Beispiel #1
0
def main():
    args = parse_args()
    # extract unmapped read pairs
    r2c_bam = pysam.AlignmentFile(args.r2c)
    unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir)
    
    # align unmapped reads
    unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir)

    # extract fusions
    unmapped_bam = pysam.AlignmentFile(unmapped_bam_file)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs)
    Adjacency.report_events(adjs, '{}/discordant_pairs.bedpe'.format(args.outdir))
    
    if not args.no_cleanup:
	cleanup(args.outdir)
Beispiel #2
0
def main():
    args = parse_args()
    # extract unmapped read pairs
    r2c_bam = pysam.AlignmentFile(args.r2c)
    unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir)
    
    # align unmapped reads
    unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir)

    # extract fusions
    unmapped_bam = pysam.AlignmentFile(unmapped_bam_file)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs)
    Adjacency.report_events(adjs, '%s/discordant_pairs.bedpe' % args.outdir)
    
    if not args.no_cleanup:
	cleanup(args.outdir)
Beispiel #3
0
def main():
    args = parse_args()

    gbam = create_pysam_bam(args.gbam)
    tbam = create_pysam_bam(args.tbam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_fasta = create_pysam_fasta(args.transcripts_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)

    sf = SVFinder(genome_fasta,
                  annot_tabix,
                  transcripts_dict,
                  args.outdir,
                  probe_len=args.probe_len,
                  debug=args.debug)
    events = {'via_genome': {}, 'via_transcripts': {}}
    mappings = {'via_genome': {}, 'via_transcripts': {}}
    gene_hits = None
    if gbam and annot_tabix:
        events['via_genome'], mappings['via_genome'] = sf.find_events(
            gbam,
            query_fasta,
            genome_fasta,
            'genome',
            min_indel_size=args.min_indel_size,
            min_dup_size=args.min_dup_size,
            min_indel_flanking=args.min_indel_flanking,
            no_utr=args.no_utr,
            max_novel_len=args.max_novel_len,
            max_homol_len=args.max_homol_len,
            only_sense_fusion=not args.include_nonsense_fusion,
            only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
            only_coding_fusion=not args.include_noncoding_fusion,
            only_fusions=args.only_fusions)

    if tbam:
        events['via_transcripts'], mappings[
            'via_transcripts'] = sf.find_events(
                tbam,
                query_fasta,
                transcripts_fasta,
                'transcripts',
                external_mappings=mappings['via_genome'],
                min_indel_size=args.min_indel_size,
                min_dup_size=args.min_dup_size,
                min_indel_flanking=args.min_indel_flanking,
                no_utr=args.no_utr,
                no_indels=True,
                max_novel_len=args.max_novel_len,
                max_homol_len=args.max_homol_len,
                only_sense_fusion=not args.include_nonsense_fusion,
                only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                only_coding_fusion=not args.include_noncoding_fusion,
                only_fusions=args.only_fusions)

    # combine events from genome and transcriptome alignments
    events_combined = combine_events(events, mappings)

    # merge identical events from different contigs
    events_merged = Adjacency.merge(events_combined)

    # filter by checking probe and subseq alignments
    if events_merged and args.genome_index and len(args.genome_index) == 2:
        if not args.disable_subseq_filtering:
            sf.filter_subseqs(events_merged,
                              query_fasta,
                              args.genome_index[0],
                              args.genome_index[1],
                              args.outdir,
                              subseq_len=args.subseq_len,
                              debug=args.debug)
        if not args.disable_probe_filtering:
            sf.filter_probes(events_merged,
                             args.genome_index[0],
                             args.genome_index[1],
                             args.outdir,
                             args.probe_len,
                             debug=args.debug)

    # read support
    if args.r2c:
        find_support(events_merged,
                     args.r2c,
                     args.query_fasta,
                     min_overlap=args.min_overhang,
                     num_procs=args.nproc,
                     debug=args.debug)
        events_filtered = [
            event for event in events_merged
            if event.spanning >= args.min_support
        ]
    else:
        events_filtered = events_merged

    # determine if events are in- or out-of-frame
    sf.set_frame(events_filtered, query_fasta, genome_fasta)

    # report (with meta data)
    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '{} {}'.format(pv.__name__, pv.__version__)
    Adjacency.report_events(events_filtered,
                            '{}/sv.bedpe'.format(args.outdir),
                            sort_by_coord=args.sort_by_coord,
                            header=(software, '{} {}'.format(time, cmd)))
def main():
    args = parse_args()
        
    gbam = create_pysam_bam(args.gbam)
    tbam = create_pysam_bam(args.tbam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_fasta = create_pysam_fasta(args.transcripts_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)
            
    sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug)
    events = {'via_genome': {}, 'via_transcripts': {}}
    mappings = {'via_genome': {}, 'via_transcripts': {}}
    gene_hits = None
    if gbam and annot_tabix:
        events['via_genome'], mappings['via_genome'] = sf.find_events(gbam,
                                                                      query_fasta,
                                                                      genome_fasta,
                                                                      'genome',
                                                                      min_indel_size=args.min_indel_size,
                                                                      min_dup_size=args.min_dup_size,
                                                                      min_indel_flanking=args.min_indel_flanking,
                                                                      no_utr=args.no_utr,
                                                                      max_novel_len=args.max_novel_len,
                                                                      max_homol_len=args.max_homol_len,
                                                                      only_sense_fusion=not args.include_nonsense_fusion,
                                                                      only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                                                                      only_coding_fusion=not args.include_noncoding_fusion,
                                                                      only_fusions=args.only_fusions
                                                                      )
        
    if tbam:
        events['via_transcripts'], mappings['via_transcripts'] = sf.find_events(tbam,
                                                                                query_fasta,
                                                                                transcripts_fasta,
                                                                                'transcripts',
                                                                                external_mappings=mappings['via_genome'],
                                                                                min_indel_size=args.min_indel_size,
                                                                                min_dup_size=args.min_dup_size,
                                                                                min_indel_flanking=args.min_indel_flanking,
                                                                                no_utr=args.no_utr,
                                                                                no_indels=True,
                                                                                max_novel_len=args.max_novel_len,
                                                                                max_homol_len=args.max_homol_len,
                                                                                only_sense_fusion=not args.include_nonsense_fusion,
                                                                                only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                                                                                only_coding_fusion=not args.include_noncoding_fusion,
                                                                                only_fusions=args.only_fusions
                                                                                )        

    # combine events from genome and transcriptome alignments
    events_combined = combine_events(events, mappings)

    # merge identical events from different contigs
    events_merged = Adjacency.merge(events_combined)
    
    # filter by checking probe and subseq alignments
    if events_merged and args.genome_index and len(args.genome_index) == 2:
        if not args.disable_subseq_filtering:
            sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir,
                              subseq_len=args.subseq_len, debug=args.debug)
        if not args.disable_probe_filtering:
            sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug)

    # read support
    if args.r2c:
        find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug)
        events_filtered = [event for event in events_merged if event.spanning >= args.min_support]
    else:
        events_filtered = events_merged

    # determine if events are in- or out-of-frame
    sf.set_frame(events_filtered, query_fasta, genome_fasta)

    # report (with meta data)
    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '%s %s' % (pv.__name__, pv.__version__)
    Adjacency.report_events(events_filtered, '%s/sv.bedpe' % args.outdir, sort_by_coord=args.sort_by_coord, header=(software, '%s %s' % (time, cmd)))