def main(): args = parse_args() # extract unmapped read pairs r2c_bam = pysam.AlignmentFile(args.r2c) unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir) # align unmapped reads unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir) # extract fusions unmapped_bam = pysam.AlignmentFile(unmapped_bam_file) transcripts_dict = Transcript.extract_transcripts(args.gtf) adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs) Adjacency.report_events(adjs, '{}/discordant_pairs.bedpe'.format(args.outdir)) if not args.no_cleanup: cleanup(args.outdir)
def main(): args = parse_args() # extract unmapped read pairs r2c_bam = pysam.AlignmentFile(args.r2c) unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir) # align unmapped reads unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir) # extract fusions unmapped_bam = pysam.AlignmentFile(unmapped_bam_file) transcripts_dict = Transcript.extract_transcripts(args.gtf) adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs) Adjacency.report_events(adjs, '%s/discordant_pairs.bedpe' % args.outdir) if not args.no_cleanup: cleanup(args.outdir)
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events( gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) if tbam: events['via_transcripts'], mappings[ 'via_transcripts'] = sf.find_events( tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [ event for event in events_merged if event.spanning >= args.min_support ] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '{} {}'.format(pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '{}/sv.bedpe'.format(args.outdir), sort_by_coord=args.sort_by_coord, header=(software, '{} {}'.format(time, cmd)))
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events(gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) if tbam: events['via_transcripts'], mappings['via_transcripts'] = sf.find_events(tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [event for event in events_merged if event.spanning >= args.min_support] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '%s/sv.bedpe' % args.outdir, sort_by_coord=args.sort_by_coord, header=(software, '%s %s' % (time, cmd)))