def main(): args = parse_args() bam = create_pysam_bam(args.bam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) genome_bam = None if args.genome_bam: genome_bam = create_pysam_bam(args.genome_bam) em = ExonMapper(annot_tabix, transcripts_dict, genome_fasta, debug=args.debug) annots = [args.gtf] if args.suppl_annot: annots.extend(args.suppl_annot) accessory_known_features = extract_features(annots) mappings, junc_adjs, events = em.map_aligns( bam, query_fasta, genome_fasta, accessory_known_features=accessory_known_features, max_diff=args.max_diff_splice) juncs_merged = Adjacency.merge(junc_adjs) events_merged = Adjacency.merge(events) if args.r2c: all_adjs = [] if juncs_merged: all_adjs.extend(juncs_merged) if events_merged: all_adjs.extend(events_merged) if all_adjs: find_support(all_adjs, args.r2c, args.query_fasta, num_procs=args.nproc, debug=args.debug) if events_merged: filter_events(events_merged, args.min_support) if genome_bam: corroborate_genome(events_merged, genome_bam) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir) em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir) em.output_events(events_merged, '%s/novel_splicing.bedpe' % args.outdir, header=(software, '%s %s' % (time, cmd)))
def main(): args = parse_args() bam = create_pysam_bam(args.bam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) genome_bam = None if args.genome_bam: genome_bam = create_pysam_bam(args.genome_bam) em = ExonMapper(annot_tabix, transcripts_dict, genome_fasta, debug = args.debug) annots = [args.gtf] if args.suppl_annot: annots.extend(args.suppl_annot) accessory_known_features = extract_features(annots) mappings, junc_adjs, events = em.map_aligns(bam, query_fasta, genome_fasta, accessory_known_features=accessory_known_features, max_diff=args.max_diff_splice) juncs_merged = Adjacency.merge(junc_adjs) events_merged = Adjacency.merge(events) if args.r2c: all_adjs = [] if juncs_merged: all_adjs.extend(juncs_merged) if events_merged: all_adjs.extend(events_merged) if all_adjs: find_support(all_adjs, args.r2c, args.query_fasta, num_procs=args.nproc, debug=args.debug) if events_merged: filter_events(events_merged, args.min_support) if genome_bam: corroborate_genome(events_merged, genome_bam) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir) em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir) em.output_events(events_merged, '%s/novel_splicing.bedpe' % args.outdir, header=(software, '%s %s' % (time, cmd)))
def extract_multiple_contig_events(events_by_contig): """Finds multiple contigs that map to same event, for use in same_event()""" events = [] for contig in events_by_contig.keys(): events.extend(events_by_contig[contig]) events_merged = Adjacency.merge(events) contigs = set() for event in events_merged: if ',' in event.seq_id: for contig in event.seq_id.split(','): contigs.add(contig) return contigs
def extract_multiple_contig_events(events_by_contig): """Finds multiple contigs that map to same event, for use in same_event()""" events = [] for contig in events_by_contig.keys(): events.extend(events_by_contig[contig]) events_merged = Adjacency.merge(events) contigs = Set() for event in events_merged: if ',' in event.seq_id: for contig in event.seq_id.split(','): contigs.add(contig) return contigs
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events( gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) if tbam: events['via_transcripts'], mappings[ 'via_transcripts'] = sf.find_events( tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [ event for event in events_merged if event.spanning >= args.min_support ] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '{} {}'.format(pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '{}/sv.bedpe'.format(args.outdir), sort_by_coord=args.sort_by_coord, header=(software, '{} {}'.format(time, cmd)))
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events(gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) if tbam: events['via_transcripts'], mappings['via_transcripts'] = sf.find_events(tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [event for event in events_merged if event.spanning >= args.min_support] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '%s/sv.bedpe' % args.outdir, sort_by_coord=args.sort_by_coord, header=(software, '%s %s' % (time, cmd)))