def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tablename", dest="tablename", type="string", help="tablename to get variants from (in samtools pileup format) [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-f", "--exons-file", dest="filename_exons", type="string", help="filename with transcript model information (gtf formatted file) [default=%default].") parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string", help="filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default].") parser.add_option("--vcf-file", dest="filename_vcf", type="string", help="filename with variants in VCF format. Should be indexed by tabix [default=%default].") parser.add_option("--pileup-file", dest="filename_pileup", type="string", help="filename with variants in samtools pileup format. Should be indexed by tabix [default=%default].") parser.add_option("--vcf-sample", dest="vcf_sample", type="string", help="sample id for species of interest in vcf formatted file [default=%default].") parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help="filename of a list of transcript ids that are selenoproteins [default=%default].") parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true", help="add alleles that are knocked out to fasta and gtf files [default=%default].") parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite( options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF( options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join( ("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# collected variants:", variants # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# merged variants:", variants # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences( indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print "exon", key Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print "intron", key Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles(transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip(allele.exon_starts[1:], allele.cds_starts[ 1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write( ">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ( "\t".join((gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()