def buildVariantSequences(indexed_variants, sequences): '''build variant sequences by inserting ``variants`` into ``sequences``. For each sequence, two alleles are returned. Both alleles are initialized as wildtype sequences. In the absence of any phasing information, variants are preferably added to the second allele, such that the wild-type status of the first allele is preserved as much as possible returns a dictionary of lists. ''' result = {} for key, sequence in sequences.iteritems(): feature_start, feature_end = key variants = [ (x, y,) + z for (x, y, z) in indexed_variants.find(feature_start, feature_end)] allele1, allele2 = Variants.buildAlleles(sequence, variants, reference_start=feature_start) result[(feature_start, feature_end)] = (allele1, allele2) return result
def buildVariantSequences(indexed_variants, sequences): '''build variant sequences by inserting ``variants`` into ``sequences``. For each sequence, two alleles are returned. Both alleles are initialized as wildtype sequences. In the absence of any phasing information, variants are preferably added to the second allele, such that the wild-type status of the first allele is preserved as much as possible returns a dictionary of lists. ''' result = {} for key, sequence in sequences.items(): feature_start, feature_end = key variants = [( x, y, ) + z for (x, y, z) in indexed_variants.find(feature_start, feature_end)] allele1, allele2 = Variants.buildAlleles(sequence, variants, reference_start=feature_start) result[(feature_start, feature_end)] = (allele1, allele2) return result
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tablename", dest="tablename", type="string", help="tablename to get variants from (in samtools pileup format) [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-f", "--exons-file", dest="filename_exons", type="string", help="filename with transcript model information (gtf formatted file) [default=%default].") parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string", help="filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default].") parser.add_option("--vcf-file", dest="filename_vcf", type="string", help="filename with variants in VCF format. Should be indexed by tabix [default=%default].") parser.add_option("--pileup-file", dest="filename_pileup", type="string", help="filename with variants in samtools pileup format. Should be indexed by tabix [default=%default].") parser.add_option("--vcf-sample", dest="vcf_sample", type="string", help="sample id for species of interest in vcf formatted file [default=%default].") parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help="filename of a list of transcript ids that are selenoproteins [default=%default].") parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true", help="add alleles that are knocked out to fasta and gtf files [default=%default].") parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite( options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF( options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join( ("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# collected variants:", variants # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# merged variants:", variants # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences( indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print "exon", key Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print "intron", key Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles(transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip(allele.exon_starts[1:], allele.cds_starts[ 1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write( ">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ( "\t".join((gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append", help="tracks (tablenames) to use in sqlite database [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-r", "--reference", dest="reference", type="string", help="name of reference [default=%default].") parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true", help="if set, the gene_id will be added to the alignment header [default=%default].") parser.add_option("-z", "--compress", dest="compress", action="store_true", help="compress output with gzip [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string", help="regular expression pattern for track [default=%default].") parser.set_defaults( genome_file=None, tracks=[], database="csvdb", output=[], border=0, reference_name="reference", pattern_track="(\S+)", is_gtf=True, compress=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if not options.database or not options.tracks: raise ValueError("please supply both database and tracks") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.is_gtf: infile_gff = GTF.iterator(options.stdin) else: infile_gff = GTF.iterator(options.stdin) dbhandle = sqlite3.connect(options.database) statement = '''SELECT pos, reference, genotype FROM %(track)s WHERE contig = '%(contig)s' AND pos BETWEEN %(extended_start)s and %(extended_end)s ''' counts = E.Counter() tracks = options.tracks try: translated_tracks = [ re.search(options.pattern_track, track).groups()[0] for track in tracks] except AttributeError: raise AttributeError( "pattern `%s` does not match input tracks." % options.pattern_track) if options.compress: outfile = gzip.GzipFile(fileobj=options.stdout) else: outfile = options.stdout outfile.flush() outfile.write("##maf version=1 program=snp2maf.py\n\n") for gff in infile_gff: counts.input += 1 contig = gff.contig strand = gff.strand lcontig = fasta.getLength(contig) region_start, region_end = gff.start, gff.end if contig.startswith("chr"): contig = contig[3:] extended_start = region_start - options.border extended_end = region_end + options.border is_positive = Genomics.IsPositiveStrand(strand) E.info("processing %s" % str(gff)) # collect all variants all_variants = [] for track in options.tracks: cc = dbhandle.cursor() cc.execute(statement % locals()) all_variants.append(map(Variants.Variant._make, cc.fetchall())) cc.close() E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig, region_start, region_end, sum([ len(x) for x in all_variants]), len(all_variants))) reference_seq = fasta.getSequence( contig, "+", region_start, region_end) lseq = len(reference_seq) alleles = collections.defaultdict(list) # build allele sequences for track and count maximum chars per mali # column colcounts = numpy.ones(lseq) for track, variants in zip(translated_tracks, all_variants): variants = Variants.updateVariants(variants, lcontig, "+") a = Variants.buildAlleles(reference_seq, variants, reference_start=region_start) alleles[track] = a for allele in a: for pos, c in enumerate(allele): colcounts[pos] = max(colcounts[pos], len(c)) # realign gapped regions alignIndels(alleles, colcounts) if options.is_gtf: outfile.write("a gene_id=%s\n" % gff.gene_id) else: outfile.write("a\n") maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n" def __addGaps(sequence, colcounts): '''output gapped sequence.''' r = [] for x, c in enumerate(sequence): r.append(c + "-" * (colcounts[x] - len(c))) return "".join(r) name = ".".join((options.reference, contig)) if is_positive: pos = region_start else: pos = lcontig - region_start size = lseq seq = __addGaps(reference_seq, colcounts) outfile.write(maf_format % (locals())) for track in translated_tracks: for aid, allele in enumerate(alleles[track]): seq = __addGaps(allele, colcounts) if not is_positive: Genomics.complement(seq) size = len(seq) - seq.count("-") name = ".".join((track + "-%i" % aid, contig)) outfile.write(maf_format % (locals())) outfile.write("\n") E.info("%s" % str(counts)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append", help="tracks (tablenames) to use in sqlite database [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-r", "--reference", dest="reference", type="string", help="name of reference [default=%default].") parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true", help="if set, the gene_id will be added to the alignment header [default=%default].") parser.add_option("-z", "--compress", dest="compress", action="store_true", help="compress output with gzip [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string", help="regular expression pattern for track [default=%default].") parser.set_defaults( genome_file=None, tracks=[], database="csvdb", output=[], border=0, reference_name="reference", pattern_track="(\S+)", is_gtf=True, compress=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if not options.database or not options.tracks: raise ValueError("please supply both database and tracks") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.is_gtf: infile_gff = GTF.iterator(options.stdin) else: infile_gff = GTF.iterator(options.stdin) dbhandle = sqlite3.connect(options.database) statement = '''SELECT pos, reference, genotype FROM %(track)s WHERE contig = '%(contig)s' AND pos BETWEEN %(extended_start)s and %(extended_end)s ''' counts = E.Counter() tracks = options.tracks try: translated_tracks = [ re.search(options.pattern_track, track).groups()[0] for track in tracks] except AttributeError: raise AttributeError( "pattern `%s` does not match input tracks." % options.pattern_track) if options.compress: outfile = gzip.GzipFile(fileobj=options.stdout) else: outfile = options.stdout outfile.flush() outfile.write("##maf version=1 program=snp2maf.py\n\n") for gff in infile_gff: counts.input += 1 contig = gff.contig strand = gff.strand lcontig = fasta.getLength(contig) region_start, region_end = gff.start, gff.end if contig.startswith("chr"): contig = contig[3:] extended_start = region_start - options.border extended_end = region_end + options.border is_positive = Genomics.IsPositiveStrand(strand) E.info("processing %s" % str(gff)) # collect all variants all_variants = [] for track in options.tracks: cc = dbhandle.cursor() cc.execute(statement % locals()) all_variants.append(list(map(Variants.Variant._make, cc.fetchall()))) cc.close() E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig, region_start, region_end, sum([ len(x) for x in all_variants]), len(all_variants))) reference_seq = fasta.getSequence( contig, "+", region_start, region_end) lseq = len(reference_seq) alleles = collections.defaultdict(list) # build allele sequences for track and count maximum chars per mali # column colcounts = numpy.ones(lseq) for track, variants in zip(translated_tracks, all_variants): variants = Variants.updateVariants(variants, lcontig, "+") a = Variants.buildAlleles(reference_seq, variants, reference_start=region_start) alleles[track] = a for allele in a: for pos, c in enumerate(allele): colcounts[pos] = max(colcounts[pos], len(c)) # realign gapped regions alignIndels(alleles, colcounts) if options.is_gtf: outfile.write("a gene_id=%s\n" % gff.gene_id) else: outfile.write("a\n") maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n" def __addGaps(sequence, colcounts): '''output gapped sequence.''' r = [] for x, c in enumerate(sequence): r.append(c + "-" * (colcounts[x] - len(c))) return "".join(r) name = ".".join((options.reference, contig)) if is_positive: pos = region_start else: pos = lcontig - region_start size = lseq seq = __addGaps(reference_seq, colcounts) outfile.write(maf_format % (locals())) for track in translated_tracks: for aid, allele in enumerate(alleles[track]): seq = __addGaps(allele, colcounts) if not is_positive: Genomics.complement(seq) size = len(seq) - seq.count("-") name = ".".join((track + "-%i" % aid, contig)) outfile.write(maf_format % (locals())) outfile.write("\n") E.info("%s" % str(counts)) # write footer and output benchmark information. E.stop()