def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tablename", dest="tablename", type="string", help="tablename to get variants from (in samtools pileup format) [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-f", "--exons-file", dest="filename_exons", type="string", help="filename with transcript model information (gtf formatted file) [default=%default].") parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string", help="filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default].") parser.add_option("--vcf-file", dest="filename_vcf", type="string", help="filename with variants in VCF format. Should be indexed by tabix [default=%default].") parser.add_option("--pileup-file", dest="filename_pileup", type="string", help="filename with variants in samtools pileup format. Should be indexed by tabix [default=%default].") parser.add_option("--vcf-sample", dest="vcf_sample", type="string", help="sample id for species of interest in vcf formatted file [default=%default].") parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help="filename of a list of transcript ids that are selenoproteins [default=%default].") parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true", help="add alleles that are knocked out to fasta and gtf files [default=%default].") parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite( options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF( options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join( ("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences( indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles(transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip(allele.exon_starts[1:], allele.cds_starts[ 1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write( ">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ( "\t".join((gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage information to each line.") parser.add_option("-t", "--header-names", dest="headers", type="string", help="comma separated list of headers. If empty or set to '-', filenames are used.") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--with-title", dest="with_title", action="store_true", help="use column titles in input data [%default].") parser.add_option("--no-title", dest="with_title", action="store_false", help="there are no titles in input data [%default].") parser.set_defaults( add_percent=False, percent_format="%5.2f", headers=None, add_header=True, write_header=False, with_title=True, ) (options, args) = E.start(parser) if options.add_header: options.stdout.write( "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2") if options.add_percent: options.stdout.write( "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax") options.stdout.write("\n") if options.write_header: sys.exit(0) if len(args) < 2: raise ValueError("please supply at least two filenames.") headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers = args else: headers = options.headers.split(",") if len(headers) != len(args): raise ValueError( "please supply the same number of headers as there are filenames.") for f in args: if options.with_title: title, data = IOTools.readList( IOTools.open_file(f, "r"), with_title=options.with_title) titles.append(title) else: data = IOTools.readList(open(f, "r")) sets.append(set(data)) if not headers and titles: headers = titles else: headers = args for x in range(len(sets) - 1): set1 = sets[x] for y in range(x + 1, len(sets)): set2 = sets[y] l1, l2 = len(set1), len(set2) options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y], l1, l2, len(set1.union( set2)), len(set1.intersection( set2)), len(set1.difference( set2)), len(set2.difference(set1)))) if options.add_percent: if len(set1) == 0: ri, r1, r2 = 0, 1, 0 c1, c2, cm = 1, 0, 0 elif len(set2) == 0: ri, r1, r2 = 0, 0, 1 c1, c2, cm = 0, 1, 0 else: i = len(set1.intersection(set2)) ri, r1, r2 = ( i / float(len(set1.union(set2))), len(set1.difference(set2)) / float(l1), len(set2.difference(set1)) / float(l2)) c1, c2 = (i / float(l1), i / float(l2)) cm = max(c1, c2) options.stdout.write( "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm)) options.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "reverse-complement", "grep"), help="method to apply [%default]") parser.add_option("--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option("--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option("--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults(method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method is None: raise ValueError("no method specified, please use --method") if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.open_file(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.open_file(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.open_file(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()