def maskSequences(self, sequences): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x, s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x, s)) os.close(outfile) statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))] os.remove(infile) return result
def maskSequences(self, sequences): '''mask a collection of sequences.''' with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf: for x, s in enumerate(sequences): outf.write(">%i\n%s\n" % (x, s)) infile = outf.name statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO(out.decode())) ] os.remove(infile) return result
def maskSequences(self, sequences): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x, s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x, s)) os.close(outfile) statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [x.sequence for x in FastaIterator.iterate(StringIO(out))] os.remove(infile) return result
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob( "mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile( os.path.join( "input.dir", f.title.replace(" ", "_").replace("/", "_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write( "%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) contigs_map = {} for genome in glob.glob(os.path.join(options.genome_dir, "*")): for fasta in FastaIterator.iterate(IOTools.openFile(genome)): identifier = fasta.title.split("|") gi = identifier[1] contigs_map[gi] = fasta.title for line in options.stdin.readlines(): data = line[:-1].split("\t") gi = data[1] assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi])) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--bamfile", dest="bamfile", type="string", help="supply bam file") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contigs E.info("reading in contig file") contigs = {} for fasta in FastaIterator.iterate(options.stdin): contigs[fasta.title] = (1, len(fasta.sequence) - 1) E.info("read %i contigs" % len(contigs.keys())) # read in bamfile E.info("reading bam file") samfile = pysam.Samfile(options.bamfile) E.info("iterating over contigs") c = 0 for contig, coords in contigs.iteritems(): coords = list(coords) ################################# # NB this is specific for my data! contig = contig.split(" ")[0] ################################# species_counts = collections.defaultdict(int) for alignment in samfile.fetch(contig, coords[0], coords[1]): species_id = alignment.qname.split("|")[1] species_counts[species_id] += 1 # at the moment ignore if there are no counts if len(species_counts.values()) == 0: E.warn("no reads map to %s" % contig) continue for species, count in species_counts.iteritems(): if species_counts[species] == max(species_counts.values()): top_dog = species c += 1 break E.info("species %s assigned to contig number %i" % (top_dog, c)) options.stdout.write("%s\t%s\n" % (contig, top_dog)) # write footer and output benchmark information. E.Stop()
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def countCompleteGenes(infile, outfile): ''' count the number of genes that are classed as complete based on having a start and stop codon ''' start = "ATG" stop = ["TAG", "TAA", "TGA"] ntotal = 0 nstart = 0 nstop = 0 nstart_nstop = 0 for fasta in FastaIterator.iterate(IOTools.openFile(infile)): ntotal += 1 if fasta.sequence.startswith(start): nstart += 1 if fasta.sequence[-3:len(fasta.sequence)] in stop: nstop += 1 if fasta.sequence.startswith( start) and fasta.sequence[-3:len(fasta.sequence)] in stop: nstart_nstop += 1 outf = open(outfile, "w") outf.write("total_genes\tpstart\tpstop\tpstart_stop\n") outf.write("\t".join( map(str, [ ntotal, float(nstart) / ntotal, float(nstop) / ntotal, float(nstart_nstop) / ntotal ])) + "\n") outf.close()
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option("-n", dest="N", type="int", help="e.g N50 - the length at which 50% of contigs are equal or above") parser.add_option("-f", "--filter-length", dest="filter_length", type="int", help="calculate stats on contigs longer than -f") parser.set_defaults(N = 50, filter_length = 0) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) f = options.filter_length # iterate over the contigs/scaffolds and return stats number_of_contigs = 0 N = options.N contig_lengths = [] for record in FastaIterator.iterate(options.stdin): contig_length = len(list(record.sequence)) if contig_length >= f: number_of_contigs += 1 contig_lengths.append(contig_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(contig_lengths) median_length = np.median(contig_lengths) max_length = max(contig_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(contig_lengths) for length in sorted(contig_lengths, reverse = True): while cum_length <= total_length*(float(N)/100): index += 1 cum_length += length # output the results options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length))) ## write footer and output benchmark information. E.Stop()
def filterContigs(infile, outfile, length): ''' filter contigs by length ''' outf = open(outfile, "w") for fasta in FastaIterator.iterate(IOTools.openFile(infile)): seq_length = len(fasta.sequence) if seq_length < length: continue outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence)) outf.close()
def contig_to_stats(contigs_file, stats_file, params): """ calculate descriptive stats for a set of contigs / scaffolds """ PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 # iterate over the contigs/scaffolds and return stats number_of_scaffolds = 0 N = PARAMS["scaffold_n"] scaffold_lengths = [] inf = open(contigs_file) for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length >= f: number_of_scaffolds += 1 scaffold_lengths.append(scaffold_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(scaffold_lengths) median_length = np.median(scaffold_lengths) max_length = max(scaffold_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(scaffold_lengths) for length in sorted(scaffold_lengths, reverse=True): while cum_length <= total_length * (float(N) / 100): index += 1 cum_length += length # output the results outf = open(stats_file, "w") outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) outf.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % ( number_of_scaffolds, total_length, sorted(scaffold_lengths, reverse=True)[index], str(median_length), str(mean_length), str(max_length), ) )
def build_scaffold_lengths(contigs_file, outfile, params): ''' output the distribution of scaffold lengths ''' inf = open(contigs_file) outf = open(outfile, "w") outf.write("scaffold_name\tlength\n") for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) outf.write("%s\t%i\n" % (record.title, scaffold_length)) outf.close()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def contig_to_stats(contigs_file, stats_file, params): ''' calculate descriptive stats for a set of contigs / scaffolds ''' PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 # iterate over the contigs/scaffolds and return stats number_of_scaffolds = 0 N = PARAMS["scaffold_n"] scaffold_lengths = [] inf = open(contigs_file) for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length >= f: number_of_scaffolds += 1 scaffold_lengths.append(scaffold_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(scaffold_lengths) median_length = np.median(scaffold_lengths) max_length = max(scaffold_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(scaffold_lengths) for length in sorted(scaffold_lengths, reverse=True): while cum_length <= total_length * (float(N) / 100): index += 1 cum_length += length # output the results outf = open(stats_file, "w") outf.write( "nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) outf.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_scaffolds, total_length, sorted(scaffold_lengths, reverse=True)[index], str(median_length), str(mean_length), str(max_length)))
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.openFile(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.openFile(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-pm", "--profilematrix", dest="matrixfile", type="string", help="name of profile file you want to convert") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) #outf = IOTools.openFile("my_output", "w") for line in options.matrixfile: line = line.strip() fields = line.split() total = sum([float(col) for col in fields[1:]]) if total == 0: continue else: for i, col in enumerate(fields): if i == 0: continue fields[i] = col / total options.stdout.write("\t".join(map(str, fields))) for fasta_read in FastaIterator.iterate(IOTools.openFile( options.fastafile)): read_sequence = fasta_read.sequence read_name = fasta_read.title quals = '.' * len(read_sequence) new_fastq = Fastq.Record(identifier=read_name, seq=read_sequence, quals=quals) new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8') options.stdout.write(str(new_fastq) + "\n") # write footer and output benchmark information. E.Stop()
def build_scaffold_lengths(contigs_file, outfile, params): ''' output the distribution of scaffold lengths ''' PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 inf = open(contigs_file) outf = open(outfile, "w") outf.write("scaffold_name\tlength\n") for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length > f: # rename sequences if they have a space in them outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) contigs_map = {} for genome in glob.glob(os.path.join(options.genome_dir, "*")): for fasta in FastaIterator.iterate(IOTools.openFile(genome)): identifier = fasta.title.split("|") gi = identifier[1] contigs_map[gi] = fasta.title for line in options.stdin.readlines(): data = line[:-1].split("\t") gi = data[1] assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi])) # write footer and output benchmark information. E.Stop()
def countCompleteGenes(infile, outfile): ''' count the number of genes that are classed as complete based on having a start and stop codon ''' start = "ATG" stop = ["TAG", "TAA", "TGA"] ntotal = 0 nstart = 0 nstop = 0 nstart_nstop = 0 for fasta in FastaIterator.iterate(IOTools.openFile(infile)): ntotal += 1 if fasta.sequence.startswith(start): nstart += 1 if fasta.sequence[-3:len(fasta.sequence)] in stop: nstop += 1 if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop: nstart_nstop += 1 outf = open(outfile, "w") outf.write("total_genes\tpstart\tpstop\tpstart_stop\n") outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n") outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option( "-p", "--output-proportion", dest="proportion", action="store_true", help="output proportions - overides the default output", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do not allow greater than octonucleotide assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer # how we deal with the nucleotides depends on the kmer length nucleotides = [] for nucleotide in ["A", "C", "T", "G"]: nucleotides = nucleotides + [x for x in itertools.repeat(nucleotide, options.kmer)] E.info("retrieving %imer sequences" % options.kmer) # get all kmer sequences to query kmers = set() for kmer in itertools.permutations(nucleotides, options.kmer): kmers.add(kmer) E.info("matching %imers in file" % options.kmer) # count the number of kmers in each sequence result = {} # NB assume that non fasta files are caught by FastaIterator total_entries = 0 for fasta in FastaIterator.iterate(options.stdin): total_entries += 1 result[fasta.title] = {} for kmer in kmers: counts = [m.start() for m in re.finditer("".join(kmer), fasta.sequence)] result[fasta.title][kmer] = len(counts) E.info("writing results") # write out the results headers = result.keys() rows = set() for kmer_counts in result.values(): for kmer, count in kmer_counts.iteritems(): rows.add("".join(kmer)) # write header row options.stdout.write("kmer\t" + "\t".join(headers) + "\n") # output proportions if required - normalises by # sequence length E.info("computing total counts") totals = {} for header in headers: totals[header] = sum([result[header][tuple(row)] for row in rows]) for row in rows: if options.proportion: options.stdout.write( "\t".join([row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers]) + "\n" ) else: options.stdout.write("\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n") E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.Stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in seqs1: if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len( filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % ( k, x, a, b, len(s1), len(s2)) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print "fix\t%s\t%s" % (k, str(f)) if not keep: print "# warning: not fixable: %s" % k if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in seqs2.keys(): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: # seqs1: number of sequences in set 1 # seqs2: number of sequences in set 2 # same: number of identical sequences # diff: number of sequences with differences # nmissed1: sequences in set 1 that are not found in set 2 # nmissed2: sequences in set 2 that are not found in set 1 # Type of sequence differences # first: only the first residue is different # last: only the last residue is different # prefix: one sequence is prefix of the other # selenocysteine: difference due to selenocysteines # masked: difference due to masked residues # fixed: fixed differences # other: other differences """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
parser.set_defaults( input_format="fasta", output_format="fasta", method = None, parameters = "", gop = -10.0, gep = -1.0, alignment_method = "sw", ) (options, args) = E.Start( parser ) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate( sys.stdin ) if options.method == "add": mali = Mali.Mali() mali.readFromFile( open(options.parameters[0], "r"), format = options.input_format ) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali( mali ) if options.alignment_method == "sw": alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) else:
def main( argv = None ): parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true", help = "input are cds (nucleotide) sequences [%default]" ) parser.set_defaults( is_cds = False, ) (options, args) = E.Start( parser, argv = argv ) options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" ) alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate( options.stdin ): identifier = entry.title if options.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate( cds_sequence ) weights = [] for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos+3] counts = collections.defaultdict(int) for x in range(0,3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate(codon[:x] + na + codon[x+1:]) counts[taa] += 1 weights.append( counts ) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate( sequence ): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid +=1 options.stdout.write( "%s\n" % "\t".join( ( "%010i" % snpid, identifier, str(pos+1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option( "-p", "--output-proportion", dest="proportion", action="store_true", help="output proportions - overides the default output") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do not allow greater than octonucleotide assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer # how we deal with the nucleotides depends on the kmer length nucleotides = [] for nucleotide in ["A", "C", "T", "G"]: nucleotides = nucleotides + \ [x for x in itertools.repeat(nucleotide, options.kmer)] E.info("retrieving %imer sequences" % options.kmer) # get all kmer sequences to query kmers = set() for kmer in itertools.permutations(nucleotides, options.kmer): kmers.add(kmer) E.info("matching %imers in file" % options.kmer) # count the number of kmers in each sequence result = {} # NB assume that non fasta files are caught by FastaIterator total_entries = 0 for fasta in FastaIterator.iterate(options.stdin): total_entries += 1 result[fasta.title] = {} for kmer in kmers: counts = [m.start() for m in re.finditer("".join(kmer), fasta.sequence)] result[fasta.title][kmer] = len(counts) E.info("writing results") # write out the results headers = sorted(result.keys()) rows = set() for kmer_counts in list(result.values()): for kmer, count in kmer_counts.items(): rows.add("".join(kmer)) # write header row options.stdout.write("kmer\t" + "\t".join(headers) + "\n") # output proportions if required - normalises by # sequence length E.info("computing total counts") totals = {} for header in headers: totals[header] = sum([result[header][tuple(row)] for row in rows]) for row in sorted(rows): if options.proportion: options.stdout.write("\t".join( [row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers]) + "\n") else: options.stdout.write( "\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n") E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default].") parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default].") parser.add_option("--without-combine", dest="combine", action="store_false", help="combine overlapping motifs [default=%default].") parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices=("per-sequence", "all", "xall"), help="qvalue threshold [default=%default].") parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default].") parser.add_option( "-a", "--arrangements", dest="arrangements", type="string", help="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust", "repeatmasker"), help="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action="store_true", help="output stats [default=%default].") parser.add_option("--add-sequence", dest="add_sequence", action="store_true", help="add sequence information [default=%default].") parser.set_defaults( iterations=100, qvalue_threshold=0.05, motif="rxrvdr", fdr_control="all", combine=True, arrangements=None, mask=None, output_stats=False, add_sequence=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements is None: options.arrangements = ["DR%s" % x for x in range(0, 15) ] + ["ER%s" % x for x in range(0, 15)] else: options.arrangements = options.arrangements.split(",") options.stdout.write("%s" % "\t".join(Nubiscan.NubiscanMatch._fields)) if options.add_sequence: options.stdout.write("\tsequence") options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences([x.sequence for x in seqs], options.mask) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate([re.sub("\s.*", "", x.title) for x in seqs])) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples=options.iterations) results = matcher.run(masked_seqs, options.arrangements, qvalue_threshold=options.qvalue_threshold) if options.combine: results = Nubiscan.combineMotifs(results) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives]) else: alternatives = "" options.stdout.write("\t".join( (map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives))) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement(s) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write("\t%s" % s) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile("fdr") outfile.write("bin\thist\tnobserved\n") for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write("%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples=options.iterations) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run(seq.sequence, options.arrangements, qvalue_threshold=None) for m in mm: matches.append(m._replace(sequence=seq.title)) # estimate qvalues for all matches across all sequences pvalues = [x.pvalue for x in matches] fdr = Stats.doFDR(pvalues) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append(m._replace(qvalue=qvalue)) if options.combine: results = Nubiscan.combineMotifs(results) # output for r in results: options.stdout.write("\t".join( (r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue)) + "\n") noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples=options.iterations) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run(seq.sequence, options.arrangements, qvalue_threshold=options.qvalue_threshold) if options.combine: result = Nubiscan.combineMotifs(result) t = re.sub(" .*", "", seq.title) for r in result: options.stdout.write("\t".join( (t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue)) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default]." ) parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default]." ) parser.add_option("--without-combine", dest="combine", action = "store_false", help="combine overlapping motifs [default=%default]." ) parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices = ("per-sequence", "all", "xall"), help="qvalue threshold [default=%default]." ) parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default]." ) parser.add_option("-a", "--arrangements", dest="arrangements", type="string", help ="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust","repeatmasker"), help ="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action = "store_true", help="output stats [default=%default]." ) parser.add_option("--add-sequence", dest="add_sequence", action = "store_true", help="add sequence information [default=%default]." ) parser.set_defaults( iterations = 100, qvalue_threshold = 0.05, motif = "rxrvdr", fdr_control = "all", combine = True, arrangements = None, mask = None, output_stats = False, add_sequence = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements == None: options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ] else: options.arrangements = options.arrangements.split(",") options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) ) if options.add_sequence: options.stdout.write( "\tsequence" ) options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask ) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) ) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples = options.iterations ) results = matcher.run( masked_seqs, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: results = Nubiscan.combineMotifs( results ) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives ] ) else: alternatives = "" options.stdout.write( "\t".join( ( map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives) ) ) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement( s ) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write( "\t%s" % s ) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile( "fdr" ) outfile.write("bin\thist\tnobserved\n" ) for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = None ) for m in mm: matches.append( m._replace( sequence = seq.title ) ) # estimate qvalues for all matches across all sequences pvalues = [ x.pvalue for x in matches ] fdr = Stats.doFDR( pvalues ) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append( m._replace( qvalue = qvalue ) ) if options.combine: results = Nubiscan.combineMotifs( results ) # output for r in results: options.stdout.write( "\t".join( ( r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue ) ) + "\n" ) noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: result = Nubiscan.combineMotifs( result ) t = re.sub(" .*","", seq.title) for r in result: options.stdout.write( "\t".join( ( t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue ) ) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option("--method", dest="method", type="choice", choices=("transcript", "gene"), help="count unique kmers per transcript or gene") parser.add_option("--genemap", dest="genemap", type="str", help="file mapping transcripts to genes") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults(fasta=None, method="transcript", genemap=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join( ("id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate fasta entries, shred and identify kmers if options.method == 'gene': E.info("shredding genes to identify unique kmers") assert options.genemap, ( "to perform a gene-level unique kmer count, " "you must supply a transcript2gene map (--genemap)") t2g = {} with IOTools.openFile(options.genemap, "r") as inf: for line in inf: transcript, gene = line.strip().split("\t") t2g[transcript] = gene genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!") genes.add(current_gene) k.shred(sequences, options.kmer) if total_entries % 1000 == 0: E.info("1st shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: k.shred(sequences, options.kmer) E.info("1st shred complete for %i genes" % total_entries) elif options.method == 'transcript': E.info("shredding transcripts to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break k.shred([entry.sequence.upper()], options.kmer) total_entries += 1 E.info("1st shred complete for %i transcripts" % total_entries) total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate fasta entries, shread and count unique kmers if options.method == 'gene': E.info("re-shredding fasta to count gene unique kmers") genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!") genes.add(current_gene) unique, non_unique = k.countUniqueKmers( sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join( map(str, (current_gene, unique, non_unique, fraction)))) if total_entries % 1000 == 0: E.info("2nd shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id total_entries += 1 else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: unique, non_unique = k.countUniqueKmers(sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write( "%s\n" % "\t".join(map(str, (gene_id, unique, non_unique, fraction)))) if options.method == 'transcript': E.info("re-shredding fasta to count transcript unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] total_entries += 1 unique, non_unique = k.countUniqueKmers([entry.sequence.upper()], options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join( map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults( fasta=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join(( "id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate transcripts, shred and identify unique kmers E.info("shredding fasta to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i entries" % total_entries) if options.subset and total_entries >= options.subset: break k.shred(entry.sequence.upper(), options.kmer) total_entries += 1 total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate transcripts, shread and count unique kmers E.info("re-shredding fasta to count unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i entries" % total_entries) transcript_id = entry.title.split()[0] if options.subset and total_entries >= options.subset: break total_entries += 1 unique, non_unique = k.countUniqueKmers( entry.sequence.upper(), options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join( map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $") parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option("-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option("-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults( input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.Start(parser) if options.input_filename: infile = IOTools.openFile(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print "# parsing error in description line %s" % (seq.title) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $", usage=globals()["__doc__"]) parser.add_option( "-m", "--species-map", dest="species_map", type="string", help="text file specifying the mapping between contig and genome") parser.add_option( "-g", "--genome-dir", dest="genome_dir", type="string", help="specify directory where genome / genomes are stored") ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contig lengths into dictionary E.info("reading contigs file") c_contigs = 0 contigs_lengths = {} for fasta in FastaIterator.iterate(options.stdin): c_contigs += 1 # titles of fasta records must be single strings with no special characters contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence) E.info("read %i contigs" % c_contigs) # read in mapping between spcies and contigs species_map = {} for line in open(options.species_map).readlines(): data = line[:-1].split("\t") contig, species = data[0], data[1] species_map[contig] = species # read genomes into memory # NB this may need optimisin if using large # genomes or many genomes E.info("reading genomes from %s" % options.genome_dir) # The directory must ONLY contain genome files!! genomes_sequences = {} c_genomes = 0 for genome_file in glob.glob(os.path.join(options.genome_dir, "*")): c_genomes += 1 for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)): genomes_sequences[fasta.title] = fasta.sequence E.info("read %i genomes from %s" % (c_genomes, options.genome_dir)) # iterate over the contigs and sample from the respective genome E.info("iterating over contigs") c_contigs_output = 0 for contig, length in contigs_lengths.iteritems(): if contig not in species_map: E.warn("contig %s not in species map file" % contig) else: c_contigs_output += 1 genome = species_map[contig] genome_length = len(genomes_sequences[genome]) # get the start position from which to sample start = random.randint(1, genome_length) try: end = start + length - 1 except ValueError: print "end of sampled contig extends beyond length of genome" sampled_seq = genomes_sequences[genome][start:end] options.stdout.write( ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq)) E.info("written %i contigs" % c_contigs_output) ## write footer and output benchmark information. E.Stop()
def runMEMEOnSequences(infile, outfile, background=None, psp=None): '''run MEME on fasta sequences to find motifs By defualt MEME calculates a zero-th order background model from the nucleotide frequencies in the input set. To use a different background set, a background file created by fasta-get-markov must be supplied. To perform descrimantive analysis a position specific prior (psp) file must be provided. This can be generated used generatePSP. ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs < 2: E.warn("%s: less than 2 sequences - meme skipped" % outfile) P.touch(outfile) return # Get the total length of the sequences to decide the memory total_seqs_length = 0 with IOTools.open_file(infile, "r") as fasta_reader: iterator_fasta = FastaIterator.iterate(fasta_reader) for fasta_seq in iterator_fasta: total_seqs_length += len(fasta_seq.sequence) fasta_reader.close() # If the length of all sequences is higher than 160,000bp # Up the memory job_memory = "2G" if (total_seqs_length > 160000): job_memory = "4G" if PARAMS.get("meme_revcomp", True): revcomp = "-revcomp" else: revcomp = "" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") if background: background_model = "-bfile %s" % background else: background_model = "" if psp: E.info("Running MEME in descriminative mode") psp_file = "-psp %s" % psp else: psp_file = "" statement = ''' meme %(infile)s -dna %(revcomp)s -p %(meme_threads)s -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(background_model)s %(psp_file)s %(meme_options)s 2> %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["meme_threads"]) != 1: job_options = str(PARAMS["meme_job_options"]) job_threads = int(PARAMS["meme_threads"]) cluster_parallel_environment = str( PARAMS["meme_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $", usage=globals()["__doc__"]) parser.add_option("-m", "--species-map", dest="species_map", type="string", help="text file specifying the mapping between contig and genome") parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="specify directory where genome / genomes are stored") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contig lengths into dictionary E.info("reading contigs file") c_contigs = 0 contigs_lengths = {} for fasta in FastaIterator.iterate(options.stdin): c_contigs += 1 # titles of fasta records must be single strings with no special # characters contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence) E.info("read %i contigs" % c_contigs) # read in mapping between spcies and contigs species_map = {} for line in open(options.species_map).readlines(): data = line[:-1].split("\t") contig, species = data[0], data[1] species_map[contig] = species # read genomes into memory # NB this may need optimisin if using large # genomes or many genomes E.info("reading genomes from %s" % options.genome_dir) # The directory must ONLY contain genome files!! genomes_sequences = {} c_genomes = 0 for genome_file in glob.glob(os.path.join(options.genome_dir, "*")): c_genomes += 1 for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)): genomes_sequences[fasta.title] = fasta.sequence E.info("read %i genomes from %s" % (c_genomes, options.genome_dir)) # iterate over the contigs and sample from the respective genome E.info("iterating over contigs") c_contigs_output = 0 for contig, length in contigs_lengths.iteritems(): if contig not in species_map: E.warn("contig %s not in species map file" % contig) else: c_contigs_output += 1 genome = species_map[contig] genome_length = len(genomes_sequences[genome]) # get the start position from which to sample start = random.randint(1, genome_length) try: end = start + length - 1 except ValueError: print "end of sampled contig extends beyond length of genome" sampled_seq = genomes_sequences[genome][start:end] options.stdout.write( ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq)) E.info("written %i contigs" % c_contigs_output) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bamfile", type="string", help="supply bam file") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contigs E.info("reading in contig file") contigs = {} for fasta in FastaIterator.iterate(options.stdin): contigs[fasta.title] = (1, len(fasta.sequence) - 1) E.info("read %i contigs" % len(contigs.keys())) # read in bamfile E.info("reading bam file") samfile = pysam.Samfile(options.bamfile) E.info("iterating over contigs") c = 0 for contig, coords in contigs.iteritems(): coords = list(coords) ################################# # NB this is specific for my data! contig = contig.split(" ")[0] ################################# species_counts = collections.defaultdict(int) for alignment in samfile.fetch(contig, coords[0], coords[1]): species_id = alignment.qname.split("|")[1] species_counts[species_id] += 1 # at the moment ignore if there are no counts if len(species_counts.values()) == 0: E.warn("no reads map to %s" % contig) continue for species, count in species_counts.iteritems(): if species_counts[species] == max(species_counts.values()): top_dog = species c += 1 break E.info("species %s assigned to contig number %i" % (top_dog, c)) options.stdout.write("%s\t%s\n" % (contig, top_dog)) # write footer and output benchmark information. E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option( "-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( IOTools.openFile(args[0], "r"))]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( IOTools.openFile(args[1], "r"))]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
else: files = Files( output_pattern = options.output_pattern, skip_identifiers = options.skip_identifiers ) if options.input_pattern: rx = re.compile( options.input_pattern ) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate( infile ): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print "# parsing error in description line %s" % (seq.title) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip"), help="output format of multiple alignment") parser.add_option("-m", "--method", dest="method", type="choice", choices=("add", ), help="""method to use to build multiple alignment.""") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice", choices=("sw", "nw"), help="alignment_method [%default].") parser.set_defaults( input_format="fasta", output_format="fasta", method=None, parameters="", gop=-10.0, gep=-1.0, alignment_method="sw", ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate(sys.stdin) if options.method == "add": mali = Mali.Mali() mali.readFromFile(open(options.parameters[0], "r"), format=options.input_format) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali(mali) if options.alignment_method == "sw": alignator = alignlib_lite.py_makeAlignatorFullDP( options.gop, options.gep) else: alignator = alignlib_lite.py_makeAlignatorFullDPGlobal( options.gop, options.gep) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib_lite.py_makeAlignataVector() sequence = alignlib_lite.py_makeSequence(cur_record.sequence) profile = alignlib_lite.py_makeProfileFromMali(new_mali) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align(profile, sequence, map_mali2seq) if options.loglevel >= 3: options.stdlog.write(map_mali2seq.Write()) ## add sequence to mali a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence) a.thisown = 0 new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1) id = cur_record.title mali.mIdentifiers.append(id) mali.mMali[id] = Mali.AlignedString( id, 0, len(cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString()) # substitute for x in range(old_length): mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow( x).getString() mali.writeToFile(sys.stdout, format=options.output_format) E.Stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence") def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match( "Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|")] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append(MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join(map(str, ( transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join(map( str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--is-cds", dest="is_cds", action="store_true", help="input are cds (nucleotide) sequences [%default]") parser.set_defaults(is_cds=False, ) (options, args) = E.Start(parser, argv=argv) options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n") alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate(options.stdin): identifier = entry.title if options.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate(cds_sequence) weights = [] for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos + 3] counts = collections.defaultdict(int) for x in range(0, 3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate(codon[:x] + na + codon[x + 1:]) counts[taa] += 1 weights.append(counts) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate(sequence): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid += 1 options.stdout.write("%s\n" % "\t".join(( "%010i" % snpid, identifier, str(pos + 1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults(fasta=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join( ("id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate transcripts, shred and identify unique kmers E.info("shredding fasta to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i entries" % total_entries) if options.subset and total_entries >= options.subset: break k.shred(entry.sequence.upper(), options.kmer) total_entries += 1 total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate transcripts, shread and count unique kmers E.info("re-shredding fasta to count unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i entries" % total_entries) transcript_id = entry.title.split()[0] if options.subset and total_entries >= options.subset: break total_entries += 1 unique, non_unique = k.countUniqueKmers(entry.sequence.upper(), options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write( "%s\n" % "\t".join(map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=( "plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip"), help="output format of multiple alignment") parser.add_option("-m", "--method", dest="method", type="choice", choices=("add",), help="""method to use to build multiple alignment.""") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice", choices=("sw", "nw"), help="alignment_method [%default].") parser.set_defaults( input_format="fasta", output_format="fasta", method=None, parameters="", gop=-10.0, gep=-1.0, alignment_method="sw", ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate(sys.stdin) if options.method == "add": mali = Mali.Mali() mali.readFromFile( open(options.parameters[0], "r"), format=options.input_format) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali(mali) if options.alignment_method == "sw": alignator = alignlib_lite.py_makeAlignatorFullDP( options.gop, options.gep) else: alignator = alignlib_lite.py_makeAlignatorFullDPGlobal( options.gop, options.gep) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib_lite.py_makeAlignataVector() sequence = alignlib_lite.py_makeSequence(cur_record.sequence) profile = alignlib_lite.py_makeProfileFromMali(new_mali) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align(profile, sequence, map_mali2seq) if options.loglevel >= 3: options.stdlog.write(map_mali2seq.Write()) # add sequence to mali a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence) a.thisown = 0 new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1) id = cur_record.title mali.mIdentifiers.append(id) mali.mMali[id] = Mali.AlignedString(id, 0, len( cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString()) # substitute for x in range(old_length): mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow( x).getString() mali.writeToFile(sys.stdout, format=options.output_format) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults(input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.start(parser) if options.input_filename: infile = IOTools.open_file(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print("# parsing error in description line %s" % (seq.title)) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print("# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option( "--method", dest="method", type="choice", choices=("transcript", "gene"), help="count unique kmers per transcript or gene", ) parser.add_option("--genemap", dest="genemap", type="str", help="file mapping transcripts to genes") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults(fasta=None, method="transcript", genemap=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join(("id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate fasta entries, shred and identify kmers if options.method == "gene": E.info("shredding genes to identify unique kmers") assert options.genemap, ( "to perform a gene-level unique kmer count, " "you must supply a transcript2gene map (--genemap)" ) t2g = {} with IOTools.openFile(options.genemap, "r") as inf: for line in inf: transcript, gene = line.strip().split("\t") t2g[transcript] = gene genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!" ) genes.add(current_gene) k.shred(sequences, options.kmer) if total_entries % 1000 == 0: E.info("1st shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: k.shred(sequences, options.kmer) E.info("1st shred complete for %i genes" % total_entries) elif options.method == "transcript": E.info("shredding transcripts to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break k.shred([entry.sequence.upper()], options.kmer) total_entries += 1 E.info("1st shred complete for %i transcripts" % total_entries) total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate fasta entries, shread and count unique kmers if options.method == "gene": E.info("re-shredding fasta to count gene unique kmers") genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!" ) genes.add(current_gene) unique, non_unique = k.countUniqueKmers(sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (current_gene, unique, non_unique, fraction)))) if total_entries % 1000 == 0: E.info("2nd shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id total_entries += 1 else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: unique, non_unique = k.countUniqueKmers(sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (gene_id, unique, non_unique, fraction)))) if options.method == "transcript": E.info("re-shredding fasta to count transcript unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] total_entries += 1 unique, non_unique = k.countUniqueKmers([entry.sequence.upper()], options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()