def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] primer_thermodynamics_parameters_path = PARAMS["general_primer_thermodynamics_parameters_path"] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.open_file(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title in ids: outf = IOTools.open_file(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.items(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n" % seq) outf.write("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=%s\n=\n" % primer_thermodynamics_parameters_path) outf.close()
def countMotifs(infile, motifs): '''find regular expression *motifs* in sequences within fasta formatted *infile*. ''' it = FastaIterator.FastaIterator(infile) positions = [] while 1: try: seq = next(it) except StopIteration: break if not seq: break rseq = Genomics.reverse_complement(seq.sequence) lsequence = len(seq.sequence) pos = [] for motif, pattern in motifs: for x in pattern.finditer(seq.sequence): pos.append((motif, "+", x.start(), x.end())) for x in pattern.finditer(rseq): pos.append( (motif, "-", lsequence - x.end(), lsequence - x.start())) positions.append((seq.title, pos)) return positions
def segmentUngapped(infile, gap_char, min_gap_size=0): iterator = FastaIterator.FastaIterator(infile) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) size = len(cur_record.sequence) last_end = 0 for start, end in gapped_regions(cur_record.sequence, gap_char): if end - start < min_gap_size: continue if last_end != 0: yield (contig, last_end, start, 0) last_end = end if last_end < size: yield (contig, last_end, size, 0)
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(iotools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def segmentWithCpG(infile, with_contig_sizes=False): '''segment a fasta file, output locations of CpG.''' ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) segments, contig_sizes = [], collections.OrderedDict() for cur_record in iterator: ninput += 1 contig = re.sub("\s.*", "", cur_record.title) last = None contig_sizes[contig] = (0, len(cur_record.sequence)) for pos, this in enumerate(cur_record.sequence.upper()): if last == "C" and this == "G": segments.append((contig, pos - 1, pos + 1, 1.0)) last = this E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) if with_contig_sizes: return segments, contig_sizes return segments
def maskSequences(self, sequences): '''mask a collection of sequences.''' with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf: for x, s in enumerate(sequences): outf.write(">%i\n%s\n" % (x, s)) infile = outf.name statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))] os.remove(infile) return result
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def RenameFastaTitle(fastafile, file2tax, outfile): taxonomy = file2tax[fastafile] suffix=1 for fasta in fastaiterator.iterate(iotools.open_file(fastafile)): suffix_str = "(" + str(suffix) + ")" new_title = taxonomy + suffix_str suffix += 1 outfile.write(">" + new_title + "\n" + fasta.sequence + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) # outfile_info = IOTools.open_file(options.info_file, "w") d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file and generate a dict # with the name (title) as the key and the sequence as the value # Remove any pseudo sequences for cur_record in iterator: # This is a temp fix because bedtools getfasta --name seems to have # changed the way it names the fasta titles. This may be temp but This # will fix this issue for the time being. m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")","")) if m == None: continue if m.group(2) == "pseudo": pass else: key = str(m.group(1) + m.group(3)) d[key] = cur_record.sequence # next iterate of over the dict give the cluster a number # this will be used to then map back for the info name for key, value in d.items(): # Add CCA tail options.stdout.write((">%s\n%scca\n")%(key, value)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) (options, args) = E.start(parser) if len(args) < 2: raise ValueError( "please supply at least two filenames to concatenate.") iterators = [] for a in args: iterators.append(FastaIterator.FastaIterator(iotools.open_file(a, "r"))) ninput, noutput, nerrors = 0, 0, 0 while 1: sequences = [] ids = [] for iterator in iterators: try: cur_record = next(iterator) except StopIteration: break sequences.append(re.sub(" ", "", cur_record.sequence)) ids.append(cur_record.title) if not sequences: break ninput += 1 if len(sequences) != len(iterators): raise ValueError("unequal number of sequences in files") noutput += 1 options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences))) E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.stop()
def specformatter(Infile, Outfile): infile = iotools.open_file(Infile) fastas = fastaiterator.iterate(infile) outfile = open(Outfile, "w") for fasta in fastas: name = fasta.title.split(";")[6] specID = name.split("(")[1] specID = specID[:-1] genusspecies = name.split("(")[0] genus = genusspecies.split("_")[0] species = genusspecies.split("_")[1] newtitle = " ".join([specID, genus, species]) outfile.write(">" + newtitle + "\n" + fasta.sequence + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) # outfile_info = IOTools.open_file(options.info_file, "w") d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file and generate a dict # with the name (title) as the key and the sequence as the value # Remove any pseudo sequences for cur_record in iterator: key = cur_record.title if "pseudo" in key: pass else: d[key] = cur_record.sequence # next iterate of over the dict give the cluster a number # this will be used to then map back for the info name for key, value in d.items(): # Add CCA tail options.stdout.write((">%s\n%scca\n") % (key, value)) E.stop()
def segmentGaps(infile, gap_char): iterator = FastaIterator.FastaIterator(infile) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) for start, end in gapped_regions(cur_record.sequence, gap_char): yield (contig, start, end, 0)
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.open_file(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.open_file(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def segmentFixedWidthWindows(infile, window_size, window_shift): """return a list of fixed contig sizes.""" ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) window_shift = window_size # at most 50% can be gap gap_cutoff = int(window_size // 2) segments = [] while 1: ninput += 1 try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) seq = cur_record.sequence size = len(cur_record.sequence) for x in range(0, size, window_shift): s = seq[x:x + window_size].upper() gc, at = 0, 0 for c in s: if c in "GC": gc += 1 elif c in "AT": at += 1 # skip segments containing mostly gaps if window_size - (gc + at) > gap_cutoff: nskipped += 1 continue segments.append( (contig, x, x + window_size, float(gc) / (gc + at))) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped_windows=%i" % (ninput, noutput, nskipped)) return segments
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( [x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([ x for x in differences if x[0] in "NX" or x[1] in "NX" ]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print( "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-w", "--weights-tsv-file", dest="filename_weights", type=str, help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_argument("-s", "--section", dest="sections", nargs="*", type=str, choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output ") parser.add_argument( "-t", "--sequence-type", dest="seqtype", type=str, choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids .") parser.add_argument( "-e", "--regex-identifier", dest="regex_identifier", type=str, help="regular expression to extract identifier from fasta " "description line.") parser.add_argument( "--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_argument( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (args) = E.start(parser, argv=argv) rx = re.compile(args.regex_identifier) reference_codons = [] if args.filename_weights: args.filename_weights = args.filename_weights.split(",") for filename in args.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( iotools.ReadMap(iotools.open_file(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences args.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in list(a.items()): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) args.stdlog.write( "# tablediff\t%s\t%s\t%f\n" % (args.filename_weights[x], args.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(args.stdin) def getCounter(section): if args.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps(args.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif args.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in args.sections: totals[section] = getCounter(section) args.stdout.write("id") for section in args.sections: args.stdout.write("\t" + "\t".join(totals[section].getHeaders())) args.stdout.write("\n") args.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if args.split_id is True: args.stdout.write("%s" % id.split()[0]) else: args.stdout.write("%s" % id) args.stdout.flush() for section in args.sections: s = getCounter(section) s.loadSequence(sequence, args.seqtype) totals[section].addProperties(s) args.stdout.write("\t" + "\t".join(s.getFields())) args.stdout.write("\n") if args.add_total: args.stdout.write("total") for section in args.sections: args.stdout.write("\t" + "\t".join(totals[section].getFields())) args.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--trna-scheme", dest="trna_scheme", type="choice", choices=("tDR-5'", "tRH-DA"), help="name of the tRNA scheme to make bed file for[default=%default]") parser.set_defaults(trna_scheme=None) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) outfile = IOTools.open_file(options.stdout.name, "w") trna_options = [ "tRH-5'", "tRH-DA", "tRH-DTA", "tRH-AT", "tRH-3'", "tRF-5'", "tRF-3'", "tRF-D", "tRF-DA", "tRF-A", "tRF-AT", "tRF-T" ] for trna in trna_options: infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file for cur_record in iterator: title = cur_record.title m = re.match("(cluster\d+):chr\S+.tRNA\d+-(\S+)-\((\S+)\)", title) cluster = m.group(1) trna_group = m.group(2) strand = m.group(3) chrom = cluster + ":" + trna_group + "-" score = "." print(trna) if trna == "tRH-5'": start = "1" end = "33" elif trna == "tRH-DA": start = "14" end = "43" elif trna == "tRH-DTA": start = "17" end = "54" elif trna == "tRH-AT": start = "38" end = "69" elif trna == "tRH-3'": start = "43" end = "73" elif trna == "tRF-5'": start = "1" end = "15" elif trna == "tRF-3'": start = "58" end = "73" elif trna == "tRF-D": start = "8" end = "23" elif trna == "tRF-DA": start = "20" end = "35" elif trna == "tRF-A": start = "27" end = "42" elif trna == "tRF-AT": start = "33" end = "53" elif trna == "tRF-T": start = "45" end = "71" else: print("tRNA fragment not implemented") break outfile.write(("%s\t%s\t%s\t%s\t%s\t%s\n") % (chrom, start, end, trna, score, strand)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--info-file-out", dest="info_file", type="str", help="name of the info file name[default=%default]") parser.set_defaults(info_file="info_file.fa") (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) outfile_info = IOTools.open_file(options.info_file, "w") d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file and generate a dict # with the sequnce as the key and the name as the value # only add if the sequence occurs once for cur_record in iterator: key = cur_record.sequence if key in d: pass else: d[key] = cur_record.title # next iterate of over the dict give the cluster a number # this will be used to then map back for the info name n = 0 for key, value in d.items(): n += 1 cluster_dict[key] = n # output this to std out m = re.match("(chr\d+).tRNA\d+-(\S+)-(\S+)", value) value = m.group(1) + "-" + m.group(2) + "-" + m.group(3) options.stdout.write((">cluster%s:%s\n%s\n") % (n, value, key)) # iterate over the infile again, this time use the # sequence to pull out the cluster it belongs to infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) for cur_record in iterator: cluster = cluster_dict[cur_record.sequence] outfile_info.write((">cluster%s:%s\n%s\n") % (cluster, cur_record.title, cur_record.sequence)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults(input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.start(parser) if options.input_filename: infile = iotools.open_file(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = iotools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print("# parsing error in description line %s" % (seq.title)) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print("# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("-p", "--output-proportion", dest="proportion", action="store_true", help="output proportions - overides the default output") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # do not allow greater than octonucleotide assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer # how we deal with the nucleotides depends on the kmer length nucleotides = [] for nucleotide in ["A", "C", "T", "G"]: nucleotides = nucleotides + \ [x for x in itertools.repeat(nucleotide, options.kmer)] E.info("retrieving %imer sequences" % options.kmer) # get all kmer sequences to query kmers = set() for kmer in itertools.permutations(nucleotides, options.kmer): kmers.add(kmer) E.info("matching %imers in file" % options.kmer) # count the number of kmers in each sequence result = {} # NB assume that non fasta files are caught by FastaIterator total_entries = 0 for fasta in FastaIterator.iterate(options.stdin): total_entries += 1 result[fasta.title] = {} for kmer in kmers: counts = [ m.start() for m in re.finditer("".join(kmer), fasta.sequence) ] result[fasta.title][kmer] = len(counts) E.info("writing results") # write out the results headers = sorted(result.keys()) rows = set() for kmer_counts in list(result.values()): for kmer, count in kmer_counts.items(): rows.add("".join(kmer)) # write header row options.stdout.write("kmer\t" + "\t".join(headers) + "\n") # output proportions if required - normalises by # sequence length E.info("computing total counts") totals = {} for header in headers: totals[header] = sum([result[header][tuple(row)] for row in rows]) for row in sorted(rows): if options.proportion: options.stdout.write("\t".join([row] + [ str(float(result[header][tuple(row)]) / totals[header]) for header in headers ]) + "\n") else: options.stdout.write("\t".join( [row] + [str(result[header][tuple(row)]) for header in headers]) + "\n") E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.stop()
twoA = 0 twoG = 0 twoT = 0 twoC = 0 twoO = 0 thrA = 0 thrG = 0 thrT = 0 thrC = 0 thrO = 0 i = 0 ###Iterate over and change titles if they are matched. for transcript in FastaIterator.iterate(infile): if transcript.sequence[0] == "A": oneA += 1 elif transcript.sequence[0] == "G": oneG += 1 elif transcript.sequence[0] == "T": oneT += 1 elif transcript.sequence[0] == "C": oneC += 1 else: oneO += 1 if transcript.sequence[1] == "A": twoA += 1 elif transcript.sequence[1] == "G": twoG += 1
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-c", "--is-cds", dest="is_cds", action="store_true", help="input are cds (nucleotide) sequences ") parser.set_defaults( is_cds=False, ) (args) = E.start(parser, argv=argv) args.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n") alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate(args.stdin): identifier = entry.title if args.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate(cds_sequence) weights = [] for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos + 3] counts = collections.defaultdict(int) for x in range(0, 3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate( codon[:x] + na + codon[x + 1:]) counts[taa] += 1 weights.append(counts) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate(sequence): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid += 1 args.stdout.write( "%s\n" % "\t".join( ("%010i" % snpid, identifier, str(pos + 1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with iotools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(iotools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = iotools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = iotools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(iotools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with iotools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--output-quality-format", dest="q_format", type=int, help="sequence quality format, e.g 33 = +33/Sanger") parser.add_argument("--output-paired-end", dest="paired", action="store_true", help="generate paired end reads") parser.add_argument("--insert-length-mean", dest="insert_mean", type=float, help="mean insert length.") parser.add_argument("--insert-length-sd", dest="insert_sd", type=float, help="insert length standard deviation.") parser.add_argument( "--counts-method", dest="counts_method", type=str, choices=("reads", "copies"), help="simulate a ground truth number of reads per entry or" "copies per entry.") parser.add_argument( "--counts-min", dest="counts_min", type=float, help="minimum number of reads/read pairs per fasta entry" "or copies per entry.") parser.add_argument( "--counts-max", dest="counts_max", type=float, help="maximum number of reads/read pairs per fasta entry " "or copies per entry.") parser.add_argument("--output-read-length", dest="read_length", type=int, help="read length.") parser.add_argument("--sequence-error-phred", dest="phred", type=int, help="phred quality score.") parser.add_argument("--output-counts", dest="output_counts", type=str, help="name for counts outfile.") parser.add_argument("--output-fastq2", dest="fastq2_out", type=str, help="filename for second fastq outfile.") parser.add_argument("--premrna-fraction", dest="premrna_fraction", type=float, help="the fraction of reads to simulate from pre-mRNA") parser.add_argument("--infile-premrna-fasta", dest="premrna_fasta", type=str, help="filename for pre-mRNA fasta.") parser.set_defaults(q_format=33, paired=False, insert_mean=0, insert_sd=1, counts_method="reads", counts_min=1, counts_max=1, read_length=50, fastq2_out=None, output_counts=None, phred=30, premrna_fraction=0, premrna_fasta=None) (args) = E.start(parser) if args.paired: assert args.fastq2_out, ("must specify a second fastq outfile for " "paired end (--output-fastq2)") outf2 = iotools.open_file(args.fastq2_out, "w") if args.premrna_fraction: assert args.premrna_fasta, ("must specfify the location of the" "fasta file for the pre-mRNA") # the sequence quality string will always be the same so define here sequence_quality = chr(args.q_format + args.phred) qual = "".join([sequence_quality] * args.read_length) if args.premrna_fraction: iterator = FastaIterator.iterate_together( args.stdin, iotools.open_file(args.premrna_fasta)) else: iterator = FastaIterator.FastaIterator(args.stdin) # set a cut off of twice the read/pair length for short entries if args.paired: minimum_entry_length = (2 * ((args.read_length * 2) + args.insert_mean)) else: minimum_entry_length = 2 * args.read_length c = collections.Counter() counts = collections.Counter() copies = collections.Counter() for f_entry in iterator: if args.premrna_fraction: assert getTitle(f_entry[0]) == getTitle( f_entry[1]), ("entry ids do not match: %s != %s" % (f_entry[0].title, f_entry[1].title)) entry = f_entry[0] pre_entry = f_entry[1] else: entry = f_entry # reject short fasta entries if len(entry.sequence) < minimum_entry_length: E.info("skipping short transcript: %s length=%i" % (entry.title, len(entry.sequence))) c['skipped'] += 1 continue else: c['not_skipped'] += 1 if args.paired: fragment_length = ((2 * args.read_length) + args.insert_mean) else: fragment_length = args.read_length reads_per_entry = float(len(entry.sequence)) / fragment_length if args.counts_method == "reads": n_reads = random.randint(args.counts_min, args.counts_max + 1) n_copies = float(n_reads) / reads_per_entry if args.premrna_fraction: n_reads_pre = int(round(n_reads * args.premrna_fraction)) elif args.counts_method == "copies": # random float [0-1] rand = np.random.random_sample() n_copies = (args.counts_min + (rand * (args.counts_max - args.counts_min))) n_reads = int(round(n_copies * reads_per_entry, 0)) # as n_reads must be rounded to int, need to redefine n_copies n_copies = float(n_reads) / reads_per_entry if args.premrna_fraction: reads_per_pre_entry = (float(len(pre_entry.sequence)) / fragment_length) n_copies_pre = n_copies * args.premrna_fraction n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0)) # as n_reads_pre must be rounded to int, need to # redefine n_copies_pre n_copies_pre = float(n_reads_pre) / reads_per_pre_entry entry_id = getTitle(entry) counts[entry_id] = n_reads copies[entry_id] = n_copies if "N" in entry.sequence.upper(): E.warn("fasta entry %s contains unknown bases ('N')" % entry_id) for i in range(0, n_reads): read = generateRead(entry=entry.sequence.upper(), read_length=args.read_length, error_rate=args.phred, paired=args.paired, insert_mean=args.insert_mean, insert_sd=args.insert_sd) if args.paired: r1, r2 = read h1 = "@%s_%i/1" % (entry_id, i) h2 = "@%s_%i/2" % (entry_id, i) args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_%i/1" % (entry_id, i) args.stdout.write("\n".join((h, read, "+", qual)) + "\n") if args.premrna_fraction: c['pre_counts'] += n_reads_pre c['pre_copies'] += n_copies_pre for i in range(0, n_reads_pre): read = generateRead(entry=pre_entry.sequence.upper(), read_length=args.read_length, error_rate=args.phred, paired=args.paired, insert_mean=args.insert_mean, insert_sd=args.insert_sd) if args.paired: r1, r2 = read h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i) h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i) args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_pre-mRNA_%i/1" % (entry_id, i) args.stdout.write("\n".join((h, read, "+", qual)) + "\n") if args.paired: outf2.close() with iotools.open_file(args.output_counts, "w") as counts_out: counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm"))) sum_copies = sum(copies.values()) sum_counts = sum(counts.values()) for entry_id, count in counts.items(): tpm = 1000000 * (float(copies[entry_id]) / sum_copies) counts_out.write("%s\n" % "\t".join(map(str, (entry_id, count, tpm)))) E.info("Reads simulated for %i fasta entries, %i entries skipped" % (c['not_skipped'], c['skipped'])) E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), " "%f transcripts (%f mRNA, %f pre-mRNA)" % (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'], sum_copies + c['pre_copies'], sum_copies, c['pre_copies'])) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--outputdir", dest="outdir", type="string", help="output directory to save plots") parser.add_option("-f", "--fasta", dest="fasta_file", type="string", help="fasta file containing tRNA cluster fasta seqs") parser.set_defaults(fasta_file=None, outdir=None) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) dict_trna = {} for record in FastaIterator.iterate(IOTools.open_file(options.fasta_file)): title = record.title.strip("-") length = len(record.sequence) dict_trna[title] = length # For each read in bamfile find end position and then plot this using length of tRNA cluster samfile = pysam.AlignmentFile(options.stdin.name, "rb") refname = "" values = [] n = 0 for line in samfile: if line.reference_name == refname: if line.reference_end is None: pass else: end = int(line.reference_end) - int(line.reference_start) values.append(end) elif line.reference_name != refname: n += 1 if n > 1: values = pd.Series(values) percent = values.value_counts() / values.count() * 100 percent = percent.sort_index() percent = pd.DataFrame(percent) percent.rename(columns={0: 'Percent'}, inplace=True) # length of each tRNA from fasta length = dict_trna[refname.strip("-")] + 1 temp_df = pd.DataFrame(0, index=range(1, length), columns=['A']) temp_df = pd.concat([temp_df, percent], axis=1) percent = temp_df.fillna(0) refname = options.outdir + refname.strip("-") outfile = refname + ".csv" outfig = refname + ".eps" percent.to_csv(outfile) g = sns.factorplot(x=percent.index, y="Percent", data=percent, size=8, kind="bar", palette="Blues") g.set_xlabels('position from 5\' end') g.set_xticklabels(rotation=90) g.savefig(outfig, format='eps') values = [] refname = line.reference_name else: refname = line.reference_name E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) fastafile = IOTools.open_file(options.stdin.name) fasta = FastaIterator.FastaIterator(fastafile) for line in fasta: chrom = line.title total_len = len(line.sequence) trna_list = [] string = None n = 0 for letter in line.sequence: n += 1 if n == 1: string = letter else: if string.isupper() and letter.isupper(): string = str(string) + str(letter) elif string.isupper() and letter.islower(): trna_list.append(string) string = letter elif string.islower() and letter.islower(): string = str(string) + str(letter) elif string.islower() and letter.isupper(): trna_list.append(string) string = letter trna_list.append(string) start = 1 end = 1 chrom = line.title for sequence in trna_list: start = end end = start + len(sequence) if sequence.islower(): strand = chrom.split("(")[1].split(")")[0] options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\n") % (chrom, start, end, chrom, ".", strand)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="methods", type=str, action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "map-identifier", "nop", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_argument("-p", "--parameters", dest="parameters", type=str, help="parameter stack for methods that require one ") parser.add_argument("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors.") parser.add_argument("--sample-proportion", dest="sample_proportion", type=float, help="sample proportion.") parser.add_argument( "--exclude-pattern", dest="exclude_pattern", type=str, help="exclude all sequences with ids matching pattern ") parser.add_argument( "--include-pattern", dest="include_pattern", type=str, help="include only sequences with ids matching pattern ") parser.add_argument("--filter-method", dest="filter_methods", type=str, action="append", help="filtering methods to apply ") parser.add_argument( "-t", "--sequence-type", dest="type", type=str, choices=("aa", "na"), help="sequence type (aa or na) . This option determines " "which characters to use for masking.") parser.add_argument( "-l", "--template-identifier", dest="template_identifier", type=str, help="template for numerical identifier" "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.add_argument( "--map-tsv-file", dest="map_tsv_file", type=str, help= "input filename with map for identifiers. The first row is a header") parser.add_argument("--fold-width", dest="fold_width", type=int, help="fold width for sequence output. 0 is unfolded ") parser.set_defaults(methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], input_filename_fasta="-", input_filename_map=None, fold_width=80) (args, unknown) = E.start(parser, unknowns=True) if len(unknown) > 0: args.input_filename_fasta = unknown[0] args.parameters = args.parameters.split(",") rx_include, rx_exclude = None, None if args.include_pattern: rx_include = re.compile(args.include_pattern) if args.exclude_pattern: rx_exclude = re.compile(args.exclude_pattern) iterator = FastaIterator.FastaIterator(args.stdin) nseq = 0 map_seq2nid = {} map_identifier = ("apply-map" in args.methods or "map-identifier" in args.methods) if map_identifier: if args.input_filename_map is None: raise ValueError("for method=map-identifier use --map-tsv-file") with iotools.open_file(args.input_filename_map) as infile: map_identifier = iotools.read_map(infile, has_header=True) if args.type == "na": mask_chars = args.na_mask_chars mask_char = args.na_mask_char else: mask_chars = args.aa_mask_chars mask_char = args.aa_mask_char if "map-codons" in args.methods: map_codon2code = iotools.ReadMap(open(args.parameters[0], "r")) del args.parameters[0] if "mask-soft" in args.methods: f = args.parameters[0] del args.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in args.methods or "back-translate" in args.methods: # open a second stream to read sequences from f = args.parameters[0] del args.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) if "sample" in args.methods: if not args.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = args.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in args.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in iotools.open_file(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) iterator = pysam.FastxFile(args.input_filename_fasta) c = E.Counter() fold_width = args.fold_width def fold(s, w): return "\n".join([s[x:x + w] for x in range(0, len(s), w)]) for record in iterator: c.nseq += 1 c.input += 1 sequence = re.sub(" ", "", record.sequence) l = len(sequence) if rx_include and not rx_include.search(record.name): c.skipped += 1 continue if rx_exclude and rx_exclude.search(record.name): c.skipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or record.name in filter_id_list): c.skipped += 1 continue for method in args.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % args.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( record.name, ls) c.errors += 1 if args.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if record.name != other_record.title: raise "sequence titles don't match: %s %s" % ( record.name, other_record.title) other_sequence = re.sub("[ %s]" % args.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % args.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in args.gap_chars: c = args.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = sequence.translate( str.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = args.na_mask_char elif method == "remove-stops": char = args.gap_char for x in sequence: if x not in args.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in args.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (record.name)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, record.name) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in args.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, record.name) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", record.name).groups()[0] if id in map_seq2nid: rest = record.name[len(id):] record.name = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", record.name).groups()[0] new_id = args.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id record.name = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if record.name != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (record.name, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (record.name, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in args.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [args.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [args.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: c.skipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: c.skipped += 1 continue record.sequence = sequence if fold_width >= 0: if record.comment: args.stdout.write(">{} {}\n{}\n".format( record.name, record.comment, fold(record.sequence, fold_width))) else: args.stdout.write(">{}\n{}\n".format( record.name, fold(record.sequence, fold_width))) else: args.stdout.write(str(record) + "\n") c.output += 1 if "build-map" in args.methods: p = args.parameters[0] if p: outfile = iotools.open_file(p, "w") else: outfile = args.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info(c) E.stop()