def maskSequences(self, sequences): '''mask a collection of sequences.''' with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf: for x, s in enumerate(sequences): outf.write(">%i\n%s\n" % (x, s)) infile = outf.name statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))] os.remove(infile) return result
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] primer_thermodynamics_parameters_path = PARAMS["general_primer_thermodynamics_parameters_path"] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.open_file(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title in ids: outf = IOTools.open_file(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.items(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n" % seq) outf.write("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=%s\n=\n" % primer_thermodynamics_parameters_path) outf.close()
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(iotools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def RenameFastaTitle(fastafile, file2tax, outfile): taxonomy = file2tax[fastafile] suffix=1 for fasta in fastaiterator.iterate(iotools.open_file(fastafile)): suffix_str = "(" + str(suffix) + ")" new_title = taxonomy + suffix_str suffix += 1 outfile.write(">" + new_title + "\n" + fasta.sequence + "\n")
def specformatter(Infile, Outfile): infile = iotools.open_file(Infile) fastas = fastaiterator.iterate(infile) outfile = open(Outfile, "w") for fasta in fastas: name = fasta.title.split(";")[6] specID = name.split("(")[1] specID = specID[:-1] genusspecies = name.split("(")[0] genus = genusspecies.split("_")[0] species = genusspecies.split("_")[1] newtitle = " ".join([specID, genus, species]) outfile.write(">" + newtitle + "\n" + fasta.sequence + "\n")
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.open_file(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.open_file(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--outputdir", dest="outdir", type="string", help="output directory to save plots") parser.add_option("-f", "--fasta", dest="fasta_file", type="string", help="fasta file containing tRNA cluster fasta seqs") parser.set_defaults(fasta_file=None, outdir=None) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) dict_trna = {} for record in FastaIterator.iterate(IOTools.open_file(options.fasta_file)): title = record.title.strip("-") length = len(record.sequence) dict_trna[title] = length # For each read in bamfile find end position and then plot this using length of tRNA cluster samfile = pysam.AlignmentFile(options.stdin.name, "rb") refname = "" values = [] n = 0 for line in samfile: if line.reference_name == refname: if line.reference_end is None: pass else: end = int(line.reference_end) - int(line.reference_start) values.append(end) elif line.reference_name != refname: n += 1 if n > 1: values = pd.Series(values) percent = values.value_counts() / values.count() * 100 percent = percent.sort_index() percent = pd.DataFrame(percent) percent.rename(columns={0: 'Percent'}, inplace=True) # length of each tRNA from fasta length = dict_trna[refname.strip("-")] + 1 temp_df = pd.DataFrame(0, index=range(1, length), columns=['A']) temp_df = pd.concat([temp_df, percent], axis=1) percent = temp_df.fillna(0) refname = options.outdir + refname.strip("-") outfile = refname + ".csv" outfig = refname + ".eps" percent.to_csv(outfile) g = sns.factorplot(x=percent.index, y="Percent", data=percent, size=8, kind="bar", palette="Blues") g.set_xlabels('position from 5\' end') g.set_xticklabels(rotation=90) g.savefig(outfig, format='eps') values = [] refname = line.reference_name else: refname = line.reference_name E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( [x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([ x for x in differences if x[0] in "NX" or x[1] in "NX" ]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print( "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.stop()
twoA = 0 twoG = 0 twoT = 0 twoC = 0 twoO = 0 thrA = 0 thrG = 0 thrT = 0 thrC = 0 thrO = 0 i = 0 ###Iterate over and change titles if they are matched. for transcript in FastaIterator.iterate(infile): if transcript.sequence[0] == "A": oneA += 1 elif transcript.sequence[0] == "G": oneG += 1 elif transcript.sequence[0] == "T": oneT += 1 elif transcript.sequence[0] == "C": oneC += 1 else: oneO += 1 if transcript.sequence[1] == "A": twoA += 1 elif transcript.sequence[1] == "G": twoG += 1
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-c", "--is-cds", dest="is_cds", action="store_true", help="input are cds (nucleotide) sequences ") parser.set_defaults( is_cds=False, ) (args) = E.start(parser, argv=argv) args.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n") alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate(args.stdin): identifier = entry.title if args.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate(cds_sequence) weights = [] for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos + 3] counts = collections.defaultdict(int) for x in range(0, 3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate( codon[:x] + na + codon[x + 1:]) counts[taa] += 1 weights.append(counts) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate(sequence): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid += 1 args.stdout.write( "%s\n" % "\t".join( ("%010i" % snpid, identifier, str(pos + 1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.stop()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with iotools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(iotools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = iotools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = iotools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(iotools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with iotools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("-p", "--output-proportion", dest="proportion", action="store_true", help="output proportions - overides the default output") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # do not allow greater than octonucleotide assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer # how we deal with the nucleotides depends on the kmer length nucleotides = [] for nucleotide in ["A", "C", "T", "G"]: nucleotides = nucleotides + \ [x for x in itertools.repeat(nucleotide, options.kmer)] E.info("retrieving %imer sequences" % options.kmer) # get all kmer sequences to query kmers = set() for kmer in itertools.permutations(nucleotides, options.kmer): kmers.add(kmer) E.info("matching %imers in file" % options.kmer) # count the number of kmers in each sequence result = {} # NB assume that non fasta files are caught by FastaIterator total_entries = 0 for fasta in FastaIterator.iterate(options.stdin): total_entries += 1 result[fasta.title] = {} for kmer in kmers: counts = [ m.start() for m in re.finditer("".join(kmer), fasta.sequence) ] result[fasta.title][kmer] = len(counts) E.info("writing results") # write out the results headers = sorted(result.keys()) rows = set() for kmer_counts in list(result.values()): for kmer, count in kmer_counts.items(): rows.add("".join(kmer)) # write header row options.stdout.write("kmer\t" + "\t".join(headers) + "\n") # output proportions if required - normalises by # sequence length E.info("computing total counts") totals = {} for header in headers: totals[header] = sum([result[header][tuple(row)] for row in rows]) for row in sorted(rows): if options.proportion: options.stdout.write("\t".join([row] + [ str(float(result[header][tuple(row)]) / totals[header]) for header in headers ]) + "\n") else: options.stdout.write("\t".join( [row] + [str(result[header][tuple(row)]) for header in headers]) + "\n") E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults(input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.start(parser) if options.input_filename: infile = iotools.open_file(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = iotools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print("# parsing error in description line %s" % (seq.title)) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print("# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)) E.stop()