def rename(direc, db, files): ''' This isn't really for bioinformatics, this is more for the pipeline, to rename the files generated by cluster.py with a little human interaction. ''' names = [] seqdb = dict((s.name, s) for s in io.open(db, 'r')) nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep for f in files: seq = io.open(nt_dir + f, 'r').next() ids = seq.defline.split(', ') print("File\033[33;1m", f, \ "\033[0mis described by the following sequences:") try: for id in ids: seqdb[id] print("* " + seqdb[id].name + ': ' + seqdb[id].defline.split('[')[0]) except KeyError: print("* (none)") continue pre = get_input("\033[33;1mWhat should we call this file " + "(or hit enter to skip)? \033[0m") fpre = f[:f.find('.')] if pre != "": count = 0 while True: rpre = pre + ((" (%d)" % count) if count > 0 else "") try: fh = open(nt_dir + rpre + ".fasta", 'r') fh.close() count += 1 continue except IOError: nt_old, nt_new = nt_dir + fpre, nt_dir + rpre aa_old, aa_new = aa_dir + fpre, aa_dir + rpre print("Renaming " + fpre + ".* to " + rpre + ".*") try: mv(nt_old + ".fasta", nt_new + ".fasta") mv(aa_old + ".fasta", aa_new + ".fasta") mv(nt_old + ".clustalw", nt_new + ".clustalw") mv(aa_old + ".clustalw", aa_new + ".clustalw") names.append(nt_new + '.clustalw') names.append(aa_new + '.clustalw') except OSError: pass break return names
def run_clustal(): while not q.empty(): cid = q.get() dig = hashlib.md5() dig.update(' '.join(cid)) dig = dig.hexdigest() fpre = direc + 'nt' + sep + dig apre = direc + 'aa' + sep + dig fname = fpre + ".fasta" aname = apre + ".fasta" fh = io.open(fname, 'w') ah = io.open(aname, 'w') for ipt in clusters: counter = 0 name = '_'.join(ipt.split(sep)[-1].split('.')[0].split()) for cluster in clusters[ipt]: if cid & cluster[0]: nm = name + '_' + str(counter) seq = cluster[1] curr = sequ.Sequence(nm, seq, defline=', '.join(cid)) tr = tran.translate(curr) tr.name = curr.name fh.write(curr) ah.write(tr) counter += 1 fh.close() ah.close() try: clustal.run(fname, fpre + '.clustalw') clustal.run(aname, apre + '.clustalw') filenames.append(dig + '.fasta') except ValueError: pass q.task_done()
def run(infile, outfile, **kwargs): n = 0 for seq in io.open(infile, 'r'): n += 1 if n > 1: seqtype = seq.type break if n > 1: cmd = "clustalw" try: ignore = open('/dev/null', 'w') except IOError: ignore = open('nul', 'w') if seqtype == 'nucl': defaults = { 'OUTORDER': 'ALIGNED', 'GAPOPEN': '10', 'GAPEXT': '0.1', 'DNAMATRIX': 'IUB' } others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg])) for arg in set(name.upper() for name in kwargs) & set(defaults.keys())] subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA", "-OUTFILE=" + outfile] + others, stdout=ignore) else: defaults = { 'OUTORDER': 'ALIGNED', 'GAPOPEN': '10', 'GAPEXT': '0.1', 'MATRIX': 'BLOSUM' } others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg])) for arg in set(name.upper() for name in kwargs) & set(defaults.keys())] subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others, stdout=ignore) pos = infile.rfind('.') if pos > -1: prefix = infile[:pos] else: prefix = infile remove(prefix + '.dnd')
def var(files): ''' Returns plot data and metadata for plotting later on in the pipeline. ''' sort = {} for f in files: seqs = [s for s in io.open(f)] type = set(s.type for s in seqs) if len(type) > 1: type = set(['prot']) fid = (type.pop(), f) seqs = [''.join(s.seq.split('-')).strip() for s in seqs] seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs] sset = frozenset(seqs) srtr = (len(seqs), sset) sort[srtr] = sort.get(srtr, set()) | set([fid]) couples = [] for partners in sort.values(): trim = lambda x: '.'.join(x.split('.')[:-1]) \ if f.endswith('.clustalw') or \ f.endswith('.clustal') or \ f.endswith('.aln') else x names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners)) pair = {} for type, f in partners: if len(pair) == 2: break if type in pair: continue pair[type] = f if 0 < len(pair) < 2: raise TypeError("Unmatched clustal alignment(s): " + ", ".join(f for type, f in partners)) if len(pair) == 0: continue couples.append((pair['nucl'], pair['prot'], names)) for nt, aa, strain in couples: plotdata = { 'nt': SaySNPs(nt), 'aa': SaySNPs(aa) } metadata = {'strain': strain, 'filename': strain + '.pdf'} yield {'plotdata': plotdata, 'metadata': metadata} raise StopIteration
def SaySNPs(input): ''' Takes a clustalw alignment and will return a dictionary of data relevent to plotting the sequence variance for the sequences in the given clustalw alignment. These data are: * `var`: the measure of sequence variation, * `starts`: the starting positions for each gene model in amino acids, * `ends`: the ending positions for each gene model in amino acids, and * `count`: the number of sequences with a particular gene model. The values given in `starts`, `ends`, and `counts` are sorted to that the nth element in starts corresponds to the nth value in ends and the nth value in counts. ''' catalogue = [] lengths = {} for seq in io.open(input, 'r'): key = (seq.start, seq.end) lengths[key] = lengths.get(key, 0) + 1 for (i, c) in zip(xrange(key[1] - key[0] + 1), seq): if i >= len(catalogue): catalogue.append({}) if c != " ": catalogue[i][c] = catalogue[i].get(c, 0) + 1 calc = [] for s in catalogue: tot = float(sum(s.values())) cnt = float(len(s)) calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s)) llist = sorted(list(lengths.keys())) return { 'var': calc, 'starts': [s for s, e in llist], 'ends': [e for s, e in llist], 'count': [lengths[k] for k in llist] }
def run(db, sfile, mega_blast=False, **kwargs): ''' Takes a database and a query and runs the appropriate type of BLAST on them. The database can be an existing BLAST database or a fasta/fastq file. If it is a sequence file, this function will look in the places where BLAST would look for an existing database created from that file and use that instead. If there is no such database, this function will make one for you and then use the newly created database with BLAST. Optional named arguments can currently only be `evalue`, `num_threads`, `gapopen`, or `gapextend`. The correspond to the BLAST options of the same name. ''' cmds = { 'prot': { 'prot': 'blastp', 'nucl': 'tblastn' }, 'nucl': { 'nucl': 'blastn', 'prot': 'blastx' } } seq = io.open(sfile, 'r').next() qtype = seq.type rcloc = '' for loc in (".:~:" + (getenv("NCBI") or "")).split(':'): if loc and loc[-1] == sep: loc += sep try: for line in (l.strip() for l in open(loc + '.ncbirc', 'r')): pos = line.find('=') if pos >= 0 and line[:pos].strip() == "BLASTDB": rcloc = line[pos + 1:].strip() except IOError: pass dbtype = None bdbenv = getenv("BLASTDB") dblocations = (":." + ((':' + bdbenv) if bdbenv else '') + ((':' + rcloc) if rcloc else '')).split(':') for loc in dblocations: if loc and loc[-1] != sep: loc += sep try: open(loc + db + '.pin', 'r') dbtype = 'prot' break except IOError: try: open(loc + db + '.nin', 'r') dbtype = 'nucl' break except IOError: pass if not dbtype: odb = db pos = db.rfind(".") for seq in io.open(db, 'r'): dbtype = seq.type break if not dbtype: raise IOError("Database not found: " + odb) ndb = None sp = db.rfind(sep) if sp > -1: dbdir, db = db[:sp], db[sp + 1:pos] else: dbdir, db = '.', db[:pos] for file in listdir(dbdir): dpos = file.rfind('.') if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in': fh = open(dbdir + sep + file, 'r') c = ord(fh.read(12)[-1]) fname = fh.read(c) if fname[0] in ("'", '"'): fname = fname[1:-1] if fname.endswith(odb): ndb = dbdir + sep + file[:dpos] break if not ndb: ndb = '_'.join(db.split()) try: ignore = open('/dev/null', 'w') except IOError: ignore = open('nul', 'w') try: # possible race condition open(ndb, 'r').close() except IOError: subprocess.call(["makeblastdb", "-in", '"%s"' % odb, "-out", ndb, "-dbtype", dbtype], stdout=ignore) try: for suff in ['in', 'hr', 'sq']: name = ndb + '.' + dbtype[0] + suff shutil.move(name, dbdir + sep + name) except shutil.Error: pass db = dbdir + sep + ndb else: db = ndb else: raise IOError("Database not found: " + db) allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \ set(kwargs.keys()) cmd = cmds[qtype][dbtype] pn = ["-db", "-query"] if mega_blast: cmd = "megablast" pn = ["-d", "-i"] allowed = ["e", "a"] proc = subprocess.Popen([cmd, pn[0], db, pn[1], sfile] + [arg for pair in [["-" + k, str(kwargs[k])] for k in allowed] for arg in pair], bufsize=1, stdout=subprocess.PIPE) return Result(iter(proc.stdout.readline, ''))
def GeneFromBLAST(db, sequences, pref, names): ''' BLASTs database against sequences, and for those results that pass the length and percent identity requirements, attempt to locate the full gene that corresponds to that BLAST hit. Genes that are found are saved in the subdirectory sequences under the given directory, divided depending on whether the sequnece is amino acid or nucleotide. ''' PIPING = True wd = options.DIRECTORY + 'sequences' + sep for d in [options.DIRECTORY, wd]: try: mkdir(d) except OSError: pass subj = dict((s.name, s) for s in io.open(db, 'r')) options.debug("Database sequences loaded from file %s." % db) try: orfs = dict((s.name, [orf for orf in ORFGenerator(s)]) for s in io.open(sequences, 'r')) options.debug("ORFs loaded from file %s." % sequences) except IOError: options.debug("No file \"" + sequences + ",\" skipping.") return def target(): while 1: try: res = qin.get(PIPING, 1) except queue.Empty: if not PIPING: break else: continue qname, sname = res['query']['name'], res['subject']['name'] start, end = res['query']['start'], res['query']['end'] alignments = [] max_match = (options.MIN_IDENTITY, None) if subj[sname].type == 'nucl': subject = translate(subj[sname]) else: subject = subj[sname] while qname: try: o = orfs[qname] break except KeyError: qname = qname[:-1] if not qname: qin.task_done() continue for orf in o: if in_range(orf, start, end, 0): orf = orf[:-3] query = translate(orf) options.debug("Aligning %33s v. %33s." % (qname, sname)) alignment = align(subject.seq, query.seq) alignments.append((orf, sname, alignment)) for orf, refname, aln in alignments: hitlen = aln['sublength'] region = orf[-3 * hitlen:] identity = float(aln['identities']) / aln['length'] if identity >= max_match[0]: max_match = (identity, (region, sname, aln)) if max_match[1]: seq, name, _ = max_match[1] odl = subject.defline.split('[')[0].strip() src = seq.original.name start, end, strand = seq.start, seq.end, seq.step defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \ (odl + (' ' if odl else ''), src, start, end, strand) new = Sequence(name.strip(), seq.seq, defline=defline, original=seq.original, type=seq.type, start=seq.start, end=seq.end, step=seq.step) qout.put(new) qin.task_done() def in_range(seq, start, end, frame): ss, se = sorted((seq.start, seq.end)) os, oe = sorted((start, end)) frame = int(frame) return (ss < oe and se > os and (se % 3 == oe % 3 or ss % 3 == oe % 3) ) qout = queue.Queue() qin = ThreadQueue(target) blastopts = { 'evalue': options.MAX_EVALUE, 'num_threads': options.NUM_THREADS } for res in BLAST.run(db, sequences, **blastopts): if float(res['expect']) > options.MAX_EVALUE: continue sbjl = len(subj[res['subject']['name']]) ident = float(res['identities'].split('(')[1][:-2]) / 100 lerr = float(res['subject']['length']) / sbjl if ident >= options.MIN_IDENTITY: if lerr >= (1.0 - options.LENGTH_ERR): qin.put(res) PIPING = False options.debug("BLAST done.") target() qin.join() options.debug("Done Aligning sequences.") options.debug("Now writing sequences (%d)." % qout.qsize()) seqs = {} nuc_file = io.open(wd + pref + '.fasta', 'w') count = 0 while 1: try: seq = qout.get(False) if seq.seq not in seqs: seqs[seq.seq] = set() seqs[seq.seq].add(seq) nuc_file.write(seq) count += 1 options.debug("Wrote %s (%d)." % (seq.name, count)); except queue.Empty: break nuc_file.close() options.debug("Done Aligning sequences.") gh = io.open(wd + pref + '.gff3', 'w') names.append(wd + pref + '.fasta') for id in seqs: gh.write(ann(seqs[id].copy().pop(), pref, 'gene', homologs=','.join(s.name for s in seqs[id]))) gh.close()
seqs[seq.seq].add(seq) nuc_file.write(seq) count += 1 options.debug("Wrote %s (%d)." % (seq.name, count)); except queue.Empty: break nuc_file.close() options.debug("Done Aligning sequences.") gh = io.open(wd + pref + '.gff3', 'w') names.append(wd + pref + '.fasta') for id in seqs: gh.write(ann(seqs[id].copy().pop(), pref, 'gene', homologs=','.join(s.name for s in seqs[id]))) gh.close() def run(subject, query, prefix, names): GeneFromBLAST(subject, query, prefix, names) if __name__ == '__main__': options.START_CODONS = ['TTG'] import sys f = io.open(sys.argv[1], 'r') for seq in f: print(seq.name + ' ' + seq.defline) for orf in ORFGenerator(seq): print('%d ... %d' % (orf.start, orf.end))
def run(direc, inputs): ''' Takes a collection of files generated by gene prediction, creates clusters based off of the genes that have homology to those predicted genes, and creates new fasta files in the clusters sub directory under the given directory and separated according to whether they are nucleotide or amino acid sequnces. These new fasta files are then used to create clustalw alignments of the genes if more than 1 sequence exists in the fasta file. ''' clusters = {} all_ids = set() ids = {} q = queue.Queue() filenames = [] def run_clustal(): while not q.empty(): cid = q.get() dig = hashlib.md5() dig.update(' '.join(cid)) dig = dig.hexdigest() fpre = direc + 'nt' + sep + dig apre = direc + 'aa' + sep + dig fname = fpre + ".fasta" aname = apre + ".fasta" fh = io.open(fname, 'w') ah = io.open(aname, 'w') for ipt in clusters: counter = 0 name = '_'.join(ipt.split(sep)[-1].split('.')[0].split()) for cluster in clusters[ipt]: if cid & cluster[0]: nm = name + '_' + str(counter) seq = cluster[1] curr = sequ.Sequence(nm, seq, defline=', '.join(cid)) tr = tran.translate(curr) tr.name = curr.name fh.write(curr) ah.write(tr) counter += 1 fh.close() ah.close() try: clustal.run(fname, fpre + '.clustalw') clustal.run(aname, apre + '.clustalw') filenames.append(dig + '.fasta') except ValueError: pass q.task_done() if direc: for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]: try: mkdir(d) except OSError: pass for ipt in inputs: seqs = {} ids[ipt] = set() for seq in io.open(ipt, 'r'): ids[ipt].add(seq.name) all_ids.add(seq.name) if seq.seq not in seqs: seqs[seq.seq] = set() seqs[seq.seq].add(seq.name) clusters[ipt] = [(seqs[k], k) for k in seqs] del seqs sub_ids = [] while all_ids: cid = all_ids.pop() subcluster = (all_ids | set([cid])) & \ set(i for ipt in clusters for cluster in clusters[ipt] for i in cluster[0] if cid in cluster[0]) for ipt in clusters: for cluster in clusters[ipt]: if cid in cluster[0]: subcluster = (subcluster & cluster[0]) | \ (subcluster - ids[ipt]) sub_ids.append(subcluster) all_ids -= subcluster for cid in sub_ids: q.put(cid) threads = [] for i in xrange(options.NUM_PROCESSES - 1): curr = threading.Thread(target=run_clustal) threads.append(curr) curr.start() run_clustal() q.join() return filenames