def target(): while 1: try: res = qin.get(PIPING, 1) except queue.Empty: if not PIPING: break else: continue qname, sname = res['query']['name'], res['subject']['name'] start, end = res['query']['start'], res['query']['end'] alignments = [] max_match = (options.MIN_IDENTITY, None) if subj[sname].type == 'nucl': subject = translate(subj[sname]) else: subject = subj[sname] while qname: try: o = orfs[qname] break except KeyError: qname = qname[:-1] if not qname: qin.task_done() continue for orf in o: if in_range(orf, start, end, 0): orf = orf[:-3] query = translate(orf) options.debug("Aligning %33s v. %33s." % (qname, sname)) alignment = align(subject.seq, query.seq) alignments.append((orf, sname, alignment)) for orf, refname, aln in alignments: hitlen = aln['sublength'] region = orf[-3 * hitlen:] identity = float(aln['identities']) / aln['length'] if identity >= max_match[0]: max_match = (identity, (region, sname, aln)) if max_match[1]: seq, name, _ = max_match[1] odl = subject.defline.split('[')[0].strip() src = seq.original.name start, end, strand = seq.start, seq.end, seq.step defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \ (odl + (' ' if odl else ''), src, start, end, strand) new = Sequence(name.strip(), seq.seq, defline=defline, original=seq.original, type=seq.type, start=seq.start, end=seq.end, step=seq.step) qout.put(new) qin.task_done()
def var(files): ''' Returns plot data and metadata for plotting later on in the pipeline. ''' sort = {} for f in files: seqs = [s for s in io.open(f)] type = set(s.type for s in seqs) if len(type) > 1: type = set(['prot']) fid = (type.pop(), f) seqs = [''.join(s.seq.split('-')).strip() for s in seqs] seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs] sset = frozenset(seqs) srtr = (len(seqs), sset) sort[srtr] = sort.get(srtr, set()) | set([fid]) couples = [] for partners in sort.values(): trim = lambda x: '.'.join(x.split('.')[:-1]) \ if f.endswith('.clustalw') or \ f.endswith('.clustal') or \ f.endswith('.aln') else x names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners)) pair = {} for type, f in partners: if len(pair) == 2: break if type in pair: continue pair[type] = f if 0 < len(pair) < 2: raise TypeError("Unmatched clustal alignment(s): " + ", ".join(f for type, f in partners)) if len(pair) == 0: continue couples.append((pair['nucl'], pair['prot'], names)) for nt, aa, strain in couples: plotdata = { 'nt': SaySNPs(nt), 'aa': SaySNPs(aa) } metadata = {'strain': strain, 'filename': strain + '.pdf'} yield {'plotdata': plotdata, 'metadata': metadata} raise StopIteration
def run_clustal(): while not q.empty(): cid = q.get() dig = hashlib.md5() dig.update(' '.join(cid)) dig = dig.hexdigest() fpre = direc + 'nt' + sep + dig apre = direc + 'aa' + sep + dig fname = fpre + ".fasta" aname = apre + ".fasta" fh = io.open(fname, 'w') ah = io.open(aname, 'w') for ipt in clusters: counter = 0 name = '_'.join(ipt.split(sep)[-1].split('.')[0].split()) for cluster in clusters[ipt]: if cid & cluster[0]: nm = name + '_' + str(counter) seq = cluster[1] curr = sequ.Sequence(nm, seq, defline=', '.join(cid)) tr = tran.translate(curr) tr.name = curr.name fh.write(curr) ah.write(tr) counter += 1 fh.close() ah.close() try: clustal.run(fname, fpre + '.clustalw') clustal.run(aname, apre + '.clustalw') filenames.append(dig + '.fasta') except ValueError: pass q.task_done()
def OptimalCTether(reference, translation, extend=1, create=10): ''' This function will take two sequences: a `reference` sequence and another protein sequence (`translation`; usually, this is an open reading frame that has been translated). Needleman-Wunsch alignment will be performed and the substring of translation with the highest identity that begins with a start codon [default: `['ATG']`] is reported. This function returns a dictionary of relevent information from the alignment; specifically, the alignments itself [keys: `query`, `subject`], the score [key: `score`], the length of the alignment [key: `length`], the length of the substring of translation used [key: `sublength`], the number of identities [key: `identities`], and the number of gaps [key: `gaps`]. ''' starts = set(translate(s) for s in options.START_CODONS) v, w = reference, translation try: v = v.seq except AttributeError: pass try: w = w.seq except AttributeError: pass if not starts & set(w): raise ValueError("Open reading frame does not contain a start codon.") v, w = v[::-1], w[::-1] lv, lw = len(v), len(w) rv, rw = range(lv + 1), range(lw + 1) gpc = [[create * int(not (i | j)) for i in rw] for j in rv] mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0]) for i in rw] for j in rv] pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK for i in rw] for j in rv] ids = [[0 for i in rw] for j in rv] optimal = [None, 0, 0] for i in range(lv): for j in range(lw): vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK], [mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK], [mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]] mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals) gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK) if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \ abs(lv - i) / float(lv) <= options.LENGTH_ERR and \ w[j] in starts: optimal = [mat[i + 1][j + 1], i + 1, j + 1] i, j = optimal[1], optimal[2] seq, ids = ['', ''], 0 gapcount, length, sublen = 0, 0, 0 methods = { VGAP_MARK: lambda s, i, j, l, g, n: (['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n), DIAG_MARK: lambda s, i, j, l, g, n: ([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1, l + 1, g, n + (w[j - 1] == v[i - 1])), HGAP_MARK: lambda s, i, j, l, g, n: ([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n) } while [i, j] != [0, 0]: length += 1 state = (seq, i, j, sublen, gapcount, ids) seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state) return { 'subject': seq[0][::-1], 'query': seq[1][::-1], 'score': optimal[0], 'gaps': gapcount, 'length': length, 'sublength': sublen, 'identities': ids }