def target(): while 1: try: res = qin.get(PIPING, 1) except queue.Empty: if not PIPING: break else: continue qname, sname = res['query']['name'], res['subject']['name'] start, end = res['query']['start'], res['query']['end'] alignments = [] max_match = (options.MIN_IDENTITY, None) if subj[sname].type == 'nucl': subject = translate(subj[sname]) else: subject = subj[sname] while qname: try: o = orfs[qname] break except KeyError: qname = qname[:-1] if not qname: qin.task_done() continue for orf in o: if in_range(orf, start, end, 0): orf = orf[:-3] query = translate(orf) options.debug("Aligning %33s v. %33s." % (qname, sname)) alignment = align(subject.seq, query.seq) alignments.append((orf, sname, alignment)) for orf, refname, aln in alignments: hitlen = aln['sublength'] region = orf[-3 * hitlen:] identity = float(aln['identities']) / aln['length'] if identity >= max_match[0]: max_match = (identity, (region, sname, aln)) if max_match[1]: seq, name, _ = max_match[1] odl = subject.defline.split('[')[0].strip() src = seq.original.name start, end, strand = seq.start, seq.end, seq.step defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \ (odl + (' ' if odl else ''), src, start, end, strand) new = Sequence(name.strip(), seq.seq, defline=defline, original=seq.original, type=seq.type, start=seq.start, end=seq.end, step=seq.step) qout.put(new) qin.task_done()
def run_predict(): while 1: try: strainf = q.get(False) except queue.Empty: break strain = strainf.split(sep)[-1] pos = strain.rfind(".") if pos > 1 or (pos == 1 and strain[0] != "."): strain = strain[:pos] options.debug("Predicting for %s." % strain) try: genepredict.run(infile, strainf, strain, filenames) except RuntimeError: pass q.task_done()
def GeneFromBLAST(db, sequences, pref, names): ''' BLASTs database against sequences, and for those results that pass the length and percent identity requirements, attempt to locate the full gene that corresponds to that BLAST hit. Genes that are found are saved in the subdirectory sequences under the given directory, divided depending on whether the sequnece is amino acid or nucleotide. ''' PIPING = True wd = options.DIRECTORY + 'sequences' + sep for d in [options.DIRECTORY, wd]: try: mkdir(d) except OSError: pass subj = dict((s.name, s) for s in io.open(db, 'r')) options.debug("Database sequences loaded from file %s." % db) try: orfs = dict((s.name, [orf for orf in ORFGenerator(s)]) for s in io.open(sequences, 'r')) options.debug("ORFs loaded from file %s." % sequences) except IOError: options.debug("No file \"" + sequences + ",\" skipping.") return def target(): while 1: try: res = qin.get(PIPING, 1) except queue.Empty: if not PIPING: break else: continue qname, sname = res['query']['name'], res['subject']['name'] start, end = res['query']['start'], res['query']['end'] alignments = [] max_match = (options.MIN_IDENTITY, None) if subj[sname].type == 'nucl': subject = translate(subj[sname]) else: subject = subj[sname] while qname: try: o = orfs[qname] break except KeyError: qname = qname[:-1] if not qname: qin.task_done() continue for orf in o: if in_range(orf, start, end, 0): orf = orf[:-3] query = translate(orf) options.debug("Aligning %33s v. %33s." % (qname, sname)) alignment = align(subject.seq, query.seq) alignments.append((orf, sname, alignment)) for orf, refname, aln in alignments: hitlen = aln['sublength'] region = orf[-3 * hitlen:] identity = float(aln['identities']) / aln['length'] if identity >= max_match[0]: max_match = (identity, (region, sname, aln)) if max_match[1]: seq, name, _ = max_match[1] odl = subject.defline.split('[')[0].strip() src = seq.original.name start, end, strand = seq.start, seq.end, seq.step defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \ (odl + (' ' if odl else ''), src, start, end, strand) new = Sequence(name.strip(), seq.seq, defline=defline, original=seq.original, type=seq.type, start=seq.start, end=seq.end, step=seq.step) qout.put(new) qin.task_done() def in_range(seq, start, end, frame): ss, se = sorted((seq.start, seq.end)) os, oe = sorted((start, end)) frame = int(frame) return (ss < oe and se > os and (se % 3 == oe % 3 or ss % 3 == oe % 3) ) qout = queue.Queue() qin = ThreadQueue(target) blastopts = { 'evalue': options.MAX_EVALUE, 'num_threads': options.NUM_THREADS } for res in BLAST.run(db, sequences, **blastopts): if float(res['expect']) > options.MAX_EVALUE: continue sbjl = len(subj[res['subject']['name']]) ident = float(res['identities'].split('(')[1][:-2]) / 100 lerr = float(res['subject']['length']) / sbjl if ident >= options.MIN_IDENTITY: if lerr >= (1.0 - options.LENGTH_ERR): qin.put(res) PIPING = False options.debug("BLAST done.") target() qin.join() options.debug("Done Aligning sequences.") options.debug("Now writing sequences (%d)." % qout.qsize()) seqs = {} nuc_file = io.open(wd + pref + '.fasta', 'w') count = 0 while 1: try: seq = qout.get(False) if seq.seq not in seqs: seqs[seq.seq] = set() seqs[seq.seq].add(seq) nuc_file.write(seq) count += 1 options.debug("Wrote %s (%d)." % (seq.name, count)); except queue.Empty: break nuc_file.close() options.debug("Done Aligning sequences.") gh = io.open(wd + pref + '.gff3', 'w') names.append(wd + pref + '.fasta') for id in seqs: gh.write(ann(seqs[id].copy().pop(), pref, 'gene', homologs=','.join(s.name for s in seqs[id]))) gh.close()