Ejemplo n.º 1
0
    def target():
        while 1:
            try:
                res = qin.get(PIPING, 1)
            except queue.Empty:
                if not PIPING:
                    break
                else:
                    continue

            qname, sname = res['query']['name'], res['subject']['name']
            start, end = res['query']['start'], res['query']['end']
            alignments = []
            max_match = (options.MIN_IDENTITY, None)

            if subj[sname].type == 'nucl':
                subject = translate(subj[sname])
            else:
                subject = subj[sname]

            while qname:
                try:
                    o = orfs[qname]
                    break
                except KeyError:
                    qname = qname[:-1]
            if not qname:
                qin.task_done()
                continue

            for orf in o:
                if in_range(orf, start, end, 0):
                    orf = orf[:-3]
                    query = translate(orf)
                    options.debug("Aligning %33s v. %33s." % (qname, sname))
                    alignment = align(subject.seq, query.seq)
                    alignments.append((orf, sname, alignment))

            for orf, refname, aln in alignments:
                hitlen = aln['sublength']
                region = orf[-3 * hitlen:]
                identity = float(aln['identities']) / aln['length']
                if identity >= max_match[0]:
                    max_match = (identity, (region, sname, aln))

            if max_match[1]:
                seq, name, _ = max_match[1]
                odl = subject.defline.split('[')[0].strip()
                src = seq.original.name
                start, end, strand = seq.start, seq.end, seq.step
                defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
                    (odl + (' ' if odl else ''), src, start, end, strand)

                new = Sequence(name.strip(), seq.seq, defline=defline,
                               original=seq.original, type=seq.type,
                               start=seq.start, end=seq.end, step=seq.step)
                qout.put(new)
            qin.task_done()
Ejemplo n.º 2
0
def var(files):
    '''
    Returns plot data and metadata for plotting later on in the pipeline.
    '''
    sort = {}
    for f in files:
        seqs = [s for s in io.open(f)]
        type = set(s.type for s in seqs)
        if len(type) > 1:
            type = set(['prot'])
        fid = (type.pop(), f)
        seqs = [''.join(s.seq.split('-')).strip() for s in seqs]
        seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs]
        sset = frozenset(seqs)
        srtr = (len(seqs), sset)
        sort[srtr] = sort.get(srtr, set()) | set([fid])

    couples = []
    for partners in sort.values():
        trim = lambda x: '.'.join(x.split('.')[:-1]) \
                         if f.endswith('.clustalw') or \
                            f.endswith('.clustal') or \
                            f.endswith('.aln') else x
        names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners))
        pair = {}
        for type, f in partners:
            if len(pair) == 2:
                break
            if type in pair:
                continue
            pair[type] = f
        if 0 < len(pair) < 2:
            raise TypeError("Unmatched clustal alignment(s): " + 
                            ", ".join(f for type, f in partners))
        if len(pair) == 0:
          continue
        couples.append((pair['nucl'], pair['prot'], names))

    for nt, aa, strain in couples:
        plotdata = {
            'nt': SaySNPs(nt),
            'aa': SaySNPs(aa)
        }
        metadata = {'strain': strain, 'filename': strain + '.pdf'}

        yield {'plotdata': plotdata, 'metadata': metadata}
    raise StopIteration
Ejemplo n.º 3
0
    def run_clustal():
        while not q.empty():
            cid = q.get()
            dig = hashlib.md5()
            dig.update(' '.join(cid))
            dig = dig.hexdigest()

            fpre = direc + 'nt' + sep + dig
            apre = direc + 'aa' + sep + dig
            fname = fpre + ".fasta"
            aname = apre + ".fasta"

            fh = io.open(fname, 'w')
            ah = io.open(aname, 'w')
            for ipt in clusters:
                counter = 0
                name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
                for cluster in clusters[ipt]:
                    if cid & cluster[0]:
                        nm = name + '_' + str(counter)
                        seq = cluster[1]
                        curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
                        tr = tran.translate(curr)
                        tr.name = curr.name
                        fh.write(curr)
                        ah.write(tr)
                        counter += 1
            fh.close()
            ah.close()

            try:
                clustal.run(fname, fpre + '.clustalw')
                clustal.run(aname, apre + '.clustalw')
                filenames.append(dig + '.fasta')
            except ValueError:
                pass

            q.task_done()
Ejemplo n.º 4
0
def OptimalCTether(reference, translation, extend=1, create=10):
    '''
    This function will take two sequences: a `reference` sequence and  another
    protein sequence (`translation`; usually, this is an open reading frame
    that has been translated). Needleman-Wunsch alignment will be performed
    and the substring of translation with the highest identity that begins
    with a start codon [default: `['ATG']`] is reported.

    This function returns a dictionary of relevent information from the
    alignment; specifically, the alignments itself [keys: `query`, `subject`],
    the score [key: `score`], the length of the alignment [key: `length`], the
    length of the substring of translation used [key: `sublength`], the number
    of identities [key: `identities`], and the number of gaps [key: `gaps`].
    '''

    starts = set(translate(s) for s in options.START_CODONS)
    v, w = reference, translation

    try:
        v = v.seq
    except AttributeError:
        pass
    try:
        w = w.seq
    except AttributeError:
        pass
    if not starts & set(w):
        raise ValueError("Open reading frame does not contain a start codon.")

    v, w = v[::-1], w[::-1]
    lv, lw = len(v), len(w)
    rv, rw = range(lv + 1), range(lw + 1)
    gpc = [[create * int(not (i | j)) for i in rw] for j in rv]
    mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0])
           for i in rw] for j in rv]
    pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK
           for i in rw] for j in rv]
    ids = [[0 for i in rw] for j in rv]
    optimal = [None, 0, 0]
    for i in range(lv):
        for j in range(lw):
            vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK],
                    [mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK],
                    [mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]]
            mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals)
            gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK)
            if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \
                    abs(lv - i) / float(lv) <= options.LENGTH_ERR and \
                    w[j] in starts:
                optimal = [mat[i + 1][j + 1], i + 1, j + 1]

    i, j = optimal[1], optimal[2]
    seq, ids = ['', ''], 0
    gapcount, length, sublen = 0, 0, 0
    methods = {
        VGAP_MARK:
            lambda s, i, j, l, g, n:
                (['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n),
        DIAG_MARK:
            lambda s, i, j, l, g, n:
                ([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1,
                 l + 1, g, n + (w[j - 1] == v[i - 1])),
        HGAP_MARK:
            lambda s, i, j, l, g, n:
                ([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n)
    }

    while [i, j] != [0, 0]:
        length += 1
        state = (seq, i, j, sublen, gapcount, ids)
        seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state)

    return {
        'subject': seq[0][::-1],
        'query': seq[1][::-1],
        'score': optimal[0],
        'gaps': gapcount,
        'length': length,
        'sublength': sublen,
        'identities': ids
    }