Beispiel #1
0
 def lookForORF(self, seq, rec) :
     coordinates, edges = rec['coordinates'], rec['flanking'][:]
     seq = self.get_seq(seq, *coordinates)
     #if (len(seq) - sum(edges)) % 3 == 0 :
     startCodon, stopCodon = 0, 0
     for s in xrange(edges[0]%3, len(seq), 3) :
         c = seq[s:s+3]
         if c in {'ATG', 'TTG', 'GTG'} :
             startCodon, edges[0] = 1, 0
             e0 = s+3
             new_s = coordinates[1] + s if coordinates[3] == '+' else coordinates[2] - s
             break
     if not startCodon :
         return 6
     for e in xrange(e0, len(seq), 3) :
         c = seq[e:e+3]
         if c in {'TAG', 'TAA', 'TGA'} :
             stopCodon, edges[1] = 1, 0
             new_e = coordinates[1] + e+2 if coordinates[3] == '+' else coordinates[2] - e-2
             break
     if startCodon and stopCodon and abs(e-s) + 1 >= 0.6 * (abs(coordinates[2]-coordinates[1])+1 - sum(rec['flanking'])) :
         c2 = (new_s, new_e) if coordinates[3] == '+' else (new_e, new_s)
         if c2[0] != coordinates[1] or c2[1] != coordinates[2] :
             rec['CIGAR'] = rec['CIGAR'].rsplit(':', 1)[0] + ':EXEMPT'
         coordinates[1:3] = c2
         rec['flanking'] = edges
         return 0
     return 6
Beispiel #2
0
 def get_similar(bsn, ortho_pairs):
     key = tuple(sorted([bsn[0][0], bsn[0][1]]))
     if key in ortho_pairs:
         return
     matched_aa = {}
     len_aa = int(int(bsn[0][12]) / 3)
     for part in bsn:
         s_i, e_i, s_j, e_j = [int(x) for x in part[6:10]]
         for s, t in re.findall(r'(\d+)([A-Z])', part[14]):
             frame_i, frame_j = s_i % 3, s_j % 3
             s = int(s)
             if t == 'M':
                 if frame_i == frame_j or params['incompleteCDS']:
                     matched_aa.update({
                         (s_i + x): 1
                         for x in xrange((3 - (frame_i - 1)) % 3, s)
                     })
                 s_i += s
                 s_j += s
                 if len(matched_aa) * 3 >= min(
                         params['match_len2'],
                         params['match_len']) or len(matched_aa) >= (
                             min(params['match_prop'],
                                 params['match_prop2']) - 0.1) * len_aa:
                     ortho_pairs[key] = 1
                     return
             elif t == 'I':
                 s_i += s
             else:
                 s_j += s
Beispiel #3
0
    def do_polish_with_SNPs(self, reference, snp_file):
        sequence = readFasta(reference)
        snps = {n: [] for n in sequence}
        if snp_file != '':
            with open(snp_file) as fin:
                for line in fin:
                    part = line.strip().split()
                    snps[part[0]].append([int(part[1]), part[-1]])
            self.snps = snps

        for n, s in sequence.items():
            sequence[n] = list(s)

        for cont, sites in snps.items():
            for site, base in reversed(sites):
                if base.startswith('+'):
                    sequence[cont][site - 1:site - 1] = base[1:]
                elif base.startswith('-'):
                    sequence[cont][site - 1:(site + len(base) - 2)] = []
                else:
                    sequence[cont][site - 1] = base

        with open('{0}.fasta'.format(prefix), 'w') as fout:
            for n, s in sorted(sequence.items()):
                s = ''.join(s)
                fout.write('>{0}\n{1}\n'.format(
                    n, '\n'.join([
                        s[site:(site + 100)]
                        for site in xrange(0, len(s), 100)
                    ])))
        return '{0}.fasta'.format(prefix)
Beispiel #4
0
    def write_down(filename, regions, repeats, mutations, reference, query, tag) :
        with uopen(filename, 'w') as fout:
            fout.write('##gff-version 3\n')
            fout.write('## Reference: {0}\n'.format(reference))
            fout.write('## Query: {0}\n'.format(query))
            fout.write('## Tag: {0}\n'.format(tag))

            fout.write('\n'.join(['{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t{5}'.format(r[1], r[2], r[3], r[0], r[10], '/inference="Aligned%20with%20{0}:{1}-{2}"'.format(*r[7:10])) for r in regions]) + '\n')
            fout.write('\n'.join(['{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="{3}"'.format(r[0], r[1], r[2], 'Repetitive%20region' if r[3] == 0 else 'Uncertain%20base%20calling%20or%20ambigious%20alignment') for r in repeats]) + '\n')
            for contig, variation in sorted(mutations.items()):
                for site, alters in sorted(variation.items()) :
                    for alter, source in alters.items() :
                        if source[6][0] == '-' :
                            difference = '+{0}'.format(source[7])
                            origin = '.'
                        elif source[7][0] == '-' :
                            difference = '-{0}'.format(source[6])
                            origin = source[6]
                        else :
                            difference = source[7]
                            origin = source[6]
                        compare = ''
                        for id in xrange(0, len(source), 9) :
                            compare += '{0}:{1}-{2}:{3};'.format(source[id+0], abs(source[id+4]), abs(source[id+5]), source[id+1])
                        fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t{3}\n'.format(contig, source[2], source[3], '/replace="{0}";/compare="{1}";/origin="{2}"'.format(difference, compare[:-1], origin)))
Beispiel #5
0
    def inter_loci_overlap(self, alleles, parameters) :
        regions = [reg for region in alleles.values() for reg in region]
        # sort with contig name and start points
        regions.sort(key=itemgetter(2,3))

        for id, regi in enumerate(regions) :
            if regi[0] == '' : continue
            todel, deleted = [], 0
            for jd in xrange(id+1, len(regions)) :
                regj = regions[jd]
                if regj[0] == '' or regi[0] == regj[0]: continue
                if regi[2] != regj[2] or regj[3] > regi[4] :
                    break
                overlap = min(regi[4], regj[4]) - regj[3] + 1
                if (regi[-1] != '' and float(overlap) >= parameters['merging_prop'] * (regi[4]-regi[3]+1)) or \
                   (regj[-1] != '' and float(overlap) >= parameters['merging_prop'] * (regj[4]-regj[3]+1)) :
                    delta = regi[1] - regj[1]
                    if delta > 0.05 :
                        todel.append(jd)
                    elif delta <= -0.05 :
                        deleted = 1
                        break
            if deleted == 0 :
                for jd in todel:
                    regions[jd][0] = ''
            else :
                regi[0] = ''

        return [{'locus':reg[0], 'identity':reg[1], 'CIGAR':reg[9], 'coordinates':[reg[2], int(reg[3]), int(reg[4]), reg[5]], 'flanking':reg[6:8], 'status':reg[8], 'accepted':(0 if reg[8] == '' else 128)} for reg in regions if reg[0] != '' and reg[-1] != '']
Beispiel #6
0
    def reScore(self, ref, qry, blastab, mode, perBatch=10000):
        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)
        for k, v in self.qrySeq.items():
            self.qrySeq[k] = nucEncoder[np.array(list(v)).view(asc2int)]
        for k, v in self.refSeq.items():
            self.refSeq[k] = nucEncoder[np.array(list(v)).view(asc2int)]

        nTab = len(blastab)
        for bId in xrange(0, blastab.shape[0], perBatch):
            logger('Update scores: {0} / {1}'.format(bId, nTab))
            tabs = blastab[bId:bId + perBatch]
            #scores = np.array([ cigar2score([t[14], self.refSeq[str(t[1])][t[8]-1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9]-1:t[8]][::-1], self.qrySeq[str(t[0])][t[6]-1:t[7]], t[6], mode, 6, 1]) for t in tabs ])
            scores = np.array(
                list(
                    map(cigar2score, ([
                        t[14],
                        self.refSeq[str(t[1])][t[8] - 1:t[9]] if t[8] < t[9]
                        else 4 - self.refSeq[str(t[1])][t[9] - 1:t[8]][::-1],
                        self.qrySeq[str(t[0])][t[6] - 1:t[7]], t[6], mode, 6, 1
                    ] for t in tabs))))
            tabs.T[2], tabs.T[11] = scores.T
        return blastab
Beispiel #7
0
def main(mgs):
    for mg in mgs:
        res = []
        with uopen(mg) as fin:
            for line in fin:
                logp = re.findall('logp:\t([-eE\d\.]+)', line)
                if len(logp):
                    logp = float(logp[0])
                    res.append([logp])
                else:
                    genotype = re.findall(
                        'Genotype (\d+):\tMean proportion:\t([eE\d\.]+)\tCI95%:\t(\[ [eE\d\.]+ - [eE\d\.]+ \])',
                        line)
                    if len(genotype):
                        res[-1].append(
                            [genotype[0][1], genotype[0][2], '', '', ''])
                    elif len(res) and len(
                            res[-1]) > 1 and res[-1][-1][-1] == '':
                        part = line.strip().split('\t')
                        res[-1][-1][2:] = [
                            part[0], part[1], part[3] + ' ' + part[5]
                        ]
        try:
            res = max(res)
            res[1:] = sorted(res[1:], key=lambda x: -float(x[0]))
            for i in xrange(1, len(res)):
                r = res[i]
                print('{0}\t{1}\t{2}'.format(mg, i, '\t'.join(r)))
        except:
            pass
Beispiel #8
0
def assignment3(dists, res, encode, presence):
    n_loci = mat.shape[1] - 1
    for id in xrange(len(dists)):
        idx, ref, _, s, ql = dists[id]
        gl = presence[encode[res[ref, 1:]], np.arange(1, res.shape[1])]
        gl[gl > ql] = ql
        d = (n_loci * (gl - s).astype(float) / gl + 0.5).astype(int)
        jd = np.argmax((d - np.arange(n_loci)) <= 0) + 1
        res[idx, jd:] = res[ref, jd:]
        gl = presence[encode[res[idx, jd:]], np.arange(jd, presence.shape[1])]
        presence[encode[res[idx, jd:]][gl < ql],
                 np.arange(jd, presence.shape[1])[gl < ql]] = ql
Beispiel #9
0
def assignment(dists, res):
    for id in xrange(len(dists)):
        idx, ref, d1, d2 = dists[id]
        for d in xrange(d1, n_loci + 1):
            if res[idx, d] != res[ref, d]:
                if d >= res[idx, 0]:
                    if d >= d2:
                        if res[idx, d] < res[ref, d]:
                            grps = [res[idx, d], res[ref, d]]
                        else:
                            grps = [res[ref, d], res[idx, d]]
                        res[:idx, d][res[:idx, d] == grps[1]] = grps[0]
                        res[idx, d] = grps[0]
                else:
                    if res[idx, d] < res[ref, d]:
                        res[:idx, d][res[:idx, d] == res[ref, d]] = res[idx, d]
                    else:
                        res[idx, d:] = res[ref, d:]
                        break
            else:
                break
        if res[idx, 0] > d1:
            res[idx, 0] = d1
    return
Beispiel #10
0
def tab2overlaps(tabs, ovl_l, ovl_p, nTab, overlaps):
    ovlId = 0
    for i1 in xrange(overlaps[-1, 0], nTab):
        t1 = tabs[i1]
        ovl_l2 = min(ovl_l, ovl_p * (t1[3] - t1[2] + 1))
        if i1 > overlaps[-1, 0]:
            i2r = xrange(i1 + 1, nTab)
        else:
            i2r = xrange(overlaps[-1, 1], nTab)
        for i2 in i2r:
            t2 = tabs[i2]
            if t1[0] != t2[0] or t2[2] > t1[3]: break
            ovl = min(t1[3], t2[3]) - t2[2] + 1
            if ovl >= ovl_l2 or ovl >= ovl_p * (t2[3] - t2[2] + 1):
                overlaps[ovlId, :] = [t1[1], t2[1], ovl]
                ovlId += 1
                if ovlId == 1000000:
                    overlaps[-1, :2] = [i1, i2]
                    break
        if ovlId == 1000000:
            break
    if ovlId < 1000000:
        overlaps[-1, :] = -1
    return overlaps
Beispiel #11
0
    def runDiamond(self, ref, qry, nhits=10, frames='7') :
        logger('Run diamond starts')

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')
        
        if not self.qrySeq :
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq :
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout :
            for n, ss in sorted(qryAASeq.items()) :
                _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s))
        
        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate()
        
        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()) :
            for id, s in enumerate(ss) :
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0]+list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss) :
                    if len(cs) :
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs))
        
        for id in xrange(5) :
            with open('{0}.{1}'.format(refAA, id), 'w') as fout :
                for line in toWrite[id::5] :
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.)
            Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
        blastab = []
        for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) :
            if r is not None :
                blastab.append(np.load(r, allow_pickle=True))
                os.unlink(r)
        blastab = np.vstack(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0]))
        return blastab
Beispiel #12
0
def global_difference2(g):
    _, idx, cnt = np.unique(g.T[1], return_counts=True, return_index=True)
    idx = idx[cnt == 1]
    names, seqs = g[idx, 1], np.vstack(g[idx, 4])
    comparable = np.zeros(shape=[seqs.shape[0], seqs.shape[0]])
    seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45
    diff = compare_seq(
        seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2],
                       dtype=np.uint8))
    res = {}
    for i, n1 in enumerate(names):
        for j in xrange(i + 1, len(names)):
            n2 = names[j]
            if diff[i, j, 1] >= min(params['match_len2'],
                                    seqs.shape[1] * params['match_prop2']):
                res[(n1, n2)] = diff[i, j, :]
    return res
Beispiel #13
0
    def ovlFilter(self, blastab, params):
        coverage, delta = params[1:]
        logger('Run filtering. Start with {0} hits.'.format(len(blastab)))
        blastab[blastab.T[8] > blastab.T[9], 8:10] *= -1

        blastab = pd.DataFrame(blastab).sort_values(by=[1, 0, 8, 6]).values
        for i, t1 in enumerate(blastab):
            if t1[2] < 0: continue
            toDel = []
            for j in xrange(i + 1, blastab.shape[0]):
                t2 = blastab[j]
                if t2[2] < 0: continue
                if np.any(t1[:2] != t2[:2]) or t1[9] < t2[8]:
                    break
                c = min(t1[9], t2[9]) - t2[8] + 1
                if (c >= coverage * (t1[9] - t1[8] + 1)
                        and t2[11] - t1[11] >= delta):
                    t1[2] = -1.
                    break
                elif (c >= coverage * (t2[9] - t2[8] + 1)
                      and t1[11] - t2[11] >= delta):
                    toDel.append(j)
                elif c >= (t1[9] - t1[8] +
                           1) and c < coverage * (t2[9] - t2[8] + 1):
                    c2 = min(t1[7], t2[7]) - max(t2[6], t1[6]) + 1
                    if c2 >= (t1[7] - t1[6] +
                              1) and c2 < coverage * (t2[7] - t2[6] + 1):
                        t1[2] == -1
                        break
                elif c >= (t2[9] - t2[8] +
                           1) and c < coverage * (t1[9] - t1[8] + 1):
                    c2 = min(t1[7], t2[7]) - max(t2[6], t1[6]) + 1
                    if c2 >= (t2[7] - t2[6] +
                              1) and c2 < coverage * (t1[7] - t1[6] + 1):
                        toDel.append(j)
            if t1[2] >= 0:
                for j in toDel:
                    blastab[j][2] = -1.
        blastab = blastab[blastab.T[2] >= 0]
        blastab[blastab.T[8] < 0, 8:10] *= -1
        logger('Done filtering. End with {0} hits.'.format(blastab.shape[0]))
        return blastab
Beispiel #14
0
def global_difference(bsn_file, orthoGroup, counts=3000):
    groups = np.load(bsn_file)
    genes = []
    for gene, g in groups.items():
        _, idx, cnt = np.unique(g.T[1], return_counts=True, return_index=True)
        score = (np.sum(cnt == 1) - 1) * (2**41) - np.sum(g[idx[cnt == 1], 2],
                                                          dtype=int)
        if score > 0:
            genes.append([score, gene])
    genes = sorted(genes, reverse=True)

    og = np.array(list(orthoGroup.keys()))
    grp_order, all_useds = [], set([])
    for score, gene in genes:
        tag = groups[gene][0][0]
        if tag not in all_useds:
            grp_order.append(gene)
            used = og[og.T[0] == tag, 1]
            all_useds |= set(used.tolist())
    genes = grp_order[:counts]

    global_differences = {}
    for iter in xrange(0, len(genes), 100):
        logger('finding ANIs between genomes. {0}/{1}'.format(
            iter, len(genes)))
        #diffs = list(map(global_difference2, [groups[i] for i in genes[iter:iter+100]]))
        diffs = pool.map(global_difference2,
                         [groups[i] for i in genes[iter:iter + 100]])
        for diff in diffs:
            for pair, (mut, aln) in diff.items():
                if pair not in global_differences:
                    global_differences[pair] = []
                if aln:
                    global_differences[pair].append(max(float(mut), .5) / aln)
    for pair, info in global_differences.items():
        diff = np.log(info)
        mean_diff = max(np.mean(diff), -4.605)
        sigma = min(max(np.sqrt(np.mean((diff - mean_diff)**2)) * 3, 0.693),
                    1.386)
        global_differences[pair] = (np.exp(mean_diff), np.exp(sigma))
    return pd.DataFrame(list(global_differences.items())).values
Beispiel #15
0
 def __readAssembly(self, assembly):
     seq = {}
     with uopen(assembly) as fin:
         header = fin.read(1)
     with uopen(assembly) as fin:
         if header == '@':
             for id, line in enumerate(fin):
                 if id % 4 == 0:
                     part = line[1:].strip().split()
                     name = part[0]
                     seq[name] = [
                         0,
                         float(part[2]) if len(part) > 2 else 0., None, None
                     ]
                 elif id % 4 == 1:
                     seq[name][2] = line.strip()
                     seq[name][0] = len(seq[name][2])
                 elif id % 4 == 3:
                     seq[name][3] = np.array(list(line.strip()))
             fasfile = assembly.rsplit('.', 1)[0] + '.fasta'
             logger('Write fasta sequences into {0}'.format(fasfile))
             with open(fasfile, 'w') as fout:
                 for n, s in sorted(seq.items()):
                     fout.write('>{0}\n{1}\n'.format(
                         n, '\n'.join([
                             s[2][site:(site + 100)]
                             for site in xrange(0, len(s[2]), 100)
                         ])))
         else:
             fasfile = assembly
             for id, line in enumerate(fin):
                 if line.startswith('>'):
                     name = line[1:].strip().split()[0]
                     seq[name] = [0, 0., []]
                 else:
                     seq[name][2].extend(line.strip().split())
             for n, s in seq.items():
                 s[2] = ''.join(s[2])
                 s[0] = len(s[2])
     return seq, fasfile
Beispiel #16
0
    def runUBlast(self, ref, qry, nhits=6, frames='7'):
        logger('Run uBLAST starts')

        def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab[2] /= 100.
            blastab = blastab[blastab[2] >= min_id]
            blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3

            qf, rf = blastab[0].str.rsplit(
                ':', 1, expand=True), blastab[1].str.rsplit(':',
                                                            1,
                                                            expand=True)
            if np.all(qf[0].str.isdigit()):
                qf[0] = qf[0].astype(int)
            if np.all(rf[0].str.isdigit()):
                rf[0] = rf[0].astype(int)
            blastab[0], qf = qf[0], qf[1].astype(int)
            blastab[1], rf = rf[0], rf[1].astype(int)
            blastab[6], blastab[
                7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1
            blastab[14] = [[
                [3 * vv[0], vv[1]] for vv in v
            ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))]

            blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[
                str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)]))

            rf3 = (rf <= 3)
            blastab.loc[rf3,
                        8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[
                            rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1
            blastab.loc[~rf3, 8], blastab.loc[
                ~rf3, 9] = blastab.loc[~rf3, 13] - (
                    blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 -
                    3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 +
                                                     rf[~rf3] - 3 - 1) + 1
            d = np.max([
                blastab[7] - blastab[12], blastab[9] - blastab[13],
                1 - blastab[9],
                np.zeros(blastab.shape[0], dtype=int)
            ],
                       axis=0)
            blastab[7] -= d

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(blastab[14], d)
            d[~rf3] *= -1
            blastab[9] -= d
            blastab = blastab[
                (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12])
                & (blastab[7] - blastab[6] + 1 >= min_cov)]
            return blastab.drop(columns=[15, 16])

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F')
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        refAASeq = transeq(self.refSeq, frames)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        blastab = []
        for id in xrange(5):
            with open(refAA, 'w') as fout:
                for line in toWrite[id::4]:
                    fout.write(line)

            ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(
                usearch=usearch,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio)
            p = Popen(ublast_cmd.split(),
                      stderr=PIPE,
                      stdout=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                blastab.append(
                    parseUBlast(open(aaMatch), self.refSeq, self.qrySeq,
                                self.min_id, self.min_cov, self.min_ratio))
        blastab = pd.concat(blastab)
        logger('Run uBLAST finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Beispiel #17
0
def _linearMerge(data):
    matches, params = data
    grpCol = pd.Series(data=[[]] * matches.shape[0])
    matches = np.hstack([matches, grpCol.values[:, np.newaxis]])
    gapDist, lenDiff = params[1:]
    gene, geneLen = matches[0][0], matches[0][12]
    tailing = 20

    def resolve_edges(edges):
        grps = []
        for id, m1 in edges[0]:
            for jd, m2 in edges[1]:
                if (m1[1] == m2[1] and max(abs(m1[8]), abs(m1[9])) > min(abs(m2[8]), abs(m2[9])) ) or \
                   abs(m1[2]-m2[2]) > 0.3 or m1[6] >= m2[6] or m1[7] >= m2[7] or m2[6]-m1[7]-1 >= gapDist:
                    continue
                rLen = m2[7] - m1[6] + 1
                g1 = -m1[9] - 1 if m1[9] < 0 else m1[13] - m1[9]
                g2 = m1[8] - 1 if m1[8] > 0 else m1[13] + m1[8]
                qLen = m1[9] - m1[8] + 1 + m2[9] - m2[8] + 1 + g1 + g2
                if g1 + g2 >= gapDist or min(rLen, qLen) * lenDiff < max(
                        rLen, qLen):
                    continue
                overlap = sorted([m1[7] - m2[6] + 1, -g1 - g2], reverse=True)

                rLen1, rLen2 = m1[7] - m1[6] + 1, m2[7] - m2[6] + 1
                if overlap[0] > 0:
                    score = m1[11] + m2[11] - overlap[0] * min(
                        float(m1[11]) / rLen1,
                        float(m2[11]) / rLen2)
                    ident = (m1[2] * rLen1 + m2[2] * rLen2 - overlap[0] *
                             min(m1[2], m2[2])) / (rLen1 + rLen2 - overlap[0])
                else:
                    score = m1[11] + m2[11]
                    ident = (m1[2] * rLen1 + m2[2] * rLen2) / (rLen1 + rLen2)
                if overlap[1] < 0:
                    score += overlap[1] / 3.
                if score > m1[11] and score > m2[11]:
                    grps.append([score, ident, rLen, 1, id, jd])
        return grps

    groups = []
    prev, edges = matches[0][1], [[], []]
    nSave = len(matches)

    for id, m1 in enumerate(matches):
        rLen1 = m1[7] - m1[6] + 1
        groups.append([m1[11], m1[2], rLen1, 0, id])
        if m1[6] > tailing and (
            (m1[8] > 0 and m1[8] - 1 <= gapDist) or
            (m1[8] < 0 and m1[13] + m1[8] < gapDist)
        ):  # any hit within the last 150 bps to either end of a scaffold is a potential fragmented gene
            edges[1].append([id, m1])
        if m1[7] <= m1[12] - tailing:
            if (m1[8] > 0 and m1[13] - m1[9] <= gapDist) or (
                    m1[8] < 0 and -1 - m1[9] < gapDist):
                edges[0].append([id, m1])
            for jd in xrange(id + 1, nSave):
                m2 = matches[jd]
                if m1[1] != m2[2] or (m1[8] < 0 and m2[8] > 0) or m2[8] - m1[
                        9] - 1 >= gapDist:  # maximum 300bps between two continuous hits in the same scaffold
                    break
                rLen, qLen = m2[7] - m1[6] + 1, m2[9] - m1[8] + 1
                if abs(m1[2]-m2[2]) > 0.3 or m1[9] >= m2[9] or m1[6] >= m2[6] or m1[7] >= m2[7] or m2[6] - m1[7] -1 >= gapDist \
                   or min(rLen, qLen)*lenDiff < max(rLen, qLen) :
                    continue
                rLen2 = m2[7] - m2[6] + 1
                overlap = sorted([m1[7] - m2[6] + 1, m1[9] - m2[8] + 1],
                                 reverse=True)
                if overlap[0] > 0:
                    score = m1[11] + m2[11] - overlap[0] * min(
                        float(m1[11]) / rLen1,
                        float(m2[11]) / rLen2)
                    ident = (m1[2] * rLen1 + m2[2] * rLen2 - overlap[0] *
                             min(m1[2], m2[2])) / (rLen1 + rLen2 - overlap[0])
                else:
                    score = m1[11] + m2[11]
                    ident = (m1[2] * rLen1 + m2[2] * rLen2) / (rLen1 + rLen2)
                if overlap[1] < 0:
                    score += overlap[1] / 3.
                if score > m1[11] and score > m2[11]:
                    groups.append([score, ident, rLen, 0, id, jd])
    if len(edges[0]) and len(edges[1]):
        groups.extend(resolve_edges(edges))
    if len(groups) > len(matches):
        groups.sort(reverse=True)
        usedMatches, usedGroups = {}, []
        for grp in groups:
            if (grp[4], 4) in usedMatches or (grp[-1], 5) in usedMatches:
                continue
            if grp[3] > 0:
                if (grp[4], 5) in usedMatches or (grp[-1], 4) in usedMatches:
                    continue
            if grp[4] != grp[-1]:
                lMat, rMat = matches[grp[4]], matches[grp[-1]]
                il, im = sorted([grp[4], grp[-1]])
                skp = 0
                for i in xrange(il + 1, im):
                    if matches[i][1] in {lMat[1], rMat[1]}:
                        if (i, 4) in usedMatches or (i, 5) in usedMatches:
                            skp = 1
                            break
                if skp:
                    continue
                for i in xrange(il + 1, im):
                    if matches[i][1] in {lMat[1], rMat[1]}:
                        usedMatches[(i, 4)] = usedMatches[(i, 5)] = 0
            usedGroups.append(grp)
            usedMatches[(grp[4], 4)] = usedMatches[(grp[-1], 5)] = 1
            if grp[3] > 0:
                usedMatches[(grp[4], 5)] = usedMatches[(grp[-1], 4)] = 1

        usedGroups.sort(key=itemgetter(4), reverse=True)
        for gId in xrange(len(usedGroups) - 1):
            g1, g2 = usedGroups[gId:gId + 2]
            if g1[4] == g2[-1]:
                m = matches[g1[4]]
                score = g1[0] + g2[0] - m[11]
                length = g1[2] + g2[2] - (m[7] - m[6] + 1)
                iden = (g1[1] * g1[2] + g2[1] * g2[2] - min(g1[1], g2[1]) *
                        (m[7] - m[6] + 1)) / length
                usedGroups[gId + 1] = [score, iden, length, 0, g2[4]] + g1[4:]
                g1[1] = -1
    else:
        usedGroups = groups
        usedMatches = {(k, k): 1 for k in np.arange(matches.shape[0])}
    for g in usedGroups:
        if g[1] >= 0:
            ids = [matches[i][15] for i in g[4:]]
            for i in g[4:]:
                matches[i, -1] = g[:3] + ids
    ids = {k[0] for k, v in usedMatches.items() if v == 1}
    matches = matches[np.array(list(ids))]
    return matches
Beispiel #18
0
def hierCC(args):
    params = get_args(args)
    ot = time.time()
    profile_file, cluster_file, old_cluster = params.profile, params.output + '.npz', params.incremental

    global mat, n_loci
    mat = pd.read_csv(profile_file, sep='\t', header=None, dtype=str).values
    allele_columns = np.array(
        [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])])
    mat = mat[1:, allele_columns].astype(int)
    n_loci = mat.shape[1] - 1

    logger(
        '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.'
        .format(time.time() - ot, *mat.shape))
    if not params.immutable:
        absence = np.sum(mat <= 0, 1)
        mat = mat[np.argsort(absence, kind='mergesort')]

    if os.path.isfile(old_cluster):
        od = np.load(old_cluster, allow_pickle=True)
        cls = od['hierCC']

        typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0}
        if len(typed) > 0:
            logger('{0}: Loaded in {1} old hierCC assignments.'.format(
                time.time() - ot, len(typed)))
            mat_idx = np.array([t in typed for t in mat.T[0]])
            mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]])
    else:
        typed = {}

    logger('{0}: Start hierCC assignments'.format(time.time() - ot))
    pool = Pool(10)

    res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape)
    res[1:, 0] = n_loci + 1
    for index in xrange(0, mat.shape[0], 100):
        to_run = []
        for idx in np.arange(index, index + 100):
            if idx < mat.shape[0]:
                if mat[idx, 0] in typed:
                    res[idx, :] = cls[typed[mat[idx, 0]], :]
                else:
                    to_run.append(idx)
        if len(to_run) == 0:
            continue
        if not params.immutable:
            dists = np.vstack(pool.map(get_distance, to_run))
            assignment(dists, res)
        else:
            dists = np.vstack(pool.map(get_distance2, to_run))
            assignment2(dists, res)

        logger('{0}: Assigned {1} of {2} types into hierCC.'.format(
            time.time() - ot, index, mat.shape[0]))
    res.T[0] = mat.T[0]
    np.savez_compressed(cluster_file, hierCC=res)

    if not params.delta:
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in np.arange(n_loci)])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(rr) for rr in r]) + '\n')
    else:
        deltas = map(int, params.delta.split(','))
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in deltas])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n')
    del res
    logger('NUMPY clustering result (for incremental hierCC): {0}.npz'.format(
        params.output))
    logger('TEXT  clustering result (for visual inspection): {0}.hierCC.gz'.
           format(params.output))
Beispiel #19
0
def assignment2(dists, res):
    for id in xrange(len(dists)):
        idx, ref, jd = dists[id]
        res[idx, jd:] = res[ref, jd:]
Beispiel #20
0
    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = []
            for line in fin:
                if line.startswith('@'):
                    continue
                part = line.strip().split('\t')
                if part[2] == '*': continue
                qn, qf = part[0].rsplit(':', 1)
                rn, rf, rx = part[2].rsplit(':', 2)
                rs = int(part[3]) + int(rx)
                ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)])
                qm = len(part[9])
                if qm * 3 < min_cov: continue
                cov_ratio = qm * 3. / ql
                if cov_ratio < min_ratio: continue
                cigar = [[int(n) * 3, t]
                         for n, t in re.findall(r'(\d+)([A-Z])', part[5])]
                cl = np.sum([c[0] for c in cigar])
                variation = float(part[12][5:]) * 3 if part[12].startswith(
                    'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3

                iden = 1 - round(variation / cl, 3)
                if iden < min_id: continue
                qf, rf = int(qf), int(rf)
                qs = int(part[18][5:]) if part[18].startswith('ZS:') else int(
                    re.findall('ZS:i:(\d+)', line)[0])

                rm = int(
                    np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3)
                if rf <= 3:
                    rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1
                else:
                    rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - (
                        (rs + rm - 1) * 3 + rf - 4) + 1
                if qf <= 3:
                    qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1
                else:
                    qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - (
                        (qs + qm - 1) * 3 + qf - 4) + 1
                    qs, qe, rs, r_e = qe, qs, r_e, rs
                    cigar = list(reversed(cigar))

                cd = [c[0] for c in cigar if c[1] != 'M']
                score = int(
                    part[14][5:]) if part[14].startswith('ZR:') else int(
                        re.findall('ZR:i:(\d+)', line)[0])
                blastab.append([
                    qn, rn, iden, cl,
                    int(variation - sum(cd)),
                    len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar
                ])
            blastab = pd.DataFrame(blastab)
            blastab[[0, 1]] = blastab[[0, 1]].astype(str)
            return blastab

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        blastab = []
        for id in xrange(5):
            #logger('{0}'.format(id))
            with open(refAA, 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            p = Popen(diamond_cmd.split(),
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq,
                                   self.min_id, self.min_cov, self.min_ratio)
                os.unlink(aaMatch)
            if tab is not None:
                blastab.append(tab)
        blastab = pd.concat(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Beispiel #21
0
def loadBam(prefix, reference, bams, sequences, snps):
    sites = []
    p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format(
        ' '.join(bams)).split(),
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)
    for line in p.stdout:
        part = line.strip().split('\t')
        s = int(part[1]) - 1
        if s % 100000 == 0:
            sys.stdout.write('# {0}\n'.format(s))
        if sequences[part[0]][s] > 0 or s % 5 == 0:
            bases = ''.join(part[4::3])
            bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper()))
            bases = re.split('(\d+)', bases)
            for i in range(1, len(bases), 2):
                bases[i + 1] = bases[i + 1][int(bases[i]):]
            types, cnts = np.unique(list(''.join(bases[::2])),
                                    return_counts=True)
            if np.sum(cnts) >= 3:
                if types.size > 1:
                    cnts.sort()
                    sites.append([cnts[-1], np.sum(cnts[:-1])])
                else:
                    sites.append([cnts[0], 0])
    sites = np.array(sites)
    ave_depth = np.max([np.median(np.sum(sites, 1)), 2.])
    sys.stdout.write(
        '{3}: Average read depth: {0}; Sites between {1} and {2} will be used for hybrid estimation.\n'
        .format(ave_depth, ave_depth / 2., ave_depth * 3., prefix))
    sites = sites[(ave_depth / 2. <= np.sum(sites, 1))
                  & (np.sum(sites, 1) <= ave_depth * 3)]

    m = GaussianMixture(n_components=1, covariance_type='tied')
    m.fit(sites)
    best_model = [m.bic(sites), m]
    for n_components in xrange(2, 6):
        sys.stdout.write('# Testing {0} components.\n'.format(n_components))
        m = GaussianMixture(n_components=n_components, covariance_type='tied')
        for i in xrange(20):
            m.fit(sites)
            bic = m.bic(sites)
            if bic < best_model[0]:
                best_model = [bic, m]
                m = GaussianMixture(n_components=n_components,
                                    covariance_type='tied')
    m = best_model[1]
    mId = np.argmax(m.means_.T[1] / np.sum(m.means_, 1))
    sys.stdout.write(
        '{3}: Find {0} GMM components. The most divergent group is {1} and counts for {2} of total sites.\n'
        .format(m.n_components, m.means_[mId].tolist(), m.weights_[mId],
                prefix))
    mDiv = m.means_[mId][0] / np.sum(m.means_[mId])
    mDiv = 10 * np.log10([[mDiv, 1 - mDiv], [1 - mDiv, mDiv]])

    seq = {n: list(s) for n, s in readFasta(reference).items()}
    qual = {n: [0] * len(s) for n, s in seq.items()}

    lowQ, lowC, highQ = 0, 0, 0
    p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format(
        ' '.join(bams)).split(),
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)
    for line in p.stdout:
        part = line.strip().split('\t')
        s = int(part[1]) - 1
        if s % 100000 == 0:
            sys.stdout.write('# {0}\n'.format(s))
        bases = ''.join(part[4::3])
        bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper()))
        bases = re.split('(\d+)', bases)
        for i in range(1, len(bases), 2):
            bases[i + 1] = bases[i + 1][int(bases[i]):]
        types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True)
        if types.size > 0:
            depth = np.sum(cnts)
            if cnts.size == 1:
                g, mId = [cnts[0], 0], 0
            elif cnts.size > 1:
                mId = np.argmax(cnts)
                g = [cnts[mId], depth - cnts[mId]]
            seq[part[0]][s] = types[mId]
            if depth >= 3 and depth / 3. <= ave_depth <= depth * 3.:
                q = min(
                    40,
                    max(
                        1,
                        int(round(
                            np.sum(g * mDiv[0]) - np.sum(g * mDiv[1]), 0))))
                qual[part[0]][s] = q
                if q < 10:
                    lowQ += 1
                else:
                    highQ += 1
            else:
                lowC += 1
    qual = {n: ''.join([chr(ss + 33) for ss in s]) for n, s in qual.items()}
    with open(prefix + '.fastq', 'w') as fout:
        for n, s in seq.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, ''.join(s), qual[n]))
    sys.stdout.write(
        '{0}: {1} good sites; {2} low covered sites; {3} low quality sites;\n'.
        format(prefix, highQ, lowC, lowQ))

    return
Beispiel #22
0
def getClust(prefix, genes, params):
    groups = {}
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        if not params['translate']:
            geneFile = genes
        else:
            na_seqs = readFasta(genes)
            aa_seqs = transeq(na_seqs, frame='1', transl_table='starts')
            with open(os.path.join(dirPath, 'seq.aa'), 'w') as fout:
                for n, s in aa_seqs:
                    fout.write('>{0}\n{1}\n'.format(n, s[0]))
            geneFile = os.path.join(dirPath, 'seq.aa')
        seqDb = os.path.join(dirPath, 'seq.db')
        tmpDb = os.path.join(dirPath, 'tmp')
        lcDb = os.path.join(dirPath, 'seq.lc')
        tabFile = os.path.join(dirPath, 'clust.tab')
        refFile = os.path.join(dirPath, 'seq.ref')

        nRef = 999999999999999
        for ite in xrange(3):
            if os.path.isdir(tmpDb):
                shutil.rmtree(tmpDb)
            os.makedirs(tmpDb)
            if os.path.isfile(seqDb):
                list(map(os.unlink, glob.glob(seqDb + '*')))
            if os.path.isfile(lcDb):
                list(map(os.unlink, glob.glob(lcDb + '*')))
            subprocess.Popen('{0} createdb {2} {1} -v 0'.format(
                externals['mmseqs'], seqDb, geneFile).split()).communicate()
            subprocess.Popen('{0} linclust {1} {2} {3} --min-seq-id {4} -c {5} --threads {6} -v 0'.format( \
                externals['mmseqs'], seqDb, lcDb, tmpDb, params['identity'], params['coverage'], params['n_thread']).split(), stdout=subprocess.PIPE).communicate()
            subprocess.Popen('{0} createtsv {1} {1} {2} {3}'.format(\
                externals['mmseqs'], seqDb, lcDb, tabFile).split(), stdout = subprocess.PIPE).communicate()
            with open(tabFile) as fin:
                for line in fin:
                    part = line.strip().split()
                    groups[part[1]] = part[0]
            tmp = []
            with open(geneFile) as fin:
                toWrite, used_grps = False, {None: 1}
                for line in fin:
                    if line.startswith('>'):
                        name = line[1:].strip().split()[0]
                        grp = groups.get(name, None)
                        toWrite = False if grp in used_grps else True
                        if toWrite:
                            used_grps[grp] = name
                    if toWrite:
                        tmp.append(line)
                for gene, grp in groups.items():
                    if grp in used_grps:
                        groups[gene] = used_grps[grp]
            with open(refFile, 'w') as fout:
                for line in tmp:
                    fout.write(line)
            if nRef <= len(used_grps):
                break
            nRef = len(used_grps)
            geneFile = refFile
        if not params['translate']:
            shutil.copy2(refFile, '{0}.clust.exemplar'.format(prefix))
        else:
            rSeq = readFasta(refFile)
            na_seqs = dict(na_seqs)
            with open('{0}.clust.exemplar'.format(prefix), 'w') as fout:
                for n, s in rSeq:
                    fout.write('>{0}\n{1}\n'.format(n, na_seqs[n]))
    finally:
        shutil.rmtree(dirPath)
    with open('{0}.clust.tab'.format(prefix), 'w') as fout:
        for gene, grp in sorted(groups.items()):
            g = gene
            while g != grp:
                g, grp = grp, groups[grp]
            groups[gene] = grp
            fout.write('{0}\t{1}\n'.format(gene, grp))

    return '{0}.clust.exemplar'.format(prefix), '{0}.clust.tab'.format(prefix)
Beispiel #23
0
    def linear_merge(self, blasttab, min_iden, min_frag_prop, min_frag_len, max_dist=300, diag_diff=1.5, max_diff=200, **params) :
        for part in blasttab :
            if part[8] > part[9] :
                part[8], part[9] = -part[8], -part[9]

        blasttab.sort(key=itemgetter(0,1,6,8,-11))
        nB = len(blasttab)
        for id, p1 in enumerate(blasttab) :
            if p1[0] == '' : continue
            for jd in xrange(id+1, nB) :
                p2 = blasttab[jd]
                if p2[0] == '' : continue
                if (p1[0], p1[1]) != (p2[0], p2[1]) or p2[6] - p1[6] > 4 :
                    break
                d = abs(p1[6]-p2[6]) + abs(p1[7]-p2[7]) + abs(p1[8]-p2[8]) + abs(p1[9]-p2[9])
                if d <= 5 :
                    if p1[2] >= p2[2] :
                        p2[0] = ''
                    else :
                        p1[0] = ''
                        break
        blasttab = [p for p in blasttab if p[0] != '']
        
        nB = len(blasttab)
        syntenies = []
        for id, p1 in enumerate(blasttab) :
            for jd in xrange(id+1, nB)  :
                p2 = blasttab[jd]

                if p1[0] != p2[0] or p1[1] != p2[1] or p2[6] - p1[7] > max_dist :
                    break
                elif p1[8] < 0 < p2[8] or p2[8] < p1[8] + 15 or p2[9] < p1[9] + 15 or p2[7]<p1[7]+15 or p2[6] < p1[6]+15 or p2[8] - p1[9] > max_dist :
                    continue
                m, n = p2[7] - p1[6] + 1, p2[9] - p1[8] + 1
                if m < min_frag_len or m < min_frag_prop*p1[12] or max(m, n) - min(m, n) > max_diff or max(m,n) > diag_diff * min(m, n) :
                    continue
                o_len = 0 if p2[6] > p1[7] else p1[7] - p2[6] + 1
                p1_len = p1[7] - p1[6] + 1 - o_len
                p2_len = p2[7] - p2[6] + 1 - o_len
                
                iden = (p1[2]*p1_len + p2[2]*p2_len +max(p1[2], p2[2])*o_len)/(p1_len+p2_len+o_len)
                if iden < min_iden :
                    continue

                p1s, p2s = p1[11]/(p1[7]-p1[6]+1), p2[11]/(p2[7]-p2[6]+1)
                dist = max(p2[6]-p1[7]-1, p2[8]-p1[9]-1, 0)
                score = p1s*p1_len +p2s*p2_len+max(p1s, p2s)*o_len - dist
                if score > 0 :
                    syntenies.append([id, jd, iden, score])
        syn_score = {}
        for id , syn in enumerate(syntenies) :
            if syn[0] not in syn_score and syn[1] not in syn_score :
                p1, p2 = blasttab[syn[0]][:], blasttab[syn[1]]
                c1, c2 = p1[-1], p2[-1]                
                r_dist, q_dist = p2[6]-p1[7]-1, p2[8]-p1[9]-1
                if min(r_dist, q_dist) < 0 :
                    p1s, p2s = p1[11]/(p1[7]-p1[6]+1), p2[11]/(p2[7]-p2[6]+1)
                    if p1s <= p2s :
                        cc = [ [int(n), t] for n,t in re.findall(r'(\d+)([A-Z])', c1)]
                        i = -1
                    else :
                        cc = [ [int(n), t] for n,t in re.findall(r'(\d+)([A-Z])', c2)]
                        i = 0

                    while min(r_dist, q_dist) < 0 :
                        if cc[i][1] == 'M' :
                            d = min(cc[i][0], max(-r_dist, -q_dist, 0))
                            r_dist, q_dist = d + r_dist, d + q_dist
                        elif cc[i][1] == 'D' :
                            d = min(cc[i][0], max(-r_dist, -q_dist, 0))
                            r_dist += d
                        elif cc[i][1] == 'I' :
                            d = min(cc[i][0], max(-r_dist, -q_dist, 0))
                            q_dist += d
                        else :
                            raise 'unknown'
                        if d >= cc[i][0] :
                            cc = cc[1:] if i == 0 else cc[:-1]
                        else :
                            cc[i][0] -= d
                    if i == -1 :
                        c1 = ''.join([ '{0}{1}'.format(*c) for c in cc ])
                    else :
                        c2 = ''.join([ '{0}{1}'.format(*c) for c in cc ])
                gap=[]
                if r_dist > 0 :
                    gap.append('{0}D'.format(r_dist))
                if q_dist > 0 :
                    gap.append('{0}I'.format(q_dist))
                if len(gap) == 0 :
                    cc1 = re.findall(r'(^.*?)(\d+)([A-Z]$)', c1)[0]
                    cc2 = re.findall(r'(^\d+)([A-Z])(.*$)', c2)[0]
                    if cc1[2] == cc2[1] :
                        c1 = '{0}{1}{2}'.format(cc1[0], int(cc1[1])+int(cc2[0]), cc1[2])
                        c2 = cc2[2]
                p1[7], p1[9], p1[2], p1[11], p1[14] = p2[7], p2[9], syn[2], syn[3], ''.join([c1] + gap + [c2])
                blasttab.append(p1)
            syn_score[syn[0]] = id
            syn_score[syn[1]] = id
            
        for part in blasttab :
            if part[8] < 0 :
                part[8], part[9] = -part[8], -part[9]
            x = ['{0}D'.format(part[6]-1), part[-1]] if part[6] > 1 else [part[-1]]
            if part[7] < part[12] :
                x.append('{0}D'.format(part[12]-part[7]))
            if len(x) : part[-1] = ''.join(x)
        return blasttab
Beispiel #24
0
def alignAgainst(data) :
    prefix, aligner, db, (rtag, reference), (tag, query) = data
    if isinstance(aligner, list) :
        return lastAgainst(tag, query, db, prefix, reference, aligner[1])
    try :
        qrySeq, qryQual = readFastq(query)
    except :
        return [tag, query]
    refSeq, refQual = readFastq(reference)
    proc = subprocess.Popen('{0} -c -t1 --frag=yes -A1 -B14 -O24,60 -E2,1 -r100 -g1000 -P -N5000 -f1000,5000 -n2 -m50 -s200 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format(
                                aligner, db, query).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    alignments = []
    for lineId, line in enumerate(proc.stdout) :
        part = line.strip().split('\t')
        part[1:4] = [int(p) for p in part[1:4]]
        part[6:11] = [int(p) for p in part[6:11]]
        part[11] = float(part[13][5:])
        part[12], part[13] = lineId, part[11]/part[10]
        part[14:17] = [[], [], []]
        alignments.append(part)
    proc.wait()
    
    deleteChain = {}
    nItem = len(alignments)
    
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break

            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if s > e+10 :
                break
            if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break

            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if s > e+10 :
                break
            
            if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] :
                deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]])
            if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] :
                deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]])

    deleted = {}
    for p in sorted(alignments, key=lambda x:x[11], reverse=True) :
        id = p[12]
        if id in deleteChain :
            for jd in deleteChain[id] :
                if jd not in deleted :
                    deleted[id] = 1
                    break
    alignments = [p for p in alignments if p[12] not in deleted]
    
    # repeats in qry
    nItem = len(alignments)
    alignments.sort(key=lambda x:x[:4])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[0] != p2[0] : break
            s, e = max(p1[2], p2[2]), min(p1[3], p2[3])
            if e > s :
                p1[16].append([s, e])
                p2[16].append([s, e])
            else :
                break
    # repeats in ref
    alignments.sort(key=lambda x:x[5:9])
    for i1, p1 in enumerate(alignments) :
        for i2 in xrange(i1+1, nItem) :
            p2 = alignments[i2]
            if p1[5] != p2[5] : break
            s, e = max(p1[7], p2[7]), min(p1[8], p2[8])
            if e > s :
                p1[15].append([s, e])
                p2[15].append([s, e])
            else :
                break
    
    maskedRegion = {}
    refRepeat = []
    for p in alignments :
        # prepare a unique set of repeat region
        qryRepeat = []
        if len(p[16]) > 0 :
            qryRepeat.append(p[16][0])
            for pp in p[16][1:] :
                if pp[0] > qryRepeat[-1][1]+20 :
                    qryRepeat.append(pp)
                elif pp[1] > qryRepeat[-1][1]:
                    qryRepeat[-1][1] = pp[1]
        ref = [refSeq[p[5]], refQual[p[5]]]
        qry = [qrySeq[p[0]], qryQual[p[0]]]
        cigar = p[-1][5:]
        d = 1 if p[4] == '+' else -1
        if d < 0 :
            qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat]
        else :
            qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)]

        mut = []
        alnSite = [p[7], p[2] if d > 0 else p[3]-1]
        for cl, ct in re.findall(r'(\d+)([MID])', cigar) :
            cl = int(cl)
            if ct == 'M' :
                # extract aligned sequences
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)])
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[1][(alnSite[1]-cl+1):(alnSite[1]+1)]))

                e =[alnSite[0]+cl, alnSite[1]+cl*d]
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e[1] :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0] + d*(qr[0] - alnSite[1])
                        if d*qr[1] <= d*e[1] :
                            qr[3] = alnSite[0] + d*(qr[1] - alnSite[1])
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) :
                    if ord(rr1) < 43 or ord(qq1) < 43 :
                        maskedRegion[(p[5], alnSite[0]+id)] = 0
                    if rr != qq and rr != 'N' and qq != 'N' :
                        mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]])
                alnSite = e
            elif ct == 'I' :
                q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )
                q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ))
                
                e = alnSite[1] + cl*d
                for qid in xrange(len(qryRepeat)-1, -1, -1) :
                    qr = qryRepeat[qid]
                    if d*qr[0] <= d*e :
                        if qr[2] == -1 :
                            qr[2] = alnSite[0]
                        if d*qr[1] <= d*e :
                            qr[3] = alnSite[0]
                            p[15].append(qr[2:])
                            del qryRepeat[qid]
                    else :
                        break
                
                if ord(min(list(q1))) >= 43 :
                    mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[1] = e
            elif ct == 'D' :
                r = ref[0][alnSite[0]:alnSite[0]+cl]
                r1 = ref[1][alnSite[0]:alnSite[0]+cl]
                if ord(min(list(r1))) >= 43 :
                    mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]])
                for site in xrange(alnSite[0], alnSite[0]+2) :
                    maskedRegion[(p[5], site)] = 0
                alnSite[0]+=cl
        p[14] = mut
        refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ])

    repeats = []
    if len(refRepeat) :
        refRepeat.sort()
        repeats = [refRepeat[0]]
        for p in refRepeat[1:] :
            if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] :
                repeats.append(p)
            elif p[2] > repeats[-1][2] :
                repeats[-1][2] = p[2]

    for p in repeats :
        for site in xrange(p[1], p[2]) :
            maskedRegion[(p[0], site)] = 1

    repeats = []
    for cont, site in sorted(maskedRegion) :
        if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site :
            repeats.append([cont, site, site])
        else :
            repeats[-1][2] = site
  
    mutations = []
    alignments = [aln for aln in alignments if aln[9] >= 100]
    for aln in alignments :
        for m in aln[14] :
            if len(m[3]) == 1 :
                if (aln[5], m[0]) not in maskedRegion :
                    mutations.append([aln[5], aln[0]] + m)
            elif maskedRegion.get((aln[5], m[0]), 0) != 1 :
                if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 :
                    continue
                mutations.append([aln[5], aln[0]] + m)
    with uopen(prefix + '.gff.gz', 'w') as fout :
        fout.write('##gff-version 3\n')
        fout.write('## Reference: {0}\n'.format(reference))
        fout.write('## Query: {0}\n'.format(query))
        fout.write('## Tag: {0}\n'.format(tag))
        for aln in alignments :
            if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
            else :
                fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format(
                    aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], 
                ))
                
        for p in repeats :
            fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format(
                p[0], p[1]+1, p[2]+1, 
            ))
        for mut in mutations :
            e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2
            e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2
            if len(mut[5]) > 26 :
                mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1)

            fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format(
                mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6]
            ))

    return [tag, prefix + '.gff.gz']
Beispiel #25
0
def loadBam(prefix, reference, bams, sequences, snps):
    sequence = readFasta(reference)
    sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()}

    sites = {}
    for bam in bams:
        if bam is not None:
            depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                bam=bam, **externals).split(),
                                     stdout=subprocess.PIPE,
                                     universal_newlines=True)
            try:
                d = pd.read_csv(depth.stdout, sep='\t').values
                sites.update({cName: 1 for cName in np.unique(d.T[0])})
            except:
                pass

    sequence = {n: s for n, s in sequence.items() if n in sites}
    with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
        for n, s in sorted(sequence.items()):
            fout.write('>{0}\n{1}\n'.format(
                n, '\n'.join([
                    s[0][site:(site + 100)]
                    for site in xrange(0, len(s[0]), 100)
                ])))

    bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
    pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
        prefix=prefix, bam_opt=bam_opt, **externals)
    subprocess.Popen(pilon_cmd.split(),
                     stdout=subprocess.PIPE,
                     universal_newlines=True).communicate()

    uncertains = []
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if sequences[part[0]][int(part[1]) - 1] >= 0:
                if len(part[3]) == 1 and len(part[4]) == 1:
                    pp = part[7].split(';')
                    dp = float(pp[0][3:])
                    if dp >= 3:
                        qd = int(pp[4][3:])
                        if part[-1] == '0/1' or qd < 10:
                            bcs = sorted(
                                [float(bc) for bc in pp[5][3:].split(',')])
                            uncertains.append([bcs[-1], np.sum(bcs[:-1])])
    uncertains = np.array(uncertains)
    p = np.sum(uncertains.T[0]) / np.sum(uncertains)
    qPerRead = 10 * (np.log10(p) - np.log10(1 - p))
    for n in sequence:
        sequence[n][0] = list(sequence[n][0])

    highQ, lowQ, lowC = 0, 0, 0
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if len(part[3]) == 1 and len(part[4]) == 1:
                s = int(part[1]) - 1
                pp = part[7].split(';')
                dp = float(pp[0][3:])
                qd = int(pp[4][3:])
                if part[-1] == '0/1' or qd < 10:
                    bcs = np.array([int(bc) for bc in pp[5][3:].split(',')])
                    if np.sum(bcs) > 0:
                        sequence[part[0]][0][s] = ['A', 'C', 'G',
                                                   'T'][np.argmax(bcs)]
                    else:
                        sequence[part[0]][0][s] = part[3]
                    if dp < 3:
                        lowC += 1
                    else:
                        bcs.sort()
                        bcs = [bcs[-1], np.sum(bcs[:-1])]
                        q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p)
                        q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1
                        if q2 >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = min(40, max(1, int(q2)))
                else:
                    if dp < 3:
                        lowC += 1
                    else:
                        if qd >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = qd
                    if part[-1] == '1/1':
                        sequence[part[0]][0][s] = part[4]

    logger(
        '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'.
        format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC))
    with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout:
        p = prefix.rsplit('/', 1)[-1]
        for n, (s, q) in sequence.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(
                p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q])))
    os.unlink('{0}.mapping.vcf'.format(prefix))
    os.unlink('{0}.mapping.fasta'.format(prefix))
    os.unlink('{0}.mapping.reference.fasta'.format(prefix))
    return '{0}.metaCaller.fastq'.format(prefix)
Beispiel #26
0
 def make_alignment( filename ) :
     comparisons = []
     with open(filename, 'r') as fin:
         for line in fin:
             if line[0] == 'a' :
                 comparison = [ int(line.split(' ', 2)[1][6:]) ]
             elif line[0] == 's' :
                 part = line.strip().split()[1:]
                 part[1:5] = [int(part[1]), int(part[2]), part[3], int(part[4])]
                 if part[3] == '+' :
                     part[1:3] = [part[1]+1, part[1]+part[2]]
                 else :
                     part[1:3] = [part[4]-part[1], part[4]-part[1]-part[2]+1]
                 comparison.extend(part)
                 if len(comparison) >= 13 :
                     comparison.append([int((m in 'nN') or (n in 'Nn')) for m, n in zip(comparison[6], comparison[12])])
             elif line[0] in 'pq' :
                 part = line.strip().split()
                 comparison[13] = [ max(comparison[13][id], int(b in '!"#$%&\'()*+,-./')) for id, b in enumerate(part[-1])]
             elif len(line.strip()) == 0 :
                 if comparison[0] >= 200 : 
                     comparisons.append( last_package.call_mutation(comparison) )
     
     # remove significant low identity regions in query
     comparisons.sort(key=lambda x: min(x[8:10]) )
     comparisons.sort(key=lambda x: x[7] )
 
     low_q = []
     for id, regi in enumerate(comparisons) :
         if len(regi) == 0 : continue
         for jd in xrange(id+1, len(comparisons)) :
             regj = comparisons[jd]
             if len(regj) == 0 : continue
             if regi[7] != regj[7] : break
             si, ei = sorted(regi[8:10])
             sj, ej = sorted(regj[8:10])
             s = max(si, sj)
             e = min(ei, ej)
             if e >= s :
                 overlap_i = last_package.sub_comparison(regi, qry_coords=[s, e])
                 overlap_j = last_package.sub_comparison(regj, qry_coords=[s, e])
                 
                 if overlap_i[0] < 0.95 * overlap_j[0] and ( regi[0] < regj[0] or ei < ej ) : 
                     if s - si >= 30 :
                         comparisons[id] = last_package.sub_comparison(regi, qry_coords=[si, s-1])
                         #if overlap_i[3] < overlap_i[2] and overlap_i[4] == '+' :
                             #print comparisons[id]
                         overlap_i[12] = 'E'
                         low_q.append(overlap_i)
                         if overlap_i[3] >= overlap_i[2]  :
                             regi = comparisons[id]
                         if len(regi) == 0: break
                     else :
                         comparisons[id][12] = 'E'
                         break
                 elif overlap_i[0] * 0.95 > overlap_j[0] :
                     if ej - e >= 30 :
                         comparisons[jd] = last_package.sub_comparison(regj, qry_coords=[e+1, ej])
                         overlap_j[12] = 'E'
                         if overlap_j[3] >= overlap_j[2]  :
                             low_q.append(overlap_j)
                     else :
                         comparisons[jd][12] = 'E'
                 elif s == si and e == ei and regj[0] > regi[0]*3 and overlap_i[0] <= overlap_j[0] :
                     comparisons[id][12] = 'E'
                     break
                 elif s == sj and e == ej and regi[0] > regj[0]*3 and overlap_i[0] >= overlap_j[0] :
                     comparisons[jd][12] = 'E'
                 else :
                     comparisons[id][12] = 'D'
                     comparisons[jd][12] = 'D'
             else :
                 break
 
     # remove significant low identity regions in reference
     comparisons = sorted([x for x in comparisons if len(x) > 0] + low_q, key=lambda x: x[2] )
     comparisons.sort(key=lambda x: x[1] )
 
     for id, regi in enumerate(comparisons) :
         if len(regi) == 0 : continue
         for jd in xrange(id+1, len(comparisons)) :
             regj = comparisons[jd]
             if len(regj) == 0 : continue
             if regi[1] != regj[1] : break
             si, ei = regi[2:4]
             sj, ej = regj[2:4]
             s = max(si, sj)
             e = min(ei, ej)
             if e >= s :                
                 overlap_i = last_package.sub_comparison(regi, ref_coords=[s, e])
                 overlap_j = last_package.sub_comparison(regj, ref_coords=[s, e])
                 
                 if overlap_i[0] < 0.95 * overlap_j[0] and ( regi[0] < regj[0] or ei < ej ) : 
                     if s - si >= 30 :
                         comparisons[id] = last_package.sub_comparison(regi, ref_coords=[si, s-1])
                         regi = comparisons[id]
                         if len(regi) == 0: break
                     else :
                         comparisons[id] = []
                         break
                 elif overlap_i[0] * 0.95 > overlap_j[0] :
                     if ej - e >= 30 :
                         comparisons[jd] = last_package.sub_comparison(regj, ref_coords=[e+1, ej])
                     else :
                         comparisons[jd] = []
                 elif overlap_i[0] == overlap_j[0] and len(overlap_i) == len(overlap_j) :
                     if si == sj and ei == ej:
                         diff = 0
                         for i, i_snp in enumerate(overlap_i[13:]) :
                             j_snp = overlap_j[13+i]
                             if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] :
                                 diff = 1
                                 break
                         if diff == 0 :
                             if comparisons[id][12] in 'DE':
                                 comparisons[id] = [] 
                                 break
                             else: 
                                 comparisons[jd] = []
                     elif si <= sj and ei >= ej :
                         diff = 0
                         for i, i_snp in enumerate(overlap_i[13:]) :
                             j_snp = overlap_j[13+i]
                             if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] :
                                 diff = 1
                                 break
                         if diff == 0 :
                             comparisons[jd] = []
                     elif si >= sj and ei <= ej:
                         diff = 0
                         for i, i_snp in enumerate(overlap_i[13:]) :
                             j_snp = overlap_j[13+i]
                             if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] :
                                 diff = 1
                                 break
                         if diff == 0 :
                             comparisons[id] = []
                             break
             else :
                 break
         if len(comparisons[id]) > 0 :
             regi = comparisons[id]
             regi[6] = [lq for lq in regi[6] if lq[1] >= regi[2] and lq[0] <= regi[3]]
     
     # mark repetitive regions in query
     repeats = []
     mutations = {}
     comparisons = sorted([x for x in comparisons if len(x) > 0 and x[12] != 'E'], key=lambda x: min(x[8:10]) )
     comparisons.sort(key=lambda x: x[7] )
     
     for id, regi in enumerate(comparisons) :
         for jd in xrange(id+1, len(comparisons)) :
             regj = comparisons[jd]
             if regi[7] != regj[7] : break
             si, ei = sorted(regi[8:10])
             sj, ej = sorted(regj[8:10])
             s = max(si, sj)
             e = min(ei, ej)
             if e >= s :
                 for mut in regi[13:] :
                     if abs(min(mut[2:4])) <= e and abs(max(mut[2:4])) >= s :
                         mut[6] = 1
                 for mut in regj[13:] :
                     if abs(min(mut[2:4])) <= e and abs(max(mut[2:4])) >= s :
                         mut[6] = 1
                 overlap_i = last_package.sub_comparison(regi, qry_coords=[s, e])
                 overlap_j = last_package.sub_comparison(regj, qry_coords=[s, e])
                 regi[6] = [lq for lq in regi[6] if lq[0] <overlap_i[2] or lq[1] > overlap_i[3]]
                 regj[6] = [lq for lq in regj[6] if lq[0] <overlap_j[2] or lq[1] > overlap_j[3]]
                 repeats.append(overlap_i[1:4] + [0])
                 repeats.append(overlap_j[1:4] + [0])
         
     # identify repetitive regions in the reference
     comparisons.sort(key=lambda x: x[2] )
     comparisons.sort(key=lambda x: x[1] )
 
     for id, regi in enumerate(comparisons) :
         if len(regi) == 0 : continue
         for jd in xrange(id+1, len(comparisons)) :
             regj = comparisons[jd]
             if regi[1] != regj[1] : break
             si, ei = sorted(regi[2:4])
             sj, ej = sorted(regj[2:4])
             s = max(si, sj)
             e = min(ei, ej)
             if e >= s :
                 overlap_i = last_package.sub_comparison(regi, ref_coords=[s, e])
                 overlap_j = last_package.sub_comparison(regj, ref_coords=[s, e])
                 if len(overlap_i) == len(overlap_j) :
                     diff = 0
                     for i, i_snp in enumerate(overlap_i[13:]) :
                         j_snp = overlap_j[13+i]
                         if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] :
                             diff = 1
                             break
                     if diff == 1 :
                         for mut in regi[13:] :
                             if abs(mut[0]) <= e and abs(mut[1]) >= s :
                                 mut[6] = 1
                         for mut in regj[13:] :
                             if abs(mut[0]) <= e and abs(mut[1]) >= s :
                                 mut[6] = 1
                         regi[6] = [lq for lq in regi[6] if lq[0] <s or lq[1] > e]
                         regj[6] = [lq for lq in regj[6] if lq[0] <s or lq[1] > e]
                         repeats.append([regi[1], s, e, 0])
                 else :
                     for mut in regi[13:] :
                         if abs(mut[0]) <= e and abs(mut[1]) >= s :
                             mut[6] = 1
                     for mut in regj[13:] :
                         if abs(mut[0]) <= e and abs(mut[1]) >= s :
                             mut[6] = 1
                     regi[6] = [lq for lq in regi[6] if lq[0] <s or lq[1] > e]
                     regj[6] = [lq for lq in regj[6] if lq[0] <s or lq[1] > e]
                     repeats.append([regi[1], s, e, 0])
         for mut in regi[13:] :
             if mut[6] == 0 :
                 if regi[1] not in mutations : 
                     mutations[ regi[1] ] = {}
                 if mut[0] not in mutations[ regi[1] ]:
                     mutations[ regi[1] ] [ mut[0] ] = {}
                 if mut[5] not in mutations[ regi[1] ] [ mut[0] ] :
                     mutations[ regi[1] ] [ mut[0] ] [ mut[5] ] = [regi[7], regi[10]] + mut
                 else : 
                     mutations[ regi[1] ] [ mut[0] ] [ mut[5] ].extend([regi[7], regi[10]] + mut)
         repeats.extend([[regi[1]]+ lq[:2] + [1] for lq in regi[6]])
     
     repeats.sort(key=lambda x:x[1])
     repeats.sort(key=lambda x:x[0])
     repetitive_regions = []
     for rep in repeats:
         if len(repetitive_regions) == 0 or repetitive_regions[-1][0] != rep[0] or repetitive_regions[-1][2]+1 < rep[1] :
             repetitive_regions.append(rep)
         elif rep[2] > repetitive_regions[-1][2] :
             repetitive_regions[-1][2] = rep[2]
             if repetitive_regions[-1][3] > 0 :
                 repetitive_regions[-1][3] = rep[3]
     nocall = {}
     for r in repetitive_regions + [c[1:] for c in comparisons if float(c[0])/(abs(c[3]-c[2])+1) < 0.7 or c[0] < 200] :
         for s in xrange(r[1], r[2]+1) :
             nocall[(r[0], s)] = 1
     mutations = { contig:{ site:alters for site, alters in variation.items() if (contig, site) not in nocall } for contig, variation in mutations.items() }
     comparisons = [c for c in comparisons if float(c[0])/(abs(c[3]-c[2])+1) >= 0.7 and c[0] >= 200]
     return comparisons, repetitive_regions, mutations
Beispiel #27
0
def write_output(prefix, prediction, genomes, clust_ref, old_prediction):
    predictions, alleles = {}, {}

    allele_file = open('{0}.allele.fna'.format(prefix), 'w')
    prediction = pd.read_csv(prediction, sep='\t', header=None).values
    for part in prediction:
        #with open(prediction) as fin :
        #for line in fin :
        #part = line.strip().split()
        if part[0] not in alleles:
            alleles[part[0]] = {clust_ref[part[0]]: 1}
            allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1,
                                                       clust_ref[part[0]]))

        if part[9] < part[10]:
            l, r, d = min(part[7] - 1,
                          part[9] - 1), min(part[12] - part[8],
                                            part[13] - part[10]), 1
        else:
            l, r, d = min(part[7] - 1,
                          part[13] - part[9]), min(part[12] - part[8],
                                                   part[10] - 1), -1
        if l <= 6 and part[7] - l == 1:
            part[7], part[9] = part[7] - l, part[9] - l * d
        else:
            ll = (part[7] - 1) % 3
            if ll > 0:
                part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d
        if r <= 6 and part[8] + r == part[12]:
            part[8], part[10] = part[8] + r, part[10] + r * d
        else:
            rr = (part[12] - part[8]) % 3
            if rr > 0:
                part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d

        if part[9] < part[10]:
            part[9:12] = part[9], part[10], '+'
        else:
            part[9:12] = part[10], part[9], '-'

        if part[4] not in predictions:
            predictions[part[4]] = []
        elif predictions[part[4]][-1][2] == part[2]:
            prev = predictions[part[4]][-1]
            if prev[5] == part[5] and part[7] - prev[8] < 500:
                if part[11] == '+' and part[9] - prev[10] < 500:
                    prev[8], prev[10] = part[8], part[10]
                    continue
                elif part[11] == '-' and prev[9] - part[10] < 500:
                    prev[8], prev[9] = part[8], part[9]
                    continue
            predictions[part[4]][-1][1], part[1] = -1, -1
        predictions[part[4]].append(part)

    op = ['', 0, []]
    with open('{0}.EToKi.gff'.format(prefix), 'w') as fout:
        for gid, (g, predict) in enumerate(predictions.items()):
            predict.sort(key=itemgetter(5, 9, 10))
            for pid, pred in enumerate(predict):
                if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]:
                    cds, allele_id = 'fragment:{0:.2f}%'.format(
                        (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain'
                    start, stop = pred[9:11]
                else:
                    s, e = pred[9:11]
                    if pred[11] == '+':
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min(
                            3 * int((pred[13] - e) / 3), 600)
                        seq = genomes[pred[5]][1][(s2 - 1):e2]
                        lp, rp = s - s2, e2 - e
                    else:
                        s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min(
                            3 * int((pred[13] - e) / 3), 60)
                        seq = rc(genomes[pred[5]][1][(s2 - 1):e2])
                        rp, lp = s - s2, e2 - e

                    seq2 = seq[(lp):(len(seq) - rp)]
                    if seq2 not in alleles[pred[0]]:
                        if pred[3] == pred[0] and pred[7] == 1 and pred[
                                8] == pred[12]:
                            alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1
                        else:
                            alleles[pred[0]][seq2] = 'LowQ{0}'.format(
                                len(alleles[pred[0]]) + 1)
                        allele_id = str(alleles[pred[0]][seq2])
                        allele_file.write('>{0}_{1}\n{2}\n'.format(
                            pred[0], allele_id, seq2))
                    else:
                        allele_id = str(alleles[pred[0]][seq2])

                    frames = sorted(set([0, len(seq) % 3]))
                    for frame, aa_seq in zip(
                            frames,
                            transeq({'n': seq},
                                    transl_table='starts',
                                    frame=','.join(
                                        [str(f + 1) for f in frames]))['n']):
                        cds = 'CDS'
                        s0, s1 = aa_seq.find('M', int(lp / 3),
                                             int(lp / 3 + 30)), aa_seq.rfind(
                                                 'M', 0, int(lp / 3))
                        start = s0 if s0 >= 0 else s1
                        if start < 0:
                            cds, start = 'nostart', int(lp / 3)
                        stop = aa_seq.find('X', start)
                        if 0 <= stop < lp / 3 + 30:
                            s0 = aa_seq.find('M', stop, int(lp / 3 + 30))
                            if s0 >= 0:
                                start = s0
                                stop = aa_seq.find('X', start)
                        if stop < 0:
                            cds = 'nostop'
                        elif (stop - start + 1) * 3 <= 0.8 * pred[12]:
                            cds = 'premature stop:{0:.2f}%'.format(
                                (stop - start + 1) * 300 / pred[12])

                        if cds == 'CDS':
                            if pred[11] == '+':
                                start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame
                            else:
                                start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame
                            break
                        else:
                            start, stop = s, e
                            if frame > 0:
                                cds = 'frameshift'

                if pred[5] != op[0]:
                    op = [pred[5], 0, old_prediction.get(pred[5], [])]
                old_tag = []
                for k in xrange(op[1], len(op[2])):
                    opd = op[2][k]
                    if opd[2] < start:
                        op[1] = k + 1
                    elif opd[1] > stop:
                        break
                    elif opd[3] != pred[11]:
                        continue
                    ovl = min(opd[2], stop) - max(opd[1], start) + 1
                    if ovl >= 300 or ovl >= 0.6 * (
                            opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop -
                                                                  start + 1):
                        frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3)
                        if frame == 0:
                            old_tag.append('{0}:{1}-{2}'.format(*opd))

                fout.write(
                    '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n'
                    .format(
                        pred[5],
                        'CDS' if cds == 'CDS' else 'pseudogene',
                        start,
                        stop,
                        pred[11],
                        '{0}_{1}_{2}'.format(prefix, gid, pid),
                        pred[0],
                        allele_id,
                        s,
                        e,
                        '' if pred[0] == pred[3] else
                        ',structure variant group:' + pred[3],
                        '' if cds == 'CDS' else ';pseudogene=' + cds,
                        '' if len(old_tag) == 0 else 'locus_tag={0};'.format(
                            ','.join(old_tag)),
                    ))
    allele_file.close()
    return
Beispiel #28
0
def filt_per_group(data):
    mat, ref, global_file = data
    global_differences = dict(np.load(global_file))
    nMat = mat.shape[0]
    seqs = np.vstack([
        np.vstack(mat.T[4]),
        np.array(list(ref)).view(asc2int).astype(np.uint8)[np.newaxis, :]
    ])
    seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45
    diff = compare_seq(
        seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2],
                       dtype=int)).astype(float)
    incompatible, distances = {}, np.zeros(
        shape=[seqs.shape[0], seqs.shape[0]], dtype=float)
    for i1, m1 in enumerate(mat):
        for i2 in xrange(i1 + 1, nMat):
            m2 = mat[i2]
            mut, aln = diff[i1, i2]
            if aln > 0:
                gd = global_differences.get(tuple(sorted([m1[1], m2[1]])),
                                            (0.01, 4))
                distances[i1, i2] = distances[i2, i1] = max(
                    0., 1 - (aln - mut) / aln / (1 - gd[0]))
                difference = mut / aln / gd[0] / gd[1]
            else:
                distances[i1, i2] = distances[i2, i1] = 0.8
                difference = 1.5
            if difference > 1.:
                incompatible[(i1, i2)] = 1

    if len(incompatible) > 0:
        groups = []
        for j, m in enumerate(mat):
            novel = 1
            for g in groups:
                if diff[g[0], j, 0] <= 0.6 * (
                        1.0 - params['clust_identity']) * diff[g[0], j, 1]:
                    g.append(j)
                    novel = 0
                    break
            if novel:
                groups.append([j])
        group_tag = {gg: g[0] for g in groups for gg in g}
        try:
            tags = {
                g[0]: mat[g[0]][4].tostring().decode('ascii')
                for g in groups
            }
        except:
            tags = {g[0]: mat[g[0]][4].tostring() for g in groups}

        tags.update({'REF': ref})

        ic2 = {}
        for i1, i2 in incompatible:
            t1, t2 = group_tag[i1], group_tag[i2]
            if t1 != t2:
                t1, t2 = str(t1), str(t2)
                if t1 not in ic2: ic2[t1] = {}
                if t2 not in ic2: ic2[t2] = {}
                ic2[t1][t2] = ic2[t2][t1] = 1
        incompatible = ic2

        for ite in xrange(3):
            try:
                tmpFile = tempfile.NamedTemporaryFile(dir='.', delete=False)
                for n, s in tags.items():
                    tmpFile.write('>X{0}\n{1}\n{2}'.format(
                        n, s, '\n' * ite).encode('utf-8'))
                tmpFile.close()
                cmd = params[params['orthology']].format(
                    tmpFile.name, **
                    params) if len(tags) < 500 else params['nj'].format(
                        tmpFile.name, **params)
                phy_run = subprocess.Popen(shlex.split(cmd),
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           universal_newlines=True)
                gene_phy = ete3.Tree(phy_run.communicate()[0].replace("'", ''))
                break
            except:
                if ite == 2:
                    return mat
            finally:
                os.unlink(tmpFile.name)
        for n in gene_phy.get_leaves():
            if len(n.name):
                n.name = n.name[1:]

        node = gene_phy.get_midpoint_outgroup()
        if node is not None:
            gene_phy.set_outgroup(node)

        for ite in xrange(3000):
            gene_phy.ic, gene_phy.dist = {}, 0.
            rdist = sum([c.dist for c in gene_phy.get_children()])
            for c in gene_phy.get_children():
                c.dist = rdist
            for node in gene_phy.iter_descendants('postorder'):
                if node.is_leaf():
                    node.ic = {
                        tuple(sorted([node.name, n2])): 1
                        for n2 in incompatible.get(node.name, {})
                    }
                else:
                    node.ic = {}
                    for c in node.get_children():
                        for x in c.ic:
                            if x in node.ic:
                                node.ic.pop(x)
                            else:
                                node.ic[x] = 1
            cut_node = max([[len(n.ic), n.dist, n]
                            for n in gene_phy.iter_descendants('postorder')],
                           key=lambda x: (x[0], x[1]))
            if cut_node[0] > 0:
                cut_node = cut_node[2]
                prev_node = cut_node.up
                cut_node.detach()
                if 'REF' in cut_node.get_leaf_names():
                    gene_phy = cut_node
                elif prev_node.is_root():
                    gene_phy = gene_phy.get_children()[0]
                else:
                    prev_node.delete(preserve_branch_length=True)

                tips = set(gene_phy.get_leaf_names())
                for r1 in list(incompatible.keys()):
                    if r1 not in tips:
                        rr = incompatible.pop(r1, None)
                        for r2 in rr:
                            incompatible.get(r2, {}).pop(r1, None)
                for r1 in list(incompatible.keys()):
                    if len(incompatible[r1]) == 0:
                        incompatible.pop(r1, None)
                if len(incompatible) == 0:
                    break

                logger('     Iteration {0}. Remains {1} tips.'.format(
                    ite + 1, len(gene_phy.get_leaf_names())))
            else:
                break
        if len(gene_phy.get_leaf_names()) < len(tags):
            groups = {str(g[0]): g for g in groups}
            tips = sorted([
                nn for n in gene_phy.get_leaf_names()
                for nn in groups.get(n, [])
            ])
            mat = mat[tips]
    return mat
Beispiel #29
0
    def do_polish(self, reference, reads, reassemble=False, onlySNP=False) :
        if parameters.get('SNP', None) is not None :
            return self.do_polish_with_SNPs(reference, parameters['SNP'])
        else :
            if parameters['mapper'] == 'minimap2' :
                bams = self.__run_minimap(prefix, reference, reads )
            elif parameters['mapper'] != 'bwa' :
                bams = self.__run_bowtie(prefix, reference, reads )
            else :
                bams = self.__run_bwa(prefix, reference, reads )
            sites = {}
            for bam in bams :
                if bam is not None :
                    depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                    for line in depth.stdout :
                        part = line.strip().split()
                        if len(part) > 2 and float(part[2]) > 0 :
                            sites[part[0]] = 1
            sequence = readFasta(reference)
            sequence = {n:s for n,s in sequence.items() if n in sites}

            with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))

            bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
            if reassemble :
                pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            else :
                pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            
            if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
                pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    
            
            snps = []
            with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout :
                for line in fin :
                    if line.startswith('#') : continue
                    part = line.strip().split('\t')
                    if part[-1] != '0/0':
                        try :
                            if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]):
                                if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) :
                                    snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] )
                                    fout.write(line)
                        except :
                            pass

            os.unlink('{0}.mapping.vcf'.format(prefix))
            for n in sequence.keys() :
                sequence[n] = list(sequence[n])
            for n, site, ori, alt in reversed(snps) :
                s = sequence[n]
                end = site + len(ori)
                s[site:end] = alt
            logger('Observed and corrected {0} changes using PILON'.format(len(snps)))
            with open('{0}.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    s = ''.join(s)
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))
            return '{0}.fasta'.format(prefix)
Beispiel #30
0
    def get_quality(self, reference, reads ) :
        if parameters['mapper'] == 'minimap2' :
            bams = self.__run_minimap(prefix, reference, reads, )
        elif parameters['mapper'] != 'bwa' :
            bams = self.__run_bowtie(prefix, reference, reads, )
        else :
            bams = self.__run_bwa(prefix, reference, reads, )
        
        sequence = readFasta(reference)
        for n, s in sequence.items() :
            q = ['!'] * len(s)
            sequence[n] = [s, q]

        sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() }
        for bam in bams :
            if bam is not None :
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                for line in depth.stdout :
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0 :
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()}
        depth = np.array(list(sites.values()))
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth :
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1]
            if acc[0] *2 >= size :
                break
        ave_depth = acc[1]/acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 2.)
        for n, s in sites.items() :
            s[2] = s[1]/ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
            for n, s in sorted(sequence.items()) :
                fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)])))
        bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
        Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate()
        if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
            pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
            Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth))
        logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth))
        indels = []
        with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout :
            for line in fin :
                if line.startswith('#') : continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] :
                    continue
                if part[-1] == '1/1':
                    if len(part[3]) > 1 :
                        indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2])
                    elif len(part[4]) > 1 and part[4] != '<DUP>' :
                        indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2])

                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 :
                        pp = part[7].split(';')
                        dp = float(pp[0][3:])
                        af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1]
                        if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) :
                            site = int(part[1])-1
                            qual = chr(int(pp[4][3:])+33)
                            sequence[part[0]][1][site] = qual
                        else :
                            fout.write(line)
                    else :
                        fout.write(line)
                except :
                    fout.write(line)
        for n, s, e in indels :
            sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e])
            
        if self.snps is not None :
            for n, snvs in self.snps.items() :
                for site, snv in snvs :
                    if snv.find('N') >= 0 : continue
                    if snv.startswith('+') :
                        s, e = site-4, site+3+len(snv)
                    else :
                        s, e = site-4, site+4
                    for k in xrange(s, e) :
                        sequence[n][1][k] = max(chr(40+33), sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout :
            p = prefix.rsplit('/', 1)[-1]
            for n, (s, q) in sequence.items() :
                if sites[n][2] >= cont_depth[0] :
                    fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) )
        os.unlink( '{0}.mapping.vcf'.format(prefix) )
        logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)