Beispiel #1
0
def MLSTdb(args):
    params = getParams(args)
    database, refset, alleleFasta, refstrain, max_iden, min_iden, coverage, paralog, relaxEnd = params[
        'database'], params['refset'], params['alleleFasta'], params[
            'refstrain'], params['max_iden'], params['min_iden'], params[
                'coverage'], params['paralog'], params['relaxEnd']
    if os.path.isfile(alleleFasta):
        alleles = readFasta(uopen(alleleFasta))
    else:
        alleles = readFasta(StringIO(alleleFasta))
    alleles = [allele for allele in alleles \
                   if allele['value_id'].isdigit() and int(allele['value_id']) > 0 and allele['fieldname'].find('/') < 0]
    refAlleles = ''
    if refset is not None:
        if refstrain:
            if os.path.isfile(refstrain):
                references = readFasta(uopen(refstrain))
            else:
                references = readFasta(StringIO(refstrain))
        else:
            loci, references = {}, []
            for allele in alleles:
                if allele['fieldname'] not in loci:
                    loci[allele['fieldname']] = 1
                    references.append(allele)

        allele_text, refAlleles = buildReference(alleles, references, max_iden,
                                                 min_iden, coverage, paralog,
                                                 relaxEnd)
        if refset:
            with open(str(refset), 'w') as fout:
                fout.write(refAlleles + '\n')
        logger('A file of reference alleles has been generated:  {0}'.format(
            refset))
    if database:
        conversion = [[], []]
        with open(database, 'w') as fout:
            for allele in alleles:
                conversion[0].append(get_md5(allele['value']))
                conversion[1].append(
                    [allele['fieldname'],
                     int(allele['value_id'])])

        conversion = pd.DataFrame(conversion[1], index=conversion[0])
        conversion.to_csv(database, header=False)
        logger('A lookup table of all alleles has been generated:  {0}'.format(
            database))
    return allele_text, refAlleles
Beispiel #2
0
    def get_allele_info(allele_file):
        if os.path.isfile(allele_file + '.stat'):
            return json.load(open(allele_file + '.stat'))
        alleles = readFasta(allele_file)
        allele_aa = transeq(alleles)
        allele_stat = {}
        for n, s in alleles.items():
            locus, allele_id = n.rsplit('_', 1)
            if locus not in allele_stat:
                allele_stat[locus] = {}

            if len(s) % 3 > 0:
                pseudo = 2  # frameshift
            else:
                aa = allele_aa.get(n + '_1', 'A')
                if aa[:-1].find('X') >= 0:
                    pseudo = 3  # premature
                elif s[:3] not in ('ATG', 'GTG', 'TTG'):
                    pseudo = 4  # no start
                elif aa[-1] != 'X':
                    pseudo = 5  # no stop
                else:
                    pseudo = 6  # intact
            allele_stat[locus][
                allele_id] = int(allele_id) * 1000000 + len(s) * 10 + pseudo
        json.dump(allele_stat, open(allele_file + '.stat', 'w'))
        return allele_stat
Beispiel #3
0
    def do_polish_with_SNPs(self, reference, snp_file) :
        sequence = readFasta(reference)
        snps = { n:[] for n in sequence }
        if snp_file != '' :
            with open(snp_file) as fin :
                for line in fin :
                    part = line.strip().split()
                    snps[part[0]].append([int(part[1]), part[-1]])
            self.snps = snps

        for n, s in sequence.items() :
            sequence[n] = list(s)

        for cont, sites in snps.items() :
            for site,base in reversed(sites) :
                if base.startswith('+') :
                    sequence[cont][site-1:site-1] = base[1:]
                elif base.startswith('-') :
                    sequence[cont][site-1:(site+len(base)-2)] = []
                else :
                    sequence[cont][site-1] = base

        with open('{0}.fasta'.format(prefix), 'w') as fout :
            for n, s in sorted(sequence.items()) :
                s = ''.join(s)
                fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))
        return '{0}.fasta'.format(prefix)
Beispiel #4
0
def addGenes(genes, gene_file):
    for gfile in gene_file.split(','):
        if gfile == '': continue
        gprefix = gfile.split('.')[0]
        ng = readFasta(gfile)
        for name in ng:
            s = ng[name]
            if checkCDS(name, s):
                genes['{0}:{1}'.format(gprefix, name)] = [
                    gfile, '', 0, 0, '+',
                    int(hashlib.sha1(s.encode('utf-8')).hexdigest(), 16), s
                ]
    return genes
Beispiel #5
0
def cgMLST(allele_profile, allele_file):
    def get_allele_info(alleles):
        allele_aa = transeq(alleles)
        allele_stat = {}
        for n, s in alleles.iteritems():
            locus, allele_id = n.rsplit('_', 1)
            if locus not in allele_stat:
                allele_stat[locus] = {}

            if len(s) % 3 > 0:
                pseudo = 2  # frameshift
            else:
                aa = allele_aa.get(n + '_1', 'A')
                if aa[:-1].find('X') >= 0:
                    pseudo = 3  # premature
                elif s[:3] not in ('ATG', 'GTG', 'TTG'):
                    pseudo = 4  # no start
                elif aa[-1] != 'X':
                    pseudo = 5  # no stop
                else:
                    pseudo = 6  # intact
            allele_stat[locus][allele_id] = [len(s), pseudo]
        return allele_stat

    matrix = pd.read_csv(allele_profile, sep='\t', header=None,
                         dtype=str).as_matrix()
    loci = np.array([not m.startswith('#') for m in matrix[0]])
    data = matrix[1:, loci]
    data[np.in1d(data, ['-', 'n', 'N']).reshape(data.shape)] = '0'

    data = data.astype(int)
    data[data < 0] = 0
    loci = matrix[0][loci]
    genomes = matrix[1:, 0]

    allele_stat = get_allele_info(readFasta(allele_file))

    genome_stat = {genome: [0 for l in loci] for genome in genomes}
    locus_stat = [[
        locus,
        len(allele_stat[locus]),
        np.mean([v[0] for v in allele_stat[locus].values()]),
        np.min([v[0] for v in allele_stat[locus].values()]),
        np.max([v[0] for v in allele_stat[locus].values()])
    ] for locus in loci]
    for g, d in zip(genomes, data):
        for i, dd in enumerate(d):
            genome_stat[g][i] = dd * 10 + allele_stat.get(loci[i], {}).get(
                str(dd), [0, 0])[-1]
    return genome_stat, locus_stat
Beispiel #6
0
def loadBam(prefix, reference, bams, sequences, snps):
    sites = []
    p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format(
        ' '.join(bams)).split(),
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)
    for line in p.stdout:
        part = line.strip().split('\t')
        s = int(part[1]) - 1
        if s % 100000 == 0:
            sys.stdout.write('# {0}\n'.format(s))
        if sequences[part[0]][s] > 0 or s % 5 == 0:
            bases = ''.join(part[4::3])
            bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper()))
            bases = re.split('(\d+)', bases)
            for i in range(1, len(bases), 2):
                bases[i + 1] = bases[i + 1][int(bases[i]):]
            types, cnts = np.unique(list(''.join(bases[::2])),
                                    return_counts=True)
            if np.sum(cnts) >= 3:
                if types.size > 1:
                    cnts.sort()
                    sites.append([cnts[-1], np.sum(cnts[:-1])])
                else:
                    sites.append([cnts[0], 0])
    sites = np.array(sites)
    ave_depth = np.max([np.median(np.sum(sites, 1)), 2.])
    sys.stdout.write(
        '{3}: Average read depth: {0}; Sites between {1} and {2} will be used for hybrid estimation.\n'
        .format(ave_depth, ave_depth / 2., ave_depth * 3., prefix))
    sites = sites[(ave_depth / 2. <= np.sum(sites, 1))
                  & (np.sum(sites, 1) <= ave_depth * 3)]

    m = GaussianMixture(n_components=1, covariance_type='tied')
    m.fit(sites)
    best_model = [m.bic(sites), m]
    for n_components in xrange(2, 6):
        sys.stdout.write('# Testing {0} components.\n'.format(n_components))
        m = GaussianMixture(n_components=n_components, covariance_type='tied')
        for i in xrange(20):
            m.fit(sites)
            bic = m.bic(sites)
            if bic < best_model[0]:
                best_model = [bic, m]
                m = GaussianMixture(n_components=n_components,
                                    covariance_type='tied')
    m = best_model[1]
    mId = np.argmax(m.means_.T[1] / np.sum(m.means_, 1))
    sys.stdout.write(
        '{3}: Find {0} GMM components. The most divergent group is {1} and counts for {2} of total sites.\n'
        .format(m.n_components, m.means_[mId].tolist(), m.weights_[mId],
                prefix))
    mDiv = m.means_[mId][0] / np.sum(m.means_[mId])
    mDiv = 10 * np.log10([[mDiv, 1 - mDiv], [1 - mDiv, mDiv]])

    seq = {n: list(s) for n, s in readFasta(reference).items()}
    qual = {n: [0] * len(s) for n, s in seq.items()}

    lowQ, lowC, highQ = 0, 0, 0
    p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format(
        ' '.join(bams)).split(),
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)
    for line in p.stdout:
        part = line.strip().split('\t')
        s = int(part[1]) - 1
        if s % 100000 == 0:
            sys.stdout.write('# {0}\n'.format(s))
        bases = ''.join(part[4::3])
        bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper()))
        bases = re.split('(\d+)', bases)
        for i in range(1, len(bases), 2):
            bases[i + 1] = bases[i + 1][int(bases[i]):]
        types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True)
        if types.size > 0:
            depth = np.sum(cnts)
            if cnts.size == 1:
                g, mId = [cnts[0], 0], 0
            elif cnts.size > 1:
                mId = np.argmax(cnts)
                g = [cnts[mId], depth - cnts[mId]]
            seq[part[0]][s] = types[mId]
            if depth >= 3 and depth / 3. <= ave_depth <= depth * 3.:
                q = min(
                    40,
                    max(
                        1,
                        int(round(
                            np.sum(g * mDiv[0]) - np.sum(g * mDiv[1]), 0))))
                qual[part[0]][s] = q
                if q < 10:
                    lowQ += 1
                else:
                    highQ += 1
            else:
                lowC += 1
    qual = {n: ''.join([chr(ss + 33) for ss in s]) for n, s in qual.items()}
    with open(prefix + '.fastq', 'w') as fout:
        for n, s in seq.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, ''.join(s), qual[n]))
    sys.stdout.write(
        '{0}: {1} good sites; {2} low covered sites; {3} low quality sites;\n'.
        format(prefix, highQ, lowC, lowQ))

    return
Beispiel #7
0
def filt_genes(prefix, groups, global_file, conflicts, first_classes=None):
    outPos = np.ones(16, dtype=bool)
    outPos[[3, 4, 5, 10, 15]] = False

    c2 = {c: {} for c in np.unique(conflicts.T[:2])}
    for c in conflicts:
        c2[c[0]][c[1]] = c2[c[1]][c[0]] = c[2]
    conflicts = c2

    clust_ref = readFasta(params['clust'])
    for gene, g in groups.items():
        g.T[2] *= g.T[3]
        g[:] = g[np.argsort(-g.T[2], kind='mergesort')]
    used, results, run = {}, {}, {}
    group_id = 0
    with open('{0}.Prediction'.format(prefix), 'w') as fout:
        while len(groups) > 0:
            genes = get_gene(groups, first_classes, cnt=50)
            if len(genes) <= 0:
                continue
            to_run, to_run_id, min_score, min_rank = [], [], genes[-1][
                1], genes[0][2]
            genes = {gene: score for gene, score, min_rank in genes}
            if params['orthology'] in ('ml', 'nj'):
                for gene, score in genes.items():
                    if gene not in run:
                        mat = groups[gene]
                        _, bestPerGenome, matInGenome = np.unique(
                            mat.T[1], return_index=True, return_inverse=True)
                        region_score = mat.T[2] / mat[
                            bestPerGenome[matInGenome], 2]
                        if region_score.size >= bestPerGenome.size * 2:
                            used2, kept = set([]), np.ones(mat.shape[0],
                                                           dtype=bool)
                            for id, m in enumerate(mat):
                                if m[5] in used2:
                                    kept[id] = False
                                else:
                                    used2.update(conflicts.get(m[5], {}))
                            mat = mat[kept]
                            _, bestPerGenome, matInGenome = np.unique(
                                mat.T[1],
                                return_index=True,
                                return_inverse=True)
                            region_score = mat.T[2] / mat[
                                bestPerGenome[matInGenome], 2]
                        if region_score.size > bestPerGenome.size * 3 and len(
                                region_score) > 500:
                            region_score2 = sorted(region_score, reverse=True)
                            cut = region_score2[bestPerGenome.size * 3 - 1]
                            if cut >= params['clust_identity']:
                                cut = min(
                                    region_score2[bestPerGenome.size * 5] if
                                    len(region_score) > bestPerGenome.size * 5
                                    else params['clust_identity'], 1.0 - 0.6 *
                                    (1.0 - params['clust_identity']))
                            mat = mat[region_score >= cut]

                        to_run.append([mat, clust_ref[mat[0][0]], global_file])
                        to_run_id.append(gene)
                working_groups = pool.map(filt_per_group, to_run)
                #working_groups = [filt_per_group(d) for d in to_run]
                for gene, working_group in zip(to_run_id, working_groups):
                    groups[gene] = working_group
                    run[gene] = 1
            else:
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)
                region_score = mat.T[2] / mat[bestPerGenome[matInGenome], 2]
                mat[:] = mat[region_score >= params['clust_identity']]
                used2, kept = set([]), np.ones(mat.shape[0], dtype=bool)
                for id, m in enumerate(mat):
                    for mmm in m[6]:
                        if mmm[15] in used2:
                            kept[id] = False
                            break
                    if kept[id]:
                        used2 |= {mmm[15] for mmm in m[6]}
                mat = mat[kept]
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)

            while len(genes):
                score, gene = max([[
                    np.sum(groups[gene][np.unique(groups[gene].T[1],
                                                  return_index=True)[1]].T[2]),
                    gene
                ] for gene in genes])
                if score < min_score:
                    break
                mat = groups.pop(gene, [])
                genes.pop(gene)

                paralog, paralog2 = 0, 0
                supergroup = {}
                used2 = {}
                for m in mat:
                    gid = m[5]
                    conflict = used.get(gid, None)
                    if conflict is not None:
                        if not isinstance(conflict, int):
                            superC = results[conflict]
                            supergroup[superC] = supergroup.get(superC, 0) + 1
                        elif conflict > 0:
                            if m[6].shape[0] <= 1 and m[3] >= params[
                                    'clust_identity']:
                                paralog = 1
                                break
                            else:
                                paralog2 += 1
                        m[3] = -1
                    else:
                        for g2, gs in conflicts.get(gid, {}).items():
                            if gs == 1:
                                if g2 not in used:
                                    used2[g2] = m[0]
                            elif gs == 2:
                                used2[g2] = 1
                            else:
                                used[g2] = 0
                if paralog or paralog2 * 3 >= mat.shape[0]:
                    continue
                else:
                    used.update(used2)

                pangene = mat[0][0]
                if len(supergroup):
                    pg, pid = max(supergroup.items(), key=itemgetter(1))
                    if pid * 3 >= mat.shape[0] or (pid * 5 >= mat.shape[0]
                                                   and pid > 1):
                        pangene = pg

                results[mat[0][0]] = pangene
                logger(
                    '{4} / {5}: pan gene "{3}" : "{0}" picked from rank {1} and score {2}'
                    .format(mat[0][0], min_rank, score, pangene, len(results),
                            len(groups) + len(results)))

                for grp in mat[mat.T[3] > 0]:
                    group_id += 1
                    for g in grp[6]:
                        fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            pangene, min_rank, group_id, grp[1],
                            '\t'.join(g[outPos].astype(str).tolist())))
    return '{0}.Prediction'.format(prefix)
Beispiel #8
0
def ortho(args):
    global params
    params.update(add_args(args).__dict__)
    params.update(externals)

    global pool
    pool = Pool(params['n_thread'])
    genomes, genes = readGFF(params['GFFs'])
    genes = addGenes(genes, params['genes'])
    if params.get('old_prediction', None) is None:
        params['old_prediction'] = params['prefix'] + '.old_prediction.npz'
        old_predictions = {}
        for n, g in genes.items():
            if g[1] != '':
                if g[1] not in old_predictions:
                    old_predictions[g[1]] = []
                old_predictions[g[1]].append([n, g[2], g[3], g[4]])
        for gene, g in old_predictions.items():
            old_predictions[gene] = np.array(sorted(g), dtype=object)
        np.savez_compressed(params['old_prediction'], **old_predictions)
        del old_predictions, n, g

    genomes, genes, encodes = encodeNames(genomes, genes)
    if params.get('prediction', None) is None:
        first_classes = load_priority(params.get('priority', ''), genes,
                                      encodes)

        if params.get('clust', None) is None:
            params['genes'] = writeGenes('{0}.genes'.format(params['prefix']),
                                         genes, first_classes)
            del genes
            params['clust'], params['uc'] = getClust(
                params['prefix'], params['genes'],
                dict(identity=params['clust_identity'],
                     coverage=params['clust_match_prop'],
                     n_thread=params['n_thread']))
        genes = readFasta(params['clust'])

        if params.get('self_bsn', None) is None:
            params['self_bsn'] = params['prefix'] + '.self_bsn.npy'
            orthoGroup = get_similar_pairs(params['prefix'], params['clust'],
                                           first_classes, params)
            np.save(params['self_bsn'], orthoGroup)
        else:
            orthoGroup = np.load(params['self_bsn'])
        orthoGroup = dict([[tuple(g), 1] for g in orthoGroup] +
                          [[(g[1], g[0]), 1]
                           for g in orthoGroup] + [[(g, g), 0] for g in genes])

        if params.get('map_bsn', None) is None or params.get(
                'conflicts', None) is None:
            blastab, conflicts = get_map_bsn(params['prefix'], params['clust'],
                                             genomes, orthoGroup)
            blastab = np.split(
                blastab,
                np.cumsum(np.unique(blastab.T[0], return_counts=True)[1])[:-1])

            params['map_bsn'], params['conflicts'] = params[
                'prefix'] + '.map_bsn.npz', params['prefix'] + '.conflicts.npz'
            np.savez_compressed(params['map_bsn'],
                                **{str(b[0, 0]): b
                                   for b in blastab})
            np.savez_compressed(params['conflicts'], conflicts=conflicts)
            del blastab, conflicts

        if params.get('global', None) is None:
            params['global'] = params['prefix'] + '.global.npy'
            global_differences = global_difference(params['map_bsn'],
                                                   orthoGroup, 3000)
            np.save(params['global'], global_differences)
            del global_differences

        blastab = precluster(params['map_bsn'], params['global'])
        #np.savez_compressed(params['map_bsn'], **blastab)
        params['prediction'] = filt_genes(
            params['prefix'], blastab, params['global'],
            np.load(params['conflicts'])['conflicts'], first_classes)
    else:
        genes = {n: s[-1] for n, s in genes.items()}
    pool.close()
    old_predictions = dict(np.load(
        params['old_prediction'])) if 'old_prediction' in params else {}

    write_output(params['prefix'], params['prediction'], genomes, genes,
                 old_predictions)
Beispiel #9
0
    def get_quality(self, reference, reads ) :
        if parameters['mapper'] == 'minimap2' :
            bams = self.__run_minimap(prefix, reference, reads, )
        elif parameters['mapper'] != 'bwa' :
            bams = self.__run_bowtie(prefix, reference, reads, )
        else :
            bams = self.__run_bwa(prefix, reference, reads, )
        
        sequence = readFasta(reference)
        for n, s in sequence.items() :
            q = ['!'] * len(s)
            sequence[n] = [s, q]

        sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() }
        for bam in bams :
            if bam is not None :
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                for line in depth.stdout :
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0 :
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()}
        depth = np.array(list(sites.values()))
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth :
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1]
            if acc[0] *2 >= size :
                break
        ave_depth = acc[1]/acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 2.)
        for n, s in sites.items() :
            s[2] = s[1]/ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
            for n, s in sorted(sequence.items()) :
                fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)])))
        bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
        Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate()
        if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
            pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
            Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth))
        logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth))
        indels = []
        with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout :
            for line in fin :
                if line.startswith('#') : continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] :
                    continue
                if part[-1] == '1/1':
                    if len(part[3]) > 1 :
                        indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2])
                    elif len(part[4]) > 1 and part[4] != '<DUP>' :
                        indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2])

                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 :
                        pp = part[7].split(';')
                        dp = float(pp[0][3:])
                        af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1]
                        if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) :
                            site = int(part[1])-1
                            qual = chr(int(pp[4][3:])+33)
                            sequence[part[0]][1][site] = qual
                        else :
                            fout.write(line)
                    else :
                        fout.write(line)
                except :
                    fout.write(line)
        for n, s, e in indels :
            sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e])
            
        if self.snps is not None :
            for n, snvs in self.snps.items() :
                for site, snv in snvs :
                    if snv.find('N') >= 0 : continue
                    if snv.startswith('+') :
                        s, e = site-4, site+3+len(snv)
                    else :
                        s, e = site-4, site+4
                    for k in xrange(s, e) :
                        sequence[n][1][k] = max(chr(40+33), sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout :
            p = prefix.rsplit('/', 1)[-1]
            for n, (s, q) in sequence.items() :
                if sites[n][2] >= cont_depth[0] :
                    fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) )
        os.unlink( '{0}.mapping.vcf'.format(prefix) )
        logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)
Beispiel #10
0
    def do_polish(self, reference, reads, reassemble=False, onlySNP=False) :
        if parameters.get('SNP', None) is not None :
            return self.do_polish_with_SNPs(reference, parameters['SNP'])
        else :
            if parameters['mapper'] == 'minimap2' :
                bams = self.__run_minimap(prefix, reference, reads )
            elif parameters['mapper'] != 'bwa' :
                bams = self.__run_bowtie(prefix, reference, reads )
            else :
                bams = self.__run_bwa(prefix, reference, reads )
            sites = {}
            for bam in bams :
                if bam is not None :
                    depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                    for line in depth.stdout :
                        part = line.strip().split()
                        if len(part) > 2 and float(part[2]) > 0 :
                            sites[part[0]] = 1
            sequence = readFasta(reference)
            sequence = {n:s for n,s in sequence.items() if n in sites}

            with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))

            bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
            if reassemble :
                pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            else :
                pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            
            if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
                pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    
            
            snps = []
            with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout :
                for line in fin :
                    if line.startswith('#') : continue
                    part = line.strip().split('\t')
                    if part[-1] != '0/0':
                        try :
                            if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]):
                                if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) :
                                    snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] )
                                    fout.write(line)
                        except :
                            pass

            os.unlink('{0}.mapping.vcf'.format(prefix))
            for n in sequence.keys() :
                sequence[n] = list(sequence[n])
            for n, site, ori, alt in reversed(snps) :
                s = sequence[n]
                end = site + len(ori)
                s[site:end] = alt
            logger('Observed and corrected {0} changes using PILON'.format(len(snps)))
            with open('{0}.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    s = ''.join(s)
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))
            return '{0}.fasta'.format(prefix)
Beispiel #11
0
def loadBam(prefix, reference, bams, sequences, snps):
    sequence = readFasta(reference)
    sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()}

    sites = {}
    for bam in bams:
        if bam is not None:
            depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                bam=bam, **externals).split(),
                                     stdout=subprocess.PIPE,
                                     universal_newlines=True)
            try:
                d = pd.read_csv(depth.stdout, sep='\t').values
                sites.update({cName: 1 for cName in np.unique(d.T[0])})
            except:
                pass

    sequence = {n: s for n, s in sequence.items() if n in sites}
    with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
        for n, s in sorted(sequence.items()):
            fout.write('>{0}\n{1}\n'.format(
                n, '\n'.join([
                    s[0][site:(site + 100)]
                    for site in xrange(0, len(s[0]), 100)
                ])))

    bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
    pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
        prefix=prefix, bam_opt=bam_opt, **externals)
    subprocess.Popen(pilon_cmd.split(),
                     stdout=subprocess.PIPE,
                     universal_newlines=True).communicate()

    uncertains = []
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if sequences[part[0]][int(part[1]) - 1] >= 0:
                if len(part[3]) == 1 and len(part[4]) == 1:
                    pp = part[7].split(';')
                    dp = float(pp[0][3:])
                    if dp >= 3:
                        qd = int(pp[4][3:])
                        if part[-1] == '0/1' or qd < 10:
                            bcs = sorted(
                                [float(bc) for bc in pp[5][3:].split(',')])
                            uncertains.append([bcs[-1], np.sum(bcs[:-1])])
    uncertains = np.array(uncertains)
    p = np.sum(uncertains.T[0]) / np.sum(uncertains)
    qPerRead = 10 * (np.log10(p) - np.log10(1 - p))
    for n in sequence:
        sequence[n][0] = list(sequence[n][0])

    highQ, lowQ, lowC = 0, 0, 0
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if len(part[3]) == 1 and len(part[4]) == 1:
                s = int(part[1]) - 1
                pp = part[7].split(';')
                dp = float(pp[0][3:])
                qd = int(pp[4][3:])
                if part[-1] == '0/1' or qd < 10:
                    bcs = np.array([int(bc) for bc in pp[5][3:].split(',')])
                    if np.sum(bcs) > 0:
                        sequence[part[0]][0][s] = ['A', 'C', 'G',
                                                   'T'][np.argmax(bcs)]
                    else:
                        sequence[part[0]][0][s] = part[3]
                    if dp < 3:
                        lowC += 1
                    else:
                        bcs.sort()
                        bcs = [bcs[-1], np.sum(bcs[:-1])]
                        q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p)
                        q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1
                        if q2 >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = min(40, max(1, int(q2)))
                else:
                    if dp < 3:
                        lowC += 1
                    else:
                        if qd >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = qd
                    if part[-1] == '1/1':
                        sequence[part[0]][0][s] = part[4]

    logger(
        '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'.
        format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC))
    with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout:
        p = prefix.rsplit('/', 1)[-1]
        for n, (s, q) in sequence.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(
                p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q])))
    os.unlink('{0}.mapping.vcf'.format(prefix))
    os.unlink('{0}.mapping.fasta'.format(prefix))
    os.unlink('{0}.mapping.reference.fasta'.format(prefix))
    return '{0}.metaCaller.fastq'.format(prefix)
Beispiel #12
0
    def get_quality(self, reference, reads):
        if parameters['mapper'] != 'bwa':
            bams = self.__run_bowtie(reference, reads)
        else:
            bams = self.__run_bwa(reference, reads)

        sequence = readFasta(filename=reference, qual=0)
        for n, s in sequence.iteritems():
            s[1] = list(s[1])

        sites = {
            n: np.array([0 for ss in s[1]])
            for n, s in sequence.iteritems()
        }
        for bam in bams:
            if bam is not None:
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                    bam=bam, **parameters).split(),
                              stdout=PIPE).communicate()[0]
                for line in depth.split('\n'):
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0:
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n: [s.size, np.mean(s), 0.] for n, s in sites.iteritems()}
        depth = np.array(sites.values())
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth:
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0] * d[1]
            if acc[0] * 2 >= size:
                break
        ave_depth = acc[1] / acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 1.)
        for n, s in sites.iteritems():
            s[2] = s[1] / ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        logger('Sites with over {0} or 15% unsupported reads is not called'.
               format(exp_mut_depth))
        sequence = {n: s for n, s in sequence.iteritems() if sites[n][1] > 0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
            for n, s in sorted(sequence.items()):
                fout.write('>{0}\n{1}\n'.format(
                    n, '\n'.join([
                        s[0][site:(site + 100)]
                        for site in range(0, len(s[0]), 100)
                    ])))
        bam_opt = ' '.join(
            ['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
            bam_opt=bam_opt, **parameters)
        Popen(pilon_cmd.split(), stdout=PIPE).communicate()

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger(
            'Contigs with less than {0} depth will be removed from the assembly'
            .format(cont_depth[0] * ave_depth))
        logger(
            'Contigs with more than {0} depth will be treated as duplicates'.
            format(cont_depth[1] * ave_depth))

        with open('{0}.mapping.vcf'.format(prefix)) as fin, open(
                '{0}.mapping.difference'.format(prefix), 'w') as fout:
            for line in fin:
                if line.startswith('#'): continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[
                        part[0]][2] >= cont_depth[1]:
                    continue
                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(
                            part[4]) == 1:
                        dp, af = float(part[7].split(';', 1)[0][3:]), float(
                            part[7][-4:])
                        if af < 0.15 and dp >= 3 and dp * af <= exp_mut_depth:
                            if part[6] == 'PASS' or (part[6] == 'LowCov' and
                                                     parameters['metagenome']):
                                site = int(part[1]) - 1
                                qual = chr(int(part[7].split(';')[4][3:]) + 33)
                                sequence[part[0]][1][site] = qual
                        else:
                            fout.write(line)
                    else:
                        fout.write(line)
                except:
                    fout.write(line)
        if self.snps is not None:
            for n, snvs in self.snps.iteritems():
                for site, snv in snvs:
                    if snv.find('N') >= 0: continue
                    if snv.startswith('+'):
                        s, e = site - 4, site + 3 + len(snv)
                    else:
                        s, e = site - 4, site + 4
                    for k in range(s, e):
                        sequence[n][1][k] = max(chr(40 + 33),
                                                sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout:
            for n, (s, q) in sequence.iteritems():
                if sites[n][2] >= cont_depth[0]:
                    fout.write('@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format(
                        n, s, ''.join(q), *sites[n]))
        os.unlink('{0}.mapping.vcf'.format(prefix))
        logger('Final result is written into {0}'.format(
            '{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)