Exemple #1
0
def assemble(args):
    global reads, prefix, parameters
    parameters = add_args(args).__dict__
    parameters.update(externals)
    prefix = parameters['prefix']

    reads = []
    for k, vs in zip(('pe', 'se'), (parameters['pe'], parameters['se'])):
        for v in vs:
            if k == 'pe':
                rnames = v.split(',')
                if len(rnames) > 0:
                    assert len(
                        rnames
                    ) == 2, 'Allows 2 reads per PE library. You specified {0}'.format(
                        len(rnames))
                    reads.append(rnames)
            elif k == 'se':
                rnames = v.split(',')
                if len(rnames) > 0:
                    assert len(
                        rnames
                    ) == 1, 'Allows one file per SE library. You specified {0}'.format(
                        len(rnames))
                    reads.append(rnames)

    logger('Load in {0} read files from {1} libraries'.format(
        sum([len(lib) for lib in reads]), len(reads)))
    if not parameters['onlyEval']:
        assembly = mainprocess().launch(reads)
    else:
        assembly = parameters['reference']

    report = postprocess().launch(assembly)
    import json
    print(json.dumps(report, sort_keys=True, indent=2))
Exemple #2
0
    def returnOverlap(self, blastab, param):
        logger('Calculate overlaps.')

        ovl_l, ovl_p = param[1:]
        contigs = {tab[1]: id for id, tab in enumerate(blastab)}
        tabs = [[contigs[tab[1]], tab[15]] + sorted([tab[8], tab[9]])
                for tab in blastab]
        tabs = np.array(sorted(tabs, key=itemgetter(0, 2, 3)), dtype=int)
        overlaps = np.empty(shape=[1000001, 3], dtype=int)
        overlaps[-1, :] = [0, 1, -1]
        res = []
        while overlaps[-1, 0] >= 0:
            logger('Searching {0} / {1} tabs'.format(overlaps[-1, 0],
                                                     len(tabs)))
            overlaps[:-1, :] = -1
            overlaps = tab2overlaps(tabs, ovl_l, ovl_p, len(tabs), overlaps)
            res.append(overlaps[overlaps.T[2] > 0][:])
        res = np.vstack(res)
        logger('Identified {0} overlaps.'.format(len(res)))
        return res
Exemple #3
0
def evaluate(profile, cluster, stepwise, ave_gene_length=1000.) :
    with uopen(profile) as fin :
        logger('Loading profiles ...')                
        profile_header = fin.readline().strip().split('\t')
        ST_col = np.where([p.find('#ST')>=0 for p in profile_header])[0].tolist()
        if len(ST_col) <= 0 :
            ST_col = [0]
        cols = ST_col + np.where([not h.startswith('#') for h in profile_header])[0].tolist()
        profile = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        profile_names = profile.index.values
        profile = profile.values
    
    with uopen(cluster) as fin :
        logger('Loading hierCC ...')                        
        cluster_header = fin.readline().strip().split('\t')
        cols = [0] + np.where([not h.startswith('#') for h in cluster_header])[0].tolist()
        cluster = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        cluster_names = cluster.index.values
        cluster = cluster.values
        s = np.arange(0, cluster.shape[1], stepwise)
        cluster = cluster[:, s]

    presence = np.in1d(cluster_names, profile_names)
    cluster, cluster_names = cluster[presence], cluster_names[presence]
    order = {n:id for id, n in enumerate(cluster_names)}
    profile_order = np.array([ [id, order[n]] for id, n in enumerate(profile_names) if n in order ])
    profile_order = profile_order[np.argsort(profile_order.T[1]), 0]
    profile_names = profile_names[profile_order]
    profile = profile[profile_order]
    
    shannon = shannon_index(cluster)

    similarity = get_similarity('adjusted_rand_score', cluster, stepwise)

    silhouette = get_silhouette(profile, cluster, stepwise, ave_gene_length)

    np.savez_compressed('evalHCC.npz', shannon=shannon, similarity=similarity, silhouette=silhouette)
    logger('Done. Results saved in evalHCC.npz')
Exemple #4
0
    def get_quality(self, reference, reads ) :
        if parameters['mapper'] == 'minimap2' :
            bams = self.__run_minimap(prefix, reference, reads, )
        elif parameters['mapper'] != 'bwa' :
            bams = self.__run_bowtie(prefix, reference, reads, )
        else :
            bams = self.__run_bwa(prefix, reference, reads, )
        
        sequence = readFasta(reference)
        for n, s in sequence.items() :
            q = ['!'] * len(s)
            sequence[n] = [s, q]

        sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() }
        for bam in bams :
            if bam is not None :
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                for line in depth.stdout :
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0 :
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()}
        depth = np.array(list(sites.values()))
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth :
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1]
            if acc[0] *2 >= size :
                break
        ave_depth = acc[1]/acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 2.)
        for n, s in sites.items() :
            s[2] = s[1]/ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
            for n, s in sorted(sequence.items()) :
                fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)])))
        bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
        Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate()
        if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
            pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
            Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth))
        logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth))
        indels = []
        with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout :
            for line in fin :
                if line.startswith('#') : continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] :
                    continue
                if part[-1] == '1/1':
                    if len(part[3]) > 1 :
                        indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2])
                    elif len(part[4]) > 1 and part[4] != '<DUP>' :
                        indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2])

                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 :
                        pp = part[7].split(';')
                        dp = float(pp[0][3:])
                        af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1]
                        if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) :
                            site = int(part[1])-1
                            qual = chr(int(pp[4][3:])+33)
                            sequence[part[0]][1][site] = qual
                        else :
                            fout.write(line)
                    else :
                        fout.write(line)
                except :
                    fout.write(line)
        for n, s, e in indels :
            sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e])
            
        if self.snps is not None :
            for n, snvs in self.snps.items() :
                for site, snv in snvs :
                    if snv.find('N') >= 0 : continue
                    if snv.startswith('+') :
                        s, e = site-4, site+3+len(snv)
                    else :
                        s, e = site-4, site+4
                    for k in xrange(s, e) :
                        sequence[n][1][k] = max(chr(40+33), sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout :
            p = prefix.rsplit('/', 1)[-1]
            for n, (s, q) in sequence.items() :
                if sites[n][2] >= cont_depth[0] :
                    fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) )
        os.unlink( '{0}.mapping.vcf'.format(prefix) )
        logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)
Exemple #5
0
    def reduce_depth(self, reads):
        encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        read_stats = [{} for library in reads]
        new_reads = [{} for library in reads]
        for lib_id, (libraries, stat,
                     new_libs) in enumerate(zip(reads, read_stats, new_reads)):
            read_information = [0, 0]
            for lib_type, library in libraries.items():
                stat[lib_type] = []
                for fname in library:
                    p = Popen("{pigz} -cd {0}|awk 'NR%4==2'|wc".format(
                        fname, **externals),
                              shell=True,
                              stdout=PIPE,
                              universal_newlines=True).communicate()[0].strip(
                              ).split()
                    n_base, n_read = int(p[2]) - int(p[1]), int(p[0])
                    read_information[0] += n_base
                    read_information[1] += n_read
                    bcomp = [[0, 0, 0, 0, 0] for i in range(10)]
                    p = Popen(
                        "{pigz} -cd {0}|head -200000|awk 'NR%20==2'".format(
                            fname, **externals),
                        shell=True,
                        stdout=PIPE,
                        stderr=PIPE,
                        universal_newlines=True)
                    for line in p.stdout:
                        for b, bc in zip(line[:10], bcomp):
                            bc[encode.get(b, 4)] += 1
                    seq_start = 0
                    for c in range(9, -1, -1):
                        bc = bcomp[c]
                        if max(bc) / 0.8 >= sum(bc) or (c < 2 and
                                                        bc[4] > 0.1 * sum(bc)):
                            seq_start = c + 1
                            break
                    stat[lib_type].append([n_base, seq_start])
            logger('Obtained {1} bases in {2} reads after Trimming in Lib {0}'.
                   format(lib_id, *read_information))
            n_base = read_information[0]
            sample_freq2 = float(
                parameters['max_base']
            ) / n_base if parameters['max_base'] > 0 and n_base > 0 else 1.
            if sample_freq2 >= 1:
                for ss in stat.values():
                    for s in ss:
                        s.append(sample_freq2)
            else:
                max_base = float(parameters['max_base'])
                for lib_type in ('MP', 'PE', 'SE'):
                    if lib_type in stat:
                        ss = stat[lib_type]
                        n_base = sum([s[0] for s in ss])
                        sample_freq = float(max_base) / n_base
                        for s in ss:
                            s.append(sample_freq)
                        max_base = 0. if n_base >= max_base else max_base - n_base
            if 0 < sample_freq2 < 1:
                logger('Read depth too high. Subsampling.')

            for lib_type, library in libraries.items():
                if stat[lib_type][0][-1] > 0:
                    if lib_type == 'MP':
                        new_libs[lib_type] = [
                            '{0}.2.{1}.m.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    elif lib_type == 'PE':
                        new_libs[lib_type] = [
                            '{0}.2.{1}.r1.fastq.gz'.format(
                                parameters['prefix'], lib_id),
                            '{0}.2.{1}.r2.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    else:
                        new_libs[lib_type] = [
                            '{0}.2.{1}.s.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    for f_id, (lib, s, nlib) in enumerate(
                            zip(library, stat[lib_type], new_libs[lib_type])):
                        sample_freq = s[-1]
                        if parameters['noRename'] == False:
                            if s[1] > 0:
                                logger(
                                    'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                    .format(s[1], lib))
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq,
                                                           1.), s[1] + 1,
                                            lib_id, lib_type, **externals),
                                    shell=True).wait()
                            else:
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq,
                                                           1.), s[1] + 1,
                                            lib_id, lib_type, **externals),
                                    shell=True).wait()
                        else:
                            if s[1] > 0:
                                logger(
                                    'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                    .format(s[1], lib))
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq, 1.),
                                            s[1] + 1, lib_id, **externals),
                                    shell=True).wait()
                            else:
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq, 1.),
                                            s[1] + 1, lib_id, **externals),
                                    shell=True).wait()
                for lib in library:
                    try:
                        os.unlink(lib)
                    except:
                        pass
        return new_reads
Exemple #6
0
def hierCC(args):
    params = get_args(args)
    ot = time.time()
    cluster_file = params.output + '.completeCC.npz'
    pool = Pool(10)

    global mat, n_loci
    mat = pd.read_csv(params.profile, sep='\t', header=None, dtype=str).values
    allele_columns = np.array(
        [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])])
    mat = mat[1:, allele_columns].astype(int)
    n_loci = mat.shape[1] - 1

    logger(
        '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.'
        .format(time.time() - ot, *mat.shape))

    if os.path.isfile(params.incremental):
        od = np.load(params.incremental, allow_pickle=True)
        cls = od['completeCC']

        typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0}
        if len(typed) > 0:
            logger('{0}: Loaded in {1} old completeCC assignments.'.format(
                time.time() - ot, len(typed)))
            mat_idx = np.array([t in typed for t in mat.T[0]])
            mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]])
    else:
        typed = {}

    if os.path.isfile(params.partition):
        st_idx = {str(st): id for id, st in enumerate(mat.T[0])}
        from collections import defaultdict
        partitions = defaultdict(list)
        for st, grp in pd.read_csv(params.partition, sep='\t',
                                   dtype=str).values:
            partitions[grp].append(st_idx[st])
            st_idx[st] = -1
        logger('{0}: Load in {1} partition(s)'.format(time.time() - ot,
                                                      len(partitions)))
        st_idx = {k: v for k, v in st_idx.items() if v >= 0}
    else:
        partitions = {'all': np.arange(mat.shape[0])}
        st_idx = {}

    res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape)
    res[list(st_idx.values()), :] = 0
    for key, indices in sorted(partitions.items()):
        if len(indices) <= 1:
            continue
        logger('{0}: Partition {1} contains {2} STs'.format(
            time.time() - ot, key, len(indices)))
        mat2 = mat[indices]
        logger(
            '{0}: Start to calculate pairwise distances'.format(time.time() -
                                                                ot))
        dist = get_distances(params.output, mat2, pool)
        logger('{0}: Start complete linkage clustering'.format(time.time() -
                                                               ot))
        cls = linkage(ssd.squareform(dist), method='complete')
        logger('{0}: Start completeCC assignments'.format(time.time() - ot))
        descendents = [[i] for i in np.arange(dist.shape[0])
                       ] + [None for i in np.arange(dist.shape[0] - 1)]
        for idx, c in enumerate(cls.astype(int)):
            n_id = idx + dist.shape[0]
            d = sorted([int(c[0]), int(c[1])], key=lambda x: descendents[x][0])
            min_id = descendents[d[0]][0]
            descendents[n_id] = descendents[d[0]] + descendents[d[1]]
            for tgt in descendents[d[1]]:
                res[indices[tgt], c[2] + 1:] = res[indices[min_id], c[2] + 1:]
    res = res[res.T[0] > 0]
    np.savez_compressed(cluster_file, completeCC=res)

    if not params.delta:
        with uopen(params.output + '.completeCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in np.arange(n_loci)])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(rr) for rr in r]) + '\n')
    else:
        deltas = map(int, params.delta.split(','))
        with uopen(params.output + '.completeCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in deltas])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n')
    del res
    logger(
        'NUMPY clustering result (for incremental completeCC): {0}.completeCC.npz'
        .format(params.output))
    logger(
        'TEXT  clustering result (for visual inspection): {0}.completeCC.gz'.
        format(params.output))
Exemple #7
0
def nomenclature(query, reference, ref_aa='', **params):
    # write query
    logger('EnSign starts')
    sequence, qry_fna, qry_faa = seqOperation().write_query(query)
    logger('Read in {0} bases as query'.format(
        sum([len(s[0]) for s in sequence.itervalues()])))
    # write refset
    if not os.path.isfile(str(ref_aa)):
        ref_aa = seqOperation().write_refsets(reference)
        logger('Prepare translated references')
    # do comparison
    blasttab = dualBlast().run_ublast(fna_target=qry_fna,
                                      faa_target=qry_faa,
                                      fna_query=reference,
                                      faa_query=ref_aa)
    # filter
    blasttab_parser = blastParser()
    blasttab = blasttab_parser.linear_merge(blasttab, **parameters)
    logger('Merge closely located hits. {0} hits'.format(len(blasttab)))
    loci = blasttab_parser.parse_ublast(blasttab, parameters)
    logger('Identify homologous groups. {0} groups'.format(
        len([1 for lc in loci if lc != '__non_specific__'])))
    regions = blasttab_parser.inter_loci_overlap(loci, parameters)
    logger('Resolve potential paralogs. {0} regions'.format(len(regions)))

    # submission
    alleles = blasttab_parser.form_alleles(regions, sequence,
                                           parameters['unique_key'],
                                           parameters['high_quality'],
                                           parameters)
    logger('Generate allelic sequences. {0} remains'.format(len(alleles)))
    #results = blasttab_parser.typing(alleles, parameters, dbname, scheme, submission=submission)
    return alleles
Exemple #8
0
def read_matrix(fname):
    invariant = []
    seqLens, missing = [], []

    with uopen(fname) as fin:
        for line_id, line in enumerate(fin):
            if line.startswith('##'):
                if line.startswith('## Constant_bases'):
                    part = line[2:].strip().split()
                    invariant = dict(
                        zip([65, 67, 71, 84], [float(v) for v in part[1:]]))
                elif line.startswith('## Sequence_length:'):
                    part = line[2:].strip().split()
                    seqLens.append([part[1], int(part[2])])
                elif line.startswith('## Missing_region:'):
                    part = line[2:].strip().split()
                    missing.append([part[1], int(part[2]), int(part[3])])
            elif line.startswith('#'):
                part = np.array(line.strip().split('\t'))
                cols = np.where(
                    (1 - np.char.startswith(part, '#')).astype(bool))[0]
                w_cols = np.where(np.char.startswith(part, '#!W'))[0]
                names = part[cols]
                break
            else:
                part = np.array(line.strip().split('\t'))
                cols = np.ones(part.shape, dtype=bool)
                cols[:2] = False
                w_cols = np.char.startswith(part, '#!W')
                names = part[cols]
                break

        bases, weights, sites = [], [], []
        for mat in pd.read_csv(fin,
                               header=None,
                               sep='\t',
                               usecols=cols.tolist() + w_cols.tolist() +
                               [0, 1],
                               chunksize=10000,
                               engine='c',
                               dtype=str,
                               low_memory=False,
                               na_filter=False):
            mat = mat.values
            logger('{0}\t{1}\t{2}\t{3}'.format(\
                mat[0, 0], mat[0, 1], \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites) ))

            for m in mat:
                btype, bidx = np.unique(['-'] + m[cols].tolist(),
                                        return_inverse=True)
                if btype.size <= 2:
                    continue
                sites.append([m[0], int(m[1]), 1, np.array([])])
                weights.append(
                    m[w_cols].astype(float).prod() if w_cols.size else 1.)
                if '.' in btype or max(map(len, btype)) > 1:
                    missing_val = np.where(btype == '-')[0][0]
                    bidx[bidx == missing_val] = 45
                    bidx[bidx < missing_val] += 1
                    bidx[bidx == 45] = 0
                    sites[-1][3] = np.array(['-'] +
                                            btype[btype != '-'].tolist())
                    sites[-1][2] = 2
                    bases.append(bidx[1:])
                else:
                    bases.append(
                        np.array(list(map(ord, btype)),
                                 dtype=np.uint8)[bidx[1:]])

    bases, weights, sites = np.vstack(bases), np.array(weights), np.array(
        sites, dtype=object)
    indices = np.lexsort(bases.T)
    snps = []

    for idx in indices:
        s, b, w = sites[idx], bases[idx], weights[idx]
        if not snps or np.any(b != snps[-1][2]):
            snps.append([len(snps), w, b, s[2]])
        else:
            snps[-1][1] += w
        s[2] = snps[-1][0]

    for inv in invariant.items():
        b_key = np.array([inv[0]] * len(names), dtype=np.uint8)
        snps.append([len(snps), float(inv[1]), b_key, 0])
    for snp in snps:
        snp[1] = np.ceil(snp[1])
    return names, sites, np.array(snps, dtype=object), np.array(
        seqLens, dtype=object), np.array(missing, dtype=object)
Exemple #9
0
def filt_genes(prefix, groups, global_file, conflicts, first_classes=None):
    outPos = np.ones(16, dtype=bool)
    outPos[[3, 4, 5, 10, 15]] = False

    c2 = {c: {} for c in np.unique(conflicts.T[:2])}
    for c in conflicts:
        c2[c[0]][c[1]] = c2[c[1]][c[0]] = c[2]
    conflicts = c2

    clust_ref = readFasta(params['clust'])
    for gene, g in groups.items():
        g.T[2] *= g.T[3]
        g[:] = g[np.argsort(-g.T[2], kind='mergesort')]
    used, results, run = {}, {}, {}
    group_id = 0
    with open('{0}.Prediction'.format(prefix), 'w') as fout:
        while len(groups) > 0:
            genes = get_gene(groups, first_classes, cnt=50)
            if len(genes) <= 0:
                continue
            to_run, to_run_id, min_score, min_rank = [], [], genes[-1][
                1], genes[0][2]
            genes = {gene: score for gene, score, min_rank in genes}
            if params['orthology'] in ('ml', 'nj'):
                for gene, score in genes.items():
                    if gene not in run:
                        mat = groups[gene]
                        _, bestPerGenome, matInGenome = np.unique(
                            mat.T[1], return_index=True, return_inverse=True)
                        region_score = mat.T[2] / mat[
                            bestPerGenome[matInGenome], 2]
                        if region_score.size >= bestPerGenome.size * 2:
                            used2, kept = set([]), np.ones(mat.shape[0],
                                                           dtype=bool)
                            for id, m in enumerate(mat):
                                if m[5] in used2:
                                    kept[id] = False
                                else:
                                    used2.update(conflicts.get(m[5], {}))
                            mat = mat[kept]
                            _, bestPerGenome, matInGenome = np.unique(
                                mat.T[1],
                                return_index=True,
                                return_inverse=True)
                            region_score = mat.T[2] / mat[
                                bestPerGenome[matInGenome], 2]
                        if region_score.size > bestPerGenome.size * 3 and len(
                                region_score) > 500:
                            region_score2 = sorted(region_score, reverse=True)
                            cut = region_score2[bestPerGenome.size * 3 - 1]
                            if cut >= params['clust_identity']:
                                cut = min(
                                    region_score2[bestPerGenome.size * 5] if
                                    len(region_score) > bestPerGenome.size * 5
                                    else params['clust_identity'], 1.0 - 0.6 *
                                    (1.0 - params['clust_identity']))
                            mat = mat[region_score >= cut]

                        to_run.append([mat, clust_ref[mat[0][0]], global_file])
                        to_run_id.append(gene)
                working_groups = pool.map(filt_per_group, to_run)
                #working_groups = [filt_per_group(d) for d in to_run]
                for gene, working_group in zip(to_run_id, working_groups):
                    groups[gene] = working_group
                    run[gene] = 1
            else:
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)
                region_score = mat.T[2] / mat[bestPerGenome[matInGenome], 2]
                mat[:] = mat[region_score >= params['clust_identity']]
                used2, kept = set([]), np.ones(mat.shape[0], dtype=bool)
                for id, m in enumerate(mat):
                    for mmm in m[6]:
                        if mmm[15] in used2:
                            kept[id] = False
                            break
                    if kept[id]:
                        used2 |= {mmm[15] for mmm in m[6]}
                mat = mat[kept]
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)

            while len(genes):
                score, gene = max([[
                    np.sum(groups[gene][np.unique(groups[gene].T[1],
                                                  return_index=True)[1]].T[2]),
                    gene
                ] for gene in genes])
                if score < min_score:
                    break
                mat = groups.pop(gene, [])
                genes.pop(gene)

                paralog, paralog2 = 0, 0
                supergroup = {}
                used2 = {}
                for m in mat:
                    gid = m[5]
                    conflict = used.get(gid, None)
                    if conflict is not None:
                        if not isinstance(conflict, int):
                            superC = results[conflict]
                            supergroup[superC] = supergroup.get(superC, 0) + 1
                        elif conflict > 0:
                            if m[6].shape[0] <= 1 and m[3] >= params[
                                    'clust_identity']:
                                paralog = 1
                                break
                            else:
                                paralog2 += 1
                        m[3] = -1
                    else:
                        for g2, gs in conflicts.get(gid, {}).items():
                            if gs == 1:
                                if g2 not in used:
                                    used2[g2] = m[0]
                            elif gs == 2:
                                used2[g2] = 1
                            else:
                                used[g2] = 0
                if paralog or paralog2 * 3 >= mat.shape[0]:
                    continue
                else:
                    used.update(used2)

                pangene = mat[0][0]
                if len(supergroup):
                    pg, pid = max(supergroup.items(), key=itemgetter(1))
                    if pid * 3 >= mat.shape[0] or (pid * 5 >= mat.shape[0]
                                                   and pid > 1):
                        pangene = pg

                results[mat[0][0]] = pangene
                logger(
                    '{4} / {5}: pan gene "{3}" : "{0}" picked from rank {1} and score {2}'
                    .format(mat[0][0], min_rank, score, pangene, len(results),
                            len(groups) + len(results)))

                for grp in mat[mat.T[3] > 0]:
                    group_id += 1
                    for g in grp[6]:
                        fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            pangene, min_rank, group_id, grp[1],
                            '\t'.join(g[outPos].astype(str).tolist())))
    return '{0}.Prediction'.format(prefix)
Exemple #10
0
    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        for id in xrange(5):
            with open('{0}.{1}'.format(refAA, id), 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA='{0}.{1}'.format(refAA, id),
                qryAA=qryAA,
                aaMatch='{0}.{1}'.format(aaMatch, id),
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            Popen(diamond_cmd.split(),
                  stdout=PIPE,
                  stderr=PIPE,
                  universal_newlines=True).communicate()
        blastab = []
        for r in self.pool.imap_unordered(parseDiamond, [[
                '{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq,
                self.min_id, self.min_cov, self.min_ratio
        ] for id in xrange(5)]):
            if r is not None:
                blastab.append(np.load(r, allow_pickle=True))
                os.unlink(r)
        blastab = np.vstack(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemple #11
0
    def init_cleanup(self, reads):
        prefix = parameters['prefix']
        new_reads = []
        for lib_id, library in enumerate(reads):
            library_file = ['{0}.0.{1}.1.fastq.gz'.format(prefix, lib_id)]
            Popen('cat {0} > {1}'.format(' '.join([run[0] for run in library]),
                                         library_file[0]),
                  shell=True).wait()
            if len(library[0]) > 1:
                library_file.append('{0}.0.{1}.2.fastq.gz'.format(
                    prefix, lib_id))
                Popen('cat {0} > {1}'.format(
                    ' '.join([run[1] for run in library]), library_file[1]),
                      shell=True).wait()
            if len(library_file) == 1:
                reads = 'in=' + library_file[0]
                library_file2 = ['{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id)]
                outputs = 'out=' + library_file2[0]
            else:
                reads = 'in=' + library_file[0] + ' in2=' + library_file[1]
                library_file2 = [
                    '{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id),
                    '{0}.1.{1}.2.fastq.gz'.format(prefix, lib_id),
                    '{0}.1.{1}.3.fastq.gz'.format(prefix, lib_id)
                ]
                outputs = 'out=' + library_file2[0] + ' out2=' + library_file2[
                    1] + ' outs=' + library_file2[2]

            if parameters['noTrim'] == False:
                bb_run = Popen('{bbduk} -Xmx{memory} threads=8 rref={adapters} overwrite=t qout=33 k=23 mink=13 minlength=23 tbo=t entropy=0.75 entropywindow=25 mininsert=23 maxns=2 ktrim=r trimq={read_qual} {read} {outputs}'.format( \
                                  read=reads, outputs=outputs, **parameters).split(), stdout=PIPE, stderr=PIPE)
                timer = Timer(3600, kill_child_proc, [bb_run])
                try:
                    timer.start()
                    bb_out = bb_run.communicate()
                finally:
                    timer.cancel()
                if bb_run.returncode == 0:
                    new_reads.append(library_file2)
                    try:
                        for fname in library_file:
                            os.unlink(fname)
                        stat = re.findall(
                            'Result:\s+(\d+) reads .+\s+(\d+) bases',
                            bb_out[1])[0]
                        logger('Obtained {1} bases in {0} reads after BBDuk2'.
                               format(*stat))
                    except:
                        pass
                else:
                    new_reads.append(library_file)
                    try:
                        stat = re.findall(
                            'Input:\s+(\d+) reads .+\s+(\d+) bases',
                            bb_out[1])[0]
                        logger(
                            'BBDuk2 failed! Use original reads with {1} bases in {0} reads'
                            .format(*stat))
                        for fname in library_file2:
                            os.unlink(fname)
                    except:
                        pass
            else:
                new_reads.append(library_file)
        return new_reads
Exemple #12
0
def hierCC(args):
    params = get_args(args)
    ot = time.time()
    profile_file, cluster_file, old_cluster = params.profile, params.output + '.npz', params.incremental

    global mat, n_loci
    mat = pd.read_csv(profile_file, sep='\t', header=None, dtype=str).values
    allele_columns = np.array(
        [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])])
    mat = mat[1:, allele_columns].astype(int)
    n_loci = mat.shape[1] - 1

    logger(
        '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.'
        .format(time.time() - ot, *mat.shape))
    if not params.immutable:
        absence = np.sum(mat <= 0, 1)
        mat = mat[np.argsort(absence, kind='mergesort')]

    if os.path.isfile(old_cluster):
        od = np.load(old_cluster, allow_pickle=True)
        cls = od['hierCC']

        typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0}
        if len(typed) > 0:
            logger('{0}: Loaded in {1} old hierCC assignments.'.format(
                time.time() - ot, len(typed)))
            mat_idx = np.array([t in typed for t in mat.T[0]])
            mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]])
    else:
        typed = {}

    logger('{0}: Start hierCC assignments'.format(time.time() - ot))
    pool = Pool(10)

    res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape)
    res[1:, 0] = n_loci + 1
    for index in xrange(0, mat.shape[0], 100):
        to_run = []
        for idx in np.arange(index, index + 100):
            if idx < mat.shape[0]:
                if mat[idx, 0] in typed:
                    res[idx, :] = cls[typed[mat[idx, 0]], :]
                else:
                    to_run.append(idx)
        if len(to_run) == 0:
            continue
        if not params.immutable:
            dists = np.vstack(pool.map(get_distance, to_run))
            assignment(dists, res)
        else:
            dists = np.vstack(pool.map(get_distance2, to_run))
            assignment2(dists, res)

        logger('{0}: Assigned {1} of {2} types into hierCC.'.format(
            time.time() - ot, index, mat.shape[0]))
    res.T[0] = mat.T[0]
    np.savez_compressed(cluster_file, hierCC=res)

    if not params.delta:
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in np.arange(n_loci)])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(rr) for rr in r]) + '\n')
    else:
        deltas = map(int, params.delta.split(','))
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in deltas])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n')
    del res
    logger('NUMPY clustering result (for incremental hierCC): {0}.npz'.format(
        params.output))
    logger('TEXT  clustering result (for visual inspection): {0}.hierCC.gz'.
           format(params.output))
Exemple #13
0
    def do_polish(self, reference, reads, reassemble=False, onlySNP=False) :
        if parameters.get('SNP', None) is not None :
            return self.do_polish_with_SNPs(reference, parameters['SNP'])
        else :
            if parameters['mapper'] == 'minimap2' :
                bams = self.__run_minimap(prefix, reference, reads )
            elif parameters['mapper'] != 'bwa' :
                bams = self.__run_bowtie(prefix, reference, reads )
            else :
                bams = self.__run_bwa(prefix, reference, reads )
            sites = {}
            for bam in bams :
                if bam is not None :
                    depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                    for line in depth.stdout :
                        part = line.strip().split()
                        if len(part) > 2 and float(part[2]) > 0 :
                            sites[part[0]] = 1
            sequence = readFasta(reference)
            sequence = {n:s for n,s in sequence.items() if n in sites}

            with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))

            bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
            if reassemble :
                pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            else :
                pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            
            if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
                pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    
            
            snps = []
            with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout :
                for line in fin :
                    if line.startswith('#') : continue
                    part = line.strip().split('\t')
                    if part[-1] != '0/0':
                        try :
                            if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]):
                                if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) :
                                    snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] )
                                    fout.write(line)
                        except :
                            pass

            os.unlink('{0}.mapping.vcf'.format(prefix))
            for n in sequence.keys() :
                sequence[n] = list(sequence[n])
            for n, site, ori, alt in reversed(snps) :
                s = sequence[n]
                end = site + len(ori)
                s[site:end] = alt
            logger('Observed and corrected {0} changes using PILON'.format(len(snps)))
            with open('{0}.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    s = ''.join(s)
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))
            return '{0}.fasta'.format(prefix)
Exemple #14
0
    def runUBlast(self, ref, qry, nhits=6, frames='7'):
        logger('Run uBLAST starts')

        def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab[2] /= 100.
            blastab = blastab[blastab[2] >= min_id]
            blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3

            qf, rf = blastab[0].str.rsplit(
                ':', 1, expand=True), blastab[1].str.rsplit(':',
                                                            1,
                                                            expand=True)
            if np.all(qf[0].str.isdigit()):
                qf[0] = qf[0].astype(int)
            if np.all(rf[0].str.isdigit()):
                rf[0] = rf[0].astype(int)
            blastab[0], qf = qf[0], qf[1].astype(int)
            blastab[1], rf = rf[0], rf[1].astype(int)
            blastab[6], blastab[
                7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1
            blastab[14] = [[
                [3 * vv[0], vv[1]] for vv in v
            ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))]

            blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[
                str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)]))

            rf3 = (rf <= 3)
            blastab.loc[rf3,
                        8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[
                            rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1
            blastab.loc[~rf3, 8], blastab.loc[
                ~rf3, 9] = blastab.loc[~rf3, 13] - (
                    blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 -
                    3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 +
                                                     rf[~rf3] - 3 - 1) + 1
            d = np.max([
                blastab[7] - blastab[12], blastab[9] - blastab[13],
                1 - blastab[9],
                np.zeros(blastab.shape[0], dtype=int)
            ],
                       axis=0)
            blastab[7] -= d

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(blastab[14], d)
            d[~rf3] *= -1
            blastab[9] -= d
            blastab = blastab[
                (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12])
                & (blastab[7] - blastab[6] + 1 >= min_cov)]
            return blastab.drop(columns=[15, 16])

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F')
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        refAASeq = transeq(self.refSeq, frames)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        blastab = []
        for id in xrange(5):
            with open(refAA, 'w') as fout:
                for line in toWrite[id::4]:
                    fout.write(line)

            ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(
                usearch=usearch,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio)
            p = Popen(ublast_cmd.split(),
                      stderr=PIPE,
                      stdout=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                blastab.append(
                    parseUBlast(open(aaMatch), self.refSeq, self.qrySeq,
                                self.min_id, self.min_cov, self.min_ratio))
        blastab = pd.concat(blastab)
        logger('Run uBLAST finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemple #15
0
    def get_quality(self, reference, reads):
        if parameters['mapper'] != 'bwa':
            bams = self.__run_bowtie(reference, reads)
        else:
            bams = self.__run_bwa(reference, reads)

        sequence = readFasta(filename=reference, qual=0)
        for n, s in sequence.iteritems():
            s[1] = list(s[1])

        sites = {
            n: np.array([0 for ss in s[1]])
            for n, s in sequence.iteritems()
        }
        for bam in bams:
            if bam is not None:
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                    bam=bam, **parameters).split(),
                              stdout=PIPE).communicate()[0]
                for line in depth.split('\n'):
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0:
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n: [s.size, np.mean(s), 0.] for n, s in sites.iteritems()}
        depth = np.array(sites.values())
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth:
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0] * d[1]
            if acc[0] * 2 >= size:
                break
        ave_depth = acc[1] / acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 1.)
        for n, s in sites.iteritems():
            s[2] = s[1] / ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        logger('Sites with over {0} or 15% unsupported reads is not called'.
               format(exp_mut_depth))
        sequence = {n: s for n, s in sequence.iteritems() if sites[n][1] > 0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
            for n, s in sorted(sequence.items()):
                fout.write('>{0}\n{1}\n'.format(
                    n, '\n'.join([
                        s[0][site:(site + 100)]
                        for site in range(0, len(s[0]), 100)
                    ])))
        bam_opt = ' '.join(
            ['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
            bam_opt=bam_opt, **parameters)
        Popen(pilon_cmd.split(), stdout=PIPE).communicate()

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger(
            'Contigs with less than {0} depth will be removed from the assembly'
            .format(cont_depth[0] * ave_depth))
        logger(
            'Contigs with more than {0} depth will be treated as duplicates'.
            format(cont_depth[1] * ave_depth))

        with open('{0}.mapping.vcf'.format(prefix)) as fin, open(
                '{0}.mapping.difference'.format(prefix), 'w') as fout:
            for line in fin:
                if line.startswith('#'): continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[
                        part[0]][2] >= cont_depth[1]:
                    continue
                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(
                            part[4]) == 1:
                        dp, af = float(part[7].split(';', 1)[0][3:]), float(
                            part[7][-4:])
                        if af < 0.15 and dp >= 3 and dp * af <= exp_mut_depth:
                            if part[6] == 'PASS' or (part[6] == 'LowCov' and
                                                     parameters['metagenome']):
                                site = int(part[1]) - 1
                                qual = chr(int(part[7].split(';')[4][3:]) + 33)
                                sequence[part[0]][1][site] = qual
                        else:
                            fout.write(line)
                    else:
                        fout.write(line)
                except:
                    fout.write(line)
        if self.snps is not None:
            for n, snvs in self.snps.iteritems():
                for site, snv in snvs:
                    if snv.find('N') >= 0: continue
                    if snv.startswith('+'):
                        s, e = site - 4, site + 3 + len(snv)
                    else:
                        s, e = site - 4, site + 4
                    for k in range(s, e):
                        sequence[n][1][k] = max(chr(40 + 33),
                                                sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout:
            for n, (s, q) in sequence.iteritems():
                if sites[n][2] >= cont_depth[0]:
                    fout.write('@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format(
                        n, s, ''.join(q), *sites[n]))
        os.unlink('{0}.mapping.vcf'.format(prefix))
        logger('Final result is written into {0}'.format(
            '{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)
Exemple #16
0
def read_matrix(fname) :
    sites, snps = [], {}
    invariant = []
    seqLens, missing = [], []
    
    validate = np.repeat(45, 256).astype(np.uint8)
    validate[np.array(['A', 'C', 'G', 'T', '.', '+', '-', '', '*']).view(asc2int)] = np.array(['A', 'C', 'G', 'T', '', '', '-', '', '']).view(asc2int)
    
    with uopen(fname) as fin :
        for line_id, line in enumerate(fin) :
            if line.startswith('##'):
                if line.startswith('## Constant_bases') :
                    part = line[2:].strip().split()
                    invariant = list(zip([65, 67, 71, 84], [float(v) for v in part[1:]]))
                elif line.startswith('## Sequence_length:') :
                    part = line[2:].strip().split()
                    seqLens.append([part[1], int(part[2])])
                elif line.startswith('## Missing_region:') :
                    part = line[2:].strip().split()
                    missing.append([part[1], int(part[2]), int(part[3])])
            elif line.startswith('#') :
                part = np.array(line.strip().split('\t'))
                cols = np.where((1 - np.char.startswith(part, '#')).astype(bool))[0]
                w_cols = np.where(np.char.startswith(part, '#!W'))[0]
                names = part[cols]
                break
            else :
                part = np.array(line.strip().split('\t'))
                cols = np.ones(part.shape, dtype=bool)
                cols[:2] = False
                w_cols = np.char.startswith(part, '#!W')
                names = part[cols]
                break
        for mat in pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0,1], chunksize=10000) :
            mat = mat.values
            logger('{0}\t{1}\t{2}\t{3}\t{4}'.format(\
                mat[0, 0], mat[0, 1], \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites), len(snps)))
            bk = validate[mat[:, cols].astype('str').view(asc2int)].reshape(mat.shape[0], -1, cols.shape[0])
            bk = np.moveaxis(bk, 1, 2)
            if bk.shape[2] > 1 :
                bk[(bk[:, :, 1] != 0) & (bk[:, :, 0] == 45), 0] = 0
            b_keys = bk[:, :, 0]
            weights = mat[:, w_cols].astype(float).prod(1) if w_cols.size else np.ones(mat.shape[0], dtype=float)
            for (b_key, site, w) in zip(b_keys, mat, weights) :
                b_key = tuple(b_key)
                if min(b_key) == 0 :
                    bk2 = np.concatenate([site[cols], ['']])
                    bk2[bk2 == '-'] = ''
                    category, b_key = np.unique(bk2, return_inverse=True)
                    if category[0] == '' :
                        category[0] = '-'
                    b_key = tuple(b_key[:-1].tolist())
                else :
                    category = []
                if b_key in snps :
                    snps[b_key][2] += w
                elif min(b_key) >= 45 : 
                    snps[b_key] = [len(snps), 1, w]
                else :
                    snps[b_key] = [len(snps), 2, w]
                    
                if snps[b_key][1] > 0 :
                    sites.append([ site[0], site[1], snps[b_key][0], np.array(category) ])

    for inv in invariant :
        b_key = tuple([inv[0]] * len(names))
        if b_key not in snps :
            snps[b_key] = [len(snps), 0, float(inv[1])]
        else :
            snps[b_key][2] += float(inv[1])
    return names, sites, sorted([[info[0], int(math.ceil(info[2])), np.array(line, dtype=np.uint8), info[1]] for line, info in snps.items() ]), seqLens, missing
Exemple #17
0
def buildReference(targets, sources, max_iden=0.9,  min_iden=0.6, coverage=0.7, paralog=0.1, relaxEnd=False) :
    orderedLoci = { t['fieldname']:i for i, t in reversed(list(enumerate(sources))) }
    refsets = []
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        tmpDir = os.path.join(dirPath, 'tmp')
        sourceFna = os.path.join(dirPath, 'sourceFna')
        sourceFaa = os.path.join(dirPath, 'sourceFaa')
        targetFna = os.path.join(dirPath, 'targetFna')
        targetFiltFna = os.path.join(dirPath, 'targetFiltFna')
        targetFiltFaa = os.path.join(dirPath, 'targetFiltFaa')
        targetClsFna = os.path.join(dirPath, 'targetClsFna')
        targetResFna = os.path.join(dirPath, 'targetResFna')
        alnFaa = os.path.join(dirPath, 'alnFaa')
        
        with open(sourceFna+'.fas', 'w') as fout :
            fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**s) for s in sources]))
        with open(targetFna+'.fas', 'w') as fout :
            fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**t) for t in targets]))
        targetFiltFna, goodCandidates, crossSites = minimapFilter(sourceFna+'.fas', targetFna+'.fas', targetFiltFna, max_iden, min_iden, coverage, paralog, relaxEnd, orderedLoci)
        logger('identifed {0} good exemplar alleles after nucleic search'.format(len(goodCandidates)))
        
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, sourceFna).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetFiltFna).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, sourceFna, sourceFaa).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, targetFiltFna, targetFiltFaa).split(), stdout = subprocess.PIPE).communicate()
        for ite in range(9) :
            if os.path.isdir(tmpDir) :
                shutil.rmtree(tmpDir)
            p=subprocess.Popen('{0} search {2} {1} {3} {4} -c {6} --min-seq-id {5} --threads {t}'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa, tmpDir, min_iden, coverage, t=9-4*int(ite/3) ).split(), stdout = subprocess.PIPE)
            p.communicate()
            if p.returncode == 0 :
                break
            time.sleep(1)
        subprocess.Popen('{0} convertalis {2} {1} {3} {3}.tab --format-mode 2 --threads 8'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa).split(), stdout = subprocess.PIPE).communicate()
        
        with open(alnFaa + '.tab') as fin :
            for line in fin :
                part = line.strip().split('\t')
                qLoc, rLoc = part[0].rsplit('_', 1)[0], part[1].rsplit('_', 1)[0]
                if qLoc == rLoc:
                    if relaxEnd or (int(part[8]) == int(part[6]) and int(part[12]) - int(part[7]) == int(part[13]) - int(part[9])) :
                        goodCandidates[part[0]] = max(goodCandidates.get(part[0], 0), float(part[2]))
                elif orderedLoci[qLoc] > orderedLoci[rLoc] and crossSites.get(part[0], 0) < float(part[2]) :
                    crossSites[part[0]] = float(part[2])
        logger('identifed a total of {0} good exemplar alleles after amino search'.format(len(goodCandidates)))
        
        nLoci = len(orderedLoci)
        for s in sources :
            key = '{0}_{1}'.format(s['fieldname'], s['value_id'])
            if crossSites.get(key, 0) > 1-paralog :
                orderedLoci.pop(s['fieldname'], None)
                #logger(key)
        if nLoci > len(orderedLoci) :
            logger('Total of {0} loci are not suitable for MLST scheme [due to paralog setting]. There are {1} left'.format(nLoci - len(orderedLoci), len(orderedLoci)))

        with open(targetFna+'.fas') as fin, open(targetClsFna+'.fas', 'w') as fout :
            writable = False
            for line in fin :
                if line.startswith('>') :
                    name = line[1:].strip().split()[0]
                    locus, id = name.rsplit('_', 1)
                    writable = True if locus in orderedLoci and goodCandidates.get(name, 0) - crossSites.get(name, 0) > paralog else False
                if writable :
                    fout.write(line)
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetClsFna).split(), stdout = subprocess.PIPE).communicate()
        for ite in range(9) :
            if os.path.isdir(tmpDir) :
                shutil.rmtree(tmpDir)            
            p = subprocess.Popen('{0} cluster {1} {2} {3} -c {5} --min-seq-id {4} --threads {t}'.format(mmseqs, targetClsFna, targetResFna, tmpDir, max_iden, max_iden, t=9-4*int(ite/3)).split(), stdout = subprocess.PIPE)
            p.communicate()
            if p.returncode == 0 :
                break
            time.sleep(1)
        subprocess.Popen('{0} createtsv {1} {2} {3} {3}.tab'.format(mmseqs, targetClsFna, targetClsFna, targetResFna).split(), stdout = subprocess.PIPE).communicate()
        goodCandidates = {}
        with open(targetResFna + '.tab') as fin :
            for line in fin :
                goodCandidates[line.split('\t', 1)[0]] = 1
        logger('There are {0} good exemplar alleles left after final clustering'.format(len(goodCandidates)))

        with open(targetClsFna+'.fas') as fin:
            writable = False
            for line in fin :
                if line.startswith('>') :
                    name = line[1:].strip().split()[0]
                    writable = True if name in goodCandidates else False
                if writable :
                    refsets.append(line.strip())
    except :
        pass
    finally:
        shutil.rmtree(dirPath)
        return '\n'.join(refsets)
Exemple #18
0
def loadBam(prefix, reference, bams, sequences, snps):
    sequence = readFasta(reference)
    sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()}

    sites = {}
    for bam in bams:
        if bam is not None:
            depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                bam=bam, **externals).split(),
                                     stdout=subprocess.PIPE,
                                     universal_newlines=True)
            try:
                d = pd.read_csv(depth.stdout, sep='\t').values
                sites.update({cName: 1 for cName in np.unique(d.T[0])})
            except:
                pass

    sequence = {n: s for n, s in sequence.items() if n in sites}
    with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
        for n, s in sorted(sequence.items()):
            fout.write('>{0}\n{1}\n'.format(
                n, '\n'.join([
                    s[0][site:(site + 100)]
                    for site in xrange(0, len(s[0]), 100)
                ])))

    bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
    pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
        prefix=prefix, bam_opt=bam_opt, **externals)
    subprocess.Popen(pilon_cmd.split(),
                     stdout=subprocess.PIPE,
                     universal_newlines=True).communicate()

    uncertains = []
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if sequences[part[0]][int(part[1]) - 1] >= 0:
                if len(part[3]) == 1 and len(part[4]) == 1:
                    pp = part[7].split(';')
                    dp = float(pp[0][3:])
                    if dp >= 3:
                        qd = int(pp[4][3:])
                        if part[-1] == '0/1' or qd < 10:
                            bcs = sorted(
                                [float(bc) for bc in pp[5][3:].split(',')])
                            uncertains.append([bcs[-1], np.sum(bcs[:-1])])
    uncertains = np.array(uncertains)
    p = np.sum(uncertains.T[0]) / np.sum(uncertains)
    qPerRead = 10 * (np.log10(p) - np.log10(1 - p))
    for n in sequence:
        sequence[n][0] = list(sequence[n][0])

    highQ, lowQ, lowC = 0, 0, 0
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if len(part[3]) == 1 and len(part[4]) == 1:
                s = int(part[1]) - 1
                pp = part[7].split(';')
                dp = float(pp[0][3:])
                qd = int(pp[4][3:])
                if part[-1] == '0/1' or qd < 10:
                    bcs = np.array([int(bc) for bc in pp[5][3:].split(',')])
                    if np.sum(bcs) > 0:
                        sequence[part[0]][0][s] = ['A', 'C', 'G',
                                                   'T'][np.argmax(bcs)]
                    else:
                        sequence[part[0]][0][s] = part[3]
                    if dp < 3:
                        lowC += 1
                    else:
                        bcs.sort()
                        bcs = [bcs[-1], np.sum(bcs[:-1])]
                        q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p)
                        q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1
                        if q2 >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = min(40, max(1, int(q2)))
                else:
                    if dp < 3:
                        lowC += 1
                    else:
                        if qd >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = qd
                    if part[-1] == '1/1':
                        sequence[part[0]][0][s] = part[4]

    logger(
        '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'.
        format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC))
    with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout:
        p = prefix.rsplit('/', 1)[-1]
        for n, (s, q) in sequence.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(
                p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q])))
    os.unlink('{0}.mapping.vcf'.format(prefix))
    os.unlink('{0}.mapping.fasta'.format(prefix))
    os.unlink('{0}.mapping.reference.fasta'.format(prefix))
    return '{0}.metaCaller.fastq'.format(prefix)
Exemple #19
0
    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = []
            for line in fin:
                if line.startswith('@'):
                    continue
                part = line.strip().split('\t')
                if part[2] == '*': continue
                qn, qf = part[0].rsplit(':', 1)
                rn, rf, rx = part[2].rsplit(':', 2)
                rs = int(part[3]) + int(rx)
                ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)])
                qm = len(part[9])
                if qm * 3 < min_cov: continue
                cov_ratio = qm * 3. / ql
                if cov_ratio < min_ratio: continue
                cigar = [[int(n) * 3, t]
                         for n, t in re.findall(r'(\d+)([A-Z])', part[5])]
                cl = np.sum([c[0] for c in cigar])
                variation = float(part[12][5:]) * 3 if part[12].startswith(
                    'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3

                iden = 1 - round(variation / cl, 3)
                if iden < min_id: continue
                qf, rf = int(qf), int(rf)
                qs = int(part[18][5:]) if part[18].startswith('ZS:') else int(
                    re.findall('ZS:i:(\d+)', line)[0])

                rm = int(
                    np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3)
                if rf <= 3:
                    rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1
                else:
                    rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - (
                        (rs + rm - 1) * 3 + rf - 4) + 1
                if qf <= 3:
                    qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1
                else:
                    qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - (
                        (qs + qm - 1) * 3 + qf - 4) + 1
                    qs, qe, rs, r_e = qe, qs, r_e, rs
                    cigar = list(reversed(cigar))

                cd = [c[0] for c in cigar if c[1] != 'M']
                score = int(
                    part[14][5:]) if part[14].startswith('ZR:') else int(
                        re.findall('ZR:i:(\d+)', line)[0])
                blastab.append([
                    qn, rn, iden, cl,
                    int(variation - sum(cd)),
                    len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar
                ])
            blastab = pd.DataFrame(blastab)
            blastab[[0, 1]] = blastab[[0, 1]].astype(str)
            return blastab

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        blastab = []
        for id in xrange(5):
            #logger('{0}'.format(id))
            with open(refAA, 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            p = Popen(diamond_cmd.split(),
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq,
                                   self.min_id, self.min_cov, self.min_ratio)
                os.unlink(aaMatch)
            if tab is not None:
                blastab.append(tab)
        blastab = pd.concat(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemple #20
0
    def runMMseq(self, ref, qry):
        logger('Run MMSeqs starts')

        def parseMMSeq(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab = blastab[blastab[2] >= min_id]
            qlen = blastab[0].apply(lambda r: len(qryseq[r]))
            rlen = blastab[1].apply(lambda r: len(refseq[r]))
            cigar = blastab[14].apply(lambda x: [[int(n) * 3, t] for n, t in re
                                                 .findall(r'(\d+)([A-Z])', x)])
            ref_sites = pd.concat([3 * (blastab[6] - 1) + 1, 3 * blastab[7]],
                                  keys=[0, 1],
                                  axis=1)
            d = ref_sites[1] - qlen
            d[d < 0] = 0

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(cigar, d)
            ref_sites[1] -= d

            direction = (blastab[8] < blastab[9])
            qry_sites = pd.concat([blastab[8], blastab[9] - d], axis=1)
            qry_sites[~direction] = pd.concat([blastab[8] - d, blastab[9]],
                                              axis=1)[~direction]

            blastab = pd.DataFrame(
                np.hstack([
                    blastab[[0, 1, 2]],
                    np.apply_along_axis(lambda x: x[1] - x[0] + 1, 1,
                                        ref_sites.values)[:, np.newaxis],
                    pd.DataFrame(np.zeros([blastab.shape[0], 2], dtype=int)),
                    ref_sites, qry_sites, blastab[[10, 11]],
                    qlen[:, np.newaxis], rlen[:, np.newaxis], cigar[:,
                                                                    np.newaxis]
                ]))
            return blastab[(blastab[3] >= min_cov)
                           & (blastab[3] >= blastab[12] * min_ratio)]

        tmpDir = os.path.join(self.dirPath, 'tmp')
        refNA = os.path.join(self.dirPath, 'refNA')
        qryNA = os.path.join(self.dirPath, 'qryNA')

        refCDS = os.path.join(self.dirPath, 'refCDS')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch2')

        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, ref, refNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, qry, qryNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} translatenucs {1} {2}'.format(mmseqs, qryNA, qryAA).split(),
              stdout=PIPE).communicate()
        for ite in range(9):
            if os.path.isdir(tmpDir):
                shutil.rmtree(tmpDir)
            p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                mmseqs, qryAA, refNA, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
            p.communicate()
            if p.returncode == 0:
                break
            if ite > 2:
                Popen('{0} extractorfs {2} {3}'.format(mmseqs, qryAA, refNA,
                                                       refCDS).split(),
                      stdout=PIPE).communicate()
                p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                    mmseqs, qryAA, refCDS, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
                p.communicate()
                if p.returncode == 0:
                    break
            time.sleep(1)
        Popen('{0} convertalis {1} {2} {3} {3}.tab --threads {4} --format-output'.format(\
            mmseqs, qryAA, refNA, aaMatch, self.n_thread).split() + ['query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,raw,qlen,tlen,cigar'], stdout=PIPE).communicate()

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)
        blastab = parseMMSeq(open(aaMatch + '.tab'), self.refSeq, self.qrySeq,
                             self.min_id, self.min_cov, self.min_ratio)
        logger('Run MMSeqs finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab
Exemple #21
0
def buildReference(alleles,
                   references,
                   max_iden=0.9,
                   min_iden=0.6,
                   coverage=0.7,
                   paralog=0.1,
                   relaxEnd=False):
    orderedLoci = {
        t['fieldname']: i
        for i, t in reversed(list(enumerate(references)))
    }
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        sourceFna = os.path.join(dirPath, 'sourceFna')
        clsFna = os.path.join(dirPath, 'clsFna')
        targetFna = os.path.join(dirPath, 'targetFna')
        with open(sourceFna, 'w') as fout:
            fout.write('\n'.join([
                '>{fieldname}_{value_id}\n{value}'.format(**s) for s in alleles
            ]))
        with open(targetFna, 'w') as fout:
            fout.write('\n'.join([
                '>{fieldname}_{value_id}\n{value}'.format(**t)
                for t in references
            ]))
        # get cluster
        exampler, cluster = clust('-i {0} -p {1} -d {2} -c 1 -t 8'.format(\
            sourceFna, clsFna, max_iden).split())
        tooClose, goodCandidates, crossSites = {}, {}, {}
        with open(cluster) as fin:
            for line in fin:
                part = line.strip().split()
                locus = [p.rsplit('_', 1)[0] for p in part]
                if locus[0] != locus[1]:
                    crossSites[part[0]] = locus[1]
                    crossSites[part[1]] = locus[0]

        # compare with references
        blastab = uberBlast('-r {0} -q {1} -f --blastn --diamondSELF --min_id {2} --min_ratio {3} -t 8 -p -s 1 -e 0,3'.format(\
            targetFna, exampler, min_iden, coverage ).split())
        #blastab = blastab[blastab.T[0] != blastab.T[1]]
    except:
        pass
    finally:
        shutil.rmtree(dirPath)
    for tab in blastab:
        locus = [p.rsplit('_', 1)[0] for p in tab[:2]]
        c = (tab[7] - tab[6] + 1) / tab[12]
        e = max(abs(tab[8] - tab[6]),
                abs(tab[12] - tab[7] - (tab[13] - tab[9])))
        if c >= coverage and tab[2] >= min_iden:
            if locus[0] != locus[1]:
                crossSites[tab[0]] = locus[1]
                crossSites[tab[1]] = locus[0]
            elif e <= 0:
                if tab[2] >= max_iden and tab[0] != tab[1]:
                    tooClose[tab[0]] = 1
                else:
                    goodCandidates[tab[0]] = tab[2]
    paralogous_loci = {}
    for ref in references:
        key = '{0}_{1}'.format(ref['fieldname'], ref['value_id'])
        if key in crossSites and orderedLoci[ref['fieldname']] < orderedLoci[
                crossSites[key]]:
            paralogous_loci[ref['fieldname']] = 1
    refsets = []
    for allele in alleles:
        if allele['fieldname'] in paralogous_loci:
            allele['fieldname'] = ''
        else:
            key = '{0}_{1}'.format(allele['fieldname'], allele['value_id'])
            if key in crossSites:
                allele['fieldname'] = ''
            elif key in goodCandidates and key not in tooClose:
                refsets.append(
                    '>{fieldname}_{value_id}\n{value}'.format(**allele))
    alleles = [
        '>{fieldname}_{value_id}\n{value}'.format(**allele)
        for allele in alleles if allele['fieldname'] != ''
    ]
    logger('removed {0} paralogous sites.'.format(len(paralogous_loci)))
    logger('obtained {0} alleles and {1} references alleles'.format(
        len(alleles), len(refsets)))
    return '\n'.join(alleles), '\n'.join(refsets)
Exemple #22
0
    def reduce_depth(self, reads):
        encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        read_stats = [[] for library in reads]
        new_reads = [[] for library in reads]
        for lib_id, (library, stat,
                     new_lib) in enumerate(zip(reads, read_stats, new_reads)):
            for fname in library:
                p = Popen("zcat {0}|awk 'NR%4==2'|wc".format(fname),
                          shell=True,
                          stdout=PIPE).communicate()[0].strip().split()
                n_base = int(p[2]) - int(p[1])
                bcomp = [[0, 0, 0, 0, 0] for i in range(10)]
                p = Popen("zcat {0}|head -400000|awk 'NR%20==2'".format(fname),
                          shell=True,
                          stdout=PIPE,
                          stderr=PIPE)
                for line in p.stdout:
                    for b, bc in zip(line[:10], bcomp):
                        bc[encode.get(b, 4)] += 1
                seq_start = 0
                for c in range(9, -1, -1):
                    bc = bcomp[c]
                    if max(bc) / 0.8 >= sum(bc) or (c < 2
                                                    and bc[4] > 0.1 * sum(bc)):
                        seq_start = c + 1
                        break
                stat.append([n_base, seq_start])
            n_base = sum([s[0] for s in stat])
            sample_freq = float(parameters['max_base']) / n_base if parameters[
                'max_base'] > 0 else 1.
            if sample_freq >= 1 or len(stat) < 3:
                sample_freqs = [sample_freq for s in stat]
            else:
                n_base2 = sum([s[0] for s in stat[:2]])
                if float(parameters['max_base']) <= n_base2:
                    sample_freqs = [
                        float(parameters['max_base']) / n_base2,
                        float(parameters['max_base']) / n_base2, 0.
                    ]
                else:
                    sample_freqs = [
                        1., 1.,
                        (float(parameters['max_base']) - n_base2) / stat[2][0]
                    ]
            if sample_freqs[0] < 1 and sample_freqs[0] > 0:
                logger('Read depth too high. Subsample to every {0:.2f} read'.
                       format(1. / sample_freqs[0]))

            for f_id, (lib, s, sample_freq) in enumerate(
                    zip(library, stat, sample_freqs)):
                if sample_freq > 0:
                    new_lib.append('{0}.2.{1}.{2}.fastq.gz'.format(
                        parameters['prefix'], lib_id, f_id + 1))
                    if parameters['noRename'] == False:
                        if s[1] > 0:
                            logger(
                                'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                .format(s[1], lib))
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                        else:
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                    else:
                        if s[1] > 0:
                            logger(
                                'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                .format(s[1], lib))
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                        else:
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()

                os.unlink(lib)
        return new_reads
Exemple #23
0
def filt_per_group(data):
    mat, ref, global_file = data
    global_differences = dict(np.load(global_file))
    nMat = mat.shape[0]
    seqs = np.vstack([
        np.vstack(mat.T[4]),
        np.array(list(ref)).view(asc2int).astype(np.uint8)[np.newaxis, :]
    ])
    seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45
    diff = compare_seq(
        seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2],
                       dtype=int)).astype(float)
    incompatible, distances = {}, np.zeros(
        shape=[seqs.shape[0], seqs.shape[0]], dtype=float)
    for i1, m1 in enumerate(mat):
        for i2 in xrange(i1 + 1, nMat):
            m2 = mat[i2]
            mut, aln = diff[i1, i2]
            if aln > 0:
                gd = global_differences.get(tuple(sorted([m1[1], m2[1]])),
                                            (0.01, 4))
                distances[i1, i2] = distances[i2, i1] = max(
                    0., 1 - (aln - mut) / aln / (1 - gd[0]))
                difference = mut / aln / gd[0] / gd[1]
            else:
                distances[i1, i2] = distances[i2, i1] = 0.8
                difference = 1.5
            if difference > 1.:
                incompatible[(i1, i2)] = 1

    if len(incompatible) > 0:
        groups = []
        for j, m in enumerate(mat):
            novel = 1
            for g in groups:
                if diff[g[0], j, 0] <= 0.6 * (
                        1.0 - params['clust_identity']) * diff[g[0], j, 1]:
                    g.append(j)
                    novel = 0
                    break
            if novel:
                groups.append([j])
        group_tag = {gg: g[0] for g in groups for gg in g}
        try:
            tags = {
                g[0]: mat[g[0]][4].tostring().decode('ascii')
                for g in groups
            }
        except:
            tags = {g[0]: mat[g[0]][4].tostring() for g in groups}

        tags.update({'REF': ref})

        ic2 = {}
        for i1, i2 in incompatible:
            t1, t2 = group_tag[i1], group_tag[i2]
            if t1 != t2:
                t1, t2 = str(t1), str(t2)
                if t1 not in ic2: ic2[t1] = {}
                if t2 not in ic2: ic2[t2] = {}
                ic2[t1][t2] = ic2[t2][t1] = 1
        incompatible = ic2

        for ite in xrange(3):
            try:
                tmpFile = tempfile.NamedTemporaryFile(dir='.', delete=False)
                for n, s in tags.items():
                    tmpFile.write('>X{0}\n{1}\n{2}'.format(
                        n, s, '\n' * ite).encode('utf-8'))
                tmpFile.close()
                cmd = params[params['orthology']].format(
                    tmpFile.name, **
                    params) if len(tags) < 500 else params['nj'].format(
                        tmpFile.name, **params)
                phy_run = subprocess.Popen(shlex.split(cmd),
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           universal_newlines=True)
                gene_phy = ete3.Tree(phy_run.communicate()[0].replace("'", ''))
                break
            except:
                if ite == 2:
                    return mat
            finally:
                os.unlink(tmpFile.name)
        for n in gene_phy.get_leaves():
            if len(n.name):
                n.name = n.name[1:]

        node = gene_phy.get_midpoint_outgroup()
        if node is not None:
            gene_phy.set_outgroup(node)

        for ite in xrange(3000):
            gene_phy.ic, gene_phy.dist = {}, 0.
            rdist = sum([c.dist for c in gene_phy.get_children()])
            for c in gene_phy.get_children():
                c.dist = rdist
            for node in gene_phy.iter_descendants('postorder'):
                if node.is_leaf():
                    node.ic = {
                        tuple(sorted([node.name, n2])): 1
                        for n2 in incompatible.get(node.name, {})
                    }
                else:
                    node.ic = {}
                    for c in node.get_children():
                        for x in c.ic:
                            if x in node.ic:
                                node.ic.pop(x)
                            else:
                                node.ic[x] = 1
            cut_node = max([[len(n.ic), n.dist, n]
                            for n in gene_phy.iter_descendants('postorder')],
                           key=lambda x: (x[0], x[1]))
            if cut_node[0] > 0:
                cut_node = cut_node[2]
                prev_node = cut_node.up
                cut_node.detach()
                if 'REF' in cut_node.get_leaf_names():
                    gene_phy = cut_node
                elif prev_node.is_root():
                    gene_phy = gene_phy.get_children()[0]
                else:
                    prev_node.delete(preserve_branch_length=True)

                tips = set(gene_phy.get_leaf_names())
                for r1 in list(incompatible.keys()):
                    if r1 not in tips:
                        rr = incompatible.pop(r1, None)
                        for r2 in rr:
                            incompatible.get(r2, {}).pop(r1, None)
                for r1 in list(incompatible.keys()):
                    if len(incompatible[r1]) == 0:
                        incompatible.pop(r1, None)
                if len(incompatible) == 0:
                    break

                logger('     Iteration {0}. Remains {1} tips.'.format(
                    ite + 1, len(gene_phy.get_leaf_names())))
            else:
                break
        if len(gene_phy.get_leaf_names()) < len(tags):
            groups = {str(g[0]): g for g in groups}
            tips = sorted([
                nn for n in gene_phy.get_leaf_names()
                for nn in groups.get(n, [])
            ])
            mat = mat[tips]
    return mat