Python logger Examples

Programming Language: Python

Namespace/Package Name: configure

Method/Function: logger

Examples at hotexamples.com: 23

Python logger - 23 examples found. These are the top rated real world Python examples of configure.logger extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: assemble.py Project: shulp2211/EToKi

def assemble(args):
    global reads, prefix, parameters
    parameters = add_args(args).__dict__
    parameters.update(externals)
    prefix = parameters['prefix']

    reads = []
    for k, vs in zip(('pe', 'se'), (parameters['pe'], parameters['se'])):
        for v in vs:
            if k == 'pe':
                rnames = v.split(',')
                if len(rnames) > 0:
                    assert len(
                        rnames
                    ) == 2, 'Allows 2 reads per PE library. You specified {0}'.format(
                        len(rnames))
                    reads.append(rnames)
            elif k == 'se':
                rnames = v.split(',')
                if len(rnames) > 0:
                    assert len(
                        rnames
                    ) == 1, 'Allows one file per SE library. You specified {0}'.format(
                        len(rnames))
                    reads.append(rnames)

    logger('Load in {0} read files from {1} libraries'.format(
        sum([len(lib) for lib in reads]), len(reads)))
    if not parameters['onlyEval']:
        assembly = mainprocess().launch(reads)
    else:
        assembly = parameters['reference']

    report = postprocess().launch(assembly)
    import json
    print(json.dumps(report, sort_keys=True, indent=2))

Example #2

Show file

    def returnOverlap(self, blastab, param):
        logger('Calculate overlaps.')

        ovl_l, ovl_p = param[1:]
        contigs = {tab[1]: id for id, tab in enumerate(blastab)}
        tabs = [[contigs[tab[1]], tab[15]] + sorted([tab[8], tab[9]])
                for tab in blastab]
        tabs = np.array(sorted(tabs, key=itemgetter(0, 2, 3)), dtype=int)
        overlaps = np.empty(shape=[1000001, 3], dtype=int)
        overlaps[-1, :] = [0, 1, -1]
        res = []
        while overlaps[-1, 0] >= 0:
            logger('Searching {0} / {1} tabs'.format(overlaps[-1, 0],
                                                     len(tabs)))
            overlaps[:-1, :] = -1
            overlaps = tab2overlaps(tabs, ovl_l, ovl_p, len(tabs), overlaps)
            res.append(overlaps[overlaps.T[2] > 0][:])
        res = np.vstack(res)
        logger('Identified {0} overlaps.'.format(len(res)))
        return res

Example #3

Show file

File: evalHCC.py Project: shulp2211/EToKi

def evaluate(profile, cluster, stepwise, ave_gene_length=1000.) :
    with uopen(profile) as fin :
        logger('Loading profiles ...')                
        profile_header = fin.readline().strip().split('\t')
        ST_col = np.where([p.find('#ST')>=0 for p in profile_header])[0].tolist()
        if len(ST_col) <= 0 :
            ST_col = [0]
        cols = ST_col + np.where([not h.startswith('#') for h in profile_header])[0].tolist()
        profile = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        profile_names = profile.index.values
        profile = profile.values
    
    with uopen(cluster) as fin :
        logger('Loading hierCC ...')                        
        cluster_header = fin.readline().strip().split('\t')
        cols = [0] + np.where([not h.startswith('#') for h in cluster_header])[0].tolist()
        cluster = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        cluster_names = cluster.index.values
        cluster = cluster.values
        s = np.arange(0, cluster.shape[1], stepwise)
        cluster = cluster[:, s]

    presence = np.in1d(cluster_names, profile_names)
    cluster, cluster_names = cluster[presence], cluster_names[presence]
    order = {n:id for id, n in enumerate(cluster_names)}
    profile_order = np.array([ [id, order[n]] for id, n in enumerate(profile_names) if n in order ])
    profile_order = profile_order[np.argsort(profile_order.T[1]), 0]
    profile_names = profile_names[profile_order]
    profile = profile[profile_order]
    
    shannon = shannon_index(cluster)

    similarity = get_similarity('adjusted_rand_score', cluster, stepwise)

    silhouette = get_silhouette(profile, cluster, stepwise, ave_gene_length)

    np.savez_compressed('evalHCC.npz', shannon=shannon, similarity=similarity, silhouette=silhouette)
    logger('Done. Results saved in evalHCC.npz')

Example #4

Show file

File: assemble.py Project: nickp60/EToKi

    def get_quality(self, reference, reads ) :
        if parameters['mapper'] == 'minimap2' :
            bams = self.__run_minimap(prefix, reference, reads, )
        elif parameters['mapper'] != 'bwa' :
            bams = self.__run_bowtie(prefix, reference, reads, )
        else :
            bams = self.__run_bwa(prefix, reference, reads, )
        
        sequence = readFasta(reference)
        for n, s in sequence.items() :
            q = ['!'] * len(s)
            sequence[n] = [s, q]

        sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() }
        for bam in bams :
            if bam is not None :
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                for line in depth.stdout :
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0 :
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()}
        depth = np.array(list(sites.values()))
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth :
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1]
            if acc[0] *2 >= size :
                break
        ave_depth = acc[1]/acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 2.)
        for n, s in sites.items() :
            s[2] = s[1]/ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
            for n, s in sorted(sequence.items()) :
                fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)])))
        bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
        Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate()
        if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
            pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
            Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth))
        logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth))
        indels = []
        with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout :
            for line in fin :
                if line.startswith('#') : continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] :
                    continue
                if part[-1] == '1/1':
                    if len(part[3]) > 1 :
                        indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2])
                    elif len(part[4]) > 1 and part[4] != '<DUP>' :
                        indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2])

                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 :
                        pp = part[7].split(';')
                        dp = float(pp[0][3:])
                        af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1]
                        if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) :
                            site = int(part[1])-1
                            qual = chr(int(pp[4][3:])+33)
                            sequence[part[0]][1][site] = qual
                        else :
                            fout.write(line)
                    else :
                        fout.write(line)
                except :
                    fout.write(line)
        for n, s, e in indels :
            sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e])
            
        if self.snps is not None :
            for n, snvs in self.snps.items() :
                for site, snv in snvs :
                    if snv.find('N') >= 0 : continue
                    if snv.startswith('+') :
                        s, e = site-4, site+3+len(snv)
                    else :
                        s, e = site-4, site+4
                    for k in xrange(s, e) :
                        sequence[n][1][k] = max(chr(40+33), sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout :
            p = prefix.rsplit('/', 1)[-1]
            for n, (s, q) in sequence.items() :
                if sites[n][2] >= cont_depth[0] :
                    fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) )
        os.unlink( '{0}.mapping.vcf'.format(prefix) )
        logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)

Example #5

Show file

File: prepare.py Project: zheminzhou/EToKi

    def reduce_depth(self, reads):
        encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        read_stats = [{} for library in reads]
        new_reads = [{} for library in reads]
        for lib_id, (libraries, stat,
                     new_libs) in enumerate(zip(reads, read_stats, new_reads)):
            read_information = [0, 0]
            for lib_type, library in libraries.items():
                stat[lib_type] = []
                for fname in library:
                    p = Popen("{pigz} -cd {0}|awk 'NR%4==2'|wc".format(
                        fname, **externals),
                              shell=True,
                              stdout=PIPE,
                              universal_newlines=True).communicate()[0].strip(
                              ).split()
                    n_base, n_read = int(p[2]) - int(p[1]), int(p[0])
                    read_information[0] += n_base
                    read_information[1] += n_read
                    bcomp = [[0, 0, 0, 0, 0] for i in range(10)]
                    p = Popen(
                        "{pigz} -cd {0}|head -200000|awk 'NR%20==2'".format(
                            fname, **externals),
                        shell=True,
                        stdout=PIPE,
                        stderr=PIPE,
                        universal_newlines=True)
                    for line in p.stdout:
                        for b, bc in zip(line[:10], bcomp):
                            bc[encode.get(b, 4)] += 1
                    seq_start = 0
                    for c in range(9, -1, -1):
                        bc = bcomp[c]
                        if max(bc) / 0.8 >= sum(bc) or (c < 2 and
                                                        bc[4] > 0.1 * sum(bc)):
                            seq_start = c + 1
                            break
                    stat[lib_type].append([n_base, seq_start])
            logger('Obtained {1} bases in {2} reads after Trimming in Lib {0}'.
                   format(lib_id, *read_information))
            n_base = read_information[0]
            sample_freq2 = float(
                parameters['max_base']
            ) / n_base if parameters['max_base'] > 0 and n_base > 0 else 1.
            if sample_freq2 >= 1:
                for ss in stat.values():
                    for s in ss:
                        s.append(sample_freq2)
            else:
                max_base = float(parameters['max_base'])
                for lib_type in ('MP', 'PE', 'SE'):
                    if lib_type in stat:
                        ss = stat[lib_type]
                        n_base = sum([s[0] for s in ss])
                        sample_freq = float(max_base) / n_base
                        for s in ss:
                            s.append(sample_freq)
                        max_base = 0. if n_base >= max_base else max_base - n_base
            if 0 < sample_freq2 < 1:
                logger('Read depth too high. Subsampling.')

            for lib_type, library in libraries.items():
                if stat[lib_type][0][-1] > 0:
                    if lib_type == 'MP':
                        new_libs[lib_type] = [
                            '{0}.2.{1}.m.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    elif lib_type == 'PE':
                        new_libs[lib_type] = [
                            '{0}.2.{1}.r1.fastq.gz'.format(
                                parameters['prefix'], lib_id),
                            '{0}.2.{1}.r2.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    else:
                        new_libs[lib_type] = [
                            '{0}.2.{1}.s.fastq.gz'.format(
                                parameters['prefix'], lib_id)
                        ]
                    for f_id, (lib, s, nlib) in enumerate(
                            zip(library, stat[lib_type], new_libs[lib_type])):
                        sample_freq = s[-1]
                        if parameters['noRename'] == False:
                            if s[1] > 0:
                                logger(
                                    'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                    .format(s[1], lib))
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq,
                                                           1.), s[1] + 1,
                                            lib_id, lib_type, **externals),
                                    shell=True).wait()
                            else:
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq,
                                                           1.), s[1] + 1,
                                            lib_id, lib_type, **externals),
                                    shell=True).wait()
                        else:
                            if s[1] > 0:
                                logger(
                                    'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                    .format(s[1], lib))
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq, 1.),
                                            s[1] + 1, lib_id, **externals),
                                    shell=True).wait()
                            else:
                                Popen(
                                    "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}"
                                    .format(lib, nlib, min(sample_freq, 1.),
                                            s[1] + 1, lib_id, **externals),
                                    shell=True).wait()
                for lib in library:
                    try:
                        os.unlink(lib)
                    except:
                        pass
        return new_reads

Example #6

Show file

File: completeCC.py Project: zheminzhou/EToKi

def hierCC(args):
    params = get_args(args)
    ot = time.time()
    cluster_file = params.output + '.completeCC.npz'
    pool = Pool(10)

    global mat, n_loci
    mat = pd.read_csv(params.profile, sep='\t', header=None, dtype=str).values
    allele_columns = np.array(
        [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])])
    mat = mat[1:, allele_columns].astype(int)
    n_loci = mat.shape[1] - 1

    logger(
        '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.'
        .format(time.time() - ot, *mat.shape))

    if os.path.isfile(params.incremental):
        od = np.load(params.incremental, allow_pickle=True)
        cls = od['completeCC']

        typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0}
        if len(typed) > 0:
            logger('{0}: Loaded in {1} old completeCC assignments.'.format(
                time.time() - ot, len(typed)))
            mat_idx = np.array([t in typed for t in mat.T[0]])
            mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]])
    else:
        typed = {}

    if os.path.isfile(params.partition):
        st_idx = {str(st): id for id, st in enumerate(mat.T[0])}
        from collections import defaultdict
        partitions = defaultdict(list)
        for st, grp in pd.read_csv(params.partition, sep='\t',
                                   dtype=str).values:
            partitions[grp].append(st_idx[st])
            st_idx[st] = -1
        logger('{0}: Load in {1} partition(s)'.format(time.time() - ot,
                                                      len(partitions)))
        st_idx = {k: v for k, v in st_idx.items() if v >= 0}
    else:
        partitions = {'all': np.arange(mat.shape[0])}
        st_idx = {}

    res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape)
    res[list(st_idx.values()), :] = 0
    for key, indices in sorted(partitions.items()):
        if len(indices) <= 1:
            continue
        logger('{0}: Partition {1} contains {2} STs'.format(
            time.time() - ot, key, len(indices)))
        mat2 = mat[indices]
        logger(
            '{0}: Start to calculate pairwise distances'.format(time.time() -
                                                                ot))
        dist = get_distances(params.output, mat2, pool)
        logger('{0}: Start complete linkage clustering'.format(time.time() -
                                                               ot))
        cls = linkage(ssd.squareform(dist), method='complete')
        logger('{0}: Start completeCC assignments'.format(time.time() - ot))
        descendents = [[i] for i in np.arange(dist.shape[0])
                       ] + [None for i in np.arange(dist.shape[0] - 1)]
        for idx, c in enumerate(cls.astype(int)):
            n_id = idx + dist.shape[0]
            d = sorted([int(c[0]), int(c[1])], key=lambda x: descendents[x][0])
            min_id = descendents[d[0]][0]
            descendents[n_id] = descendents[d[0]] + descendents[d[1]]
            for tgt in descendents[d[1]]:
                res[indices[tgt], c[2] + 1:] = res[indices[min_id], c[2] + 1:]
    res = res[res.T[0] > 0]
    np.savez_compressed(cluster_file, completeCC=res)

    if not params.delta:
        with uopen(params.output + '.completeCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in np.arange(n_loci)])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(rr) for rr in r]) + '\n')
    else:
        deltas = map(int, params.delta.split(','))
        with uopen(params.output + '.completeCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in deltas])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n')
    del res
    logger(
        'NUMPY clustering result (for incremental completeCC): {0}.completeCC.npz'
        .format(params.output))
    logger(
        'TEXT  clustering result (for visual inspection): {0}.completeCC.gz'.
        format(params.output))

Example #7

Show file

def nomenclature(query, reference, ref_aa='', **params):
    # write query
    logger('EnSign starts')
    sequence, qry_fna, qry_faa = seqOperation().write_query(query)
    logger('Read in {0} bases as query'.format(
        sum([len(s[0]) for s in sequence.itervalues()])))
    # write refset
    if not os.path.isfile(str(ref_aa)):
        ref_aa = seqOperation().write_refsets(reference)
        logger('Prepare translated references')
    # do comparison
    blasttab = dualBlast().run_ublast(fna_target=qry_fna,
                                      faa_target=qry_faa,
                                      fna_query=reference,
                                      faa_query=ref_aa)
    # filter
    blasttab_parser = blastParser()
    blasttab = blasttab_parser.linear_merge(blasttab, **parameters)
    logger('Merge closely located hits. {0} hits'.format(len(blasttab)))
    loci = blasttab_parser.parse_ublast(blasttab, parameters)
    logger('Identify homologous groups. {0} groups'.format(
        len([1 for lc in loci if lc != '__non_specific__'])))
    regions = blasttab_parser.inter_loci_overlap(loci, parameters)
    logger('Resolve potential paralogs. {0} regions'.format(len(regions)))

    # submission
    alleles = blasttab_parser.form_alleles(regions, sequence,
                                           parameters['unique_key'],
                                           parameters['high_quality'],
                                           parameters)
    logger('Generate allelic sequences. {0} remains'.format(len(alleles)))
    #results = blasttab_parser.typing(alleles, parameters, dbname, scheme, submission=submission)
    return alleles

Example #8

Show file

def read_matrix(fname):
    invariant = []
    seqLens, missing = [], []

    with uopen(fname) as fin:
        for line_id, line in enumerate(fin):
            if line.startswith('##'):
                if line.startswith('## Constant_bases'):
                    part = line[2:].strip().split()
                    invariant = dict(
                        zip([65, 67, 71, 84], [float(v) for v in part[1:]]))
                elif line.startswith('## Sequence_length:'):
                    part = line[2:].strip().split()
                    seqLens.append([part[1], int(part[2])])
                elif line.startswith('## Missing_region:'):
                    part = line[2:].strip().split()
                    missing.append([part[1], int(part[2]), int(part[3])])
            elif line.startswith('#'):
                part = np.array(line.strip().split('\t'))
                cols = np.where(
                    (1 - np.char.startswith(part, '#')).astype(bool))[0]
                w_cols = np.where(np.char.startswith(part, '#!W'))[0]
                names = part[cols]
                break
            else:
                part = np.array(line.strip().split('\t'))
                cols = np.ones(part.shape, dtype=bool)
                cols[:2] = False
                w_cols = np.char.startswith(part, '#!W')
                names = part[cols]
                break

        bases, weights, sites = [], [], []
        for mat in pd.read_csv(fin,
                               header=None,
                               sep='\t',
                               usecols=cols.tolist() + w_cols.tolist() +
                               [0, 1],
                               chunksize=10000,
                               engine='c',
                               dtype=str,
                               low_memory=False,
                               na_filter=False):
            mat = mat.values
            logger('{0}\t{1}\t{2}\t{3}'.format(\
                mat[0, 0], mat[0, 1], \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites) ))

            for m in mat:
                btype, bidx = np.unique(['-'] + m[cols].tolist(),
                                        return_inverse=True)
                if btype.size <= 2:
                    continue
                sites.append([m[0], int(m[1]), 1, np.array([])])
                weights.append(
                    m[w_cols].astype(float).prod() if w_cols.size else 1.)
                if '.' in btype or max(map(len, btype)) > 1:
                    missing_val = np.where(btype == '-')[0][0]
                    bidx[bidx == missing_val] = 45
                    bidx[bidx < missing_val] += 1
                    bidx[bidx == 45] = 0
                    sites[-1][3] = np.array(['-'] +
                                            btype[btype != '-'].tolist())
                    sites[-1][2] = 2
                    bases.append(bidx[1:])
                else:
                    bases.append(
                        np.array(list(map(ord, btype)),
                                 dtype=np.uint8)[bidx[1:]])

    bases, weights, sites = np.vstack(bases), np.array(weights), np.array(
        sites, dtype=object)
    indices = np.lexsort(bases.T)
    snps = []

    for idx in indices:
        s, b, w = sites[idx], bases[idx], weights[idx]
        if not snps or np.any(b != snps[-1][2]):
            snps.append([len(snps), w, b, s[2]])
        else:
            snps[-1][1] += w
        s[2] = snps[-1][0]

    for inv in invariant.items():
        b_key = np.array([inv[0]] * len(names), dtype=np.uint8)
        snps.append([len(snps), float(inv[1]), b_key, 0])
    for snp in snps:
        snp[1] = np.ceil(snp[1])
    return names, sites, np.array(snps, dtype=object), np.array(
        seqLens, dtype=object), np.array(missing, dtype=object)

Example #9

Show file

def filt_genes(prefix, groups, global_file, conflicts, first_classes=None):
    outPos = np.ones(16, dtype=bool)
    outPos[[3, 4, 5, 10, 15]] = False

    c2 = {c: {} for c in np.unique(conflicts.T[:2])}
    for c in conflicts:
        c2[c[0]][c[1]] = c2[c[1]][c[0]] = c[2]
    conflicts = c2

    clust_ref = readFasta(params['clust'])
    for gene, g in groups.items():
        g.T[2] *= g.T[3]
        g[:] = g[np.argsort(-g.T[2], kind='mergesort')]
    used, results, run = {}, {}, {}
    group_id = 0
    with open('{0}.Prediction'.format(prefix), 'w') as fout:
        while len(groups) > 0:
            genes = get_gene(groups, first_classes, cnt=50)
            if len(genes) <= 0:
                continue
            to_run, to_run_id, min_score, min_rank = [], [], genes[-1][
                1], genes[0][2]
            genes = {gene: score for gene, score, min_rank in genes}
            if params['orthology'] in ('ml', 'nj'):
                for gene, score in genes.items():
                    if gene not in run:
                        mat = groups[gene]
                        _, bestPerGenome, matInGenome = np.unique(
                            mat.T[1], return_index=True, return_inverse=True)
                        region_score = mat.T[2] / mat[
                            bestPerGenome[matInGenome], 2]
                        if region_score.size >= bestPerGenome.size * 2:
                            used2, kept = set([]), np.ones(mat.shape[0],
                                                           dtype=bool)
                            for id, m in enumerate(mat):
                                if m[5] in used2:
                                    kept[id] = False
                                else:
                                    used2.update(conflicts.get(m[5], {}))
                            mat = mat[kept]
                            _, bestPerGenome, matInGenome = np.unique(
                                mat.T[1],
                                return_index=True,
                                return_inverse=True)
                            region_score = mat.T[2] / mat[
                                bestPerGenome[matInGenome], 2]
                        if region_score.size > bestPerGenome.size * 3 and len(
                                region_score) > 500:
                            region_score2 = sorted(region_score, reverse=True)
                            cut = region_score2[bestPerGenome.size * 3 - 1]
                            if cut >= params['clust_identity']:
                                cut = min(
                                    region_score2[bestPerGenome.size * 5] if
                                    len(region_score) > bestPerGenome.size * 5
                                    else params['clust_identity'], 1.0 - 0.6 *
                                    (1.0 - params['clust_identity']))
                            mat = mat[region_score >= cut]

                        to_run.append([mat, clust_ref[mat[0][0]], global_file])
                        to_run_id.append(gene)
                working_groups = pool.map(filt_per_group, to_run)
                #working_groups = [filt_per_group(d) for d in to_run]
                for gene, working_group in zip(to_run_id, working_groups):
                    groups[gene] = working_group
                    run[gene] = 1
            else:
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)
                region_score = mat.T[2] / mat[bestPerGenome[matInGenome], 2]
                mat[:] = mat[region_score >= params['clust_identity']]
                used2, kept = set([]), np.ones(mat.shape[0], dtype=bool)
                for id, m in enumerate(mat):
                    for mmm in m[6]:
                        if mmm[15] in used2:
                            kept[id] = False
                            break
                    if kept[id]:
                        used2 |= {mmm[15] for mmm in m[6]}
                mat = mat[kept]
                _, bestPerGenome, matInGenome = np.unique(mat.T[1],
                                                          return_index=True,
                                                          return_inverse=True)

            while len(genes):
                score, gene = max([[
                    np.sum(groups[gene][np.unique(groups[gene].T[1],
                                                  return_index=True)[1]].T[2]),
                    gene
                ] for gene in genes])
                if score < min_score:
                    break
                mat = groups.pop(gene, [])
                genes.pop(gene)

                paralog, paralog2 = 0, 0
                supergroup = {}
                used2 = {}
                for m in mat:
                    gid = m[5]
                    conflict = used.get(gid, None)
                    if conflict is not None:
                        if not isinstance(conflict, int):
                            superC = results[conflict]
                            supergroup[superC] = supergroup.get(superC, 0) + 1
                        elif conflict > 0:
                            if m[6].shape[0] <= 1 and m[3] >= params[
                                    'clust_identity']:
                                paralog = 1
                                break
                            else:
                                paralog2 += 1
                        m[3] = -1
                    else:
                        for g2, gs in conflicts.get(gid, {}).items():
                            if gs == 1:
                                if g2 not in used:
                                    used2[g2] = m[0]
                            elif gs == 2:
                                used2[g2] = 1
                            else:
                                used[g2] = 0
                if paralog or paralog2 * 3 >= mat.shape[0]:
                    continue
                else:
                    used.update(used2)

                pangene = mat[0][0]
                if len(supergroup):
                    pg, pid = max(supergroup.items(), key=itemgetter(1))
                    if pid * 3 >= mat.shape[0] or (pid * 5 >= mat.shape[0]
                                                   and pid > 1):
                        pangene = pg

                results[mat[0][0]] = pangene
                logger(
                    '{4} / {5}: pan gene "{3}" : "{0}" picked from rank {1} and score {2}'
                    .format(mat[0][0], min_rank, score, pangene, len(results),
                            len(groups) + len(results)))

                for grp in mat[mat.T[3] > 0]:
                    group_id += 1
                    for g in grp[6]:
                        fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            pangene, min_rank, group_id, grp[1],
                            '\t'.join(g[outPos].astype(str).tolist())))
    return '{0}.Prediction'.format(prefix)

Example #10

Show file

File: uberBlast.py Project: tauqeer9/PEPPAN

    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        for id in xrange(5):
            with open('{0}.{1}'.format(refAA, id), 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA='{0}.{1}'.format(refAA, id),
                qryAA=qryAA,
                aaMatch='{0}.{1}'.format(aaMatch, id),
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            Popen(diamond_cmd.split(),
                  stdout=PIPE,
                  stderr=PIPE,
                  universal_newlines=True).communicate()
        blastab = []
        for r in self.pool.imap_unordered(parseDiamond, [[
                '{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq,
                self.min_id, self.min_cov, self.min_ratio
        ] for id in xrange(5)]):
            if r is not None:
                blastab.append(np.load(r, allow_pickle=True))
                os.unlink(r)
        blastab = np.vstack(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab

Example #11

Show file

    def init_cleanup(self, reads):
        prefix = parameters['prefix']
        new_reads = []
        for lib_id, library in enumerate(reads):
            library_file = ['{0}.0.{1}.1.fastq.gz'.format(prefix, lib_id)]
            Popen('cat {0} > {1}'.format(' '.join([run[0] for run in library]),
                                         library_file[0]),
                  shell=True).wait()
            if len(library[0]) > 1:
                library_file.append('{0}.0.{1}.2.fastq.gz'.format(
                    prefix, lib_id))
                Popen('cat {0} > {1}'.format(
                    ' '.join([run[1] for run in library]), library_file[1]),
                      shell=True).wait()
            if len(library_file) == 1:
                reads = 'in=' + library_file[0]
                library_file2 = ['{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id)]
                outputs = 'out=' + library_file2[0]
            else:
                reads = 'in=' + library_file[0] + ' in2=' + library_file[1]
                library_file2 = [
                    '{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id),
                    '{0}.1.{1}.2.fastq.gz'.format(prefix, lib_id),
                    '{0}.1.{1}.3.fastq.gz'.format(prefix, lib_id)
                ]
                outputs = 'out=' + library_file2[0] + ' out2=' + library_file2[
                    1] + ' outs=' + library_file2[2]

            if parameters['noTrim'] == False:
                bb_run = Popen('{bbduk} -Xmx{memory} threads=8 rref={adapters} overwrite=t qout=33 k=23 mink=13 minlength=23 tbo=t entropy=0.75 entropywindow=25 mininsert=23 maxns=2 ktrim=r trimq={read_qual} {read} {outputs}'.format( \
                                  read=reads, outputs=outputs, **parameters).split(), stdout=PIPE, stderr=PIPE)
                timer = Timer(3600, kill_child_proc, [bb_run])
                try:
                    timer.start()
                    bb_out = bb_run.communicate()
                finally:
                    timer.cancel()
                if bb_run.returncode == 0:
                    new_reads.append(library_file2)
                    try:
                        for fname in library_file:
                            os.unlink(fname)
                        stat = re.findall(
                            'Result:\s+(\d+) reads .+\s+(\d+) bases',
                            bb_out[1])[0]
                        logger('Obtained {1} bases in {0} reads after BBDuk2'.
                               format(*stat))
                    except:
                        pass
                else:
                    new_reads.append(library_file)
                    try:
                        stat = re.findall(
                            'Input:\s+(\d+) reads .+\s+(\d+) bases',
                            bb_out[1])[0]
                        logger(
                            'BBDuk2 failed! Use original reads with {1} bases in {0} reads'
                            .format(*stat))
                        for fname in library_file2:
                            os.unlink(fname)
                    except:
                        pass
            else:
                new_reads.append(library_file)
        return new_reads

Example #12

Show file

File: hierCC.py Project: shulp2211/EToKi

def hierCC(args):
    params = get_args(args)
    ot = time.time()
    profile_file, cluster_file, old_cluster = params.profile, params.output + '.npz', params.incremental

    global mat, n_loci
    mat = pd.read_csv(profile_file, sep='\t', header=None, dtype=str).values
    allele_columns = np.array(
        [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])])
    mat = mat[1:, allele_columns].astype(int)
    n_loci = mat.shape[1] - 1

    logger(
        '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.'
        .format(time.time() - ot, *mat.shape))
    if not params.immutable:
        absence = np.sum(mat <= 0, 1)
        mat = mat[np.argsort(absence, kind='mergesort')]

    if os.path.isfile(old_cluster):
        od = np.load(old_cluster, allow_pickle=True)
        cls = od['hierCC']

        typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0}
        if len(typed) > 0:
            logger('{0}: Loaded in {1} old hierCC assignments.'.format(
                time.time() - ot, len(typed)))
            mat_idx = np.array([t in typed for t in mat.T[0]])
            mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]])
    else:
        typed = {}

    logger('{0}: Start hierCC assignments'.format(time.time() - ot))
    pool = Pool(10)

    res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape)
    res[1:, 0] = n_loci + 1
    for index in xrange(0, mat.shape[0], 100):
        to_run = []
        for idx in np.arange(index, index + 100):
            if idx < mat.shape[0]:
                if mat[idx, 0] in typed:
                    res[idx, :] = cls[typed[mat[idx, 0]], :]
                else:
                    to_run.append(idx)
        if len(to_run) == 0:
            continue
        if not params.immutable:
            dists = np.vstack(pool.map(get_distance, to_run))
            assignment(dists, res)
        else:
            dists = np.vstack(pool.map(get_distance2, to_run))
            assignment2(dists, res)

        logger('{0}: Assigned {1} of {2} types into hierCC.'.format(
            time.time() - ot, index, mat.shape[0]))
    res.T[0] = mat.T[0]
    np.savez_compressed(cluster_file, hierCC=res)

    if not params.delta:
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in np.arange(n_loci)])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(rr) for rr in r]) + '\n')
    else:
        deltas = map(int, params.delta.split(','))
        with uopen(params.output + '.hierCC.gz', 'w') as fout:
            fout.write('#ST_id\t{0}\n'.format('\t'.join(
                ['d' + str(id) for id in deltas])))
            for r in res[np.argsort(res.T[0])]:
                fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n')
    del res
    logger('NUMPY clustering result (for incremental hierCC): {0}.npz'.format(
        params.output))
    logger('TEXT  clustering result (for visual inspection): {0}.hierCC.gz'.
           format(params.output))

Example #13

Show file

File: assemble.py Project: nickp60/EToKi

    def do_polish(self, reference, reads, reassemble=False, onlySNP=False) :
        if parameters.get('SNP', None) is not None :
            return self.do_polish_with_SNPs(reference, parameters['SNP'])
        else :
            if parameters['mapper'] == 'minimap2' :
                bams = self.__run_minimap(prefix, reference, reads )
            elif parameters['mapper'] != 'bwa' :
                bams = self.__run_bowtie(prefix, reference, reads )
            else :
                bams = self.__run_bwa(prefix, reference, reads )
            sites = {}
            for bam in bams :
                if bam is not None :
                    depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True)
                    for line in depth.stdout :
                        part = line.strip().split()
                        if len(part) > 2 and float(part[2]) > 0 :
                            sites[part[0]] = 1
            sequence = readFasta(reference)
            sequence = {n:s for n,s in sequence.items() if n in sites}

            with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))

            bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
            if reassemble :
                pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            else :
                pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()
            
            if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) :
                pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters)
                Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate()                    
            
            snps = []
            with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout :
                for line in fin :
                    if line.startswith('#') : continue
                    part = line.strip().split('\t')
                    if part[-1] != '0/0':
                        try :
                            if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]):
                                if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) :
                                    snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] )
                                    fout.write(line)
                        except :
                            pass

            os.unlink('{0}.mapping.vcf'.format(prefix))
            for n in sequence.keys() :
                sequence[n] = list(sequence[n])
            for n, site, ori, alt in reversed(snps) :
                s = sequence[n]
                end = site + len(ori)
                s[site:end] = alt
            logger('Observed and corrected {0} changes using PILON'.format(len(snps)))
            with open('{0}.fasta'.format(prefix), 'w') as fout :
                for n, s in sorted(sequence.items()) :
                    s = ''.join(s)
                    fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)])))
            return '{0}.fasta'.format(prefix)

Example #14

Show file

    def runUBlast(self, ref, qry, nhits=6, frames='7'):
        logger('Run uBLAST starts')

        def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab[2] /= 100.
            blastab = blastab[blastab[2] >= min_id]
            blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3

            qf, rf = blastab[0].str.rsplit(
                ':', 1, expand=True), blastab[1].str.rsplit(':',
                                                            1,
                                                            expand=True)
            if np.all(qf[0].str.isdigit()):
                qf[0] = qf[0].astype(int)
            if np.all(rf[0].str.isdigit()):
                rf[0] = rf[0].astype(int)
            blastab[0], qf = qf[0], qf[1].astype(int)
            blastab[1], rf = rf[0], rf[1].astype(int)
            blastab[6], blastab[
                7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1
            blastab[14] = [[
                [3 * vv[0], vv[1]] for vv in v
            ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))]

            blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[
                str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)]))

            rf3 = (rf <= 3)
            blastab.loc[rf3,
                        8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[
                            rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1
            blastab.loc[~rf3, 8], blastab.loc[
                ~rf3, 9] = blastab.loc[~rf3, 13] - (
                    blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 -
                    3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 +
                                                     rf[~rf3] - 3 - 1) + 1
            d = np.max([
                blastab[7] - blastab[12], blastab[9] - blastab[13],
                1 - blastab[9],
                np.zeros(blastab.shape[0], dtype=int)
            ],
                       axis=0)
            blastab[7] -= d

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(blastab[14], d)
            d[~rf3] *= -1
            blastab[9] -= d
            blastab = blastab[
                (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12])
                & (blastab[7] - blastab[6] + 1 >= min_cov)]
            return blastab.drop(columns=[15, 16])

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F')
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        refAASeq = transeq(self.refSeq, frames)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        blastab = []
        for id in xrange(5):
            with open(refAA, 'w') as fout:
                for line in toWrite[id::4]:
                    fout.write(line)

            ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(
                usearch=usearch,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio)
            p = Popen(ublast_cmd.split(),
                      stderr=PIPE,
                      stdout=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                blastab.append(
                    parseUBlast(open(aaMatch), self.refSeq, self.qrySeq,
                                self.min_id, self.min_cov, self.min_ratio))
        blastab = pd.concat(blastab)
        logger('Run uBLAST finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab

Example #15

Show file

File: assemble.py Project: khaledk2/EToKi

    def get_quality(self, reference, reads):
        if parameters['mapper'] != 'bwa':
            bams = self.__run_bowtie(reference, reads)
        else:
            bams = self.__run_bwa(reference, reads)

        sequence = readFasta(filename=reference, qual=0)
        for n, s in sequence.iteritems():
            s[1] = list(s[1])

        sites = {
            n: np.array([0 for ss in s[1]])
            for n, s in sequence.iteritems()
        }
        for bam in bams:
            if bam is not None:
                depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                    bam=bam, **parameters).split(),
                              stdout=PIPE).communicate()[0]
                for line in depth.split('\n'):
                    part = line.strip().split()
                    if len(part) > 2 and float(part[2]) > 0:
                        sites[part[0]][int(part[1]) - 1] += float(part[2])
        sites = {n: [s.size, np.mean(s), 0.] for n, s in sites.iteritems()}
        depth = np.array(sites.values())
        depth = depth[np.argsort(-depth.T[0])]
        size = np.sum(depth.T[0])
        acc = [0, 0]
        for d in depth:
            acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0] * d[1]
            if acc[0] * 2 >= size:
                break
        ave_depth = acc[1] / acc[0]
        exp_mut_depth = max(ave_depth * 0.2, 1.)
        for n, s in sites.iteritems():
            s[2] = s[1] / ave_depth
        logger('Average read depth: {0}'.format(ave_depth))
        logger('Sites with over {0} or 15% unsupported reads is not called'.
               format(exp_mut_depth))
        sequence = {n: s for n, s in sequence.iteritems() if sites[n][1] > 0.}
        with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
            for n, s in sorted(sequence.items()):
                fout.write('>{0}\n{1}\n'.format(
                    n, '\n'.join([
                        s[0][site:(site + 100)]
                        for site in range(0, len(s[0]), 100)
                    ])))
        bam_opt = ' '.join(
            ['--bam {0}'.format(b) for b in bams if b is not None])
        pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
            bam_opt=bam_opt, **parameters)
        Popen(pilon_cmd.split(), stdout=PIPE).communicate()

        cont_depth = [float(d) for d in parameters['cont_depth'].split(',')]
        logger(
            'Contigs with less than {0} depth will be removed from the assembly'
            .format(cont_depth[0] * ave_depth))
        logger(
            'Contigs with more than {0} depth will be treated as duplicates'.
            format(cont_depth[1] * ave_depth))

        with open('{0}.mapping.vcf'.format(prefix)) as fin, open(
                '{0}.mapping.difference'.format(prefix), 'w') as fout:
            for line in fin:
                if line.startswith('#'): continue
                part = line.strip().split('\t')
                if sites[part[0]][2] < cont_depth[0] or sites[
                        part[0]][2] >= cont_depth[1]:
                    continue
                try:
                    if part[-1] == '0/0' and len(part[3]) == 1 and len(
                            part[4]) == 1:
                        dp, af = float(part[7].split(';', 1)[0][3:]), float(
                            part[7][-4:])
                        if af < 0.15 and dp >= 3 and dp * af <= exp_mut_depth:
                            if part[6] == 'PASS' or (part[6] == 'LowCov' and
                                                     parameters['metagenome']):
                                site = int(part[1]) - 1
                                qual = chr(int(part[7].split(';')[4][3:]) + 33)
                                sequence[part[0]][1][site] = qual
                        else:
                            fout.write(line)
                    else:
                        fout.write(line)
                except:
                    fout.write(line)
        if self.snps is not None:
            for n, snvs in self.snps.iteritems():
                for site, snv in snvs:
                    if snv.find('N') >= 0: continue
                    if snv.startswith('+'):
                        s, e = site - 4, site + 3 + len(snv)
                    else:
                        s, e = site - 4, site + 4
                    for k in range(s, e):
                        sequence[n][1][k] = max(chr(40 + 33),
                                                sequence[n][1][k])

        with open('{0}.result.fastq'.format(prefix), 'w') as fout:
            for n, (s, q) in sequence.iteritems():
                if sites[n][2] >= cont_depth[0]:
                    fout.write('@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format(
                        n, s, ''.join(q), *sites[n]))
        os.unlink('{0}.mapping.vcf'.format(prefix))
        logger('Final result is written into {0}'.format(
            '{0}.result.fastq'.format(prefix)))
        return '{0}.result.fastq'.format(prefix)

Example #16

Show file

def read_matrix(fname) :
    sites, snps = [], {}
    invariant = []
    seqLens, missing = [], []
    
    validate = np.repeat(45, 256).astype(np.uint8)
    validate[np.array(['A', 'C', 'G', 'T', '.', '+', '-', '', '*']).view(asc2int)] = np.array(['A', 'C', 'G', 'T', '', '', '-', '', '']).view(asc2int)
    
    with uopen(fname) as fin :
        for line_id, line in enumerate(fin) :
            if line.startswith('##'):
                if line.startswith('## Constant_bases') :
                    part = line[2:].strip().split()
                    invariant = list(zip([65, 67, 71, 84], [float(v) for v in part[1:]]))
                elif line.startswith('## Sequence_length:') :
                    part = line[2:].strip().split()
                    seqLens.append([part[1], int(part[2])])
                elif line.startswith('## Missing_region:') :
                    part = line[2:].strip().split()
                    missing.append([part[1], int(part[2]), int(part[3])])
            elif line.startswith('#') :
                part = np.array(line.strip().split('\t'))
                cols = np.where((1 - np.char.startswith(part, '#')).astype(bool))[0]
                w_cols = np.where(np.char.startswith(part, '#!W'))[0]
                names = part[cols]
                break
            else :
                part = np.array(line.strip().split('\t'))
                cols = np.ones(part.shape, dtype=bool)
                cols[:2] = False
                w_cols = np.char.startswith(part, '#!W')
                names = part[cols]
                break
        for mat in pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0,1], chunksize=10000) :
            mat = mat.values
            logger('{0}\t{1}\t{2}\t{3}\t{4}'.format(\
                mat[0, 0], mat[0, 1], \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites), len(snps)))
            bk = validate[mat[:, cols].astype('str').view(asc2int)].reshape(mat.shape[0], -1, cols.shape[0])
            bk = np.moveaxis(bk, 1, 2)
            if bk.shape[2] > 1 :
                bk[(bk[:, :, 1] != 0) & (bk[:, :, 0] == 45), 0] = 0
            b_keys = bk[:, :, 0]
            weights = mat[:, w_cols].astype(float).prod(1) if w_cols.size else np.ones(mat.shape[0], dtype=float)
            for (b_key, site, w) in zip(b_keys, mat, weights) :
                b_key = tuple(b_key)
                if min(b_key) == 0 :
                    bk2 = np.concatenate([site[cols], ['']])
                    bk2[bk2 == '-'] = ''
                    category, b_key = np.unique(bk2, return_inverse=True)
                    if category[0] == '' :
                        category[0] = '-'
                    b_key = tuple(b_key[:-1].tolist())
                else :
                    category = []
                if b_key in snps :
                    snps[b_key][2] += w
                elif min(b_key) >= 45 : 
                    snps[b_key] = [len(snps), 1, w]
                else :
                    snps[b_key] = [len(snps), 2, w]
                    
                if snps[b_key][1] > 0 :
                    sites.append([ site[0], site[1], snps[b_key][0], np.array(category) ])

    for inv in invariant :
        b_key = tuple([inv[0]] * len(names))
        if b_key not in snps :
            snps[b_key] = [len(snps), 0, float(inv[1])]
        else :
            snps[b_key][2] += float(inv[1])
    return names, sites, sorted([[info[0], int(math.ceil(info[2])), np.array(line, dtype=np.uint8), info[1]] for line, info in snps.items() ]), seqLens, missing

Example #17

Show file

def buildReference(targets, sources, max_iden=0.9,  min_iden=0.6, coverage=0.7, paralog=0.1, relaxEnd=False) :
    orderedLoci = { t['fieldname']:i for i, t in reversed(list(enumerate(sources))) }
    refsets = []
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        tmpDir = os.path.join(dirPath, 'tmp')
        sourceFna = os.path.join(dirPath, 'sourceFna')
        sourceFaa = os.path.join(dirPath, 'sourceFaa')
        targetFna = os.path.join(dirPath, 'targetFna')
        targetFiltFna = os.path.join(dirPath, 'targetFiltFna')
        targetFiltFaa = os.path.join(dirPath, 'targetFiltFaa')
        targetClsFna = os.path.join(dirPath, 'targetClsFna')
        targetResFna = os.path.join(dirPath, 'targetResFna')
        alnFaa = os.path.join(dirPath, 'alnFaa')
        
        with open(sourceFna+'.fas', 'w') as fout :
            fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**s) for s in sources]))
        with open(targetFna+'.fas', 'w') as fout :
            fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**t) for t in targets]))
        targetFiltFna, goodCandidates, crossSites = minimapFilter(sourceFna+'.fas', targetFna+'.fas', targetFiltFna, max_iden, min_iden, coverage, paralog, relaxEnd, orderedLoci)
        logger('identifed {0} good exemplar alleles after nucleic search'.format(len(goodCandidates)))
        
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, sourceFna).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetFiltFna).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, sourceFna, sourceFaa).split(), stdout = subprocess.PIPE).communicate()
        subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, targetFiltFna, targetFiltFaa).split(), stdout = subprocess.PIPE).communicate()
        for ite in range(9) :
            if os.path.isdir(tmpDir) :
                shutil.rmtree(tmpDir)
            p=subprocess.Popen('{0} search {2} {1} {3} {4} -c {6} --min-seq-id {5} --threads {t}'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa, tmpDir, min_iden, coverage, t=9-4*int(ite/3) ).split(), stdout = subprocess.PIPE)
            p.communicate()
            if p.returncode == 0 :
                break
            time.sleep(1)
        subprocess.Popen('{0} convertalis {2} {1} {3} {3}.tab --format-mode 2 --threads 8'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa).split(), stdout = subprocess.PIPE).communicate()
        
        with open(alnFaa + '.tab') as fin :
            for line in fin :
                part = line.strip().split('\t')
                qLoc, rLoc = part[0].rsplit('_', 1)[0], part[1].rsplit('_', 1)[0]
                if qLoc == rLoc:
                    if relaxEnd or (int(part[8]) == int(part[6]) and int(part[12]) - int(part[7]) == int(part[13]) - int(part[9])) :
                        goodCandidates[part[0]] = max(goodCandidates.get(part[0], 0), float(part[2]))
                elif orderedLoci[qLoc] > orderedLoci[rLoc] and crossSites.get(part[0], 0) < float(part[2]) :
                    crossSites[part[0]] = float(part[2])
        logger('identifed a total of {0} good exemplar alleles after amino search'.format(len(goodCandidates)))
        
        nLoci = len(orderedLoci)
        for s in sources :
            key = '{0}_{1}'.format(s['fieldname'], s['value_id'])
            if crossSites.get(key, 0) > 1-paralog :
                orderedLoci.pop(s['fieldname'], None)
                #logger(key)
        if nLoci > len(orderedLoci) :
            logger('Total of {0} loci are not suitable for MLST scheme [due to paralog setting]. There are {1} left'.format(nLoci - len(orderedLoci), len(orderedLoci)))

        with open(targetFna+'.fas') as fin, open(targetClsFna+'.fas', 'w') as fout :
            writable = False
            for line in fin :
                if line.startswith('>') :
                    name = line[1:].strip().split()[0]
                    locus, id = name.rsplit('_', 1)
                    writable = True if locus in orderedLoci and goodCandidates.get(name, 0) - crossSites.get(name, 0) > paralog else False
                if writable :
                    fout.write(line)
        subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetClsFna).split(), stdout = subprocess.PIPE).communicate()
        for ite in range(9) :
            if os.path.isdir(tmpDir) :
                shutil.rmtree(tmpDir)            
            p = subprocess.Popen('{0} cluster {1} {2} {3} -c {5} --min-seq-id {4} --threads {t}'.format(mmseqs, targetClsFna, targetResFna, tmpDir, max_iden, max_iden, t=9-4*int(ite/3)).split(), stdout = subprocess.PIPE)
            p.communicate()
            if p.returncode == 0 :
                break
            time.sleep(1)
        subprocess.Popen('{0} createtsv {1} {2} {3} {3}.tab'.format(mmseqs, targetClsFna, targetClsFna, targetResFna).split(), stdout = subprocess.PIPE).communicate()
        goodCandidates = {}
        with open(targetResFna + '.tab') as fin :
            for line in fin :
                goodCandidates[line.split('\t', 1)[0]] = 1
        logger('There are {0} good exemplar alleles left after final clustering'.format(len(goodCandidates)))

        with open(targetClsFna+'.fas') as fin:
            writable = False
            for line in fin :
                if line.startswith('>') :
                    name = line[1:].strip().split()[0]
                    writable = True if name in goodCandidates else False
                if writable :
                    refsets.append(line.strip())
    except :
        pass
    finally:
        shutil.rmtree(dirPath)
        return '\n'.join(refsets)

Example #18

Show file

def loadBam(prefix, reference, bams, sequences, snps):
    sequence = readFasta(reference)
    sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()}

    sites = {}
    for bam in bams:
        if bam is not None:
            depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(
                bam=bam, **externals).split(),
                                     stdout=subprocess.PIPE,
                                     universal_newlines=True)
            try:
                d = pd.read_csv(depth.stdout, sep='\t').values
                sites.update({cName: 1 for cName in np.unique(d.T[0])})
            except:
                pass

    sequence = {n: s for n, s in sequence.items() if n in sites}
    with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout:
        for n, s in sorted(sequence.items()):
            fout.write('>{0}\n{1}\n'.format(
                n, '\n'.join([
                    s[0][site:(site + 100)]
                    for site in xrange(0, len(s[0]), 100)
                ])))

    bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None])
    pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(
        prefix=prefix, bam_opt=bam_opt, **externals)
    subprocess.Popen(pilon_cmd.split(),
                     stdout=subprocess.PIPE,
                     universal_newlines=True).communicate()

    uncertains = []
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if sequences[part[0]][int(part[1]) - 1] >= 0:
                if len(part[3]) == 1 and len(part[4]) == 1:
                    pp = part[7].split(';')
                    dp = float(pp[0][3:])
                    if dp >= 3:
                        qd = int(pp[4][3:])
                        if part[-1] == '0/1' or qd < 10:
                            bcs = sorted(
                                [float(bc) for bc in pp[5][3:].split(',')])
                            uncertains.append([bcs[-1], np.sum(bcs[:-1])])
    uncertains = np.array(uncertains)
    p = np.sum(uncertains.T[0]) / np.sum(uncertains)
    qPerRead = 10 * (np.log10(p) - np.log10(1 - p))
    for n in sequence:
        sequence[n][0] = list(sequence[n][0])

    highQ, lowQ, lowC = 0, 0, 0
    with open('{0}.mapping.vcf'.format(prefix)) as fin:
        for line in fin:
            if line.startswith('#'): continue
            part = line.strip().split('\t')
            if len(part[3]) == 1 and len(part[4]) == 1:
                s = int(part[1]) - 1
                pp = part[7].split(';')
                dp = float(pp[0][3:])
                qd = int(pp[4][3:])
                if part[-1] == '0/1' or qd < 10:
                    bcs = np.array([int(bc) for bc in pp[5][3:].split(',')])
                    if np.sum(bcs) > 0:
                        sequence[part[0]][0][s] = ['A', 'C', 'G',
                                                   'T'][np.argmax(bcs)]
                    else:
                        sequence[part[0]][0][s] = part[3]
                    if dp < 3:
                        lowC += 1
                    else:
                        bcs.sort()
                        bcs = [bcs[-1], np.sum(bcs[:-1])]
                        q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p)
                        q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1
                        if q2 >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = min(40, max(1, int(q2)))
                else:
                    if dp < 3:
                        lowC += 1
                    else:
                        if qd >= 10:
                            highQ += 1
                        else:
                            lowQ += 1
                        sequence[part[0]][1][s] = qd
                    if part[-1] == '1/1':
                        sequence[part[0]][0][s] = part[4]

    logger(
        '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'.
        format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC))
    with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout:
        p = prefix.rsplit('/', 1)[-1]
        for n, (s, q) in sequence.items():
            fout.write('@{0}\n{1}\n+\n{2}\n'.format(
                p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q])))
    os.unlink('{0}.mapping.vcf'.format(prefix))
    os.unlink('{0}.mapping.fasta'.format(prefix))
    os.unlink('{0}.mapping.reference.fasta'.format(prefix))
    return '{0}.metaCaller.fastq'.format(prefix)

Example #19

Show file

File: uberBlast.py Project: shulp2211/EToKi

    def runDiamond(self, ref, qry, nhits=10, frames='7'):
        logger('Run diamond starts')

        def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = []
            for line in fin:
                if line.startswith('@'):
                    continue
                part = line.strip().split('\t')
                if part[2] == '*': continue
                qn, qf = part[0].rsplit(':', 1)
                rn, rf, rx = part[2].rsplit(':', 2)
                rs = int(part[3]) + int(rx)
                ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)])
                qm = len(part[9])
                if qm * 3 < min_cov: continue
                cov_ratio = qm * 3. / ql
                if cov_ratio < min_ratio: continue
                cigar = [[int(n) * 3, t]
                         for n, t in re.findall(r'(\d+)([A-Z])', part[5])]
                cl = np.sum([c[0] for c in cigar])
                variation = float(part[12][5:]) * 3 if part[12].startswith(
                    'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3

                iden = 1 - round(variation / cl, 3)
                if iden < min_id: continue
                qf, rf = int(qf), int(rf)
                qs = int(part[18][5:]) if part[18].startswith('ZS:') else int(
                    re.findall('ZS:i:(\d+)', line)[0])

                rm = int(
                    np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3)
                if rf <= 3:
                    rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1
                else:
                    rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - (
                        (rs + rm - 1) * 3 + rf - 4) + 1
                if qf <= 3:
                    qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1
                else:
                    qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - (
                        (qs + qm - 1) * 3 + qf - 4) + 1
                    qs, qe, rs, r_e = qe, qs, r_e, rs
                    cigar = list(reversed(cigar))

                cd = [c[0] for c in cigar if c[1] != 'M']
                score = int(
                    part[14][5:]) if part[14].startswith('ZR:') else int(
                        re.findall('ZR:i:(\d+)', line)[0])
                blastab.append([
                    qn, rn, iden, cl,
                    int(variation - sum(cd)),
                    len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar
                ])
            blastab = pd.DataFrame(blastab)
            blastab[[0, 1]] = blastab[[0, 1]].astype(str)
            return blastab

        refAA = os.path.join(self.dirPath, 'refAA')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch')

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)

        qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id)
        with open(qryAA, 'w') as fout:
            for n, ss in sorted(qryAASeq.items()):
                _, id, s = min([(len(s[:-1].split('X')), id, s)
                                for id, s in enumerate(ss)])
                fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s))

        diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format(
            diamond=diamond, qryAA=qryAA)
        p = Popen(diamond_fmt.split(),
                  stderr=PIPE,
                  stdout=PIPE,
                  universal_newlines=True).communicate()

        refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id)
        toWrite = []
        for n, ss in sorted(refAASeq.items()):
            for id, s in enumerate(ss):
                cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X')
                cdss[-1] = cdss[-1][:-1]
                cdsi = np.cumsum([0] + list(map(len, cdss[:-1])))
                for ci, cs in zip(cdsi, cdss):
                    if len(cs):
                        toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(
                            n, id + 1, ci, cs))

        blastab = []
        for id in xrange(5):
            #logger('{0}'.format(id))
            with open(refAA, 'w') as fout:
                for line in toWrite[id::5]:
                    fout.write(line)
            diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format(
                diamond=diamond,
                refAA=refAA,
                qryAA=qryAA,
                aaMatch=aaMatch,
                n_thread=self.n_thread,
                min_id=self.min_id * 100.,
                nhits=nhits,
                min_ratio=self.min_ratio * 100.)
            p = Popen(diamond_cmd.split(),
                      stdout=PIPE,
                      stderr=PIPE,
                      universal_newlines=True).communicate()
            if os.path.getsize(aaMatch) > 0:
                tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq,
                                   self.min_id, self.min_cov, self.min_ratio)
                os.unlink(aaMatch)
            if tab is not None:
                blastab.append(tab)
        blastab = pd.concat(blastab)
        logger('Run diamond finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab

Example #20

Show file

    def runMMseq(self, ref, qry):
        logger('Run MMSeqs starts')

        def parseMMSeq(fin, refseq, qryseq, min_id, min_cov, min_ratio):
            blastab = pd.read_csv(fin, sep='\t', header=None)
            blastab = blastab[blastab[2] >= min_id]
            qlen = blastab[0].apply(lambda r: len(qryseq[r]))
            rlen = blastab[1].apply(lambda r: len(refseq[r]))
            cigar = blastab[14].apply(lambda x: [[int(n) * 3, t] for n, t in re
                                                 .findall(r'(\d+)([A-Z])', x)])
            ref_sites = pd.concat([3 * (blastab[6] - 1) + 1, 3 * blastab[7]],
                                  keys=[0, 1],
                                  axis=1)
            d = ref_sites[1] - qlen
            d[d < 0] = 0

            def ending(x, y):
                x[-1][0] -= y

            np.vectorize(ending)(cigar, d)
            ref_sites[1] -= d

            direction = (blastab[8] < blastab[9])
            qry_sites = pd.concat([blastab[8], blastab[9] - d], axis=1)
            qry_sites[~direction] = pd.concat([blastab[8] - d, blastab[9]],
                                              axis=1)[~direction]

            blastab = pd.DataFrame(
                np.hstack([
                    blastab[[0, 1, 2]],
                    np.apply_along_axis(lambda x: x[1] - x[0] + 1, 1,
                                        ref_sites.values)[:, np.newaxis],
                    pd.DataFrame(np.zeros([blastab.shape[0], 2], dtype=int)),
                    ref_sites, qry_sites, blastab[[10, 11]],
                    qlen[:, np.newaxis], rlen[:, np.newaxis], cigar[:,
                                                                    np.newaxis]
                ]))
            return blastab[(blastab[3] >= min_cov)
                           & (blastab[3] >= blastab[12] * min_ratio)]

        tmpDir = os.path.join(self.dirPath, 'tmp')
        refNA = os.path.join(self.dirPath, 'refNA')
        qryNA = os.path.join(self.dirPath, 'qryNA')

        refCDS = os.path.join(self.dirPath, 'refCDS')
        qryAA = os.path.join(self.dirPath, 'qryAA')
        aaMatch = os.path.join(self.dirPath, 'aaMatch2')

        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, ref, refNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format(
            mmseqs, qry, qryNA).split(),
              stdout=PIPE).communicate()
        Popen('{0} translatenucs {1} {2}'.format(mmseqs, qryNA, qryAA).split(),
              stdout=PIPE).communicate()
        for ite in range(9):
            if os.path.isdir(tmpDir):
                shutil.rmtree(tmpDir)
            p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                mmseqs, qryAA, refNA, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
            p.communicate()
            if p.returncode == 0:
                break
            if ite > 2:
                Popen('{0} extractorfs {2} {3}'.format(mmseqs, qryAA, refNA,
                                                       refCDS).split(),
                      stdout=PIPE).communicate()
                p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\
                    mmseqs, qryAA, refCDS, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE)
                p.communicate()
                if p.returncode == 0:
                    break
            time.sleep(1)
        Popen('{0} convertalis {1} {2} {3} {3}.tab --threads {4} --format-output'.format(\
            mmseqs, qryAA, refNA, aaMatch, self.n_thread).split() + ['query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,raw,qlen,tlen,cigar'], stdout=PIPE).communicate()

        if not self.qrySeq:
            self.qrySeq, self.qryQual = readFastq(qry)
        if not self.refSeq:
            self.refSeq, self.refQual = readFastq(ref)
        blastab = parseMMSeq(open(aaMatch + '.tab'), self.refSeq, self.qrySeq,
                             self.min_id, self.min_cov, self.min_ratio)
        logger('Run MMSeqs finishes. Got {0} alignments'.format(
            blastab.shape[0]))
        return blastab

Example #21

Show file

def buildReference(alleles,
                   references,
                   max_iden=0.9,
                   min_iden=0.6,
                   coverage=0.7,
                   paralog=0.1,
                   relaxEnd=False):
    orderedLoci = {
        t['fieldname']: i
        for i, t in reversed(list(enumerate(references)))
    }
    dirPath = tempfile.mkdtemp(prefix='NS_', dir='.')
    try:
        sourceFna = os.path.join(dirPath, 'sourceFna')
        clsFna = os.path.join(dirPath, 'clsFna')
        targetFna = os.path.join(dirPath, 'targetFna')
        with open(sourceFna, 'w') as fout:
            fout.write('\n'.join([
                '>{fieldname}_{value_id}\n{value}'.format(**s) for s in alleles
            ]))
        with open(targetFna, 'w') as fout:
            fout.write('\n'.join([
                '>{fieldname}_{value_id}\n{value}'.format(**t)
                for t in references
            ]))
        # get cluster
        exampler, cluster = clust('-i {0} -p {1} -d {2} -c 1 -t 8'.format(\
            sourceFna, clsFna, max_iden).split())
        tooClose, goodCandidates, crossSites = {}, {}, {}
        with open(cluster) as fin:
            for line in fin:
                part = line.strip().split()
                locus = [p.rsplit('_', 1)[0] for p in part]
                if locus[0] != locus[1]:
                    crossSites[part[0]] = locus[1]
                    crossSites[part[1]] = locus[0]

        # compare with references
        blastab = uberBlast('-r {0} -q {1} -f --blastn --diamondSELF --min_id {2} --min_ratio {3} -t 8 -p -s 1 -e 0,3'.format(\
            targetFna, exampler, min_iden, coverage ).split())
        #blastab = blastab[blastab.T[0] != blastab.T[1]]
    except:
        pass
    finally:
        shutil.rmtree(dirPath)
    for tab in blastab:
        locus = [p.rsplit('_', 1)[0] for p in tab[:2]]
        c = (tab[7] - tab[6] + 1) / tab[12]
        e = max(abs(tab[8] - tab[6]),
                abs(tab[12] - tab[7] - (tab[13] - tab[9])))
        if c >= coverage and tab[2] >= min_iden:
            if locus[0] != locus[1]:
                crossSites[tab[0]] = locus[1]
                crossSites[tab[1]] = locus[0]
            elif e <= 0:
                if tab[2] >= max_iden and tab[0] != tab[1]:
                    tooClose[tab[0]] = 1
                else:
                    goodCandidates[tab[0]] = tab[2]
    paralogous_loci = {}
    for ref in references:
        key = '{0}_{1}'.format(ref['fieldname'], ref['value_id'])
        if key in crossSites and orderedLoci[ref['fieldname']] < orderedLoci[
                crossSites[key]]:
            paralogous_loci[ref['fieldname']] = 1
    refsets = []
    for allele in alleles:
        if allele['fieldname'] in paralogous_loci:
            allele['fieldname'] = ''
        else:
            key = '{0}_{1}'.format(allele['fieldname'], allele['value_id'])
            if key in crossSites:
                allele['fieldname'] = ''
            elif key in goodCandidates and key not in tooClose:
                refsets.append(
                    '>{fieldname}_{value_id}\n{value}'.format(**allele))
    alleles = [
        '>{fieldname}_{value_id}\n{value}'.format(**allele)
        for allele in alleles if allele['fieldname'] != ''
    ]
    logger('removed {0} paralogous sites.'.format(len(paralogous_loci)))
    logger('obtained {0} alleles and {1} references alleles'.format(
        len(alleles), len(refsets)))
    return '\n'.join(alleles), '\n'.join(refsets)

Example #22

Show file

    def reduce_depth(self, reads):
        encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        read_stats = [[] for library in reads]
        new_reads = [[] for library in reads]
        for lib_id, (library, stat,
                     new_lib) in enumerate(zip(reads, read_stats, new_reads)):
            for fname in library:
                p = Popen("zcat {0}|awk 'NR%4==2'|wc".format(fname),
                          shell=True,
                          stdout=PIPE).communicate()[0].strip().split()
                n_base = int(p[2]) - int(p[1])
                bcomp = [[0, 0, 0, 0, 0] for i in range(10)]
                p = Popen("zcat {0}|head -400000|awk 'NR%20==2'".format(fname),
                          shell=True,
                          stdout=PIPE,
                          stderr=PIPE)
                for line in p.stdout:
                    for b, bc in zip(line[:10], bcomp):
                        bc[encode.get(b, 4)] += 1
                seq_start = 0
                for c in range(9, -1, -1):
                    bc = bcomp[c]
                    if max(bc) / 0.8 >= sum(bc) or (c < 2
                                                    and bc[4] > 0.1 * sum(bc)):
                        seq_start = c + 1
                        break
                stat.append([n_base, seq_start])
            n_base = sum([s[0] for s in stat])
            sample_freq = float(parameters['max_base']) / n_base if parameters[
                'max_base'] > 0 else 1.
            if sample_freq >= 1 or len(stat) < 3:
                sample_freqs = [sample_freq for s in stat]
            else:
                n_base2 = sum([s[0] for s in stat[:2]])
                if float(parameters['max_base']) <= n_base2:
                    sample_freqs = [
                        float(parameters['max_base']) / n_base2,
                        float(parameters['max_base']) / n_base2, 0.
                    ]
                else:
                    sample_freqs = [
                        1., 1.,
                        (float(parameters['max_base']) - n_base2) / stat[2][0]
                    ]
            if sample_freqs[0] < 1 and sample_freqs[0] > 0:
                logger('Read depth too high. Subsample to every {0:.2f} read'.
                       format(1. / sample_freqs[0]))

            for f_id, (lib, s, sample_freq) in enumerate(
                    zip(library, stat, sample_freqs)):
                if sample_freq > 0:
                    new_lib.append('{0}.2.{1}.{2}.fastq.gz'.format(
                        parameters['prefix'], lib_id, f_id + 1))
                    if parameters['noRename'] == False:
                        if s[1] > 0:
                            logger(
                                'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                .format(s[1], lib))
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                        else:
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                    else:
                        if s[1] > 0:
                            logger(
                                'Remove potential barcode bases at the beginning {0} bps of reads in {1}'
                                .format(s[1], lib))
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()
                        else:
                            Popen(
                                "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}"
                                .format(lib, new_lib[-1], min(sample_freq, 1.),
                                        s[1] + 1, lib_id),
                                shell=True).wait()

                os.unlink(lib)
        return new_reads

Example #23

Show file

def filt_per_group(data):
    mat, ref, global_file = data
    global_differences = dict(np.load(global_file))
    nMat = mat.shape[0]
    seqs = np.vstack([
        np.vstack(mat.T[4]),
        np.array(list(ref)).view(asc2int).astype(np.uint8)[np.newaxis, :]
    ])
    seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45
    diff = compare_seq(
        seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2],
                       dtype=int)).astype(float)
    incompatible, distances = {}, np.zeros(
        shape=[seqs.shape[0], seqs.shape[0]], dtype=float)
    for i1, m1 in enumerate(mat):
        for i2 in xrange(i1 + 1, nMat):
            m2 = mat[i2]
            mut, aln = diff[i1, i2]
            if aln > 0:
                gd = global_differences.get(tuple(sorted([m1[1], m2[1]])),
                                            (0.01, 4))
                distances[i1, i2] = distances[i2, i1] = max(
                    0., 1 - (aln - mut) / aln / (1 - gd[0]))
                difference = mut / aln / gd[0] / gd[1]
            else:
                distances[i1, i2] = distances[i2, i1] = 0.8
                difference = 1.5
            if difference > 1.:
                incompatible[(i1, i2)] = 1

    if len(incompatible) > 0:
        groups = []
        for j, m in enumerate(mat):
            novel = 1
            for g in groups:
                if diff[g[0], j, 0] <= 0.6 * (
                        1.0 - params['clust_identity']) * diff[g[0], j, 1]:
                    g.append(j)
                    novel = 0
                    break
            if novel:
                groups.append([j])
        group_tag = {gg: g[0] for g in groups for gg in g}
        try:
            tags = {
                g[0]: mat[g[0]][4].tostring().decode('ascii')
                for g in groups
            }
        except:
            tags = {g[0]: mat[g[0]][4].tostring() for g in groups}

        tags.update({'REF': ref})

        ic2 = {}
        for i1, i2 in incompatible:
            t1, t2 = group_tag[i1], group_tag[i2]
            if t1 != t2:
                t1, t2 = str(t1), str(t2)
                if t1 not in ic2: ic2[t1] = {}
                if t2 not in ic2: ic2[t2] = {}
                ic2[t1][t2] = ic2[t2][t1] = 1
        incompatible = ic2

        for ite in xrange(3):
            try:
                tmpFile = tempfile.NamedTemporaryFile(dir='.', delete=False)
                for n, s in tags.items():
                    tmpFile.write('>X{0}\n{1}\n{2}'.format(
                        n, s, '\n' * ite).encode('utf-8'))
                tmpFile.close()
                cmd = params[params['orthology']].format(
                    tmpFile.name, **
                    params) if len(tags) < 500 else params['nj'].format(
                        tmpFile.name, **params)
                phy_run = subprocess.Popen(shlex.split(cmd),
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE,
                                           universal_newlines=True)
                gene_phy = ete3.Tree(phy_run.communicate()[0].replace("'", ''))
                break
            except:
                if ite == 2:
                    return mat
            finally:
                os.unlink(tmpFile.name)
        for n in gene_phy.get_leaves():
            if len(n.name):
                n.name = n.name[1:]

        node = gene_phy.get_midpoint_outgroup()
        if node is not None:
            gene_phy.set_outgroup(node)

        for ite in xrange(3000):
            gene_phy.ic, gene_phy.dist = {}, 0.
            rdist = sum([c.dist for c in gene_phy.get_children()])
            for c in gene_phy.get_children():
                c.dist = rdist
            for node in gene_phy.iter_descendants('postorder'):
                if node.is_leaf():
                    node.ic = {
                        tuple(sorted([node.name, n2])): 1
                        for n2 in incompatible.get(node.name, {})
                    }
                else:
                    node.ic = {}
                    for c in node.get_children():
                        for x in c.ic:
                            if x in node.ic:
                                node.ic.pop(x)
                            else:
                                node.ic[x] = 1
            cut_node = max([[len(n.ic), n.dist, n]
                            for n in gene_phy.iter_descendants('postorder')],
                           key=lambda x: (x[0], x[1]))
            if cut_node[0] > 0:
                cut_node = cut_node[2]
                prev_node = cut_node.up
                cut_node.detach()
                if 'REF' in cut_node.get_leaf_names():
                    gene_phy = cut_node
                elif prev_node.is_root():
                    gene_phy = gene_phy.get_children()[0]
                else:
                    prev_node.delete(preserve_branch_length=True)

                tips = set(gene_phy.get_leaf_names())
                for r1 in list(incompatible.keys()):
                    if r1 not in tips:
                        rr = incompatible.pop(r1, None)
                        for r2 in rr:
                            incompatible.get(r2, {}).pop(r1, None)
                for r1 in list(incompatible.keys()):
                    if len(incompatible[r1]) == 0:
                        incompatible.pop(r1, None)
                if len(incompatible) == 0:
                    break

                logger('     Iteration {0}. Remains {1} tips.'.format(
                    ite + 1, len(gene_phy.get_leaf_names())))
            else:
                break
        if len(gene_phy.get_leaf_names()) < len(tags):
            groups = {str(g[0]): g for g in groups}
            tips = sorted([
                nn for n in gene_phy.get_leaf_names()
                for nn in groups.get(n, [])
            ])
            mat = mat[tips]
    return mat