Esempio n. 1
0
def genotype_and_saving(inputs, pool, **params) :
    input_idx = {i[3]:i[0] for i in inputs}

    barcode_dist = params['barcode_dist']
    codes = {i[0]:[i[4] for b in params['barcode_dist']] for i in inputs}
    res = {}
    for r in pool.map(utils.run_mash, [[i[2], None, 1, params] for i in inputs]) :
        if len(r) > 0 :
            res[input_idx[r[0][0]]] = r[0]
    
    merged_input = os.path.join(params['dbname'], 'merged_input.msh')
    subprocess.Popen('{mash} paste {0} {1}'.format(merged_input, ' '.join([i[2] for i in inputs]), **params).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    
    for r in utils.run_mash([merged_input, merged_input, params['n_thread'], params]) :
        if input_idx[r[0]] > input_idx[r[1]] :
            if input_idx[r[0]] not in res or res[input_idx[r[0]]][2] > r[2] :
                r[1] = input_idx[r[1]]
                res[input_idx[r[0]]] = r
    os.unlink(merged_input)

    for idx, code in sorted(codes.iteritems()) :
        if res.get(idx, None) is not None :
            best_sim, best_hit = res[idx][2], res[idx][1]
            if isinstance(best_hit, basestring) :
                best_hit = [int(r[1:]) for r in best_hit.split('.')]
            else :
                best_hit = codes[best_hit]
            for i, d in enumerate(params['barcode_dist']) :
                if d >= best_sim :
                    code[i] = int(best_hit[i])
                else :
                    break
    return save2mash(inputs, codes, **params)
Esempio n. 2
0
def query_sample(params) :
    params = utils.load_paramDict(params)
    #params = utils.load_params(sys.argv)
    assert 'query' in params and os.path.isfile(params['query']), 'no query'
    
    existing_data = os.path.join(params['dbname'], 'db_metadata.msg')
    assert existing_data, 'no data in the database.'
    data = pd.read_msgpack(existing_data)
    
    if params.get('dtype', 'fasta') == 'read' :
        msh_file = utils.get_mash(params['query'], is_read=True, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params], is_read=True)
    else :
        msh_file = utils.get_mash(params['query'], is_read=False, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params])
    os.unlink(msh_file)
    
    if len(result) > 0 :
        r_id = np.array([r[2] for r in result])
        result = np.array([r[1].split('.') for r in result])
        result, r_id = result[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])], r_id[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])]

        groups = {}
        m = {'a'+k:[n, a] for k, n, a in data[['index', 'organism_name', 'assembly_accession']].as_matrix()}
        matches = [ dict(record='.'.join(r), similarity=1-i, organism_name=m[r[-1]][0], assembly_accession=m[r[-1]][1]) for r, i in zip(result, r_id) ]
        for id, (dcut, dgroup) in enumerate(zip(params['barcode_dist'], result.T[:-1])) :
            dgroup[r_id > dcut] = ''
            g = np.unique(dgroup, return_index=True)
            tags = ['.'.join(r) for r in result[g[1], :(id+1)] if r[-1] != '']
            info = [ [i, -id, '.'.join(hit)] for i, hit in zip(r_id[g[1]], result[g[1]]) if hit[id] != '' ]
            for t, i in zip(tags, info) :
                groups[t] = i

        groups = [dict(group=c, similarity=1.0-d[0]) for c, d in sorted(groups.iteritems(), key=lambda x:x[1])]
        for g in groups :
            g.update(utils.retrieve_info(g['group'], data=data, **params))
    else :
        groups, matches, result = [], [], 'unknown'
    print json.dumps(dict(groups=groups, matches=matches), sort_keys=True, indent=2)
Esempio n. 3
0
import os, sys, pandas as pd, numpy as np, json, msgpack
import utils

if __name__ == '__main__':
    params = utils.load_params(sys.argv)
    assert 'query' in params and os.path.isfile(params['query']), 'no query'

    existing_data = os.path.join(params['dbname'], 'db_metadata.msg')
    assert existing_data, 'no data in the database.'
    data = pd.read_msgpack(existing_data)

    if params.get('dtype', 'fasta') == 'read':
        msh_file = utils.get_mash(params['query'], is_read=True, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params],
                                is_read=True)
    else:
        msh_file = utils.get_mash(params['query'], is_read=False, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params])
    os.unlink(msh_file)

    if len(result) > 0:
        r_id = np.array([r[2] for r in result])
        result = np.array([r[1].split('.') for r in result])
        result, r_id = result[(1 - r_id >= 0.98 * (1 - r_id[0]))
                              & (r_id <= params['barcode_dist'][0])], r_id[
                                  (1 - r_id >= 0.98 * (1 - r_id[0]))
                                  & (r_id <= params['barcode_dist'][0])]

        groups = {}
        m = {
            'a' + k: [n, a]