Example #1
0
def query_metadata(params) :
    params = utils.load_paramDict(params)    
    fout = open(params['seqlist'], 'w') if params.get('seqlist', None) is not None else sys.stdout
    data = utils.load_database(**params)

    db_columns = params['db_columns'] + params['metadata_columns'] + params['taxa_columns']
    if params.get('default', None) is not None :
        tmp = {v['MapDB']:v for v in params['default_bowtie'] }
        filter = tmp[params['default']]
    else :
        filter = { key:params[key] for key in db_columns + ['name', 'tag', 'min', 'max', 'group'] if params.get(key, None) is not None }
    for fld, value in filter.iteritems() :
        if fld in db_columns :
            data = data[ data[fld].isin(value.split(',')) ]
        elif fld == 'min' :
            data = data[ data['size'].astype(int) >= int(value) ]
        elif fld == 'max' :
            data = data[ data['size'].astype(int) <= int(value) ]
        elif fld == 'group' :
            data = data[ data['barcode'].str.contains(value) ]
        elif fld == 'tag' :
            data = data.reset_index(drop=True)
            barcodes = pd.DataFrame(data['barcode'].apply(lambda barcode:[int(b[1:]) for b in barcode.split('.')]).tolist(), columns=params['barcode_tag'])
            
            for f in value.split(';') :
                f = f.strip()
                g1, g2 = f[0], f[-1]
                if f.find('==') > 0 :
                    barcodes = barcodes[barcodes[g1] == barcodes[g2]]
                else :
                    barcodes = barcodes[barcodes[g1] != barcodes[g2]]
            data = data.loc[barcodes.index].reset_index(drop=True)

    data.to_csv(fout, index=False, sep='\t')
Example #2
0
def update(params):
    params = utils.load_paramDict(params)
    assert 'seqlist' in params, 'Please feed in a tab-delimited table in "seqlist="'

    exist_db = os.path.join(params['dbname'], 'db_metadata.msg')
    modified = update_data(exist_db, params['seqlist'], **params)
    pd.DataFrame(modified, columns=['#index', 'field', 'oldValue',
                                    'newValue']).to_csv(sys.stdout,
                                                        index=False,
                                                        sep='\t')
Example #3
0
def query_sample(params) :
    params = utils.load_paramDict(params)
    #params = utils.load_params(sys.argv)
    assert 'query' in params and os.path.isfile(params['query']), 'no query'
    
    existing_data = os.path.join(params['dbname'], 'db_metadata.msg')
    assert existing_data, 'no data in the database.'
    data = pd.read_msgpack(existing_data)
    
    if params.get('dtype', 'fasta') == 'read' :
        msh_file = utils.get_mash(params['query'], is_read=True, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params], is_read=True)
    else :
        msh_file = utils.get_mash(params['query'], is_read=False, **params)
        result = utils.run_mash([msh_file, None, params['n_thread'], params])
    os.unlink(msh_file)
    
    if len(result) > 0 :
        r_id = np.array([r[2] for r in result])
        result = np.array([r[1].split('.') for r in result])
        result, r_id = result[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])], r_id[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])]

        groups = {}
        m = {'a'+k:[n, a] for k, n, a in data[['index', 'organism_name', 'assembly_accession']].as_matrix()}
        matches = [ dict(record='.'.join(r), similarity=1-i, organism_name=m[r[-1]][0], assembly_accession=m[r[-1]][1]) for r, i in zip(result, r_id) ]
        for id, (dcut, dgroup) in enumerate(zip(params['barcode_dist'], result.T[:-1])) :
            dgroup[r_id > dcut] = ''
            g = np.unique(dgroup, return_index=True)
            tags = ['.'.join(r) for r in result[g[1], :(id+1)] if r[-1] != '']
            info = [ [i, -id, '.'.join(hit)] for i, hit in zip(r_id[g[1]], result[g[1]]) if hit[id] != '' ]
            for t, i in zip(tags, info) :
                groups[t] = i

        groups = [dict(group=c, similarity=1.0-d[0]) for c, d in sorted(groups.iteritems(), key=lambda x:x[1])]
        for g in groups :
            g.update(utils.retrieve_info(g['group'], data=data, **params))
    else :
        groups, matches, result = [], [], 'unknown'
    print json.dumps(dict(groups=groups, matches=matches), sort_keys=True, indent=2)
Example #4
0
def query_read(params):
    params = utils.load_paramDict(params)
    params['bootstrap'] = int(
        params['bootstrap']) if 'bootstrap' in params else 0

    data = utils.load_database(**params)

    if params.get('stage', '0') in '0':
        bowtie2matrix(**params)
    if params.get('stage', '0') in '01':
        summary_matrix(data, **params)
    if params.get('stage', '0') in '012':
        qvector = ipopt(least_amount=[params['minFreq'], params['minNum']],
                        **params)
    if params.get('stage', '0') in '0123':
        qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution')
        assign_reads(data, qvector, **params)
    if params.get('stage', '0') in '01234':
        assign = os.path.join(params['workspace'], 'read_assignment.gz')
        profiling(data, assign, **params)

    import glob
    for fname in glob.glob(os.path.join(params['workspace'], 'r?.fastq')):
        subprocess.Popen(['gzip', '-f', fname]).communicate()
Example #5
0
def db_index(params):
    params = utils.load_paramDict(params)
    if params.get('update', False):
        summary_link = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt'
        summary_file = 'assembly_summary_refseq.txt'
        utils.get_file(summary_link, summary_file)
        params['seqlist'] = summary_file

    if params.get('update', False) or not os.path.isfile(
            os.path.join(params['taxonomy_db'],
                         'names.dmp')) or not os.path.isfile(
                             os.path.join(params['taxonomy_db'], 'nodes.dmp')):
        taxdump = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
        import tarfile
        utils.get_file(taxdump, 'taxdump.tar.gz')
        if not os.path.isdir(params['taxonomy_db']):
            os.makedirs(params['taxonomy_db'])

        with tarfile.open('taxdump.tar.gz', 'r') as tf:
            tf.extractall(path=params['taxonomy_db'])
        os.unlink('taxdump.tar.gz')

    assert 'seqlist' in params, 'use seqlist to bring in a list of genomes.'

    exist_db = os.path.join(params['dbname'], 'db_metadata.msg')
    existing, entries = load_data(exist_db, params['seqlist'], **params)
    entries = add_taxa_col(entries, **params)
    phylum_order = [(m[0] not in ('Archaea', 'Bacteria') )*100 + \
                    (m[1] in ('Metazoa', 'Viridiplantae','nan'))*10 + \
                    (m[2] in ('nan', 'Chordata', 'Arthropoda', 'Streptophyta', 'Echinodermata', 'Platyhelminthes', 'Mollusca')) \
                    for m in entries[ params['taxa_columns'][-1:-4:-1] ].as_matrix()]
    entries = entries.loc[np.argsort(phylum_order,
                                     kind='mergesort')].reset_index(drop=True)

    index_id = max(existing['index'].as_matrix().astype(
        int)) + 1 if existing.shape[0] > 0 else 0

    pool, batches = Pool(params['n_thread']), params['n_thread'] * 3

    sha_dict = {c: 1 for c in existing['sha256'].as_matrix()}
    sha_dict[''] = 1

    for group_id in np.arange(0, entries.shape[0], batches):
        inputs2 = pool.map(
            mash_proc,
            [[idx, record['file_path'], record['url_path'], params]
             for idx, record in entries.loc[group_id:(group_id + batches -
                                                      1)].iterrows()])
        inputs = []
        for i in inputs2:
            entries.loc[i[0], 'sha256'] = i[1]
            if i[1] not in sha_dict:
                sha_dict[i[1]] = 1
                inputs.append(list(i) + [index_id])
                index_id += 1
            elif i[2] != '':
                os.unlink(i[2])
        if not len(inputs):
            continue

        if kill_signal:
            sys.exit(0)
        results = genotype_and_saving(inputs, pool, **params)

        for idx, size, c, fmsh, index_id2 in results:
            genome = entries.loc[idx]
            genome['index'] = str(index_id2)
            genome['barcode'] = c
            genome['size'] = str(size)
            existing = existing.append(genome)
            os.unlink(fmsh)
            print time.strftime('%X %x %Z'), ':', genome['organism_name'], c
        existing.to_msgpack(exist_db)
        if kill_signal:
            sys.exit(0)
Example #6
0
def db_MapDB(params):
    params = utils.load_paramDict(params)
    params['dbtype'] = params.get('dbtype', 'minimap2')
    db_columns = [
        c for c in params['db_columns'] + params['metadata_columns'] +
        params['taxa_columns'] if c not in ('sha256')
    ]

    assert params.get('seqlist', None) is not None, 'seqlist is required. '

    data = utils.load_database(**params)

    if params['seqlist'] in ('stdin', '-', ''):
        fin = sys.stdin
    else:
        fin = open(params['seqlist'])
    glist = pd.read_csv(fin, delimiter='\t', dtype='str')
    fin.close()

    mapdb = params['MapDB']
    mapdb = os.path.join(params['bowtie_db'], mapdb)
    start_id = 0

    indices = {i: 1 for i in glist['index'].tolist()}

    if len(glob.glob(mapdb + '.*')) > 0:
        assert params.get('mode', '') in (
            'overwrite', 'append'
        ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".'
        if params.get('mode', '') == 'overwrite':
            for fname in glob.glob(mapdb + '.*'):
                os.unlink(fname)
        elif params.get('mode', '') == 'append':
            for fname in glob.glob(mapdb + '.*.taxa.gz'):
                i = int(fname.rsplit('.', 3)[1])
                if i >= start_id:
                    start_id = i + 1
                with gzip.open(fname) as fin:
                    for line in fin:
                        indices[line.strip().split()[1]] = 2
    data = data.set_index('index', drop=False)
    data['size'] = data['size'].astype(int)
    data = data.loc[[i for i, t in indices.iteritems()
                     if t == 1]].sort_values(by=['size'], ascending=[False])
    min_file_num = int(np.ceil(
        np.sum(data['size']).astype(float) / 3800000000))

    buckets = [[0, []] for n in xrange(min_file_num)]
    id = -1
    for index, size, file_path, url_path in data[[
            'index', 'size', 'file_path', 'url_path'
    ]].values:
        size, done = int(size), 0
        for id in range(id + 1, len(buckets)) + range(id + 1):
            b = buckets[id]
            if b[0] + size <= 3800000000:
                b[0] += size
                b[1].append([index, size, file_path, url_path])
                done = 1
                break
        if done == 0:
            buckets.append([size, [[index, size, file_path, url_path]]])
    if params['dbtype'] == 'minimap2':
        pool = Pool(min(params['n_thread'], len(buckets)))
        result = pool.imap_unordered(create_db, [[
            params['minimap2'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
        #result = map(create_db, [[params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype']] for id, bucket in enumerate(buckets)])
    else:
        result = map(create_db, [[
            params['malt_build'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
    for r in result:
        if r[2] != 0:
            print 'Database {0}.{1} FAILED with code {2}!'.format(*r)

    with open(mapdb + '.info', 'w') as fout:
        for id, bucket in enumerate(buckets):
            for b, _, _, _ in bucket[1]:
                fout.write('{0}\t{1}\n'.format(b, id + start_id))
    print 'Done'

    if __name__ == '__main__':
        db_MapDB(
            dict([[k.strip() for k in arg.split('=', 1)]
                  for arg in sys.argv[1:]]))