def query_metadata(params) : params = utils.load_paramDict(params) fout = open(params['seqlist'], 'w') if params.get('seqlist', None) is not None else sys.stdout data = utils.load_database(**params) db_columns = params['db_columns'] + params['metadata_columns'] + params['taxa_columns'] if params.get('default', None) is not None : tmp = {v['MapDB']:v for v in params['default_bowtie'] } filter = tmp[params['default']] else : filter = { key:params[key] for key in db_columns + ['name', 'tag', 'min', 'max', 'group'] if params.get(key, None) is not None } for fld, value in filter.iteritems() : if fld in db_columns : data = data[ data[fld].isin(value.split(',')) ] elif fld == 'min' : data = data[ data['size'].astype(int) >= int(value) ] elif fld == 'max' : data = data[ data['size'].astype(int) <= int(value) ] elif fld == 'group' : data = data[ data['barcode'].str.contains(value) ] elif fld == 'tag' : data = data.reset_index(drop=True) barcodes = pd.DataFrame(data['barcode'].apply(lambda barcode:[int(b[1:]) for b in barcode.split('.')]).tolist(), columns=params['barcode_tag']) for f in value.split(';') : f = f.strip() g1, g2 = f[0], f[-1] if f.find('==') > 0 : barcodes = barcodes[barcodes[g1] == barcodes[g2]] else : barcodes = barcodes[barcodes[g1] != barcodes[g2]] data = data.loc[barcodes.index].reset_index(drop=True) data.to_csv(fout, index=False, sep='\t')
def update(params): params = utils.load_paramDict(params) assert 'seqlist' in params, 'Please feed in a tab-delimited table in "seqlist="' exist_db = os.path.join(params['dbname'], 'db_metadata.msg') modified = update_data(exist_db, params['seqlist'], **params) pd.DataFrame(modified, columns=['#index', 'field', 'oldValue', 'newValue']).to_csv(sys.stdout, index=False, sep='\t')
def query_sample(params) : params = utils.load_paramDict(params) #params = utils.load_params(sys.argv) assert 'query' in params and os.path.isfile(params['query']), 'no query' existing_data = os.path.join(params['dbname'], 'db_metadata.msg') assert existing_data, 'no data in the database.' data = pd.read_msgpack(existing_data) if params.get('dtype', 'fasta') == 'read' : msh_file = utils.get_mash(params['query'], is_read=True, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params], is_read=True) else : msh_file = utils.get_mash(params['query'], is_read=False, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params]) os.unlink(msh_file) if len(result) > 0 : r_id = np.array([r[2] for r in result]) result = np.array([r[1].split('.') for r in result]) result, r_id = result[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])], r_id[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])] groups = {} m = {'a'+k:[n, a] for k, n, a in data[['index', 'organism_name', 'assembly_accession']].as_matrix()} matches = [ dict(record='.'.join(r), similarity=1-i, organism_name=m[r[-1]][0], assembly_accession=m[r[-1]][1]) for r, i in zip(result, r_id) ] for id, (dcut, dgroup) in enumerate(zip(params['barcode_dist'], result.T[:-1])) : dgroup[r_id > dcut] = '' g = np.unique(dgroup, return_index=True) tags = ['.'.join(r) for r in result[g[1], :(id+1)] if r[-1] != ''] info = [ [i, -id, '.'.join(hit)] for i, hit in zip(r_id[g[1]], result[g[1]]) if hit[id] != '' ] for t, i in zip(tags, info) : groups[t] = i groups = [dict(group=c, similarity=1.0-d[0]) for c, d in sorted(groups.iteritems(), key=lambda x:x[1])] for g in groups : g.update(utils.retrieve_info(g['group'], data=data, **params)) else : groups, matches, result = [], [], 'unknown' print json.dumps(dict(groups=groups, matches=matches), sort_keys=True, indent=2)
def query_read(params): params = utils.load_paramDict(params) params['bootstrap'] = int( params['bootstrap']) if 'bootstrap' in params else 0 data = utils.load_database(**params) if params.get('stage', '0') in '0': bowtie2matrix(**params) if params.get('stage', '0') in '01': summary_matrix(data, **params) if params.get('stage', '0') in '012': qvector = ipopt(least_amount=[params['minFreq'], params['minNum']], **params) if params.get('stage', '0') in '0123': qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution') assign_reads(data, qvector, **params) if params.get('stage', '0') in '01234': assign = os.path.join(params['workspace'], 'read_assignment.gz') profiling(data, assign, **params) import glob for fname in glob.glob(os.path.join(params['workspace'], 'r?.fastq')): subprocess.Popen(['gzip', '-f', fname]).communicate()
def db_index(params): params = utils.load_paramDict(params) if params.get('update', False): summary_link = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' summary_file = 'assembly_summary_refseq.txt' utils.get_file(summary_link, summary_file) params['seqlist'] = summary_file if params.get('update', False) or not os.path.isfile( os.path.join(params['taxonomy_db'], 'names.dmp')) or not os.path.isfile( os.path.join(params['taxonomy_db'], 'nodes.dmp')): taxdump = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' import tarfile utils.get_file(taxdump, 'taxdump.tar.gz') if not os.path.isdir(params['taxonomy_db']): os.makedirs(params['taxonomy_db']) with tarfile.open('taxdump.tar.gz', 'r') as tf: tf.extractall(path=params['taxonomy_db']) os.unlink('taxdump.tar.gz') assert 'seqlist' in params, 'use seqlist to bring in a list of genomes.' exist_db = os.path.join(params['dbname'], 'db_metadata.msg') existing, entries = load_data(exist_db, params['seqlist'], **params) entries = add_taxa_col(entries, **params) phylum_order = [(m[0] not in ('Archaea', 'Bacteria') )*100 + \ (m[1] in ('Metazoa', 'Viridiplantae','nan'))*10 + \ (m[2] in ('nan', 'Chordata', 'Arthropoda', 'Streptophyta', 'Echinodermata', 'Platyhelminthes', 'Mollusca')) \ for m in entries[ params['taxa_columns'][-1:-4:-1] ].as_matrix()] entries = entries.loc[np.argsort(phylum_order, kind='mergesort')].reset_index(drop=True) index_id = max(existing['index'].as_matrix().astype( int)) + 1 if existing.shape[0] > 0 else 0 pool, batches = Pool(params['n_thread']), params['n_thread'] * 3 sha_dict = {c: 1 for c in existing['sha256'].as_matrix()} sha_dict[''] = 1 for group_id in np.arange(0, entries.shape[0], batches): inputs2 = pool.map( mash_proc, [[idx, record['file_path'], record['url_path'], params] for idx, record in entries.loc[group_id:(group_id + batches - 1)].iterrows()]) inputs = [] for i in inputs2: entries.loc[i[0], 'sha256'] = i[1] if i[1] not in sha_dict: sha_dict[i[1]] = 1 inputs.append(list(i) + [index_id]) index_id += 1 elif i[2] != '': os.unlink(i[2]) if not len(inputs): continue if kill_signal: sys.exit(0) results = genotype_and_saving(inputs, pool, **params) for idx, size, c, fmsh, index_id2 in results: genome = entries.loc[idx] genome['index'] = str(index_id2) genome['barcode'] = c genome['size'] = str(size) existing = existing.append(genome) os.unlink(fmsh) print time.strftime('%X %x %Z'), ':', genome['organism_name'], c existing.to_msgpack(exist_db) if kill_signal: sys.exit(0)
def db_MapDB(params): params = utils.load_paramDict(params) params['dbtype'] = params.get('dbtype', 'minimap2') db_columns = [ c for c in params['db_columns'] + params['metadata_columns'] + params['taxa_columns'] if c not in ('sha256') ] assert params.get('seqlist', None) is not None, 'seqlist is required. ' data = utils.load_database(**params) if params['seqlist'] in ('stdin', '-', ''): fin = sys.stdin else: fin = open(params['seqlist']) glist = pd.read_csv(fin, delimiter='\t', dtype='str') fin.close() mapdb = params['MapDB'] mapdb = os.path.join(params['bowtie_db'], mapdb) start_id = 0 indices = {i: 1 for i in glist['index'].tolist()} if len(glob.glob(mapdb + '.*')) > 0: assert params.get('mode', '') in ( 'overwrite', 'append' ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".' if params.get('mode', '') == 'overwrite': for fname in glob.glob(mapdb + '.*'): os.unlink(fname) elif params.get('mode', '') == 'append': for fname in glob.glob(mapdb + '.*.taxa.gz'): i = int(fname.rsplit('.', 3)[1]) if i >= start_id: start_id = i + 1 with gzip.open(fname) as fin: for line in fin: indices[line.strip().split()[1]] = 2 data = data.set_index('index', drop=False) data['size'] = data['size'].astype(int) data = data.loc[[i for i, t in indices.iteritems() if t == 1]].sort_values(by=['size'], ascending=[False]) min_file_num = int(np.ceil( np.sum(data['size']).astype(float) / 3800000000)) buckets = [[0, []] for n in xrange(min_file_num)] id = -1 for index, size, file_path, url_path in data[[ 'index', 'size', 'file_path', 'url_path' ]].values: size, done = int(size), 0 for id in range(id + 1, len(buckets)) + range(id + 1): b = buckets[id] if b[0] + size <= 3800000000: b[0] += size b[1].append([index, size, file_path, url_path]) done = 1 break if done == 0: buckets.append([size, [[index, size, file_path, url_path]]]) if params['dbtype'] == 'minimap2': pool = Pool(min(params['n_thread'], len(buckets))) result = pool.imap_unordered(create_db, [[ params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) #result = map(create_db, [[params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype']] for id, bucket in enumerate(buckets)]) else: result = map(create_db, [[ params['malt_build'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) for r in result: if r[2] != 0: print 'Database {0}.{1} FAILED with code {2}!'.format(*r) with open(mapdb + '.info', 'w') as fout: for id, bucket in enumerate(buckets): for b, _, _, _ in bucket[1]: fout.write('{0}\t{1}\n'.format(b, id + start_id)) print 'Done' if __name__ == '__main__': db_MapDB( dict([[k.strip() for k in arg.split('=', 1)] for arg in sys.argv[1:]]))