def sng_to_queue(q, processes=30, file=input_dir): """ Parameters ---------- q : multiprocessing.Manager().Queue queue processes : int num of processes file : str input file dir Returns ------- """ q.put(file) p = Pool(processes=processes) for i in range(get_num_lines(file)): p.apply_async(sng_from_line_2_queue, ( i, q, )) p.close() p.join()
def data_from_queue( q, map_file, idx_file, print_step=5000, ): """ Parameters ---------- q: queue queue print_step : int Returns ------- dic_scaffold dict{scaffold smiles: (mol idx, [atom idx])} """ dic_scaffold = DicIdxLs() dic_sm_idx = DicSmIdx() file = q.get() print("Extracting scaffolds from" + file) i = 0 num_lines = get_num_lines(file) idx_sc = 0 while True: if i % print_step == 0: print(i) if i >= num_lines: break mol_index, sng = q.get() i += 1 try: if sng is not None: for sng_i in sng: sng_pb = TupMolLsatom() sng_pb.idx_mol = mol_index sng_pb.ls_atom.idx_atom.extend(sng_i[1]) sng_pb.ls_nh.idx_atom.extend(sng_i[2]) sng_pb.ls_np.idx_atom.extend(sng_i[3]) if sng_i[0] not in dic_sm_idx.sm_sc.keys(): dic_sm_idx.sm_sc[sng_i[0]] = idx_sc idx_sc += 1 dic_scaffold.smiles_scaffold[dic_sm_idx.sm_sc[ sng_i[0]]].dic_mol_atoms.extend([sng_pb]) else: continue except: continue dic_idx_sm = DicIdxSm() for k, v in dic_sm_idx.sm_sc.items(): dic_idx_sm.sm_sc[v] = k # return dic_scaffold, dic_idx_sm with open(map_file, 'wb') as f: f.write(dic_scaffold.SerializeToString()) with open(idx_file, 'wb') as f: f.write(dic_idx_sm.SerializeToString())
def sql_from_queue( q, dic_path, db_name, print_step=5000 ): dic = json.load(open(dic_path)) i = 0 map_id = 0 sc_key = 0 file = q.get() num_lines = get_num_lines(file) # create a table conn = psycopg2.connect(**dic) cur = conn.cursor() cur.execute( f''' drop table if exists maps; drop table if exists scaffolds; drop table if exists mols; CREATE TABLE scaffolds (id integer PRIMARY KEY, smiles varchar); CREATE TABLE mols (id integer PRIMARY KEY, smiles varchar); CREATE TABLE maps ( id integer PRIMARY KEY, sc_id integer references scaffolds(id), mol_id integer references mols(id), ls_atom_idx varchar, ls_nh varchar, ls_np varchar ); ''' ) conn.commit() cur.close() conn.close() while True: if i % print_step == 0: print(i) if i >= num_lines: break mol_index, sng = q.get() i += 1 mol_smiles = smiles_from_line(mol_index, file).strip() conn = psycopg2.connect(**dic) cur = conn.cursor() cur.execute( f''' insert into mols(id, smiles) values({mol_index}, '{mol_smiles}'); ''' ) conn.commit() cur.close() conn.close() if sng is not None: for i_sng in sng: conn = psycopg2.connect(**dic) cur = conn.cursor() cur.execute( f''' select count(*) from scaffolds where smiles='{i_sng[0]}'; ''' ) num_ex = cur.fetchone()[0] if not num_ex: sc_id = sc_key cur.execute( f''' insert into scaffolds(id, smiles) values ({sc_id}, '{i_sng[0]}'); ''' ) conn.commit() sc_key += 1 else: cur.execute( f''' select id from scaffolds where smiles='{i_sng[0]}'; ''' ) sc_id = cur.fetchone()[0] cur.execute( f''' insert into maps(id, sc_id, mol_id, ls_atom_idx, ls_nh, ls_np) values( {map_id}, {sc_id}, {mol_index}, '{str(i_sng[1])}', '{str(i_sng[2])}', '{str(i_sng[3])}' ); ''' ) conn.commit() map_id += 1 cur.close() conn.close()