def read_and_write(inp_mol2_path, out_mol2_path, verbose): if verbose: sys.stdout.write('Processing %s' % os.path.basename(inp_mol2_path)) sys.stdout.flush() start = time.time() if inp_mol2_path.endswith('.gz'): write_mode = 'wb' open_file = gzip.open else: write_mode = 'w' open_file = open """ if query_path.endswith('.gz'): for id_, cont in split_multimol2(query_path): cnt += 1 cont = b''.join(cont).decode('utf-8').split('\n') if multiconf_query: mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt) else: mol_idx = id_ """ with open_file(out_mol2_path, write_mode) as outfile: prev_molecule = '' if inp_mol2_path.endswith('.gz'): for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)): if prev_molecule != id_: cnt = 0 else: cnt += 1 mol_idx = b'%s_%d' % (id_, cnt) cont[1] = mol_idx + b'\n' outfile.write(b''.join(cont)) prev_molecule = id_ else: for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)): if prev_molecule != id_: cnt = 0 else: cnt += 1 mol_idx = '%s_%d' % (id_, cnt) cont[1] = mol_idx + '\n' outfile.write(''.join(cont)) prev_molecule = id_ if verbose: elapsed = time.time() - start n_molecules = i + 1 sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def read_and_write(mol2_files, id_file_path, verbose, n_cpus): if verbose: sys.stdout.write('Using selection: %s\n' % SELECTION) sys.stdout.flush() with open(id_file_path, 'w') as f: for mol2_file in mol2_files: if verbose: start = time.time() sys.stdout.write('Processing %s' % os.path.basename(mol2_file)) sys.stdout.flush() cnt = 0 if mol2_file.endswith('.gz'): data_processor_fn = data_processor_gz else: data_processor_fn = data_processor for chunk in lazy_imap(data_processor=data_processor_fn, data_generator=split_multimol2(mol2_file), n_cpus=n_cpus): _ = [f.write('%s\n' % mol2_id) for mol2_id in chunk if mol2_id] cnt += len(chunk) if verbose: elapsed = time.time() - start sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed)) sys.stdout.flush()
def process_ligands(target): """Get information for all ligands associated with the target.""" ligand_list = [] num_atoms_target = len(protein_dict[target]["node_features"]) for fname in ["actives_final.mol2", "decoys_final.mol2"]: response = int(fname.startswith("a")) # Split the mol2 file with multiple ligands by ligand. # This list will be a list of pair sub-lists, the first # element of which is the ligand code, and the second of which # is the associated coordinate and bond text. curr_info = list(split_multimol2(f"raw/{target}/{fname}")) curr_info = [[f"{target}_{entry[0]}"] + process_ligand_text(entry[1], num_atoms_target) + [[[response]]] for entry in curr_info] curr_info = [ dict(zip(("id", "node_features", "graph", "targets"), curr_info[i])) for i in range(len(curr_info)) ] ligand_list += curr_info return ligand_list
def ligands_reader(): ''' Parses selected MOL2 file with structures of previously docked ligands using BioPandas module. Lists all atoms from all ligands with their coordinates. :return: symbols, numbers and coordinates of atoms + number of atom :rtype: list of lists ''' window = Tk() path = os.path.normpath(os.getcwd() + os.sep + os.pardir) path = os.path.join(path, 'files') ligands_path_string = filedialog.askopenfilename( initialdir='path', title="SELECT LIGANDS STRUCTURE:", filetypes=(("MOL2 files", "*.mol2"), ("all files", "*.*"))) ligands_name = os.path.basename(ligands_path_string) window.destroy() ligands_data = [] model_number = 1 with open(ligands_path_string, 'r') as ligands: for ligand in split_multimol2(ligands_path_string): pmol = PandasMol2().read_mol2_from_list(mol2_lines=ligand[1], mol2_code=ligand[0]) atom_coord = pmol.df[['atom_name', 'atom_id', 'x', 'y', 'z']] atom_coord = atom_coord.assign(column=model_number) model_number += 1 model_data = atom_coord.values.tolist() ligands_data = ligands_data + model_data # print(ligands_data) return ligands_data
def read_and_write(q_path, d_path, verbose, cache, output_file, n_cpus): dct_results = {'dbase': [], 'query': [], 'atoms': [], 'charges': []} d_base = os.path.basename(d_path) q_base = os.path.basename(q_path) if verbose: start = time.time() sys.stdout.write('Processing %s/%s' % (d_base, q_base)) sys.stdout.flush() cnt = 0 if q_path.endswith('.gz'): data_processor_fn = data_processor_gz else: data_processor_fn = data_processor for chunk in lazy_imap(data_processor=data_processor_fn, data_generator=zip(split_multimol2(d_path), split_multimol2(q_path)), n_cpus=n_cpus): for dbase_id, query_id, atoms, charges in chunk: dct_results['dbase'].append(dbase_id) dct_results['query'].append(query_id) dct_results['atoms'].append(atoms) dct_results['charges'].append(charges) cnt += len(chunk) """ q_pdmol = PandasMol2() d_pdmol = PandasMol2() for q_mol2, d_mol2 in zip(split_multimol2(q_path), split_multimol2(d_path)): cnt += 1 d_pdmol.read_mol2_from_list(mol2_code=d_mol2[0], mol2_lines=d_mol2[1]) d_pdmol._df = d_pdmol.df[(d_pdmol.df['atom_type'] != 'H')] if q_mol2[0] in cache: q_pdmol = cache[q_mol2[0]] else: q_pdmol.read_mol2_from_list(mol2_code=q_mol2[0], mol2_lines=q_mol2[1]) q_pdmol._df = q_pdmol.df[(q_pdmol.df['atom_type'] != 'H')] cache[q_mol2[0]] = q_pdmol atoms, charges = get_atom_matches(q_pdmol, d_pdmol) dct_results['query'].append(q_mol2[0]) dct_results['dbase'].append(d_mol2[0]) dct_results['atoms'].append(atoms) dct_results['charges'].append(charges) """ with open(output_file + '_charge.tsv', 'w') as f1,\ open(output_file + '_atomtype.tsv', 'w') as f2: columns = PandasMol2().read_mol2(q_path).df['atom_name'].values f1.write('dbase\tquery\t%s\n' % '\t'.join(columns)) f2.write('dbase\tquery\t%s\n' % '\t'.join(columns)) for i in range(len(dct_results['dbase'])): s1 = '%s\t%s\t%s\n' % (dct_results['dbase'][i], dct_results['query'][i], '\t'.join( format(x, "1.2f") for x in dct_results['charges'][i])) f1.write(s1) s2 = '%s\t%s\t%s\n' % (dct_results['dbase'][i], dct_results['query'][i], '\t'.join( dct_results['atoms'][i])) f2.write(s2) if verbose: elapsed = time.time() - start n_molecules = cnt + 1 sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def read_and_write(inp_mol2_path, report_path, output_dir, query_path, sortby, separator, verbose, id_suffix, selection): if verbose: sys.stdout.write('Processing %s' % os.path.basename(inp_mol2_path)) sys.stdout.flush() df = pd.read_table(report_path, usecols=['Name', 'ShapeQuery'] + sortby, sep=separator) if sortby: df.sort_values(sortby, inplace=True, ascending=False) if selection: selection_str = parse_selection_string(selection, df_name='df') mask = pd.eval(selection_str) df = df[mask] dbase_query_pairs = [ (d, q) for d, q in zip(df['Name'].values, df['ShapeQuery'].values) ] query_names = {q for q in df['ShapeQuery'].values} query_mol2s = {} multiconf_query = False for idx, cont in enumerate(split_multimol2(query_path)): if idx >= 1: multiconf_query = True break cnt = -1 if query_path.endswith('.gz'): for id_, cont in split_multimol2(query_path): cnt += 1 cont = b''.join(cont).decode('utf-8').split('\n') if multiconf_query: mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt) else: mol_idx = id_ if mol_idx in query_names: if id_suffix: cont[1] = mol_idx + '\n' query_mol2s[mol_idx] = ''.join(cont) else: for id_, cont in split_multimol2(query_path): cnt += 1 if multiconf_query: mol_idx = '%s_%d' % (id_, cnt) else: mol_idx = id_ if mol_idx in query_names: if id_suffix: cont[1] = mol_idx + '\n' query_mol2s[mol_idx] = ''.join(cont) out_path_base = os.path.join( output_dir, os.path.basename(inp_mol2_path).split('.mol2')[0]) out_path_q = '%s_%s' % (out_path_base, 'query.mol2') out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2') with tempfile.TemporaryDirectory() as tmpdirname: for id_, cont in split_multimol2(inp_mol2_path): if id_: tmp_path = os.path.join(tmpdirname, id_) with open(tmp_path, 'wb') as f: pickle.dump(''.join(cont), f) with open(out_path_d, 'w') as dof,\ open(out_path_q, 'w') as qof: if verbose: start = time.time() cnt = 0 for d, q in dbase_query_pairs: cnt += 1 qof.write(query_mol2s[q]) with open(os.path.join(tmpdirname, d), 'rb') as pkl: pkl_cont = pickle.load(pkl) dof.write(pkl_cont) if verbose: elapsed = time.time() - start n_molecules = cnt + 1 sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def main(input_dir, output_dir, atomtype_selection, charge_selection, input_mol2, verbose): if not os.path.exists(output_dir): os.mkdir(output_dir) all_tsv_base = [f for f in os.listdir(input_dir) if f.endswith('.tsv')] all_tsv_full = [os.path.join(input_dir, f) for f in all_tsv_base] a_inlist, c_inlist = get_tsv_pairs(all_tsv_full) a_outlist, c_outlist = get_tsv_pairs(all_tsv_base) a_outlist = [os.path.join(output_dir, f) for f in a_outlist] c_outlist = [os.path.join(output_dir, f) for f in c_outlist] for a_in, a_out, c_in, c_out in zip(a_inlist, a_outlist, c_inlist, c_outlist): if verbose: start = time.time() sys.stdout.write('Processing %s/%s' % (os.path.basename(a_in), os.path.basename(c_in))) sys.stdout.flush() df_charge = pd.read_table(c_in, sep='\t') for c in df_charge.columns[2:]: df_charge[c] = pd.to_numeric(df_charge[c]) df_atom = pd.read_table(a_in, sep='\t') mol2_cnt = df_atom.shape[0] if atomtype_selection: atom_sele = parse_selection_string(s=atomtype_selection, columns=df_atom.columns, df_name='df_atom') for sele in atom_sele: df_atom = pd.eval(sele) if charge_selection: charge_sele = parse_selection_string(s=charge_selection, columns=df_charge.columns, df_name='df_charge') for sele in charge_sele: df_charge = pd.eval(sele) selection_indices = set(df_charge.index).intersection( set(df_atom.index)) selection_indices = sorted(list(selection_indices)) df_atom.ix[selection_indices].to_csv(a_out, sep='\t') df_charge.ix[selection_indices].to_csv(c_out, sep='\t') if input_mol2: input_mol2_path_query = os.path.join( input_mol2, os.path.basename(c_out).replace('_charge.tsv', '_query.mol2')) input_mol2_path_dbase = input_mol2_path_query.replace( '_query.mol2', '_dbase.mol2') if not os.path.exists(input_mol2_path_query)\ and os.path.exists(input_mol2_path_query + '.gz'): input_mol2_path_query += '.gz' if not os.path.exists(input_mol2_path_dbase)\ and os.path.exists(input_mol2_path_dbase + '.gz'): input_mol2_path_dbase += '.gz' output_mol2_path_query = os.path.join( output_dir, os.path.basename(c_out).replace('_charge.tsv', '_query.mol2')) output_mol2_path_dbase = output_mol2_path_query.replace( '_query.mol2', '_dbase.mol2') if input_mol2_path_query.endswith('.gz'): output_mol2_path_query += '.gz' query_write_mode = 'wb' query_open_file = gzip.open else: query_write_mode = 'w' query_open_file = open if input_mol2_path_dbase.endswith('.gz'): output_mol2_path_dbase += '.gz' dbase_write_mode = 'wb' dbase_open_file = gzip.open else: dbase_write_mode = 'w' dbase_open_file = open with query_open_file(output_mol2_path_query, query_write_mode) as opq,\ dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd: for i in selection_indices: mol2_q_cont = ('DID NOT FIND %s\n' % (df_atom.ix[i]['query'])) mol2_d_cont = ('DID NOT FIND %s\n' % (df_atom.ix[i]['dbase'])) for idx, mol2 in enumerate( split_multimol2(input_mol2_path_query)): if idx == i: mol2_q_cont = mol2[1] break for idx, mol2 in enumerate( split_multimol2(input_mol2_path_dbase)): if idx == i: mol2_d_cont = mol2[1] break if query_write_mode == 'wb': opq.write(b''.join(mol2_q_cont)) else: opq.write(''.join(mol2_q_cont)) if dbase_write_mode == 'wb': opd.write(b''.join(mol2_d_cont)) else: opd.write(''.join(mol2_d_cont)) if verbose: elapsed = time.time() - start n_molecules = mol2_cnt sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' % (n_molecules, n_molecules / elapsed)) sys.stdout.flush()
def main(config): start_time = time.time() env_path = config load_dotenv(dotenv_path=env_path) converted_mols = {} pdbs = {} input_list = pd.read_csv(os.getenv('INPUT_LIST'), sep=';') mols2 = split_multimol2(os.getenv('MOL2_FILE')) pdb_path = os.getenv('PDBS_FILE_FOLDER') for pdb in os.listdir(pdb_path): name = pdb.split('_')[-1].replace('.pdb', '') pdbs[name] = PandasPdb().read_pdb('{}/{}'.format(pdb_path, pdb)).df for mol2 in mols2: pmol = PandasMol2().read_mol2_from_list(mol2_lines=mol2[1], mol2_code=mol2[0]) converted_mols[pmol.code.split("|")[1]] = pmol.df def create_atom_dimensional_position(x, y, z): return np.array((x, y, z)) def atom_is_close_to_atom(resiude_atom, atom): return np.linalg.norm(resiude_atom - atom) <= float( os.getenv('MAX_DISTANCE')) def remove_hydrogen_atoms(df): return df['ATOM'][~df['ATOM']['atom_name'].str.contains('H')] def get_interactions_molecule_for_residues(molecule, residues): matched_atoms = [] mol_points = [] for midx, mol_row in molecule.iterrows(): mol_points.append( create_atom_dimensional_position( x=mol_row['x'], y=mol_row['y'], z=mol_row['z'], )) for pidx, protein_row in remove_hydrogen_atoms(residues).iterrows(): protein_tag = '{}_{}'.format(protein_row['residue_name'], protein_row['residue_number']) protein_ad = create_atom_dimensional_position( x=protein_row['x_coord'], y=protein_row['y_coord'], z=protein_row['z_coord']) for point in mol_points: if atom_is_close_to_atom( protein_ad, point) and protein_tag not in matched_atoms: matched_atoms.append(protein_tag) return matched_atoms def interate_with_expected_residues(item, expected_residues): print('> {}'.format(item['NAME'])) resp = [] m = converted_mols[item['NAME']] r = pdbs[str(int(item['Gold.Ensemble.ID']))] interactions = get_interactions_molecule_for_residues(m, r) return [key for key in interactions if key in expected_residues] pre_dataframe = pd.DataFrame(columns=[ 'molecule_name', 'pdb', 'score', 'residues', 'residues_quantity' ]) print('Start Molecule Analyze') for i, row in input_list.iterrows(): reactions = interate_with_expected_residues( row, list(os.getenv('RESIDUES').split(","))) pre_dataframe = pre_dataframe.append( { 'molecule_name': row['NAME'], 'pdb': row['Gold.Ensemble.ID'], 'score': float(row['Gold.Chemscore.Fitness']), 'residues': ", ".join(reactions), 'residues_quantity': len(reactions), }, ignore_index=True) dataframe = pd.DataFrame(columns=[ 'molecule_name', 'v', 'pdb', 'score', 'residues', 'residues_quantity' ]) cached_mol_names = [] for i, row in pre_dataframe.iterrows(): new_name = re.sub(r'[A-Z]', '', row['molecule_name']) _var = re.search(r'[A-Z]', row['molecule_name']) _var = _var.group(0) if _var else '' if new_name in cached_mol_names: old_score = float(dataframe.loc[dataframe['molecule_name'] == new_name]['score'].values[0]) idx = dataframe.index[dataframe['molecule_name'] == new_name][0] if old_score < row['score']: dataframe.loc[idx, 'v'] = _var dataframe.loc[idx, 'pdb'] = row['pdb'] dataframe.loc[idx, 'score'] = row['score'] dataframe.loc[idx, 'residues'] = row['residues'] dataframe.loc[idx, 'residues_quantity'] = row['residues_quantity'] else: cached_mol_names.append(new_name) dataframe = dataframe.append( { 'molecule_name': new_name, 'v': _var, 'pdb': row['pdb'], 'score': row['score'], 'residues': row['residues'], 'residues_quantity': row['residues_quantity'], }, ignore_index=True) body = dataframe.sort_values(by=['residues_quantity'], ascending=False).to_csv(index=False, sep=';') file = open(os.getenv('OUTPUT_LIST'), "w") file.write(body) file.close() print("Process take: %s seconds" % (time.time() - start_time))