def make_fragment_list(self): """ Uses an iterable to compute all nodes of a fragmentation tree, but without retaining parent/child info. This function is used modify self.fragment_dict in place in the ``__init__`` constructor :returns: fragment_dict ; Python dict with **keys**: (sorted) tuple of atom indices in the fragment **values**: Python dict with * **keys**: 'path' * **values**: (sorted) tuple of unique shortest bond breakage path giving rise to fragment """ fragment_list = [] wm = Chem.RWMol(self.molH) for bond in range(self.num_bonds): # Remove bonds from H'ed molecule remove_bonds(wm, self.mol, bond) for mol in Chem.GetMolFrags(wm, sanitizeFrags=False, asMols=True): fragment_list.append({ 'frag_mol_h': mol, 'frag_mol': Chem.RemoveHs(mol), 'frag_mass': CalcExactMolWt(mol), 'frag_smiles': Chem.MolToSmiles(mol, True), 'fragment_mass': CalcExactMolWt(mol) }) # Restore broken bonds remove_bonds(wm, self.mol, bond, undo=True) return fragment_list
def filter_pubchem(ms): ms_filtered = [] elements = set(['C', 'H', 'O', 'N', 'S', 'P', 'Cl', 'B', 'Br', 'Se']) for m in ms: mw = CalcExactMolWt(m) if mw < 100 or mw > 1500: continue if GetFormalCharge(m) != 0: continue atoms = [a.GetSymbol() for a in m.GetAtoms()] c = Counter(atoms) if 'C' in c and 'H' in c: if 'S' in c and c['S'] > 5: continue if 'Cl' in c and c['Cl'] > 5: continue if 'Br' in c and c['Br'] > 5: continue if 'B' in c and c['B'] > 5: continue if set(c.keys()).issubset(elements): ms_filtered.append(CalcMolFormula(m)) return ms_filtered
def add_exact_mass(specs): for s in specs: mol = MolFromSmiles(s.get('smiles')) if mol is None: mol = MolFromInchi(s.get('inchi')) exact_mass_smi = CalcExactMolWt(mol) if abs(exact_mass_smi - s.get('parent_mass', 0.0) > 1): print(exact_mass_smi, s.get('parent_mass')) s.set('exact_mass', exact_mass_smi)
def process_line(line): tmp = line.strip().split() m = Chem.MolFromSmiles(tmp[0]) if m: mw = CalcExactMolWt(m) hac = m.GetNumHeavyAtoms() return tmp[0], tmp[1], mw, hac else: return None
def annotate_ms(ms_pred, smi, ion_mode='+', treeDepth=2): mzs = np.array(ms_pred['mz']) intensities = np.array(ms_pred['intensity']) mol = Chem.MolFromSmiles(smi) # only M+H and M-H is considered now. if ion_mode=='+': precursor = CalcExactMolWt(mol) + 1.0032 else: precursor = CalcExactMolWt(mol) - 1.0032 formula = CalcMolFormula(mol) frags = np.unique(generateFragments(smi, treeDepth=2)) frags_new = np.array([Chem.MolFromSmiles(s) for s in frags]) frags_formula = np.unique([CalcMolFormula(f) for f in frags_new]) loss_formula = [] for f in frags_formula: l = subtract_formula(formula, f) if l == '': continue if check_formula(l): loss_formula.append(l) add_H = add_formula(l, 'H') de_H = subtract_formula(l, 'H') if check_formula(add_H): loss_formula.append(add_H) if check_formula(de_H): loss_formula.append(de_H) loss_formula = np.unique(loss_formula) loss_mass = np.array([getFormulaExactMass(f) for f in loss_formula]) ms_new = pd.DataFrame(columns=['mz', 'intensity', 'annotate_loss', 'exact_mass']) for i, mz in enumerate(mzs): intensity = intensities[i] diff = precursor - mz if abs(diff) < 0.5: annotate_loss = ['precursor'] accurate_mass = [precursor] if min(np.abs(loss_mass - diff)) < 0.5: match = np.where(np.abs(loss_mass - diff) < 0.5)[0] annotate_loss = loss_formula[match] accurate_mass = precursor - loss_mass[match] else: annotate_loss = '' accurate_mass = '' ms_new.loc[len(ms_new)] = [mz, intensity, annotate_loss, accurate_mass] return ms_new
def vec2ms(smi, vec, direction='forward', maxmz=1500, norm=True): if direction == 'reverse': mass = round(CalcExactMolWt(Chem.MolFromSmiles(smi))) + 2 peakindex = np.where(vec > 0.05 * max(vec))[0] peakintensity = vec[peakindex] peakintensity[np.where(peakintensity < 0)[0]] = 0 if direction == 'reverse': peakindex = mass - peakindex if norm: peakintensity = peakintensity / (max(peakintensity) + 10**-6) output = pd.DataFrame({'mz': peakindex, 'intensity': peakintensity}) return output
def writeSDF(smiles, file): f = open(file, 'w') for smi in tqdm(smiles): m = Chem.MolFromSmiles(smi) try: CalcExactMolWt(m) except: continue sio = StringIO() w = Chem.SDWriter(sio) w.write(m) w=None string = sio.getvalue() f.write(string)
def ms2vec(smi, peakindex, peakintensity, direction='forward', maxmz=1500): mass = round(CalcExactMolWt(Chem.MolFromSmiles(smi))) + 2 output = np.zeros(maxmz) for i, j in enumerate(peakindex): if round(j) >= maxmz: continue else: if direction == 'forward': output[int(round(j))] = float(peakintensity[i]) else: if mass - round(j) < 0 or mass - round(j) > maxmz: continue output[mass - int(round(j))] = float(peakintensity[i]) if max(output) == 0: pass output = output / (max(output) + 10**-6) return output
def model_predict(smi, model): mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) + 2 input_data = morgan_fp(smi) input_data = np.array([input_data]) pred_spec_forward, pred_spec_reverse = model.predict(input_data) pred_spec_forward = vec2ms(smi, pred_spec_forward[0], norm=False, direction='forward') pred_spec_reverse = vec2ms(smi, pred_spec_reverse[0], norm=False, direction='reverse') pred_spec_forward = pred_spec_forward[pred_spec_forward.mz <= 0.5 * mass] pred_spec_reverse = pred_spec_reverse[pred_spec_reverse.mz > 0.5 * mass] output = pd.concat([pred_spec_forward, pred_spec_reverse]) output = output.sort_values('mz') output = output.reset_index(drop=True) output['intensity'] = output['intensity'] / max(output['intensity']) return output
def get_major_product(product_list): """ Input: list of product SMILES strings Output: SMILES of heaviest species This function is needed in cases where a counterion is included in the intended products. This often happens e.g. with an amine and HCl. Since the logP calculator can't parse SMILES strings with "~" or "." in them, we need to go from e.g. "OCCNCCO~Cl" to "OCCNCCO". After the former is broken into a list (["OCCNCCO","Cl"]), this function uses molecular weight to determine which species is the "primary" product. """ max_molecular_weight = -math.inf for product in product_list: molecule = Chem.rdmolfiles.MolFromSmiles(product) molecular_weight = CalcExactMolWt(molecule) if molecular_weight > max_molecular_weight: max_molecular_weight = molecular_weight heaviest_species = product return heaviest_species
def identification(ms, candidates, model, method='correlation'): smiles = [] scores = [] inchis = [] masses = [] pred_ms = [] if method == 'residual': score = ms_residual elif method == 'correlation': score = ms_correlation else: score = ms_jaccard if 'InChI=' in candidates[0]: read_candidate = Chem.MolFromInchi else: read_candidate = Chem.MolFromSmiles for i in candidates: try: mol = read_candidate(i) smi = Chem.MolToSmiles(mol) inchi = Chem.MolToInchi(mol) mass = CalcExactMolWt(mol) except: continue pms = model_predict(smi, model) scr = score(ms, pms) smiles.append(smi) inchis.append(inchi) scores.append(scr) masses.append(mass) pred_ms.append(pms) output = pd.DataFrame({ 'SMILES': smiles, 'InChI': inchis, 'mass': masses, 'scores': scores, 'pred_ms': pred_ms }) output = output.sort_values('scores', ascending=False) return output
def recursive_tree(all_frags, all_frag_masses, relationships, mol): f_tree = FragTree(mol) #parent = mol parent = Chem.MolToSmiles(Chem.AddHs(mol), False) if not parent in all_frags: all_frags.append(parent) all_frag_masses.append(CalcExactMolWt(mol)) #print len(all_frags),len(relationships) for i, f in enumerate(f_tree.fragment_list): if f['frag_mol'].GetNumBonds() > 6: fragment = f[ 'frag_smiles'] #the identifier to look up the molecule with #fragment = Chem.MolToSmiles(Chem.AddHs(f['frag_mol']),False) if not fragment in all_frags: all_frags.append(fragment) all_frag_masses.append(f['frag_mass']) myrel = (all_frags.index(parent), all_frags.index(fragment)) if not myrel in relationships: relationships.append(myrel) recursive_tree(all_frags, all_frag_masses, relationships, f['frag_mol']) return all_frags, all_frag_masses, relationships
save_npz('DeepEI/data/neims_spec_nist.npz', spec_vecs1) # NEIMS spectra of MassBank exist_smiles = nist_smiles[keep] data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp') msbk_smiles = [] msbk_spec = [] msbk_masses = [] for i, (param, ms) in enumerate(tqdm(data)): smi = param['smiles'] try: smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) except: smi = smi try: mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) except: continue msbk_masses.append(mass) msbk_smiles.append(smi) msbk_spec.append(ms2vec(ms[:, 0], ms[:, 1])) pred_smiles = [] for smi in msbk_smiles: if smi in exist_smiles: continue else: pred_smiles.append(smi) writeSDF(pred_smiles, 'Temp/mol.sdf') cwd = 'E:\\project\\deep-molecular-massspec' cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights'
def predict(): req_data = request.get_json() print("Data requested") print(req_data) conditions = req_data["conditions"] num_rounds = req_data["num_rounds"] loyality = req_data["loyality"] num_of_mols = req_data["num_of_mols"] # molecules closer to aspirin # "Melting point", "Boiling point", "Water Solubility", loyality to drug design rules, number of rounds, number of molecules #conditions = [120, 285, -2.1, 0.7, 10, 10] #data = conditions[] result_arr = [] for round in range(num_rounds): print(f"round {round}") number_generate = 100 endp = torch.tensor(scaler.transform(np.array([conditions]))) print(endp.shape) c = deepcopy(endp) c = [str(l) for l in list(c.numpy())] # endp = endp.unsqueeze(0) endp = endp.repeat(100, 1) endp = endp.unsqueeze(0) endp = endp.repeat(3, 1, 1) endp = endp.float() endp = endp.cuda() res = model.sample(endp, number_generate, dataset.model) valid = len(res) * 100 / number_generate print("valid : {} %".format(valid)) # writer.add_scalar("Valid", valid, cnt) res = [robust_standardizer(mol) for mol in res] res = list(filter(lambda x: x is not None, res)) mols = res print("Mols obtained") print(mols) vals_another = requests.post("https://backend.syntelly.com/tempSmilesArrToPredict", json={'smiles': mols}).json() for idx in range(len(vals_another)): elem = vals_another[idx]['data'] for e in elem: e["endpoint_id"] = endpoints_id2name[e["endpoint_id"]] e2v = [] for idx in range(len(vals_another)): e2v.append(dict(zip([e['endpoint_id'] for e in vals_another[idx]['data']], [e['value'] for e in vals_another[idx]['data']]))) smiles = [val['smiles'] for val in vals_another] mols = [robust_standardizer(mol) for mol in smiles] mols = [Chem.MolFromSmiles(mol) for mol in mols] molecular_weights = [CalcExactMolWt(mol) for mol in mols] logp = [MolLogP(mol) for mol in mols] atom_count = [mol.GetNumAtoms() for mol in mols] molar_reflactivity = [MolMR(mol) for mol in mols] numRings = [CalcNumRings(mol) for mol in mols] numRotBonds = [CalcNumRotatableBonds(mol) for mol in mols] numHAcceptors = [NumHAcceptors(mol) for mol in mols] numHDonors = [NumHDonors(mol) for mol in mols] bcf = [e['Bioconcentration factor'] for e in e2v] dev_tox = [e['Developmental toxicity'] for e in e2v] flash_point = [e['Flash point'] for e in e2v] boiling_point = [e['Boiling point'] for e in e2v] melting_points = [e['Melting point'] for e in e2v] water_solubility = [e['Water Solubility'] for e in e2v] result = [0] * len(smiles) for idx in range(len(smiles)): val = 0 if (molecular_weights[idx] <= 480 and molecular_weights[idx] >= 160): val += 1 if (logp[idx] <= 5.6 and logp[idx] >= -0.4): val += 1 if (atom_count[idx] <= 70 and atom_count[idx] >= 20): val += 1 if (molar_reflactivity[idx] >= 40 and molar_reflactivity[idx] <= 130): val += 1 if (bcf[idx] < 3): val += 1 if (dev_tox[idx] == 'Negative'): val += 1 if (flash_point[idx] > (350 - 273.15)): val += 1 if (boiling_point[idx] > (300 - 273.15)): val += 1 if (numRings[idx] > 0): val += 1 if (numRotBonds[idx] < 5): val += 1 if (numHAcceptors[idx] <= 10): val += 1 if (numHDonors[idx] <= 5): val += 1 if (val / 12 >= loyality): result[idx] = val print(result) for idx in range(len(result)): if (result[idx] > 0): result_arr.append((smiles[idx], result[idx], (melting_points[idx], boiling_point[idx], water_solubility[idx]), mean_squared_error( scaler.transform(np.array( [[melting_points[idx], boiling_point[idx], water_solubility[idx]]])), scaler.transform(np.array([conditions])) ))) result_arr.sort(key=lambda x: x[3]) print(result_arr[:num_of_mols]) return jsonify(result_arr[:num_of_mols])
energies.append(energy) summary = pd.DataFrame({'smiles': smiles, 'ion_mode': modes, 'energy': energies}) # example 1 idx = 551 smi = smiles[idx] mol = Chem.MolFromSmiles(smi) ms_pred = model_predict(smi, model) ms_real = ms[idx] # annotation mzs = np.array(ms_pred['mz']) intensities = np.array(ms_pred['intensity']) mol = Chem.MolFromSmiles(smi) precursor = CalcExactMolWt(mol) - 1.0032 formula = CalcMolFormula(mol) frags = np.unique(generateFragments(smi, treeDepth=2)) frags_new = [Chem.MolFromSmiles(s) for s in frags] frags_formula = np.unique([CalcMolFormula(f) for f in frags_new]) loss_formula = [] for f in frags_formula: l = subtract_formula(formula, f) if l == '': continue if check_formula(l): loss_formula.append(l) add_H = add_formula(l, 'H') de_H = subtract_formula(l, 'H') if check_formula(add_H): loss_formula.append(add_H)
def collect(): all_smiles = [] Peak_data = [] RI_data = [] Morgan_fp = [] CDK_fp = [] CDK_des = [] MolWt = [] # for i in tqdm(range(20)): for i in tqdm(range(len(all_mol))): try: m = read_mol(i) except: continue ''' if 'TMS derivative' in m['name']: derive = 1 else: derive = 0 ''' try: mol = Chem.MolFromSmiles(m['smiles']) molwt = CalcExactMolWt(mol) if molwt > 2000: continue smiles = Chem.MolToSmiles(mol) # check element elements = parser_formula(MolToFormula(MolFromSmiles(smiles))) for e in elements: if e not in [ 'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br', 'I' ]: raise ValueError('contain uncommon element') morgan_fp = np.array( AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096)) cdk_fp = get_cdk_fingerprints(smiles) # cdk_fp = fp2vec(cdk_fp) cdk_des = np.array(get_cdk_descriptors(smiles)) # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values() # cdk_des = np.array(list(itertools.chain(*cdk_des))) ri = list(m['RI'].values()) peak_vec = ms2vec(m['peakindex'], m['peakintensity']) except: continue all_smiles.append(smiles) Peak_data.append(peak_vec) RI_data.append(ri) Morgan_fp.append(morgan_fp) CDK_fp.append(cdk_fp) CDK_des.append(cdk_des) MolWt.append(molwt) # save np.save('DeepEI/data/retention.npy', np.array(RI_data)) np.save('DeepEI/data/descriptor.npy', np.array(CDK_des)) np.save('DeepEI/data/molwt.npy', np.array(MolWt)) Peak_data = csr_matrix(np.array(Peak_data)) Morgan_fp = csr_matrix(np.array(Morgan_fp)) CDK_fp = csr_matrix(np.array(CDK_fp)) save_npz('DeepEI/data/peakvec.npz', Peak_data) save_npz('DeepEI/data/morgan.npz', Morgan_fp) save_npz('DeepEI/data/fingerprints.npz', CDK_fp) with open('DeepEI/data/all_smiles.json', 'w') as t: json.dump(all_smiles, t)
from libmetgem import msp from scipy.sparse import load_npz, csr_matrix from tqdm import tqdm from rdkit import Chem from rdkit.Chem.rdMolDescriptors import CalcExactMolWt from DeepEI.predict import predict_fingerprint from DeepEI.utils import ms2vec, vec2ms, get_cdk_fingerprints data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp') smiles = [] spec = [] molwt = [] for i, (param, ms) in enumerate(tqdm(data)): smi = param['smiles'] try: mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) except: continue molwt.append(mass) smiles.append(smi) spec.append(ms2vec(ms[:, 0], ms[:, 1])) mlp = pd.read_csv('Fingerprint/results/mlp_result.txt', sep='\t', header=None) mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1'] fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]] spec = np.array(spec) pred_fps = predict_fingerprint( spec, fpkeep) # predict fingerprint of the "unknown"
def mass(self): """ float: the mass of the molecule. """ return CalcExactMolWt(self)