def make_fragment_list(self):
        """
        Uses an iterable to compute all nodes of a fragmentation tree, but without retaining parent/child info.

        This function is used modify self.fragment_dict in place in the ``__init__`` constructor

        :returns:  fragment_dict ; Python dict with

            **keys**:   (sorted) tuple of atom indices in the fragment

            **values**: Python dict with

                    * **keys**: 'path'
                    * **values**: (sorted) tuple of unique shortest bond breakage path giving rise to fragment
        """
        fragment_list = []
        wm = Chem.RWMol(self.molH)
        for bond in range(self.num_bonds):
            # Remove bonds from H'ed molecule
            remove_bonds(wm, self.mol, bond)
            for mol in Chem.GetMolFrags(wm, sanitizeFrags=False, asMols=True):
                fragment_list.append({
                    'frag_mol_h': mol,
                    'frag_mol': Chem.RemoveHs(mol),
                    'frag_mass': CalcExactMolWt(mol),
                    'frag_smiles': Chem.MolToSmiles(mol, True),
                    'fragment_mass': CalcExactMolWt(mol)
                })

            # Restore broken bonds
            remove_bonds(wm, self.mol, bond, undo=True)
        return fragment_list
Beispiel #2
0
def filter_pubchem(ms):
    ms_filtered = []
    elements = set(['C', 'H', 'O', 'N', 'S', 'P', 'Cl', 'B', 'Br', 'Se'])
    for m in ms:
        mw = CalcExactMolWt(m)
        if mw < 100 or mw > 1500:
            continue

        if GetFormalCharge(m) != 0:
            continue

        atoms = [a.GetSymbol() for a in m.GetAtoms()]
        c = Counter(atoms)
        if 'C' in c and 'H' in c:
            if 'S' in c and c['S'] > 5:
                continue
            if 'Cl' in c and c['Cl'] > 5:
                continue
            if 'Br' in c and c['Br'] > 5:
                continue
            if 'B' in c and c['B'] > 5:
                continue
            if set(c.keys()).issubset(elements):
                ms_filtered.append(CalcMolFormula(m))
    return ms_filtered
def add_exact_mass(specs):
    for s in specs:
        mol = MolFromSmiles(s.get('smiles'))
        if mol is None:
            mol = MolFromInchi(s.get('inchi'))
        exact_mass_smi = CalcExactMolWt(mol)
        if abs(exact_mass_smi - s.get('parent_mass', 0.0) > 1):
            print(exact_mass_smi, s.get('parent_mass'))
        s.set('exact_mass', exact_mass_smi)
Beispiel #4
0
def process_line(line):
    tmp = line.strip().split()
    m = Chem.MolFromSmiles(tmp[0])
    if m:
        mw = CalcExactMolWt(m)
        hac = m.GetNumHeavyAtoms()
        return tmp[0], tmp[1], mw, hac
    else:
        return None
Beispiel #5
0
def annotate_ms(ms_pred, smi, ion_mode='+', treeDepth=2):
    mzs = np.array(ms_pred['mz'])
    intensities = np.array(ms_pred['intensity'])
    mol = Chem.MolFromSmiles(smi)
    # only M+H and M-H is considered now.
    if ion_mode=='+':
        precursor = CalcExactMolWt(mol) + 1.0032
    else:
        precursor = CalcExactMolWt(mol) - 1.0032
    formula = CalcMolFormula(mol)
    frags = np.unique(generateFragments(smi, treeDepth=2))
    frags_new = np.array([Chem.MolFromSmiles(s) for s in frags])
    frags_formula = np.unique([CalcMolFormula(f) for f in frags_new])
    loss_formula = []
    for f in frags_formula:
        l = subtract_formula(formula, f)
        if l == '':
            continue
        if check_formula(l):
            loss_formula.append(l)
        add_H = add_formula(l, 'H')
        de_H = subtract_formula(l, 'H')
        if check_formula(add_H):
            loss_formula.append(add_H)
        if check_formula(de_H):
            loss_formula.append(de_H)
    loss_formula = np.unique(loss_formula)
    loss_mass = np.array([getFormulaExactMass(f) for f in loss_formula])
    ms_new = pd.DataFrame(columns=['mz', 'intensity', 'annotate_loss', 'exact_mass'])
    for i, mz in enumerate(mzs):
        intensity = intensities[i]
        diff = precursor - mz
        if abs(diff) < 0.5:
            annotate_loss = ['precursor']
            accurate_mass = [precursor]
        if min(np.abs(loss_mass - diff)) < 0.5:
            match = np.where(np.abs(loss_mass - diff) < 0.5)[0]
            annotate_loss = loss_formula[match]
            accurate_mass = precursor - loss_mass[match]
        else:
            annotate_loss = ''
            accurate_mass = ''      
        ms_new.loc[len(ms_new)] = [mz, intensity, annotate_loss, accurate_mass]
    return ms_new
Beispiel #6
0
def vec2ms(smi, vec, direction='forward', maxmz=1500, norm=True):
    if direction == 'reverse':
        mass = round(CalcExactMolWt(Chem.MolFromSmiles(smi))) + 2
    peakindex = np.where(vec > 0.05 * max(vec))[0]
    peakintensity = vec[peakindex]
    peakintensity[np.where(peakintensity < 0)[0]] = 0
    if direction == 'reverse':
        peakindex = mass - peakindex
    if norm:
        peakintensity = peakintensity / (max(peakintensity) + 10**-6)
    output = pd.DataFrame({'mz': peakindex, 'intensity': peakintensity})
    return output
Beispiel #7
0
def writeSDF(smiles, file):
    f = open(file, 'w')
    for smi in tqdm(smiles):
        m = Chem.MolFromSmiles(smi)
        try:
            CalcExactMolWt(m)
        except:
            continue
        sio = StringIO()
        w = Chem.SDWriter(sio)
        w.write(m)
        w=None
        string = sio.getvalue()
        f.write(string)
Beispiel #8
0
def ms2vec(smi, peakindex, peakintensity, direction='forward', maxmz=1500):
    mass = round(CalcExactMolWt(Chem.MolFromSmiles(smi))) + 2
    output = np.zeros(maxmz)
    for i, j in enumerate(peakindex):
        if round(j) >= maxmz:
            continue
        else:
            if direction == 'forward':
                output[int(round(j))] = float(peakintensity[i])
            else:
                if mass - round(j) < 0 or mass - round(j) > maxmz:
                    continue
                output[mass - int(round(j))] = float(peakintensity[i])
    if max(output) == 0:
        pass
    output = output / (max(output) + 10**-6)
    return output
Beispiel #9
0
def model_predict(smi, model):
    mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) + 2
    input_data = morgan_fp(smi)
    input_data = np.array([input_data])
    pred_spec_forward, pred_spec_reverse = model.predict(input_data)
    pred_spec_forward = vec2ms(smi,
                               pred_spec_forward[0],
                               norm=False,
                               direction='forward')
    pred_spec_reverse = vec2ms(smi,
                               pred_spec_reverse[0],
                               norm=False,
                               direction='reverse')
    pred_spec_forward = pred_spec_forward[pred_spec_forward.mz <= 0.5 * mass]
    pred_spec_reverse = pred_spec_reverse[pred_spec_reverse.mz > 0.5 * mass]
    output = pd.concat([pred_spec_forward, pred_spec_reverse])
    output = output.sort_values('mz')
    output = output.reset_index(drop=True)
    output['intensity'] = output['intensity'] / max(output['intensity'])
    return output
Beispiel #10
0
def get_major_product(product_list):
    """
    Input: list of product SMILES strings
    Output: SMILES of heaviest species

    This function is needed in cases where a counterion is included in the intended products.
    This often happens e.g. with an amine and HCl. Since the logP calculator can't parse
    SMILES strings with "~" or "." in them, we need to go from e.g. "OCCNCCO~Cl" to "OCCNCCO".
    After the former is broken into a list (["OCCNCCO","Cl"]), this function uses molecular weight
    to determine which species is the "primary" product.
    """
    max_molecular_weight = -math.inf
    for product in product_list:
        molecule = Chem.rdmolfiles.MolFromSmiles(product)
        molecular_weight = CalcExactMolWt(molecule)
        if molecular_weight > max_molecular_weight:
            max_molecular_weight = molecular_weight
            heaviest_species = product

    return heaviest_species
Beispiel #11
0
def identification(ms, candidates, model, method='correlation'):
    smiles = []
    scores = []
    inchis = []
    masses = []
    pred_ms = []
    if method == 'residual':
        score = ms_residual
    elif method == 'correlation':
        score = ms_correlation
    else:
        score = ms_jaccard
    if 'InChI=' in candidates[0]:
        read_candidate = Chem.MolFromInchi
    else:
        read_candidate = Chem.MolFromSmiles
    for i in candidates:
        try:
            mol = read_candidate(i)
            smi = Chem.MolToSmiles(mol)
            inchi = Chem.MolToInchi(mol)
            mass = CalcExactMolWt(mol)
        except:
            continue
        pms = model_predict(smi, model)
        scr = score(ms, pms)
        smiles.append(smi)
        inchis.append(inchi)
        scores.append(scr)
        masses.append(mass)
        pred_ms.append(pms)
    output = pd.DataFrame({
        'SMILES': smiles,
        'InChI': inchis,
        'mass': masses,
        'scores': scores,
        'pred_ms': pred_ms
    })
    output = output.sort_values('scores', ascending=False)
    return output
def recursive_tree(all_frags, all_frag_masses, relationships, mol):
    f_tree = FragTree(mol)
    #parent = mol
    parent = Chem.MolToSmiles(Chem.AddHs(mol), False)
    if not parent in all_frags:
        all_frags.append(parent)
        all_frag_masses.append(CalcExactMolWt(mol))
    #print len(all_frags),len(relationships)
    for i, f in enumerate(f_tree.fragment_list):
        if f['frag_mol'].GetNumBonds() > 6:
            fragment = f[
                'frag_smiles']  #the identifier to look up the molecule with
            #fragment = Chem.MolToSmiles(Chem.AddHs(f['frag_mol']),False)
            if not fragment in all_frags:
                all_frags.append(fragment)
                all_frag_masses.append(f['frag_mass'])
            myrel = (all_frags.index(parent), all_frags.index(fragment))
            if not myrel in relationships:
                relationships.append(myrel)
                recursive_tree(all_frags, all_frag_masses, relationships,
                               f['frag_mol'])
    return all_frags, all_frag_masses, relationships
Beispiel #13
0
save_npz('DeepEI/data/neims_spec_nist.npz', spec_vecs1)

# NEIMS spectra of MassBank
exist_smiles = nist_smiles[keep]
data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp')
msbk_smiles = []
msbk_spec = []
msbk_masses = []
for i, (param, ms) in enumerate(tqdm(data)):
    smi = param['smiles']
    try:
        smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
    except:
        smi = smi
    try:
        mass = CalcExactMolWt(Chem.MolFromSmiles(smi))
    except:
        continue
    msbk_masses.append(mass)
    msbk_smiles.append(smi)
    msbk_spec.append(ms2vec(ms[:, 0], ms[:, 1]))

pred_smiles = []
for smi in msbk_smiles:
    if smi in exist_smiles:
        continue
    else:
        pred_smiles.append(smi)
writeSDF(pred_smiles, 'Temp/mol.sdf')
cwd = 'E:\\project\\deep-molecular-massspec'
cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights'
def predict():
    req_data = request.get_json()
    print("Data requested")
    print(req_data)
    conditions = req_data["conditions"]
    num_rounds = req_data["num_rounds"]
    loyality = req_data["loyality"]
    num_of_mols = req_data["num_of_mols"]

    # molecules closer to aspirin
    # "Melting point", "Boiling point", "Water Solubility", loyality to drug design rules, number of rounds, number of molecules
    #conditions = [120, 285, -2.1, 0.7, 10, 10]
    #data = conditions[]
    result_arr = []
    for round in range(num_rounds):
        print(f"round {round}")
        number_generate = 100
        endp = torch.tensor(scaler.transform(np.array([conditions])))
        print(endp.shape)

        c = deepcopy(endp)
        c = [str(l) for l in list(c.numpy())]
        # endp = endp.unsqueeze(0)
        endp = endp.repeat(100, 1)
        endp = endp.unsqueeze(0)
        endp = endp.repeat(3, 1, 1)

        endp = endp.float()
        endp = endp.cuda()
        res = model.sample(endp, number_generate, dataset.model)
        valid = len(res) * 100 / number_generate
        print("valid : {} %".format(valid))
        # writer.add_scalar("Valid", valid, cnt)
        res = [robust_standardizer(mol) for mol in res]
        res = list(filter(lambda x: x is not None, res))
        mols = res
        print("Mols obtained")
        print(mols)
        vals_another = requests.post("https://backend.syntelly.com/tempSmilesArrToPredict",
                                     json={'smiles': mols}).json()
        for idx in range(len(vals_another)):
            elem = vals_another[idx]['data']
            for e in elem:
                e["endpoint_id"] = endpoints_id2name[e["endpoint_id"]]
        e2v = []
        for idx in range(len(vals_another)):
            e2v.append(dict(zip([e['endpoint_id'] for e in vals_another[idx]['data']],
                                [e['value'] for e in vals_another[idx]['data']])))
        smiles = [val['smiles'] for val in vals_another]
        mols = [robust_standardizer(mol) for mol in smiles]
        mols = [Chem.MolFromSmiles(mol) for mol in mols]
        molecular_weights = [CalcExactMolWt(mol) for mol in mols]
        logp = [MolLogP(mol) for mol in mols]
        atom_count = [mol.GetNumAtoms() for mol in mols]
        molar_reflactivity = [MolMR(mol) for mol in mols]
        numRings = [CalcNumRings(mol) for mol in mols]
        numRotBonds = [CalcNumRotatableBonds(mol) for mol in mols]
        numHAcceptors = [NumHAcceptors(mol) for mol in mols]
        numHDonors = [NumHDonors(mol) for mol in mols]
        bcf = [e['Bioconcentration factor'] for e in e2v]
        dev_tox = [e['Developmental toxicity'] for e in e2v]
        flash_point = [e['Flash point'] for e in e2v]
        boiling_point = [e['Boiling point'] for e in e2v]
        melting_points = [e['Melting point'] for e in e2v]
        water_solubility = [e['Water Solubility'] for e in e2v]

        result = [0] * len(smiles)
        for idx in range(len(smiles)):
            val = 0
            if (molecular_weights[idx] <= 480 and molecular_weights[idx] >= 160):
                val += 1
            if (logp[idx] <= 5.6 and logp[idx] >= -0.4):
                val += 1
            if (atom_count[idx] <= 70 and atom_count[idx] >= 20):
                val += 1
            if (molar_reflactivity[idx] >= 40 and molar_reflactivity[idx] <= 130):
                val += 1
            if (bcf[idx] < 3):
                val += 1
            if (dev_tox[idx] == 'Negative'):
                val += 1
            if (flash_point[idx] > (350 - 273.15)):
                val += 1
            if (boiling_point[idx] > (300 - 273.15)):
                val += 1
            if (numRings[idx] > 0):
                val += 1
            if (numRotBonds[idx] < 5):
                val += 1
            if (numHAcceptors[idx] <= 10):
                val += 1
            if (numHDonors[idx] <= 5):
                val += 1

            if (val / 12 >= loyality):
                result[idx] = val

        print(result)
        for idx in range(len(result)):
            if (result[idx] > 0):
                result_arr.append((smiles[idx], result[idx],
                                   (melting_points[idx], boiling_point[idx], water_solubility[idx]),
                                   mean_squared_error(
                                       scaler.transform(np.array(
                                           [[melting_points[idx], boiling_point[idx], water_solubility[idx]]])),
                                       scaler.transform(np.array([conditions]))
                                   )))

    result_arr.sort(key=lambda x: x[3])

    print(result_arr[:num_of_mols])
    return jsonify(result_arr[:num_of_mols])
    energies.append(energy)
summary = pd.DataFrame({'smiles': smiles, 'ion_mode': modes, 'energy': energies})


# example 1
idx = 551
smi = smiles[idx]
mol = Chem.MolFromSmiles(smi)
ms_pred = model_predict(smi, model)
ms_real = ms[idx]

# annotation
mzs = np.array(ms_pred['mz'])
intensities = np.array(ms_pred['intensity'])
mol = Chem.MolFromSmiles(smi)
precursor = CalcExactMolWt(mol) - 1.0032
formula = CalcMolFormula(mol)
frags = np.unique(generateFragments(smi, treeDepth=2))
frags_new = [Chem.MolFromSmiles(s) for s in frags]
frags_formula = np.unique([CalcMolFormula(f) for f in frags_new])
loss_formula = []
for f in frags_formula:
    l = subtract_formula(formula, f)
    if l == '':
        continue
    if check_formula(l):
        loss_formula.append(l)
    add_H = add_formula(l, 'H')
    de_H = subtract_formula(l, 'H')
    if check_formula(add_H):
        loss_formula.append(add_H)
Beispiel #16
0
def collect():
    all_smiles = []
    Peak_data = []
    RI_data = []
    Morgan_fp = []
    CDK_fp = []
    CDK_des = []
    MolWt = []
    # for i in tqdm(range(20)):
    for i in tqdm(range(len(all_mol))):
        try:
            m = read_mol(i)
        except:
            continue
        '''
        if  'TMS derivative' in m['name']:
            derive = 1
        else:
            derive = 0
        '''
        try:
            mol = Chem.MolFromSmiles(m['smiles'])
            molwt = CalcExactMolWt(mol)
            if molwt > 2000:
                continue
            smiles = Chem.MolToSmiles(mol)
            # check element
            elements = parser_formula(MolToFormula(MolFromSmiles(smiles)))
            for e in elements:
                if e not in [
                        'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br',
                        'I'
                ]:
                    raise ValueError('contain uncommon element')
            morgan_fp = np.array(
                AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096))
            cdk_fp = get_cdk_fingerprints(smiles)
            # cdk_fp = fp2vec(cdk_fp)
            cdk_des = np.array(get_cdk_descriptors(smiles))
            # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values()
            # cdk_des  = np.array(list(itertools.chain(*cdk_des)))
            ri = list(m['RI'].values())
            peak_vec = ms2vec(m['peakindex'], m['peakintensity'])
        except:
            continue

        all_smiles.append(smiles)
        Peak_data.append(peak_vec)
        RI_data.append(ri)
        Morgan_fp.append(morgan_fp)
        CDK_fp.append(cdk_fp)
        CDK_des.append(cdk_des)
        MolWt.append(molwt)

    # save
    np.save('DeepEI/data/retention.npy', np.array(RI_data))
    np.save('DeepEI/data/descriptor.npy', np.array(CDK_des))
    np.save('DeepEI/data/molwt.npy', np.array(MolWt))

    Peak_data = csr_matrix(np.array(Peak_data))
    Morgan_fp = csr_matrix(np.array(Morgan_fp))
    CDK_fp = csr_matrix(np.array(CDK_fp))
    save_npz('DeepEI/data/peakvec.npz', Peak_data)
    save_npz('DeepEI/data/morgan.npz', Morgan_fp)
    save_npz('DeepEI/data/fingerprints.npz', CDK_fp)

    with open('DeepEI/data/all_smiles.json', 'w') as t:
        json.dump(all_smiles, t)
Beispiel #17
0
    from libmetgem import msp
    from scipy.sparse import load_npz, csr_matrix
    from tqdm import tqdm
    from rdkit import Chem
    from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
    from DeepEI.predict import predict_fingerprint
    from DeepEI.utils import ms2vec, vec2ms, get_cdk_fingerprints

    data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp')
    smiles = []
    spec = []
    molwt = []
    for i, (param, ms) in enumerate(tqdm(data)):
        smi = param['smiles']
        try:
            mass = CalcExactMolWt(Chem.MolFromSmiles(smi))
        except:
            continue
        molwt.append(mass)
        smiles.append(smi)
        spec.append(ms2vec(ms[:, 0], ms[:, 1]))

    mlp = pd.read_csv('Fingerprint/results/mlp_result.txt',
                      sep='\t',
                      header=None)
    mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1']
    fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]]

    spec = np.array(spec)
    pred_fps = predict_fingerprint(
        spec, fpkeep)  # predict fingerprint of the "unknown"
Beispiel #18
0
    def mass(self):
        """ float: the mass of the molecule. """

        return CalcExactMolWt(self)