Esempio n. 1
0
def get_lipinksi_test(mol, rule_test):
    mol.UpdatePropertyCache(strict=False)
    MW = rdMolDescriptors.CalcExactMolWt(mol)

    # Calculate mol features. NB CalcCrippenDescriptors returns tuple logP & mr_values
    feature_values = [
        rdMolDescriptors.CalcCrippenDescriptors(mol)[0],
        rdMolDescriptors.CalcNumLipinskiHBD(mol),
        rdMolDescriptors.CalcNumLipinskiHBA(mol)
    ]
    test_rule = all(value <= rule_test for value in feature_values)
    if MW < 500 and MW > 300 and test_rule == True:
        return True
    else:
        return False
Esempio n. 2
0
    def testMolWt(self):
        mol = Chem.MolFromSmiles("C")
        amw = rdMD._CalcMolWt(mol)
        self.assertTrue(feq(amw, 16.043, .001))
        amw = rdMD._CalcMolWt(mol, True)
        self.assertTrue(feq(amw, 12.011, .001))
        mol2 = Chem.AddHs(mol)
        amw = rdMD._CalcMolWt(mol2)
        self.assertTrue(feq(amw, 16.043, .001))
        amw = rdMD._CalcMolWt(mol2, True)
        self.assertTrue(feq(amw, 12.011, .001))

        mol = Chem.MolFromSmiles("C")
        amw = rdMD.CalcExactMolWt(mol)
        self.assertTrue(feq(amw, 16.031, .001))
Esempio n. 3
0
    def predict(self, mol, selected_descriptors):
        options = [0, 0, 0, 0, 0]
        return_properties = {}

        for option in selected_descriptors:
            if option == 'logP':
                options[0] = 1
            elif option == 'sol':
                options[0] = 1
                options[1] = 1
            elif option == 'mp':
                options[0] = 1
                options[1] = 1
                options[2] = 1
            elif option == 'pka':
                options[3] = 1
            elif option == 'mol_wt':
                options[4] = 1

        fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol)

        if options[0]:
            logP = self.logP_model.run(fp)
            return_properties['logP'] = logP

        if options[1]:
            logP_sol = self.logP_solubility_model.run(logP)
            atom_pair_sol = self.atom_pair_sol_model.run(fp)
            combined_sol = self.combined_model.run(mol, logP,
                                                   logP_sol, atom_pair_sol)
            mg_ml_sol = logs_to_mg_ml(combined_sol, mol)
            return_properties['sol'] = mg_ml_sol

        if options[2]:
            mp = self.melting_point_model.run(combined_sol, logP)
            return_properties['mp'] = mp

        if options[3]:
            avalon = GetAvalonFP(mol)
            maacs = MACCSkeys.GenMACCSKeys(mol)
            pka = self.pKa_model.run(avalon + maacs + fp)
            return_properties['pka'] = pka

        if options[4]:
            wt = rdMolDescriptors.CalcExactMolWt(mol)
            return_properties['mol_wt'] = wt

        return return_properties
Esempio n. 4
0
def evaluate_chem_mol(mol):
    try:
        Chem.GetSSSR(mol)
        clogp = Crippen.MolLogP(mol)
        mw = MolDescriptors.CalcExactMolWt(mol)
        tpsa = Descriptors.TPSA(mol)
        ret_val = [
            True,
            320 < mw < 420,
            2 < clogp < 3,
            40 < tpsa < 60
        ]
    except:
        ret_val = [False] * 4

    return ret_val
Esempio n. 5
0
    def choose(self, mol):
        """Return the largest covalent unit.

        The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the
        fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed.

        :param mol: The molecule to choose the largest fragment from.
        :type mol: :rdkit:`Mol <Chem.rdchem.Mol-class.html>`
        :return: The largest fragment.
        :rtype: :rdkit:`Mol <Chem.rdchem.Mol-class.html>`
        """
        log.debug('Running LargestFragmentChooser')
        # TODO: Alternatively allow a list of fragments to be passed as the mol parameter
        fragments = Chem.GetMolFrags(mol, asMols=True)
        largest = None
        for f in fragments:
            smiles = Chem.MolToSmiles(f, isomericSmiles=True)
            log.debug('Fragment: %s', smiles)
            organic = is_organic(f)
            if self.prefer_organic:
                # Skip this fragment if not organic and we already have an organic fragment as the largest so far
                if largest and largest['organic'] and not organic:
                    continue
                # Reset largest if it wasn't organic and this fragment is organic
                if largest and organic and not largest['organic']:
                    largest = None
            # Count atoms
            atoms = 0
            for a in f.GetAtoms():
                atoms += 1 + a.GetTotalNumHs()
            # Skip this fragment if fewer atoms than the largest
            if largest and atoms < largest['atoms']:
                continue
            # Skip this fragment if equal number of atoms but weight is lower
            weight = rdMolDescriptors.CalcExactMolWt(f)
            if largest and atoms == largest['atoms'] and weight < largest['weight']:
                continue
            # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically
            if largest and atoms == largest['atoms'] and weight == largest['weight'] and smiles > largest['smiles']:
                continue
            # Otherwise this is the largest so far
            log.debug('New largest fragment: %s (%s)', smiles, atoms)
            largest = {'smiles': smiles, 'fragment': f,
                'atoms': atoms, 'weight': weight, 'organic': organic}
        return largest['fragment']
Esempio n. 6
0
def parse_epilion(abbr: str) -> dict:

    fa_decoder = ParserFA()
    pl_decoder = ParserPL()

    info_dct = {}

    converter = Converter(abbr_cfg_path)
    epilion_id = converter.convert_abbr(abbr)

    if fa_decoder.is_fa(epilion_id):
        smi = fa_decoder.get_smi_fa(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    elif pl_decoder.is_pl(epilion_id):
        smi = pl_decoder.get_smi_pl(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    else:
        logger.info(f'Can NOT parse abbreviation: {epilion_id}')

    try:
        mol = Chem.MolFromSmiles(smi)
        AllChem.Compute2DCoords(mol)
        # m_mass = Descriptors.MolWt(mol)
        m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
        m_formula = rdMolDescriptors.CalcMolFormula(mol)
        img = Draw.MolToImage(mol, size=(600, 400))
        img_io = BytesIO()
        img.save(img_io, format='png')
        img_io.seek(0)
        img.save(img_io, format='png')
        img_data = base64.b64encode(img_io.getbuffer())
        img_data_url = r'data:image/png;base64,' + img_data.decode("utf-8")

        info_dct['id'] = epilion_id
        info_dct['formula'] = m_formula
        info_dct['exactmass'] = '%.4f' % m_exactmass
        info_dct['img'] = img_data_url

    except Exception as e:
        logger.error(f'! FAILED: {epilion_id}')
        logger.error(f'! FAILED to generate structure from SMILES: {smi}')
        logger.error(e)

    return info_dct
Esempio n. 7
0
def get_filter_values(mol):
    """
    calculate the values, for a given molecule, that are used to filter
    return as a dictionary
    """

    assert isinstance(mol, Chem.Mol)

    values = {}
    values["MW"] = desc.CalcExactMolWt(mol)
    values["logP"] = crip.MolLogP(mol)
    values["HBA"] = lip.NumHAcceptors(mol)
    values["HBD"] = lip.NumHDonors(mol)
    values["tPSA"] = desc.CalcTPSA(mol)
    values["rot_bonds"] = lip.NumRotatableBonds(mol)
    values["rigid_bonds"] = mol.GetNumBonds() - values[
        "rot_bonds"]  # assume mutual exclusion
    values["num_rings"] = lip.RingCount(mol)
    values["num_hetero_atoms"] = lip.NumHeteroatoms(mol)
    values["charge"] = rdmolops.GetFormalCharge(
        mol)  # trusting this charge calculation method
    values["num_carbons"], values["num_charges"], values[
        "max_ring_size"] = get_atom_props(mol)
    try:
        values["hc_ratio"] = float(values["num_hetero_atoms"]) / float(
            values["num_carbons"])
    except ZeroDivisionError:
        values["hc_ratio"] = 100000000  # if there are zero carbons
    values["fc"] = len(list(Brics.FindBRICSBonds(
        mol)))  # how many BRICS bonds, related to complexity
    values["is_good"] = True  # default to true, but not yet observed
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()
             ]  # get all the atoms, and make the list unique (only types)
    atoms = set(atoms)
    atoms = list(atoms)
    values["atoms"] = atoms
    values["num_chiral_centers"] = len(
        Chem.FindMolChiralCenters(mol, includeUnassigned=True))
    values["rejections"] = []  # empty list to store the reasons for rejection

    return values
Esempio n. 8
0
 def __init__(self, mol, atom_count=None, MW=None, Tb=None):
     if type(mol) == Chem.rdchem.Mol:
         self.rdkitmol = mol
     else:
         self.rdkitmol = Chem.MolFromSmiles(mol)
     if atom_count is None:
         self.rdkitmol_Hs = Chem.AddHs(self.rdkitmol)
         self.atom_count = len(self.rdkitmol_Hs.GetAtoms())
     else:
         self.atom_count = atom_count
     if MW is None:
         self.MW = rdMolDescriptors.CalcExactMolWt(self.rdkitmol_Hs)
     else:
         self.MW = MW
         
     self.counts, self.success, self.status = smarts_fragment(J_BIGGS_JOBACK_SMARTS_id_dict, rdkitmol=self.rdkitmol)
         
     if Tb is not None:
         self.Tb_estimated = self.Tb(self.counts)
     else:
         self.Tb_estimated = Tb
Esempio n. 9
0
def get_monoisotopic_mz_and_z(structure):
    """
    Determines the monoisotopic m/z value and charge of an ion provided as a SMILES string or .sdf file.
    :param structure:    str     a valid SMILES string OR a path to an .sdf file containg a single ion structure.
    :return out_dict:    dict    w/ entries "charge" (int) and "monoiso_mz" (float in Daltons) and rdkit mol obj.
    """
    # parse input
    try:
        mol = Chem.MolFromSmiles(structure)
        if mol is None:
            raise TypeError(
                'The provided structure was not a valid SMILES, assuming it is a path to an .sdf file...'
            )
    except TypeError:
        try:
            lst = [mol for mol in Chem.SDMolSupplier(structure)]
            mol = lst[0]
        except OSError:
            raise TypeError(
                'The provide structure was neither a valid SMILES string nor a path to an .sdf file.'
            )

    # ensure mol exists
    if not mol:
        raise NotImplementedError(
            'For unknown reasons, the provided structure could not be analyzed.'
        )

    # determine properties of mol
    monoiso_mz = rdMolDescriptors.CalcExactMolWt(mol)
    charge = rdmolops.GetFormalCharge(mol)

    # ensure provided structure is of an ion
    if not charge:
        raise ValueError(
            'Provided structures must be of ions, not neutral molecules.')

    charge = int(charge)
    out_dict = {'charge': charge, 'monoiso_mz': monoiso_mz, 'mol': mol}
    return out_dict
Esempio n. 10
0
 def calculate_properties(self, smiles=None, mol=None, props=[]):
     """this method calculates basic properties for the mol
     returns : error (bool)"""
     if len(props) == 0:
         return True
     if mol is None:
         mol = Chem.MolFromSmiles(smiles)
     if mol is None:
         return True
     if 'py_formula' in props:
         self.data['py_formula'] = desc.CalcMolFormula(mol)
     if 'py_em' in props:
         self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5)
     if 'py_n_Cl_Br' in props:
         all_atoms = []
         for atom in mol.GetAtoms():
             all_atoms.append(atom.GetSymbol())
         n_Cl = all_atoms.count('Cl')
         n_Br = all_atoms.count('Br')
         self.data['py_n_Cl_Br'] = n_Cl + n_Br
     if 'py_na' in props:
         self.data['py_na'] = mol.GetNumAtoms()
     if 'py_mw' in props:
         self.data['py_mw'] = desc._CalcMolWt(mol)
     if 'py_fsp3' in props:
         self.data['py_fsp3'] = desc.CalcFractionCSP3(mol)
     if 'py_rb' in props:
         self.data['py_rb'] = desc.CalcNumRotatableBonds(mol)
     if 'py_tpsa' in props:
         self.data['py_tpsa'] = desc.CalcTPSA(mol)
     if 'py_clogp' in props:
         self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0]
     if 'py_nar' in props:
         self.data['py_nar'] = desc.CalcNumAromaticRings(mol)
     if 'py_nhba' in props:
         self.data['py_nhba'] = desc.CalcNumHBA(mol)
     if 'py_nhbd' in props:
         self.data['py_nhbd'] = desc.CalcNumHBD(mol)
     return False
Esempio n. 11
0
def choose_largest_fragment(mol):
    """Return the largest covalent unit.

    The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the
    fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed.

    :param mol: The molecule to choose the largest fragment from.
    :type mol: :rdkit:`Mol <Chem.rdchem.Mol-class.html>`
    :return: The largest fragment.
    :rtype: :rdkit:`Mol <Chem.rdchem.Mol-class.html>`
    """
    # TODO: Alternatively allow a list of fragments to be passed as the mol parameter
    fragments = Chem.GetMolFrags(mol, asMols=True)
    largest = None
    for f in fragments:
        smiles = Chem.MolToSmiles(f, isomericSmiles=True)
        # Count atoms
        atoms = 0
        for a in f.GetAtoms():
            atoms += 1 + a.GetTotalNumHs()
        # Skip this fragment if fewer atoms than the largest
        if largest and atoms < largest['atoms']:
            continue
        # Skip this fragment if equal number of atoms but weight is lower
        weight = rdMolDescriptors.CalcExactMolWt(f)
        if largest and atoms == largest['atoms'] and weight < largest['weight']:
            continue
        # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically
        if largest and atoms == largest['atoms'] and weight == largest[
                'weight'] and smiles > largest['smiles']:
            continue
        # Otherwise this is the largest so far
        largest = {
            'smiles': smiles,
            'fragment': f,
            'atoms': atoms,
            'weight': weight
        }
    return largest['fragment']
Esempio n. 12
0
def computeFeatures(mol):
	numRings = rdMolDescriptors.CalcNumRings(mol)
	numRotBonds = rdMolDescriptors.CalcNumRotatableBonds(mol)
	nitrogenCount = countNitrogens(mol)
	oxygenCount = countOxygens(mol)
	carbonCount = countCarbons(mol)
	boronCount = countBorons(mol)
	phosCount = countPhos(mol)
	sulfurCount = countSulfurs(mol)
	fluorCount = countFluorine(mol)
	iodCount = countIodine(mol)
	doubleBonds = countDoubleBonds(mol)
	surf_area = rdMolDescriptors.CalcLabuteASA(mol)
	mol_weight = rdMolDescriptors.CalcExactMolWt(mol)
	s_logp = rdMolDescriptors.SlogP_VSA_(mol)
	dist_hs = recurseMolHCount(mol)
	output = [numRings, nitrogenCount, oxygenCount, carbonCount, boronCount, phosCount, sulfurCount, fluorCount, iodCount, doubleBonds, surf_area, mol_weight]
	for s in s_logp:
		output.append(s)
	for d in dist_hs:
		output.append(dist_hs[d])
	return output
Esempio n. 13
0
def feature_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = rdMolDescriptors.MQNs_(mol)
    
    fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol))
    fp.append(rdMolDescriptors.CalcExactMolWt(mol))
    fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol))
    fp.append(rdMolDescriptors.CalcFractionCSP3(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol)))
    fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumAromaticRings(mol))
    fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol))
    fp.append(rdMolDescriptors.CalcNumRings(mol))
    fp.append(rdMolDescriptors.CalcNumAmideBonds(mol))
    fp.append(rdMolDescriptors.CalcNumHeterocycles(mol))
    fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol))
    fp.append(rdMolDescriptors.CalcTPSA(mol))
    
    return np.array(fp)
def calculate_scalar_descriptors(molecule, symbols):
    features = list()
    features.append(rdMD.CalcAsphericity(molecule))
    features += list(rdMD.CalcCrippenDescriptors(molecule))
    features.append(rdMD.CalcExactMolWt(molecule))
    features.append(rdMD.CalcEccentricity(molecule))
    features.append(rdMD.CalcFractionCSP3(molecule))
    features.append(rdMD.CalcLabuteASA(molecule))
    features.append(rdMD.CalcNPR1(molecule))
    features.append(rdMD.CalcNPR2(molecule))
    features.append(rdMD.CalcHallKierAlpha(molecule))

    # elemental distribution
    symbols = np.array(symbols)
    features.append(np.sum(symbols == 'H'))
    features.append(np.sum(symbols == 'C'))
    features.append(np.sum(symbols == 'N'))
    features.append(np.sum(symbols == 'O'))
    features.append(np.sum(symbols == 'F'))

    # ring features
    features.append(rdMD.CalcNumAliphaticCarbocycles(molecule))
    features.append(rdMD.CalcNumAliphaticHeterocycles(molecule))
    features.append(rdMD.CalcNumAromaticCarbocycles(molecule))
    features.append(rdMD.CalcNumAromaticHeterocycles(molecule))
    features.append(rdMD.CalcNumSaturatedCarbocycles(molecule))
    features.append(rdMD.CalcNumSaturatedHeterocycles(molecule))
    features.append(rdMD.CalcNumSpiroAtoms(
        molecule))  # atom shared between rings with one bond
    features.append(rdMD.CalcNumBridgeheadAtoms(
        molecule))  # atom shared between rings with at least two bonds

    # other counts
    features.append(rdMD.CalcNumAmideBonds(molecule))
    features.append(rdMD.CalcNumHBA(molecule))  # number of hydrogen acceptors
    features.append(rdMD.CalcNumHBD(molecule))  # number of hydrogen donors

    return np.array(features)
Esempio n. 15
0
def theolpp(usr_params):
    """
    param_dct = {'lipid_class': lipid_class, 'ox_level': ox_level,
                 'oap_mode': oap_mode, 'ocp_mode': ocp_mode,
                 'lyso_oap_mode': lyso_oap_mode, 'lyso_ocp_mode': lyso_ocp_mode,
                 'ox_max': ox_max, 'keto_max': keto_max, 'ooh_max': ooh_max, 'epoxy_max': epoxy_max,
                 'lipid_lst_path': lipid_lst_path, 'lipid_tab': lipid_tab,
                 'prostane_mode': prostane_mode, 'ox_prostane_mode': ox_prostane_mode,
                 'sdf_path': sdf_path, 'msp_mode': msp_mode, 'msp_path': msp_path,
                 'mod_lst_path': mod_lst_path, 'fa_lst_path': fa_lst_path, 'prostane_mod_path': prostane_mod_path,
                 'prostane_abbr_path': prostane_abbr_path, 'frag_pattern_path': frag_pattern_path}
    :param usr_params:
    :return:
    """

    t_start = time.clock()

    pl_table = usr_params['lipid_lst_path']
    fa_table = usr_params['fa_lst_path']
    mod_table = usr_params['mod_lst_path']
    isop_cfg = usr_params['prostane_mod_path']
    isopabbr_cfg = usr_params['prostane_abbr_path']
    # pl_class_use_lst = ['PA', 'PC', 'PE', 'PG', 'PI', 'PIP', 'PS']
    pl_class = usr_params['lipid_class']
    pl_class_use_lst = [pl_class]
    ox_level = usr_params['ox_level']

    oap_mode = usr_params['oap_mode']
    ocp_mode = usr_params['ocp_mode']
    lyso_oap_mode = usr_params['lyso_oap_mode']
    lyso_ocp_mode = usr_params['lyso_ocp_mode']

    ox_max = usr_params['ox_max']
    keto_max = usr_params['keto_max']
    ooh_max = usr_params['ooh_max']
    epoxy_max = usr_params['epoxy_max']

    prostane_mode = usr_params['prostane_mode']
    prostane_ox_mode = usr_params['ox_prostane_mode']
    save_sdf = usr_params['sdf_path']
    save_spectra = usr_params['msp_mode']
    save_msp = usr_params['msp_path']
    score_xlsx = usr_params['frag_pattern_path']
    pl_fp_xlsx = usr_params['pl_hg_path']

    pl_df = pd.read_excel(pl_table, sheetname=usr_params['lipid_tab'])
    fa_df = pd.read_csv(fa_table, index_col=0)
    print(pl_df.head())

    # Select export species OAP, OCP, Lyso OAP, Lyso OCP
    ban_lst = ['LYSOLYSO']
    if oap_mode == 0:
        ban_lst.extend(['UNMODOAP', 'OAPUNMOD', 'OAPOAP'])
    if ocp_mode == 0:
        ban_lst.extend(['UNMODOCP', 'OCPUNMOD', 'OCPOCP'])
    if lyso_oap_mode == 0:
        ban_lst.extend(['LYSOOAP', 'OAPLYSO'])
    if lyso_ocp_mode == 0:
        ban_lst.extend(['LYSOOCP', 'OCPLYSO'])
    if ox_level == 1:
        ban_lst.extend(
            ['OAPOAP', 'OCPOCP', 'OAPOCP', 'OCPOAP', 'OAPUNMOD', 'OCPUNMOD'])

    ox_param_dct = {
        'MAX_MOD': ox_max,
        'MAX_KETO': keto_max,
        'MAX_OOH': ooh_max,
        'MAX_EPOXY': epoxy_max
    }

    # sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))
    if save_spectra == 1 and len(save_msp) > 0:
        msp_obj = open(save_msp, mode='w')
    else:
        msp_obj = None
    sdf_dct = {}

    parser = PLParser()
    abbr_gen = AbbrGenerator()

    frag_gen = TheoFrag(pl_class, score_xlsx)
    fingerprint_gen = FingerprintGen(pl_fp_xlsx)

    c_lst = []

    fa_lpp_df_dct = {}

    sum_theo_lpp_dct = {}
    for (_idx, _row) in pl_df.iterrows():

        _pl_abbr = str(_row['phospholipids'])

        _pl_elem_lst, pl_info_dct = parser.get_composition(_pl_abbr)
        print('PL composition ==>', _pl_elem_lst)
        _pl_hg_abbr = _pl_elem_lst[0]

        # get smiles from abbr

        if _pl_hg_abbr in pl_class_use_lst:
            c_lst.append(_pl_abbr)

            # prepare output
            _pl_lpp_df = pd.DataFrame()

            print('Start oxidation of ==>', _pl_abbr)
            _pl_sn1_abbr = _pl_elem_lst[1]
            _pl_sn2_abbr = _pl_elem_lst[2]
            if len(pl_info_dct.keys()) > 0:
                sn1_link = pl_info_dct['sn1_link']
                sn1_c_num = int(pl_info_dct['sn1_c_num'])
                sn1_db_num = int(pl_info_dct['sn1_db_num'])
                sn1_omega_type = int(pl_info_dct['sn1_omega_type'])
                if sn1_omega_type == 0:
                    sn1_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn1_link, sn1_c_num, sn1_db_num)
                    sn1_fa_df = fa_df.query(sn1_query_code)
                    sn1_fa_df = sn1_fa_df.query(sn1_query_code).head(1)
                else:
                    sn1_query_code = 'Link == "%s" C == % i and DB == %i' % (
                        sn1_link, sn1_c_num, sn1_db_num)
                    sn1_fa_df = fa_df.query(sn1_query_code)
                    sn1_fa_df = sn1_fa_df.query(
                        'Link == "%s" and omega == %i' %
                        (sn1_link, sn1_omega_type)).head(1)

                sn2_link = pl_info_dct['sn2_link']
                sn2_c_num = int(pl_info_dct['sn2_c_num'])
                sn2_db_num = int(pl_info_dct['sn2_db_num'])
                sn2_omega_type = int(pl_info_dct['sn2_omega_type'])
                if sn2_omega_type == 0:
                    sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn2_link, sn2_c_num, sn2_db_num)
                    sn2_fa_df = fa_df.query(sn2_query_code)
                    sn2_fa_df = sn2_fa_df.query(sn2_query_code).head(1)
                else:
                    sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % (
                        sn2_link, sn2_c_num, sn2_db_num)
                    sn2_fa_df = fa_df.query(sn2_query_code)
                    sn2_fa_df = sn2_fa_df.query(
                        'Link == "%s" and omega == %i' %
                        (sn2_link, sn2_omega_type)).head(1)

                _pl_sn1_smiles = sn1_fa_df.loc[_pl_sn1_abbr, 'SMILES']
                _pl_sn2_smiles = sn2_fa_df.loc[_pl_sn2_abbr, 'SMILES']
                print('sn1 =>', _pl_sn1_smiles, '|| sn2 =>', _pl_sn2_smiles)

            else:
                _pl_sn1_smiles = ''
                _pl_sn2_smiles = ''

            # check if FA already oxidized to speed up
            if _pl_sn1_abbr in fa_lpp_df_dct.keys():
                sn1_mod_sum_df = fa_lpp_df_dct[_pl_sn1_abbr]
            else:
                sn1_link_dct = fa_link_filter(_pl_sn1_smiles)
                sn1_mod_sum_df = oxidizer(sn1_link_dct, mod_table, isop_cfg,
                                          isopabbr_cfg, ox_level, ox_param_dct,
                                          prostane_mode, prostane_ox_mode)
                fa_lpp_df_dct[_pl_sn1_abbr] = sn1_mod_sum_df.copy()

            if _pl_sn2_abbr in fa_lpp_df_dct.keys():
                sn2_mod_sum_df = fa_lpp_df_dct[_pl_sn2_abbr]
            else:
                sn2_link_dct = fa_link_filter(_pl_sn2_smiles)
                sn2_mod_sum_df = oxidizer(sn2_link_dct, mod_table, isop_cfg,
                                          isopabbr_cfg, ox_level, ox_param_dct,
                                          prostane_mode, prostane_ox_mode)
                fa_lpp_df_dct[_pl_sn2_abbr] = sn2_mod_sum_df.copy()

            for (_sn1_idx, _sn1_row) in sn1_mod_sum_df.iterrows():
                _sn1_mod_smiles = _sn1_row['FULL_SMILES']
                _sn1_abbr_str = _sn1_row['FA_ABBR']
                _sn1_typ_str = _sn1_row['FA_TYPE']
                _sn1_formula_str = _sn1_row['FA_FORMULA']

                for (_sn2_idx, _sn2_row) in sn2_mod_sum_df.iterrows():
                    _sn2_mod_smiles = _sn2_row['FULL_SMILES']
                    _sn2_abbr_str = _sn2_row['FA_ABBR']
                    _sn2_typ_str = _sn2_row['FA_TYPE']
                    _sn2_formula_str = _sn2_row['FA_FORMULA']

                    _oap_ocp_lst = [_sn1_typ_str, _sn2_typ_str]
                    _lpp_typ = ''.join(_oap_ocp_lst)

                    if _lpp_typ not in ban_lst:
                        _lpp_smiles = LPPmerge.pl_lpp(_pl_hg_abbr,
                                                      sn1=_sn1_mod_smiles,
                                                      sn2=_sn2_mod_smiles)
                        _lpp_id_str = str(''.join([
                            _pl_hg_abbr, '(', _sn1_abbr_str, '/',
                            _sn2_abbr_str, ')'
                        ]))

                        _lpp_sub_class_json = '{"SN1": "%s", "SN2": "%s"}' % (
                            _sn1_typ_str, _sn2_typ_str)

                        _lpp_info_dct = {
                            'LPP_ORIGIN': _pl_abbr,
                            'LPP_SMILES': _lpp_smiles,
                            'LPP_CLASS': _pl_hg_abbr,
                            'SN1_SMILES': _sn1_mod_smiles,
                            'SN2_SMILES': _sn2_mod_smiles,
                            'SN1_ABBR': _sn1_abbr_str,
                            'SN2_ABBR': _sn2_abbr_str,
                            'SN1_JSON': _sn1_row['FA_JSON'],
                            'SN2_JSON': _sn2_row['FA_JSON'],
                            'SN1_FORMULA': _sn1_formula_str,
                            'SN2_FORMULA': _sn2_formula_str,
                            'LM_ID': _lpp_id_str,
                            'SN_JSON': _lpp_sub_class_json
                        }
                        if save_spectra == 1:
                            _lpp_info_dct['MSP_JSON'] = frag_gen.calc_frags(
                                _lpp_info_dct)

                        _lpp_info_se = pd.Series(data=_lpp_info_dct)
                        _pl_lpp_df[_lpp_id_str] = _lpp_info_se

                        # check if same lpp generated already
                        # Currently use bulk settings
                        if _lpp_id_str in sdf_dct.keys():
                            _lpp_origin = sdf_dct[_lpp_id_str]['LPP_ORIGIN']
                            _lpp_origin_lst = _lpp_origin.split(',')
                            if _pl_abbr in _lpp_origin_lst:
                                pass
                            else:
                                _lpp_origin_lst.append(_pl_abbr)
                                sdf_dct[_lpp_id_str]['LPP_ORIGIN'] = ','.join(
                                    _lpp_origin_lst)
                        else:
                            sdf_dct[_lpp_id_str] = _lpp_info_dct.copy()

                        # clean memory by deleting these dicts and series
                        del _lpp_info_dct, _lpp_info_se

            # generate summary table
            _pl_lpp_df = _pl_lpp_df.transpose()
            print('==> %i of LPP generated !!' % _pl_lpp_df.shape[0])
            print('==> ==> Move to next lipid==> ')
            # print(_pl_lpp_df.head())
            sum_theo_lpp_dct[_pl_abbr] = _pl_lpp_df

            # create sdf
            # for (_lpp_i, _lpp_r) in _pl_lpp_df.iterrows():

    sum_theo_lpp_pl = pd.Panel(data=sum_theo_lpp_dct)
    print(sum_theo_lpp_pl.shape)

    # write to sdf
    print('==>Start to generate SDF ==> MSP mode = %i' % save_spectra)
    print('!! %i structures in total !!' % len(sdf_dct.keys()))

    mzcalc = MZcalc()

    sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))

    if save_spectra == 1:

        for _k_lpp in sdf_dct.keys():
            _lpp_dct = sdf_dct[_k_lpp]
            if len(json.loads(_lpp_dct['MSP_JSON']).keys()) > 0:
                _lpp_smiles = str(_lpp_dct['LPP_SMILES'])
                # print(_lpp_smiles)
                _lpp_mol = Chem.MolFromSmiles(_lpp_smiles)
                AllChem.Compute2DCoords(_lpp_mol)
                _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID']))
                _lpp_mass = Descriptors.MolWt(_lpp_mol)
                _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol)
                _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol)
                _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass)
                _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass)
                _lpp_mol.SetProp('FORMULA', _lpp_formula)
                _lpp_sn2_smi = _lpp_dct['SN2_SMILES']

                if str(_lpp_dct['LPP_CLASS']
                       ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O':
                    _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                    _lpp_neg_precursor_formula = mzcalc.get_formula(
                        _lpp_neg_precursor_elem, charge='[M+HCOO]-')
                    _lpp_neg_precursor_mz = mzcalc.get_mono_mz(
                        _lpp_formula, charge='[M+HCOO]-')
                    _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % (
                        _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

                else:
                    _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                    _lpp_neg_precursor_formula = mzcalc.get_formula(
                        _lpp_neg_precursor_elem, charge='[M-H]-')
                    _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                               charge='[M-H]-')
                    _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % (
                        _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

                _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info
                _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info)
                _lpp_dct['EXACT_MASS'] = _lpp_exactmass
                fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct)
                _lpp_dct['FINGERPRINT'] = fp_mz_lst
                _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst))

                for _k in _lpp_dct.keys():
                    _lpp_mol.SetProp(_k, str(_lpp_dct[_k]))

                sdf_writer.write(_lpp_mol)
                if save_spectra == 1 and len(save_msp) > 0:
                    MSPcreator.to_msp(msp_obj, _lpp_dct)

    elif save_spectra == 0:

        for _k_lpp in sdf_dct.keys():
            _lpp_dct = sdf_dct[_k_lpp]
            _lpp_smiles = str(_lpp_dct['LPP_SMILES'])
            _lpp_mol = Chem.MolFromSmiles(_lpp_smiles)
            AllChem.Compute2DCoords(_lpp_mol)
            _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID']))
            _lpp_mass = Descriptors.MolWt(_lpp_mol)
            _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol)
            _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol)
            _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass)
            _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass)
            _lpp_mol.SetProp('FORMULA', _lpp_formula)
            _lpp_sn2_smi = _lpp_dct['SN2_SMILES']

            if str(_lpp_dct['LPP_CLASS']
                   ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O':
                _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                _lpp_neg_precursor_formula = mzcalc.get_formula(
                    _lpp_neg_precursor_elem, charge='[M+HCOO]-')
                _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                           charge='[M+HCOO]-')
                _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % (
                    _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

            else:
                _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula)
                _lpp_neg_precursor_formula = mzcalc.get_formula(
                    _lpp_neg_precursor_elem, charge='[M-H]-')
                _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula,
                                                           charge='[M-H]-')
                _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % (
                    _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz)

            _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info
            _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info)
            _lpp_dct['EXACT_MASS'] = _lpp_exactmass
            fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct)
            _lpp_dct['FINGERPRINT'] = fp_mz_lst
            _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst))

            for _k in _lpp_dct.keys():
                _lpp_mol.SetProp(_k, str(_lpp_dct[_k]))

            sdf_writer.write(_lpp_mol)

    sdf_writer.close()
    if save_spectra == 1 and len(save_msp) > 0:
        msp_obj.close()

    SDFsummary.sdf2xlsx(save_sdf, str(save_sdf)[:-4] + '.xlsx')
    # if save_spectra == 1:
    SDFsummary.sdf2sum_fa(save_sdf, str(save_sdf)[:-4] + '_FA_SUM.xlsx')

    t_spent = time.clock() - t_start
    info_updater_1 = '=>%i of LPP generated ==> ' % len(sdf_dct.keys())
    info_updater_2 = '=>==> %i of phospholipids processed in %.3fs ==> ==> Finished !!!!!!' % (
        len(c_lst), t_spent)

    return info_updater_1, info_updater_2
def mol_weight_from_smiles(smile):
    x = Chem.MolFromSmiles(smile)
    return rdMolDescriptors.CalcExactMolWt(x)  # return grams per mol
Esempio n. 17
0
 def HasMatch(self, mol):
     mw = rdMolDescriptors.CalcExactMolWt(mol)
     res = not self.minMw <= mw <= self.maxMw
     Chem.MolFromSmiles("---")
     Chem.LogErrorMsg("dasfsadf")
     return res
Esempio n. 18
0
    def MW(self) -> float:
        """Molecular weight."""

        return round(rdMolDescriptors.CalcExactMolWt(self.mol))
Esempio n. 19
0
def build_reactions(perturbations_all_paths, mcs_neighbours):
    # loop over each perturbation in the list and load the pdb files:
    perturbation_reactions = []
    for perturbation_pair_path in perturbations_all_paths:

        # generate fingerprints for all present ligands:
        perturbations_unnested = list(
            itertools.chain.from_iterable(perturbations_all_paths))
        all_members = []
        all_members_FPs = []
        for member in perturbations_unnested:
            member_pdb_file = open(member, 'r').read()
            all_members.append(rdmolfiles.MolFromPDBBlock(member_pdb_file))
            all_members_FPs.append([
                FingerprintMols.FingerprintMol(
                    rdmolfiles.MolFromPDBBlock(member_pdb_file))
            ])
        first_pair = perturbations_all_paths[0]

        member1_pdb_file = open(first_pair[0], 'r').read()
        member2_pdb_file = open(first_pair[1], 'r').read()

        # for the perturbation pair, pick the ligand with lowest weight to query for MCS:
        size_member1 = rdMolDescriptors.CalcExactMolWt(
            rdmolfiles.MolFromPDBBlock(member1_pdb_file))
        size_member2 = rdMolDescriptors.CalcExactMolWt(
            rdmolfiles.MolFromPDBBlock(member2_pdb_file))

        if size_member1 >= size_member2:
            query_member = FingerprintMols.FingerprintMol(
                rdmolfiles.MolFromPDBBlock(member1_pdb_file))
        else:
            query_member = FingerprintMols.FingerprintMol(
                rdmolfiles.MolFromPDBBlock(member2_pdb_file))

        similarities = [
            AllChem.DataStructs.FingerprintSimilarity(
                query_member, target_fp[0], metric=DataStructs.DiceSimilarity)
            for target_fp in all_members_FPs
        ]
        similarities_to_query = dict(zip(all_members, similarities))

        # of all the ligands and their computed similarities, keep the given top amount(i.e. mcs_neighbours variable):
        mol_similarities = dict(
            sorted(similarities_to_query.items(),
                   key=lambda kv: kv[1],
                   reverse=True))
        ordered_mol_similarities = {}

        for key, value in mol_similarities.items():
            if value not in ordered_mol_similarities.values():
                ordered_mol_similarities[key] = value

        similar_hits = []
        for key, value in ordered_mol_similarities.items():
            similar_hits.append(key)

    # output the picked most similar ligands as rdkit molecule objects in a list:
        neighbours = similar_hits[:mcs_neighbours]

        # regenerate the perturbation (A>B):
        ligA = perturbation_pair_path[0].replace("../fesetup/poses/",
                                                 "").replace(
                                                     "/ligand.pdb", "")
        ligB = perturbation_pair_path[1].replace("../fesetup/poses/",
                                                 "").replace(
                                                     "/ligand.pdb", "")
        perturbation = str(ligA) + ">" + str(ligB)

        # read in PDB files:
        perturbation_pair = []
        member1_pdb_file = open(perturbation_pair_path[0], 'r').read()
        member2_pdb_file = open(perturbation_pair_path[1], 'r').read()

        perturbation_pair.append(rdmolfiles.MolFromPDBBlock(member1_pdb_file))
        perturbation_pair.append(rdmolfiles.MolFromPDBBlock(member2_pdb_file))

        # generate MCS (taking into account substitutions in ring structures) using the neighbours list:
        #print("Generating MCS for perturbation " + str(perturbation) + "..")
        print("##########")
        print(str(perturbation) + ":")
        MCS_object = rdFMCS.FindMCS(neighbours, completeRingsOnly=True)
        MCS_SMARTS = Chem.MolFromSmarts(MCS_object.smartsString)

        if MCS_SMARTS == None:
            print("Could not generate MCS pattern")
            return


#        print(Chem.MolToSmiles(perturbation_pair[0]))
#        print(Chem.MolToSmiles(perturbation_pair[1]))
#        print("################################")

# use SMARTS pattern to isolate unique patterns in each pair member
# if multiple unique patterns exist in one molecule they are written as:
# pattern1.pattern2 ('.' signifies a non-bonded connection)
        member1 = perturbation_pair[0]
        member2 = perturbation_pair[1]
        member1_stripped = AllChem.DeleteSubstructs(member1, MCS_SMARTS)
        member2_stripped = AllChem.DeleteSubstructs(member2, MCS_SMARTS)
        member1_stripped_smiles = Chem.MolToSmiles(member1_stripped)
        member2_stripped_smiles = Chem.MolToSmiles(member2_stripped)

        # construct SMILES string from the two members:
        reaction = str(member1_stripped_smiles) + ">>" + str(
            member2_stripped_smiles)
        print(reaction)
        member1 = str(member1_stripped_smiles)
        member2 = str(member2_stripped_smiles)

        # combine all results (name of perturbation, reaction SMILES, ligand A Smiles and ligand B SMILES)
        result = [perturbation, reaction, member1, member2]
        perturbation_reactions.append(result)

    return perturbation_reactions
Esempio n. 20
0
 def HasMatch(self, mol):
     mw = rdMolDescriptors.CalcExactMolWt(mol)
     return not self.minMw <= mw <= self.maxMw
lipinski_violators = []
counter = 0

print("Scanning molecules for Lipinski violations.")
for mol in tqdm(mols):
    # Assume no violations
    dono_viol = False
    acceptor_viol = False
    mw_viol = False
    logp_viol = False

    # Use RDKit functions to get hdonors, acceptors, molecular weight and
    # logP.
    hdonors = Lipinski.NHOHCount(mol)
    hacceptors = Lipinski.NOCount(mol)
    mw = rdMolDescriptors.CalcExactMolWt(mol)
    logp = Crippen.MolLogP(mol)

    # Make the checks if the current mol actually violates a role.
    if hdonors > 5:
        dono_viol = True
    if hacceptors > 10:
        acceptor_viol = True
    if mw > 500:
        mw_viol = True
    if logp > 5:
        logp_viol = True

    # Check if the violation sum is greater than one and assign the molecule
    # as a violator saving the index to a list.
    if sum([dono_viol, acceptor_viol, mw_viol, logp_viol]) > 1:
 def compute_Wt(self, mol_input):
     return rdMolDescriptors.CalcExactMolWt(mol_input)
Esempio n. 23
0
                return


def check_maxes(formd, maxes):
    bools = [v < maxes[e] for e, v in formd.items()]
    return all(bools)


def rec_formula(mz, ppm=5):
    maxes = dict(get_elemaxs(mz))
    error = mz * (ppm * 1E-6)
    mlow, mhigh = mz - error, mz + error
    formula = {e: 0 for e in eles.keys()}
    return _rec_form(formula, mlow, mhigh, maxes)


def _rec_form(ele_idx, formula, mlow, mhigh, maxes):
    good_form = check_formula(formula, mlow, mhigh)
    good_form = True
    under_maxes = check_maxes(formula, maxes)
    if good_form and under_maxes:
        yield formula
    else:
        formula[ele]
        pass


sm = get_soome_mols()
masses = [rdMolDescriptors.CalcExactMolWt(m) for m in sm]
formulas = [rdMolDescriptors.CalcMolFormula(m) for m in sm]
Esempio n. 24
0
def formatdb(smiles):
    df = pd.read_csv(smiles, sep='\t', header=None)
    os.remove(smiles)

    smi = list(df[0])
    m = [Chem.MolFromSmiles(x) for x in smi]
    inchi = []
    ikeys = []
    ikey1 = []
    ikey2 = []
    form = []
    exmass = []
    for i in range(len(m)):
        try:
            inchi.append(Chem.rdinchi.MolToInchi(m[i])[0])
            ikey = Chem.rdinchi.InchiToInchiKey(inchi[i])
            ikeys.append(ikey)
            ikey1.append(ikey.split('-')[0])
            ikey2.append(ikey.split('-')[1])
            form.append(rdMD.CalcMolFormula(m[i]))
            exmass.append(rdMD.CalcExactMolWt(m[i]))
        except:
            ikeys.append('')
            inchi.append('')
            ikey1.append('')
            ikey2.append('')
            form.append('')
            exmass.append('')

    data = {
        'inchikey': ikeys,
        'MonoisotopicMass': exmass,
        'InChI': inchi,
        'SMILES': list(df[0]),
        'Identifier': list(df[1]),
        'InChIKey2': ikey2,
        'InChIKey1': ikey1,
        'MolecularFormula': form
    }

    cn = [
        "inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier",
        "InChIKey2", "InChIKey1", "MolecularFormula"
    ]
    formdata = pd.DataFrame(data, columns=cn)

    classy = query_inchikey(ikeys)

    # If the structure do not show a classification, try query
    #in_process = get_class(list(df[0]), chunksize=100)
    #classy = poll(in_process)

    classy = classy[['inchikey', 'kingdom', 'superclass', 'class', 'subclass']]
    classy.columns = [
        'inchikey', 'kingdom_name', 'superclass_name', 'class_name',
        'subclass_name'
    ]

    formfinal = pd.merge(formdata, classy, how='left', on=['inchikey'])

    formfinal = formfinal.fillna('')
    formfinal.drop('inchikey', axis=1, inplace=True)

    id = [x for x in range(len(ikeys)) if ikeys[x] == '']
    formfinal.drop(formfinal.index[id], inplace=True)

    formfinal.to_csv(smiles + '_FORMATED.txt', index=False, sep='\t')
    return 'Done'
Esempio n. 25
0
def smiles_to_all_labels(df):

    smilesList = df['SMILES']
    feature_df = df.copy()

    # get all functions of modules
    all_lipinski = inspect.getmembers(l, inspect.isfunction)
    all_fragments = inspect.getmembers(f, inspect.isfunction)

    # bad features have the same value for all our compounds
    bad_features = []
    for (columnName, columnData) in df.iteritems():
        if (len(set(columnData.values)) == 1):
            bad_features.append(columnName)

    # add fragment features
    for i in range(len(all_fragments)):
        new_col = []

        # exclude attributes which start with _ and exclude bad features
        if all_fragments[i][0].startswith(
                '_') == False and all_fragments[i][0] not in bad_features:

            for smiles in smilesList:
                molecule = chem.MolFromSmiles(smiles)
                mol_method = all_fragments[i][1](molecule)
                new_col.append(mol_method)

            # add new col with feature name to our df
            feature_df[all_fragments[i][0]] = new_col

    print('fragments over')

    # add lipinski features
    for i in range(len(all_lipinski)):

        new_col = []
        if all_lipinski[i][0].startswith(
                '_') == False and all_fragments[i][0] not in bad_features:

            for smiles in smilesList:

                molecule = chem.MolFromSmiles(smiles)
                mol_method = all_lipinski[i][1](molecule)
                new_col.append(mol_method)

            feature_df[all_lipinski[i][0]] = new_col

    print('lipinski over')

    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(f.fr_Al_COO(molecule))

    feature_df["fr_Al_COO"] = new_col

    # new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(l.HeavyAtomCount(molecule))

    feature_df["HeavyAtomCount"] = new_col

    # add getnumatoms as feature
    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(molecule.GetNumAtoms())

    feature_df["GetNumAtoms"] = new_col

    # add CalcExactMolWt as feature
    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(molDesc.CalcExactMolWt(molecule))

    feature_df["CalcExactMolWt"] = new_col

    # print('other over')

    return feature_df
Esempio n. 26
0
  53.49...

"""

HeavyAtomMolWt = lambda x: MolWt(x, True)
HeavyAtomMolWt.__doc__ = """The average molecular weight of the molecule ignoring hydrogens

  >>> HeavyAtomMolWt(Chem.MolFromSmiles('CC'))
  24.02...
  >>> HeavyAtomMolWt(Chem.MolFromSmiles('[NH4+].[Cl-]'))
  49.46

"""
HeavyAtomMolWt.version = "1.0.0"

ExactMolWt = lambda *x, **y: _rdMolDescriptors.CalcExactMolWt(*x, **y)
ExactMolWt.version = _rdMolDescriptors._CalcExactMolWt_version
ExactMolWt.__doc__ = """The exact molecular weight of the molecule

  >>> ExactMolWt(Chem.MolFromSmiles('CC'))
  30.04...
  >>> ExactMolWt(Chem.MolFromSmiles('[13CH3]C'))
  31.05...

"""


def NumValenceElectrons(mol):
    """ The number of valence electrons the molecule has

  >>> NumValenceElectrons(Chem.MolFromSmiles('CC'))
def create_features(data, types="train"):

    if types == "train":
        y = np.array(data['ACTIVE'].astype(int))
    elif types == "test":
        y = None

    data = data[["SMILES"]]
    data["SMILES_str"] = data["SMILES"]
    data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
    data["NumAtoms"] = data["SMILES"].apply(
        lambda x: x.GetNumAtoms())  #l.HeavyAtomCount(m)
    data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x))
    data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x))
    data["HsNumAtoms"] = data["SMILES"].apply(
        lambda x: Chem.AddHs(x).GetNumAtoms())
    #to have the hydrogens explicitly present

    BondType = [[str(x.GetBondType()) for x in m.GetBonds()]
                for m in data["SMILES"]]
    BondType = [" ".join(x) for x in BondType]

    vec = CountVectorizer().fit(BondType)
    train_tfidf = vec.transform(BondType).todense()  # 转化为更直观的一般矩阵
    vocabulary = vec.vocabulary_

    train_tfidf = pd.DataFrame(train_tfidf)
    train_tfidf.columns = vocabulary

    data = pd.concat([data, train_tfidf], axis=1)
    #data.columns
    #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple']
    traindata = data[[
        'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double',
        'single', 'aromatic', 'triple'
    ]]

    finger = [
        np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512))
        for x in data["SMILES"]
    ]
    finger = pd.DataFrame(finger)
    finger.columns = ["morgan_" + str(x) for x in finger.columns]

    model = word2vec.Word2Vec.load('models/model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1)
    m2v = [
        DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')
    ]
    m2v = np.array([x.vec for x in m2v])
    m2v = pd.DataFrame(m2v)
    m2v.columns = ["m2v_" + str(x) for x in m2v.columns]

    datadict = {
        "Morgan": finger,
        "Despcritor": traindata,
        "molvec": m2v,
        'y': y
    }

    return datadict