Ejemplo n.º 1
0
def has_carboxylate(SMILES):
    """Returns False if carboxylate fragment is not found using RDKIT.

    """
    mol = Chem.MolFromSmiles(SMILES)
    no_frag = Fragments.fr_COO(mol)
    if no_frag > 0:
        return True
    else:
        return False
Ejemplo n.º 2
0
def recursive_dance(mols, check=set([]), thres=0.4):
    before_n = len(check)
    print before_n
    for mol in mols:
        danced_mols = dance_atom(mol)
        for mol_conv in danced_mols:
            aro_n = Fragments.fr_Ar_N(mol)
            aro_a = len(mol.GetAromaticAtoms())
            ratio = float(aro_n) / float(aro_a)
            if ratio < thres:
                smi = Chem.MolToSmiles(mol_conv)
                check.add(smi)
    after_n = len(check)
    print before_n, after_n
    if before_n < after_n:
        mols = [Chem.MolFromSmiles(mol) for mol in check]
        recursive_dance(mols, check=check)
    return [Chem.MolFromSmiles(smi) for smi in check]
Ejemplo n.º 3
0
def recursive_dance(mols, check = set([]), thres = 0.4):
    before_n = len(check)
    print before_n
    for mol in mols:
        danced_mols = dance_atom( mol )
        for mol_conv in danced_mols:
            aro_n = Fragments.fr_Ar_N( mol )
            aro_a = len( mol.GetAromaticAtoms() )
            ratio = float(aro_n) / float(aro_a)
            if ratio < thres:
                smi = Chem.MolToSmiles(mol_conv )
                check.add( smi )
    after_n = len(check)
    print before_n, after_n
    if before_n < after_n:
        mols = [ Chem.MolFromSmiles(mol) for mol in check ]
        recursive_dance( mols,  check=check )
    return [ Chem.MolFromSmiles(smi) for smi in check ]
Ejemplo n.º 4
0
def calc_rdkit(mol):
    descriptors = pd.Series(
        np.array([
            Crippen.MolLogP(mol),
            Crippen.MolMR(mol),
            Descriptors.FpDensityMorgan1(mol),
            Descriptors.FpDensityMorgan2(mol),
            Descriptors.FpDensityMorgan3(mol),
            Descriptors.FractionCSP3(mol),
            Descriptors.HeavyAtomMolWt(mol),
            Descriptors.MaxAbsPartialCharge(mol),
            Descriptors.MaxPartialCharge(mol),
            Descriptors.MinAbsPartialCharge(mol),
            Descriptors.MinPartialCharge(mol),
            Descriptors.MolWt(mol),
            Descriptors.NumRadicalElectrons(mol),
            Descriptors.NumValenceElectrons(mol),
            EState.EState.MaxAbsEStateIndex(mol),
            EState.EState.MaxEStateIndex(mol),
            EState.EState.MinAbsEStateIndex(mol),
            EState.EState.MinEStateIndex(mol),
            EState.EState_VSA.EState_VSA1(mol),
            EState.EState_VSA.EState_VSA10(mol),
            EState.EState_VSA.EState_VSA11(mol),
            EState.EState_VSA.EState_VSA2(mol),
            EState.EState_VSA.EState_VSA3(mol),
            EState.EState_VSA.EState_VSA4(mol),
            EState.EState_VSA.EState_VSA5(mol),
            EState.EState_VSA.EState_VSA6(mol),
            EState.EState_VSA.EState_VSA7(mol),
            EState.EState_VSA.EState_VSA8(mol),
            EState.EState_VSA.EState_VSA9(mol),
            Fragments.fr_Al_COO(mol),
            Fragments.fr_Al_OH(mol),
            Fragments.fr_Al_OH_noTert(mol),
            Fragments.fr_aldehyde(mol),
            Fragments.fr_alkyl_carbamate(mol),
            Fragments.fr_alkyl_halide(mol),
            Fragments.fr_allylic_oxid(mol),
            Fragments.fr_amide(mol),
            Fragments.fr_amidine(mol),
            Fragments.fr_aniline(mol),
            Fragments.fr_Ar_COO(mol),
            Fragments.fr_Ar_N(mol),
            Fragments.fr_Ar_NH(mol),
            Fragments.fr_Ar_OH(mol),
            Fragments.fr_ArN(mol),
            Fragments.fr_aryl_methyl(mol),
            Fragments.fr_azide(mol),
            Fragments.fr_azo(mol),
            Fragments.fr_barbitur(mol),
            Fragments.fr_benzene(mol),
            Fragments.fr_benzodiazepine(mol),
            Fragments.fr_bicyclic(mol),
            Fragments.fr_C_O(mol),
            Fragments.fr_C_O_noCOO(mol),
            Fragments.fr_C_S(mol),
            Fragments.fr_COO(mol),
            Fragments.fr_COO2(mol),
            Fragments.fr_diazo(mol),
            Fragments.fr_dihydropyridine(mol),
            Fragments.fr_epoxide(mol),
            Fragments.fr_ester(mol),
            Fragments.fr_ether(mol),
            Fragments.fr_furan(mol),
            Fragments.fr_guanido(mol),
            Fragments.fr_halogen(mol),
            Fragments.fr_hdrzine(mol),
            Fragments.fr_hdrzone(mol),
            Fragments.fr_HOCCN(mol),
            Fragments.fr_imidazole(mol),
            Fragments.fr_imide(mol),
            Fragments.fr_Imine(mol),
            Fragments.fr_isocyan(mol),
            Fragments.fr_isothiocyan(mol),
            Fragments.fr_ketone(mol),
            Fragments.fr_ketone_Topliss(mol),
            Fragments.fr_lactam(mol),
            Fragments.fr_lactone(mol),
            Fragments.fr_methoxy(mol),
            Fragments.fr_morpholine(mol),
            Fragments.fr_N_O(mol),
            Fragments.fr_Ndealkylation1(mol),
            Fragments.fr_Ndealkylation2(mol),
            Fragments.fr_NH0(mol),
            Fragments.fr_NH1(mol),
            Fragments.fr_NH2(mol),
            Fragments.fr_Nhpyrrole(mol),
            Fragments.fr_nitrile(mol),
            Fragments.fr_nitro(mol),
            Fragments.fr_nitro_arom(mol),
            Fragments.fr_nitro_arom_nonortho(mol),
            Fragments.fr_nitroso(mol),
            Fragments.fr_oxazole(mol),
            Fragments.fr_oxime(mol),
            Fragments.fr_para_hydroxylation(mol),
            Fragments.fr_phenol(mol),
            Fragments.fr_phenol_noOrthoHbond(mol),
            Fragments.fr_phos_acid(mol),
            Fragments.fr_phos_ester(mol),
            Fragments.fr_piperdine(mol),
            Fragments.fr_piperzine(mol),
            Fragments.fr_priamide(mol),
            Fragments.fr_prisulfonamd(mol),
            Fragments.fr_pyridine(mol),
            Fragments.fr_quatN(mol),
            Fragments.fr_SH(mol),
            Fragments.fr_sulfide(mol),
            Fragments.fr_sulfonamd(mol),
            Fragments.fr_sulfone(mol),
            Fragments.fr_term_acetylene(mol),
            Fragments.fr_tetrazole(mol),
            Fragments.fr_thiazole(mol),
            Fragments.fr_thiocyan(mol),
            Fragments.fr_thiophene(mol),
            Fragments.fr_unbrch_alkane(mol),
            Fragments.fr_urea(mol),
            GraphDescriptors.BalabanJ(mol),
            GraphDescriptors.BertzCT(mol),
            GraphDescriptors.Chi0(mol),
            GraphDescriptors.Chi0n(mol),
            GraphDescriptors.Chi0v(mol),
            GraphDescriptors.Chi1(mol),
            GraphDescriptors.Chi1n(mol),
            GraphDescriptors.Chi1v(mol),
            GraphDescriptors.Chi2n(mol),
            GraphDescriptors.Chi2v(mol),
            GraphDescriptors.Chi3n(mol),
            GraphDescriptors.Chi3v(mol),
            GraphDescriptors.Chi4n(mol),
            GraphDescriptors.Chi4v(mol),
            GraphDescriptors.HallKierAlpha(mol),
            GraphDescriptors.Ipc(mol),
            GraphDescriptors.Kappa1(mol),
            GraphDescriptors.Kappa2(mol),
            GraphDescriptors.Kappa3(mol),
            Lipinski.HeavyAtomCount(mol),
            Lipinski.NHOHCount(mol),
            Lipinski.NOCount(mol),
            Lipinski.NumAliphaticCarbocycles(mol),
            Lipinski.NumAliphaticHeterocycles(mol),
            Lipinski.NumAliphaticRings(mol),
            Lipinski.NumAromaticCarbocycles(mol),
            Lipinski.NumAromaticHeterocycles(mol),
            Lipinski.NumAromaticRings(mol),
            Lipinski.NumHAcceptors(mol),
            Lipinski.NumHDonors(mol),
            Lipinski.NumHeteroatoms(mol),
            Lipinski.NumRotatableBonds(mol),
            Lipinski.NumSaturatedCarbocycles(mol),
            Lipinski.NumSaturatedHeterocycles(mol),
            Lipinski.NumSaturatedRings(mol),
            Lipinski.RingCount(mol),
            MolSurf.LabuteASA(mol),
            MolSurf.PEOE_VSA1(mol),
            MolSurf.PEOE_VSA10(mol),
            MolSurf.PEOE_VSA11(mol),
            MolSurf.PEOE_VSA12(mol),
            MolSurf.PEOE_VSA13(mol),
            MolSurf.PEOE_VSA14(mol),
            MolSurf.PEOE_VSA2(mol),
            MolSurf.PEOE_VSA3(mol),
            MolSurf.PEOE_VSA4(mol),
            MolSurf.PEOE_VSA5(mol),
            MolSurf.PEOE_VSA6(mol),
            MolSurf.PEOE_VSA7(mol),
            MolSurf.PEOE_VSA8(mol),
            MolSurf.PEOE_VSA9(mol),
            MolSurf.SlogP_VSA1(mol),
            MolSurf.SlogP_VSA10(mol),
            MolSurf.SlogP_VSA11(mol),
            MolSurf.SlogP_VSA12(mol),
            MolSurf.SlogP_VSA2(mol),
            MolSurf.SlogP_VSA3(mol),
            MolSurf.SlogP_VSA4(mol),
            MolSurf.SlogP_VSA5(mol),
            MolSurf.SlogP_VSA6(mol),
            MolSurf.SlogP_VSA7(mol),
            MolSurf.SlogP_VSA8(mol),
            MolSurf.SlogP_VSA9(mol),
            MolSurf.SMR_VSA1(mol),
            MolSurf.SMR_VSA10(mol),
            MolSurf.SMR_VSA2(mol),
            MolSurf.SMR_VSA3(mol),
            MolSurf.SMR_VSA4(mol),
            MolSurf.SMR_VSA5(mol),
            MolSurf.SMR_VSA6(mol),
            MolSurf.SMR_VSA7(mol),
            MolSurf.SMR_VSA8(mol),
            MolSurf.SMR_VSA9(mol),
            MolSurf.TPSA(mol)
        ]))
    return descriptors
Ejemplo n.º 5
0
def COOH(mol):
    return Fragments.fr_Ar_COO(mol)
Ejemplo n.º 6
0
def NO2(mol):
    return Fragments.fr_nitro(mol)
Ejemplo n.º 7
0
def SR(mol):
    return Fragments.fr_sulfide(mol)
Ejemplo n.º 8
0
def SH(mol):
    return Fragments.fr_SH(mol)
Ejemplo n.º 9
0
def OH(mol):
    total = Fragments.fr_Ar_OH(mol)
    total += Fragments.fr_Al_OH(mol)
    return total
Ejemplo n.º 10
0
def NH2(mol):
    return Fragments.fr_NH2(mol)
def main():
    infile = open("molecule_training.csv", 'r')
    infile.readline()

    with open('train_molecule_new_features.csv', 'w') as f:
        writer = csv.writer(f)
        # writer.writerow(['index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
        #                  'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'Graph', 'smiles',
        #                  'target'])
        writer.writerow([
            'index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight',
            'Number of H-Bond Donors', 'Number of Rings',
            'Number of Rotatable Bonds', 'Polar Surface Area', 'fr_phos',
            'aromatic_carbocycles', 'MolLogP', 'PEOE_VSA1', 'Fingerprint',
            'smiles', 'target'
        ])
        for line in infile:
            line = line.strip('\n\r ')
            line = line.split(",")
            smiles = line[10].strip()
            #edge_list = to_graph(smiles)
            mol = Chem.MolFromSmiles(smiles)
            # fingerprint_explicit_bitvector = RDKFingerprint(mol)
            # fingerprint_bit_string = fingerprint_explicit_bitvector.ToBitString()
            fingerprint_bit_string = GetMorganFingerprintAsBitVect(
                mol, 2).ToBitString()
            #writer.writerow(line[:8] + [fingerprint_bit_string, line[10], line[11]])
            #writer.writerow(line[:8] + [edge_list] + [line[10], line[11]])
            fr_phos = Fragments.fr_phos_acid(mol) + Fragments.fr_phos_ester(
                mol)
            aromatic_cc = Lipinski.NumAromaticCarbocycles(mol)
            molLogP = Crippen.MolLogP(mol)
            peoe_vsa1 = MolSurf.PEOE_VSA1(mol)
            writer.writerow(line[:8] + [
                fr_phos, aromatic_cc, molLogP, peoe_vsa1,
                fingerprint_bit_string, line[10], line[11]
            ])

    infile.close()

    infile = open("molecule_TestFeatures.csv", 'r')
    infile.readline()

    with open('test_molecule_new_features.csv', 'w') as f:
        writer = csv.writer(f)
        # writer.writerow(['index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
        #                  'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', 'Graph', 'smiles',
        #                  'target'])
        writer.writerow([
            'index', 'Maximum Degree', 'Minimum Degree', 'Molecular Weight',
            'Number of H-Bond Donors', 'Number of Rings',
            'Number of Rotatable Bonds', 'Polar Surface Area', 'fr_phos',
            'aromatic_carbocycles', 'MolLogP', 'PEOE_VSA1', 'Fingerprint',
            'smiles'
        ])
        for line in infile:
            line = line.strip('\n\r ')
            line = line.split(",")
            smiles = line[10].strip()
            # edge_list = to_graph(smiles)
            mol = Chem.MolFromSmiles(smiles)
            # fingerprint_explicit_bitvector = RDKFingerprint(mol)
            # fingerprint_bit_string = fingerprint_explicit_bitvector.ToBitString()
            fingerprint_bit_string = GetMorganFingerprintAsBitVect(
                mol, 2).ToBitString()
            fr_phos = Fragments.fr_phos_acid(mol) + Fragments.fr_phos_ester(
                mol)
            aromatic_cc = Lipinski.NumAromaticCarbocycles(mol)
            molLogP = Crippen.MolLogP(mol)
            peoe_vsa1 = MolSurf.PEOE_VSA1(mol)
            writer.writerow(line[:8] + [
                fr_phos, aromatic_cc, molLogP, peoe_vsa1,
                fingerprint_bit_string, line[10]
            ])
            # writer.writerow(line[:8] + [edge_list] + [line[10], line[11]])

    infile.close()
Ejemplo n.º 12
0
def makeFeatures(fileName):


    from rdkit import Chem
    from rdkit.Chem import Fragments
    from rdkit.Chem import AllChem
    from rdkit.Chem import MolSurf

    global featuresFile, numFeatures
    featuresFile = open(fileName, 'w')      # Molecule features output file

    # run gaussian jobs
#    gaussian.setNumMols()
#    gaussian.makeAllGinps()
#    gaussian.runGaussianOnAllGinps()

    # open database file
    drugDB = Chem.SDMolSupplier("FKBP12_binders.sdf")

    if debug:
        print "\n\tNo features data file found. Writing new features data file.\n"

    text = ""       # Placeholder for feature data
    molCount = 0
    convergedCount = 0
    converged_and_different = 0
    drug_name = []

    # load fragment descriptor
    Fragments._LoadPatterns(fileName='/usr/local/anaconda/pkgs/rdkit-2015.03.1-np19py27_0/share/RDKit/Data/FragmentDescriptors.csv')

    # Select features of interest
    for mol in drugDB:
	if molCount > -1:
#		print mol.GetProp("BindingDB Target Chain Sequence")
		gaussian_log_file = "gaussian_files/drug_"+str(molCount)+".log"
		converged, dipole, quadrupole, octapole, hexadecapole, dg_solv = gaussian.parseGaussianLog(gaussian_log_file)
		if converged == "True" and mol.GetProp("BindingDB Target Chain Sequence") == "MGVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPFKFMLGKQEVIRGWEEGVAQMSVGQRAKLTISPDYAYGATGHPGIIPPHATLVFDVELLKLE":
			if convergedCount ==0:
				diff = "True"
			else:
				diff = "True"
				for i in range(converged_and_different):
					if mol.GetProp("BindingDB Ligand Name") == drug_name[i]:
						diff = "False"
						break
			
			if diff == "True":
				drug_name.append(mol.GetProp("BindingDB Ligand Name"))				
				text += "{}\n".format(AllChem.ComputeMolVolume(mol))
				text += "{}\n".format(MolSurf.pyLabuteASA(mol))
				text += "{}\n".format(mol.GetNumAtoms())
				text += "{}\n".format(mol.GetNumBonds())
				text += "{}\n".format(mol.GetNumHeavyAtoms())
				text += "{}\n".format(dipole)
				text += "{}\n".format(quadrupole)
				text += "{}\n".format(octapole)
				text += "{}\n".format(hexadecapole)
				text += "{}\n".format(dg_solv)
				text += "{}\n".format(Fragments.fr_Al_OH(mol)) # aliphatic alcohols
				text += "{}\n".format(Fragments.fr_Ar_OH(mol)) # aromatic alcohols
				text += "{}\n".format(Fragments.fr_ketone(mol)) # number of ketones
				text += "{}\n".format(Fragments.fr_ether(mol)) # number of ether oxygens
				text += "{}\n".format(Fragments.fr_ester(mol)) # number of esters
				text += "{}\n".format(Fragments.fr_aldehyde(mol)) # number of aldehydes
				text += "{}\n".format(Fragments.fr_COO(mol)) # number of carboxylic acids
				text += "{}\n".format(Fragments.fr_benzene(mol)) # number of benzenes
		                text += "{}\n".format(Fragments.fr_Ar_N(mol)) # number of aromatic nitrogens
		                text += "{}\n".format(Fragments.fr_NH0(mol)) # number of tertiary amines
		                text += "{}\n".format(Fragments.fr_NH1(mol)) # number of secondary amines
		                text += "{}\n".format(Fragments.fr_NH2(mol)) # number of primary amines
		                text += "{}\n".format(Fragments.fr_amide(mol)) # number of amides
		                text += "{}\n".format(Fragments.fr_SH(mol)) # number of thiol groups
		                text += "{}\n".format(Fragments.fr_nitro(mol)) # number of nitro groups
		                text += "{}\n".format(Fragments.fr_furan(mol)) # number of furan rings
		                text += "{}\n".format(Fragments.fr_imidazole(mol)) # number of imidazole rings
		                text += "{}\n".format(Fragments.fr_oxazole(mol)) # number of oxazole rings
		                text += "{}\n".format(Fragments.fr_morpholine(mol)) # number of morpholine rings
		                text += "{}\n".format(Fragments.fr_halogen(mol)) # number of halogens
				text += "\nKI: {}\n".format(mol.GetProp("Ki (nM)"))
				text += "\n"        # Use a blank line to divide molecule data
				
				featuresFile.write(text)
				text = ""
				converged_and_different += 1
			convergedCount += 1
	else:
		break
       	molCount += 1

    print "Number of molecules with converged gaussian log files and correct sequence:", convergedCount, "\n"
    print "Number of overlap drugs:", convergedCount - converged_and_different
    featuresFile.close()
Ejemplo n.º 13
0
def smiles_to_all_labels(df):

    smilesList = df['SMILES']
    feature_df = df.copy()

    # get all functions of modules
    all_lipinski = inspect.getmembers(l, inspect.isfunction)
    all_fragments = inspect.getmembers(f, inspect.isfunction)

    # bad features have the same value for all our compounds
    bad_features = []
    for (columnName, columnData) in df.iteritems():
        if (len(set(columnData.values)) == 1):
            bad_features.append(columnName)

    # add fragment features
    for i in range(len(all_fragments)):
        new_col = []

        # exclude attributes which start with _ and exclude bad features
        if all_fragments[i][0].startswith(
                '_') == False and all_fragments[i][0] not in bad_features:

            for smiles in smilesList:
                molecule = chem.MolFromSmiles(smiles)
                mol_method = all_fragments[i][1](molecule)
                new_col.append(mol_method)

            # add new col with feature name to our df
            feature_df[all_fragments[i][0]] = new_col

    print('fragments over')

    # add lipinski features
    for i in range(len(all_lipinski)):

        new_col = []
        if all_lipinski[i][0].startswith(
                '_') == False and all_fragments[i][0] not in bad_features:

            for smiles in smilesList:

                molecule = chem.MolFromSmiles(smiles)
                mol_method = all_lipinski[i][1](molecule)
                new_col.append(mol_method)

            feature_df[all_lipinski[i][0]] = new_col

    print('lipinski over')

    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(f.fr_Al_COO(molecule))

    feature_df["fr_Al_COO"] = new_col

    # new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(l.HeavyAtomCount(molecule))

    feature_df["HeavyAtomCount"] = new_col

    # add getnumatoms as feature
    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(molecule.GetNumAtoms())

    feature_df["GetNumAtoms"] = new_col

    # add CalcExactMolWt as feature
    new_col = []
    for smiles in smilesList:

        molecule = chem.MolFromSmiles(smiles)
        new_col.append(molDesc.CalcExactMolWt(molecule))

    feature_df["CalcExactMolWt"] = new_col

    # print('other over')

    return feature_df
def create_features(data, types="train"):

    if types == "train":
        y = np.array(data['ACTIVE'].astype(int))
    elif types == "test":
        y = None

    data = data[["SMILES"]]
    data["SMILES_str"] = data["SMILES"]
    data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
    data["NumAtoms"] = data["SMILES"].apply(
        lambda x: x.GetNumAtoms())  #l.HeavyAtomCount(m)
    data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x))
    data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x))
    data["HsNumAtoms"] = data["SMILES"].apply(
        lambda x: Chem.AddHs(x).GetNumAtoms())
    #to have the hydrogens explicitly present

    BondType = [[str(x.GetBondType()) for x in m.GetBonds()]
                for m in data["SMILES"]]
    BondType = [" ".join(x) for x in BondType]

    vec = CountVectorizer().fit(BondType)
    train_tfidf = vec.transform(BondType).todense()  # 转化为更直观的一般矩阵
    vocabulary = vec.vocabulary_

    train_tfidf = pd.DataFrame(train_tfidf)
    train_tfidf.columns = vocabulary

    data = pd.concat([data, train_tfidf], axis=1)
    #data.columns
    #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple']
    traindata = data[[
        'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double',
        'single', 'aromatic', 'triple'
    ]]

    finger = [
        np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512))
        for x in data["SMILES"]
    ]
    finger = pd.DataFrame(finger)
    finger.columns = ["morgan_" + str(x) for x in finger.columns]

    model = word2vec.Word2Vec.load('models/model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1)
    m2v = [
        DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')
    ]
    m2v = np.array([x.vec for x in m2v])
    m2v = pd.DataFrame(m2v)
    m2v.columns = ["m2v_" + str(x) for x in m2v.columns]

    datadict = {
        "Morgan": finger,
        "Despcritor": traindata,
        "molvec": m2v,
        'y': y
    }

    return datadict