Beispiel #1
0
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from db_config import url, properties
from mhfp.encoder import MHFPEncoder
from pyspark.sql.types import IntegerType, StringType
sudo apt-get install libboost-all-dev
mhfp_encoder = MHFPEncoder()
spark = SparkSession.builder.getOrCreate()

# Defined multiple UDF via PySpark Sql appfunctions.py module
def filename(path):
    return path
countCarbons = F.udf(lambda x : str(x).lower().count('c'), IntegerType())
sourceFile = F.udf(filename, StringType())
mhfp_smiles = F.udf(lambda x : mhfp_encoder.encode(x, radius=3, rings=True, kekulize=True, sanitize=True), StringType())
# Created DataFrames here with the new columns that I required and dropped the duplicates
df = spark.read.format('csv').option('delimiter','\t').option('header', 'false')\
    .load('s3a://zincdata/zinc/AA/AAAA.txt')
df = df.withColumn('mhfp', mhfp_smiles('smiles'))
df = df.dropDuplicates(['smiles'])
df.show()
# Performed my dataframe write with the help of jdbc
#df.write.jdbc(url='jdbc:%s' % url, table="zincmap", mode='append', properties=properties)






Beispiel #2
0
if not os.path.isfile('fps.dat'):
    with open('drugbank.smi', 'r') as f:
        i = 0
        for line in f:
            smiles = line.split()[0].strip()
            mol = AllChem.MolFromSmiles(smiles)
            if mol:
                fps.append(enc.encode_mol(mol))
            i += 1
            if i > 2000: break
    pickle.dump(fps, open('fps.dat', 'wb'))
else:
    fps = pickle.load(open('fps.dat', 'rb'))

m = enc.encode(
    "N=C(N)NCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)Cc1ccccc1)C(=O)N1CCC[C@H]1C(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)N[C@@H](CC(=O)N)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H]([C@@H](C)CC)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CC(C)C)C(=O)O"
)
n = enc.encode(
    "O=C(N1[C@@H](CCC1)C(=O)NNC(=O)N)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]1NC(=O)CC1)Cc1[nH]cnc1)Cc1c2c([nH]c1)cccc2)CO)Cc1ccc(O)cc1)COC(C)(C)C)CC(C)C)CCCN=C(N)N"
)
q = enc.encode(
    "[C@@H](C(=O)NCC(=O)N[C@@H](C(=O)N[C@@H](C)C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)NCCO)Cc1c2c(cccc2)[nH]c1)CC(C)C)Cc1c[nH]c2c1cccc2)CC(C)C)Cc1c[nH]c2c1cccc2)CC(C)C)Cc1c[nH]c2c1cccc2)C(C)C)C(C)C)C(C)C)CC(C)C)(C(C)C)NC=O"
)
r = enc.encode('CNCNCNCNC')

lf_classic = LSHForest(512, 64)
for i, e in enumerate(fps):
    lf_classic.add(i, e)
lf_classic.index()

start = timer()