Exemple #1
0
def from_csv():
    data = open('data.csv')
    start = True
    for line in data:
        if start:
            start = False
            continue
        split = line.split(',')
        smiles = split[0]
        print(list(tokenize_smiles(smiles)))
    print('end')
Exemple #2
0
def get_tokens_dict():
    st_dict = {}
    with open("canonized_smiles.txt", 'r') as f:
        smiles = [line.rstrip('\n') for line in f]
    index = 2
    for smile in smiles:
        tokenized_smile = list(tokenize_smiles(smile))
        for token in tokenized_smile:
            if not token in st_dict:
                st_dict[token] = index
                index += 1
    return st_dict
Exemple #3
0
def get_smiles_as_vectors():
    with open("canonized_smiles.txt", 'r') as f:
        smiles = [line.rstrip('\n') for line in f]
    tokens_dict = get_tokens_dict()
    r = []
    for smile in smiles:
        vector = []
        tokens = list(tokenize_smiles(smile))
        for token in tokens:
            vector.append(tokens_dict[token])
        r.append(vector)
    return r
Exemple #4
0
def get_smiles_numerical_vectors_dict():
    with open("canonized_smiles.txt", 'r') as f:
        smiles = [line.rstrip('\n') for line in f]
    tokens_dict = get_tokens_dict()
    d = {}
    for smile in smiles:
        vector = []
        tokens = list(tokenize_smiles(smile))
        for token in tokens:
            vector.append(tokens_dict[token])
        d[smile] = vector
    return d
Exemple #5
0
def from_npy():
    smiles = np.loadtxt("canonized_data.npy", dtype=str)
    for smile in smiles:
        print(list(tokenize_smiles(smile)))