def from_csv(): data = open('data.csv') start = True for line in data: if start: start = False continue split = line.split(',') smiles = split[0] print(list(tokenize_smiles(smiles))) print('end')
def get_tokens_dict(): st_dict = {} with open("canonized_smiles.txt", 'r') as f: smiles = [line.rstrip('\n') for line in f] index = 2 for smile in smiles: tokenized_smile = list(tokenize_smiles(smile)) for token in tokenized_smile: if not token in st_dict: st_dict[token] = index index += 1 return st_dict
def get_smiles_as_vectors(): with open("canonized_smiles.txt", 'r') as f: smiles = [line.rstrip('\n') for line in f] tokens_dict = get_tokens_dict() r = [] for smile in smiles: vector = [] tokens = list(tokenize_smiles(smile)) for token in tokens: vector.append(tokens_dict[token]) r.append(vector) return r
def get_smiles_numerical_vectors_dict(): with open("canonized_smiles.txt", 'r') as f: smiles = [line.rstrip('\n') for line in f] tokens_dict = get_tokens_dict() d = {} for smile in smiles: vector = [] tokens = list(tokenize_smiles(smile)) for token in tokens: vector.append(tokens_dict[token]) d[smile] = vector return d
def from_npy(): smiles = np.loadtxt("canonized_data.npy", dtype=str) for smile in smiles: print(list(tokenize_smiles(smile)))