def draw_at_temp(temp, path_data, save_path): """ Function to draw scaffolds with rdkit temp: temperature path_data: path to data save_path: path to save pictures """ for filename in os.listdir(path_data): if filename.endswith('.pkl'): name = filename.replace('.pkl', '') epoch = name.split('_')[1] te = name.split('_')[2] if float(temp) == float(te) and 'scaf' in name: data_ = hp.load_obj(path_data + name) for name_data, data in data_.items(): # some moleculres are put as a list with the string # error; we remove them for drawing # note that they are very rare data = [x for x in data if type(x) is str] counter = collections.Counter(data) figure_top_common_combined = 5 top_common_combined = 20 to_plot = [figure_top_common_combined, top_common_combined] for top_common in to_plot: common = counter.most_common(top_common) #all diff scaffolds we have total = sum(counter.values()) mols = [Chem.MolFromSmiles(x[0]) for x in common] repet = [f'{100*x[1]/total:.2f}%' for x in common] # print a common plot of all those guys common_top = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(242, 242), legends=repet) save_dir_common = f'{save_path}{name_data}/{te}/' os.makedirs(save_dir_common, exist_ok=True) save_filename = f'{save_dir_common}{epoch}_top_{top_common}.png' common_top.save(save_filename) # add SSE sse = sdi(dict(common), scaled=True) img = Image.open(save_filename) number_t_write = len(common) if number_t_write < 10: p = inflect.engine() number_t_write = p.number_to_words( number_t_write).title() text = f'{number_t_write} most common scaffolds at epoch {epoch} (SSE = {sse:.02}):' add_txt_on_img(img, text, save_filename)
def get_back_scaffolds(space, scaffolds_type): name = config['DATA'][space] name = name.replace('.txt', '') aug = int(config['AUGMENTATION'][space]) path_to_scaf = f'results/data/{name}/{min_len}_{max_len}_x{aug}/scaf' if not os.path.isfile(f'{path_to_scaf}.pkl'): raise ValueError( 'Scaffolds cannot be accessed. Are they missing for the source or taget space?' ) else: scaf = hp.load_obj(path_to_scaf)[scaffolds_type] return scaf
def draw_scaffolds(top_common, path): """ Function to draw scaffolds with rdkit. Parameters: - dict_scaf: dictionnary with scaffolds. - top_common (int): how many of the most common scaffolds to draw. - path (string): Path to save the scaffolds picture and to get the scaffolds data. """ path_scaffolds = f'{path}scaf' data_ = hp.load_obj(path_scaffolds) for name_data, data in data_.items(): # Note that some molecules are put as a list # with a string error; we remove them for drawing # Note 2: they occur very rarely data = [x for x in data if type(x) is str] counter = collections.Counter(data) common = counter.most_common(top_common) total = sum(counter.values()) mols = [Chem.MolFromSmiles(x[0]) for x in common[:top_common]] repet = [ str(x[1]) + f'({100*x[1]/total:.2f}%)' for x in common[:top_common] ] molsPerRow = 5 common_top = Draw.MolsToGridImage(mols, molsPerRow=molsPerRow, subImgSize=(150, 150), legends=repet) common_top.save(f'{path}top_{top_common}_{name_data}.png')
# canonical SMILES dir_split_data = f'results/data/{name_data}/{min_len}_{max_len}_x{augmentation}/' with open(f'{dir_split_data}data_tr.txt', 'r') as f: data_training = f.readlines() with open(f'{dir_split_data}data_val.txt', 'r') as f: data_validation = f.readlines() #################################### #################################### # Start iterating over the files t0 = time.time() for filename in os.listdir(path_gen): if filename.endswith('.pkl'): name = filename.replace('.pkl', '') data = hp.load_obj(path_gen + name) valids = [] n_valid = 0 for gen_smile in data: if len(gen_smile) != 0 and isinstance(gen_smile, str): gen_smile = gen_smile.replace(pad_char, '') gen_smile = gen_smile.replace(end_char, '') gen_smile = gen_smile.replace(start_char, '') mol = Chem.MolFromSmiles(gen_smile) if mol is not None: cans = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
#################################### # path to the saved novo data path_novo = f'results/{name_data}/novo_molecules/' # path to save the scores save_path = f'results/{name_data}/analysis/' os.makedirs(save_path, exist_ok=True) #################################### #################################### # start iterating over the files for filename in os.listdir(path_novo): if filename.endswith('.pkl'): name = filename.replace('.pkl', '') data = hp.load_obj(f'{path_novo}{name}') novo_tr = data['novo_tr'] # We do a double check because rdkit # throws weird errors sometimes data_rdkit = [] for i, x in enumerate(novo_tr): mol = Chem.MolFromSmiles(x) if mol is not None: data_rdkit.append(mol) save_name = name.split('_')[1] + '_' + name.split('_')[2] # descriptors desc_names = re.compile(FP.DESCRIPTORS['names']) functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
factor=factor, min_lr=min_lr) #################################### #################################### # Path to the data augmentation = int(config['AUGMENTATION'][mode]) dir_split_data = f'results/data/{name_data}/{min_len}_{max_len}_x{augmentation}/' if verbose: print(f'Data path : {dir_split_data}') # load partitions partition = {} path_partition_train = f'{dir_split_data}idx_tr' path_partition_valid = f'{dir_split_data}idx_val' partition['train'] = hp.load_obj(path_partition_train) partition['val'] = hp.load_obj(path_partition_valid) # get back the name of the training data from parameters path_data = f'{dir_split_data}{min_len}_{max_len}_x{augmentation}.txt' # finally, we infer the vocab size from the len # of the tokenization used vocab_size = len(indices_token) #################################### #################################### # Create the generators tr_generator = data_generator.DataGenerator(partition['train'], batch_size, max_len_model,
scaf_source = get_back_scaffolds('source_space', scaffolds_type) # load the target set (from which the # fine-tuning set comes from) scaffolds # if provided if config['DATA']['target_space']: scaf_target = get_back_scaffolds('target_space', scaffolds_type) #################################### #################################### # scaffolds of the fine-tuning # molecules if mode == 'fine_tuning': aug = int(config['AUGMENTATION']['fine_tuning']) path_to_scaf = f'results/data/{name_data}/{min_len}_{max_len}_x{aug}/scaf' scaf_ft = hp.load_obj(path_to_scaf)[scaffolds_type] #################################### #################################### # start iterating over the files t0 = time.time() for filename in os.listdir(path_scaf): if filename.endswith('.pkl') and 'scaf' in filename: name = filename.replace('.pkl', '') data = hp.load_obj(f'{path_scaf}{name}')[scaffolds_type] checked_scaf = [] n_valid = 0 for gen_scaf in data: if len(gen_scaf) != 0 and isinstance(gen_scaf, str):
#################################### # and do the plot flatui_alone = ['#000000', '#000000'] sns.set_palette(flatui_alone) dashe_space = 25 dashe_len = 12.5 dashes = None dict_src = {} dict_tgt = {} # start plotting for filename in os.listdir(path_fcd): if filename.endswith('.pkl'): name = filename.replace('.pkl', '') data = hp.load_obj(f'{path_fcd}{name}') epoch = int(name.split('_')[1]) te = name.split('_')[2] if float(temp) == float(te): dict_src[epoch] = data['f_dist_src'] dict_tgt[epoch] = data['f_dist_tgt'] do_plot(dict_src, dict_tgt, f'{save_path}frechet_distance_{te}.png', dashes=dashes) end = time.time() if verbose: print(f'FRECHET PLOT DONE in {end - start:.04} seconds') ####################################
def get_dict_with_data(data_name, min_len, max_len, aug): data_des = hp.load_obj( f'results/data/{data_name}/{min_len}_{max_len}_x{aug}/desc.pkl') des = data_des[desc_to_plot] dict_temp = {'seq_time': [data_name] * len(des), 'value': des} return dict_temp
# get back dataset descriptor src_space_name = config['DATA']['source_space'] src_space_name = src_space_name.replace('.txt', '') dict_temp = get_dict_with_data(src_space_name, min_len, max_len, int(config['AUGMENTATION']['source_space'])) df = update_df(df, dict_temp) for fname in os.listdir(path_des): if fname.endswith('.pkl'): if 'desc' in fname and str(temp) in fname: name = fname.replace('.pkl', '') epoch = int(name.split('_')[1]) seq_time = f'epoch {epoch}' # get values data = hp.load_obj(path_des + fname) values = data[desc_to_plot] # add to dataframe dict_temp = { 'seq_time': [seq_time] * len(values), 'value': values } df = update_df(df, dict_temp) dict_temp = get_dict_with_data(name_data, min_len, max_len, int(config['AUGMENTATION']['fine_tuning'])) df = update_df(df, dict_temp) tgt_space_name = config['DATA']['target_space'] tgt_space_name = tgt_space_name.replace('.txt', '')