def do_data_analysis(data_rdkit, descriptor_name, save_dir, verbose=False): """ Function to analize a dataset. Will compute: descritpor as specify in descriptors_name, Morgan fingerprint, Murcko and generic scaffolds. Parameters: - data_rdkit: list of RDKit mol. - descriptor_name (string): contain name of descriptor to compute. - save_dir (string): Path to save the output of the analysis. """ # Compute the descriptors with rdkit # as defined in the fixed parameter file desc_names = re.compile(FP.DESCRIPTORS['names']) functions, names = hp_chem.get_rdkit_desc_functions(desc_names) descriptors = hp_chem.rdkit_desc(data_rdkit, functions, names) hp.save_obj(descriptors, f'{save_dir}desc') # Compute fingerprints fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose) fp_dict = {'fingerprint': fingerprint} hp.save_obj(fp_dict, f'{save_dir}fp') # Extract Murcko and generic scaffolds scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit) desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf} hp.save_obj(desc_scaf, f'{save_dir}scaf') hp.write_in_file(f'{save_dir}generic_scaffolds.txt', generic_scaf) hp.write_in_file(f'{save_dir}scaffolds.txt', scaf)
def do_processing(split, data_path, augmentation, min_len, max_len, save_dir, verbose=True): """ Function to process a dataset. Parameters: - split (float): value used to split the dataset between the training set and the validation set. E.g., if split is 0.8, 80% of the data will go in the training set, and 20% in the validation set. - data_path (string): path to the dataset. - augmentation (int): value to augment the dataset. E.g., if augmentation is 10, the SMILES enumeration will be done to add 10 different SMILES encoding for each SMILES (i.e. resulting in a total of 11 representations) for a given SMILES in the dataset. - min_len (int): minimum length of SMILES to be kept in the dataset. - max_len (int): maximum length of SMILES to be kept in the dataset. - save_dir (string): directory to save the processed dataset. """ # load the data with right SMILES limits, # both in a list and in rdkit mol format data_ori, data_rdkit = load_data(data_path, min_len, max_len, verbose=verbose) # we save the data without augmentation if it was # not already saved. We will need it to check the novelty # of the generated SMILES if os.path.isfile(f'{save_dir}pruned.txt'): hp.write_in_file(f'{save_dir}pruned.txt', data_ori) if verbose: print('Start data analysis') do_data_analysis(data_rdkit, FP.DESCRIPTORS['names'], save_dir) # draw top scaffolds if verbose: print('Start drawing scaffolds') top_common = 20 draw_scaffolds(top_common, save_dir) if verbose: print('Start data processing') # define index for the tr-val split # and shuffle them all_idx = np.arange(len(data_ori)) idx_split = int(split * len(all_idx)) np.random.shuffle(all_idx) # we need to be careful about the case where # idx_split = 0 when there is only one # SMILES in the data, e.g. for fine-tuning if idx_split == 0: # in this case, we use the unique smile both # for the training and validation idx_tr_canon = [0] idx_val_canon = [0] else: idx_tr_canon = all_idx[:idx_split] idx_val_canon = all_idx[idx_split:] assert len(idx_tr_canon) != 0 assert len(idx_val_canon) != 0 if verbose: print(f'Size of the training set after split: {len(idx_tr_canon)}') print(f'Size of the validation set after split: {len(idx_val_canon)}') d = dict(enumerate(data_ori)) data_tr = [d.get(item) for item in idx_tr_canon] data_val = [d.get(item) for item in idx_val_canon] hp.write_in_file(f'{save_dir}data_tr.txt', data_tr) hp.write_in_file(f'{save_dir}data_val.txt', data_val) if augmentation > 0: if verbose: print(f'Data augmentation {augmentation}-fold start') # Augment separately the training and validation splits # It's important to do those steps separetely in order # to avoid to have the same molecule represented in # both splits tr_aug = augment_dataset(data_tr, augmentation, min_len, max_len, verbose=False) val_aug = augment_dataset(data_val, augmentation, min_len, max_len, verbose=False) # Merge with the original data and shuffle full_training_set = list(set(data_tr + tr_aug)) shuffle(full_training_set) full_validation_set = list(set(data_val + val_aug)) shuffle(full_validation_set) full_datalist = full_training_set + full_validation_set if verbose: print( f'Size of the training set after agumentation: {len(full_training_set)}' ) print( f'Size of the validation set after agumentation: {len(full_validation_set)}' ) # Create the partitions for the data generators # with the full augmented dataset idx_tr = np.arange(len(full_training_set)) idx_val = np.arange(len(full_training_set), len(full_training_set) + len(full_validation_set)) # Save hp.write_in_file(f'{save_dir}{save_name}.txt', full_datalist) hp.save_obj(list(idx_tr), save_dir + 'idx_tr') hp.save_obj(list(idx_val), save_dir + 'idx_val') else: # Save hp.write_in_file(f'{save_dir}{save_name}.txt', data_ori) hp.save_obj(list(idx_tr_canon), f'{save_dir}idx_tr') hp.save_obj(list(idx_val_canon), f'{save_dir}idx_val')
# Now let's pruned our valid guys unique_set = set(valids) n_unique = len(unique_set) novo_tr = list(unique_set - set(data_training)) n_novo_tr = len(novo_tr) novo_val = list(unique_set - set(data_validation)) n_novo_val = len(novo_val) novo_analysis = { 'n_valid': n_valid, 'n_unique': n_unique, 'n_novo_tr': n_novo_tr, 'n_novo_val': n_novo_val, 'novo_tr': novo_tr } # we save the novo molecules also as .txt novo_name = f'{save_path}molecules_{name}' with open(f'{novo_name}.txt', 'w+') as f: for item in novo_tr: f.write("%s\n" % item) hp.save_obj(novo_analysis, novo_name) if verbose: print(f'sampling analysis for {name} done') else: print(f'There are n {n_valid} valids SMILES for {name}') end = time.time() if verbose: print(f'NOVO ANALYSIS DONE in {end - start:.04} seconds') ####################################
#################################### # start the sampling of new SMILES epoch = model_path.split('/')[-1].replace('.h5', '') if verbose: print(f'Sampling from model saved at epoch {epoch}') model = load_model(model_path) generated_smi = [] counter=0 start_sampling = time.time() for n in range(n_sample): generated_smi.append(sample(model, temp, start_char, end_char, max_len+1, indices_token, token_indices)) # From 100 molecules to sample, # we indicate the current status # to the user if n_sample>=100: if len(generated_smi)%int(0.1*n_sample)==0: counter+=10 delta_time = time.time()-start_sampling start_sampling = start_sampling + delta_time print(f'Status for model from epoch {epoch}: {counter}% of the molecules sampled in {delta_time:.2f} seconds') hp.save_obj(generated_smi, f'{save_path}{epoch}_{temp}') end = time.time() if verbose: print(f'SAMPLING DONE for model from epoch {epoch} in {end-start:.2f} seconds') ####################################
# We do a double check because rdkit # throws weird errors sometimes data_rdkit = [] for i, x in enumerate(novo_tr): mol = Chem.MolFromSmiles(x) if mol is not None: data_rdkit.append(mol) save_name = name.split('_')[1] + '_' + name.split('_')[2] # descriptors desc_names = re.compile(FP.DESCRIPTORS['names']) functions, names = hp_chem.get_rdkit_desc_functions(desc_names) desc_dict = hp_chem.rdkit_desc(data_rdkit, functions, names) hp.save_obj(desc_dict, save_path + f'desc_{save_name}') # scaffolds scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit) desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf} hp.save_obj(desc_scaf, f'{save_path}scaf_{save_name}') hp.write_in_file(f'{save_path}{save_name}_scaffolds.txt', scaf) hp.write_in_file(f'{save_path}{save_name}_generic_scaffolds.txt', generic_scaf) # fingerprints fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose) fp_dict = {'fingerprint': fingerprint} hp.save_obj(fp_dict, save_path + f'fp_{save_name}') end = time.time()
if mode == 'fine_tuning': # Load the pretrained model path_model = config['FINETUNING']['path_model'] if path_model is None: raise ValueError( 'You did not provide a path to a model to be loaded for fine-tuning' ) pre_model = load_model(path_model) pre_weights = pre_model.get_weights() seqmodel.model.set_weights(pre_weights) if verbose: seqmodel.model.summary() history = seqmodel.model.fit_generator( generator=tr_generator, validation_data=val_generator, use_multiprocessing=True, epochs=epochs, callbacks=[checkpointer, lr_reduction], workers=n_workers) # Save the loss history hp.save_obj(history.history, f'{save_path}history') end = time.time() print(f'TRAINING DONE in {end - start:.05} seconds') ####################################