def do_data_analysis(data_rdkit, descriptor_name, save_dir, verbose=False):
    """
    Function to analize a dataset. Will compute: descritpor as specify in
    descriptors_name, Morgan fingerprint, Murcko and generic scaffolds.
    
    Parameters:
    - data_rdkit: list of RDKit mol.
    - descriptor_name (string): contain name of descriptor to compute.
    - save_dir (string): Path to save the output of the analysis.
    """

    # Compute the descriptors with rdkit
    # as defined in the fixed parameter file
    desc_names = re.compile(FP.DESCRIPTORS['names'])
    functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
    descriptors = hp_chem.rdkit_desc(data_rdkit, functions, names)
    hp.save_obj(descriptors, f'{save_dir}desc')

    # Compute fingerprints
    fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose)
    fp_dict = {'fingerprint': fingerprint}
    hp.save_obj(fp_dict, f'{save_dir}fp')

    # Extract Murcko and generic scaffolds
    scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit)
    desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf}
    hp.save_obj(desc_scaf, f'{save_dir}scaf')
    hp.write_in_file(f'{save_dir}generic_scaffolds.txt', generic_scaf)
    hp.write_in_file(f'{save_dir}scaffolds.txt', scaf)
def do_processing(split,
                  data_path,
                  augmentation,
                  min_len,
                  max_len,
                  save_dir,
                  verbose=True):
    """
    Function to process a dataset.
    
    Parameters:
    - split (float): value used to split the dataset between
    the training set and the validation set. E.g., if split is 0.8,
    80% of the data will go in the training set, and 20% in the 
    validation set.
    - data_path (string): path to the dataset.
    - augmentation (int): value to augment the dataset. E.g., if augmentation
    is 10, the SMILES enumeration will be done to add 10 different 
    SMILES encoding for each SMILES (i.e. resulting in a total of 11 representations)
    for a given SMILES in the dataset.
    - min_len (int): minimum length of SMILES to be kept in the dataset.
    - max_len (int): maximum length of SMILES to be kept in the dataset.
    - save_dir (string): directory to save the processed dataset.
    """

    # load the data with right SMILES limits,
    # both in a list and in rdkit mol format
    data_ori, data_rdkit = load_data(data_path,
                                     min_len,
                                     max_len,
                                     verbose=verbose)

    # we save the data without augmentation if it was
    # not already saved. We will need it to check the novelty
    # of the generated SMILES
    if os.path.isfile(f'{save_dir}pruned.txt'):
        hp.write_in_file(f'{save_dir}pruned.txt', data_ori)

    if verbose: print('Start data analysis')
    do_data_analysis(data_rdkit, FP.DESCRIPTORS['names'], save_dir)

    # draw top scaffolds
    if verbose: print('Start drawing scaffolds')
    top_common = 20
    draw_scaffolds(top_common, save_dir)

    if verbose: print('Start data processing')
    # define index for the tr-val split
    # and shuffle them
    all_idx = np.arange(len(data_ori))
    idx_split = int(split * len(all_idx))
    np.random.shuffle(all_idx)

    # we need to be careful about the case where
    # idx_split = 0 when there is only one
    # SMILES in the data, e.g. for fine-tuning
    if idx_split == 0:
        # in this case, we use the unique smile both
        # for the training and validation
        idx_tr_canon = [0]
        idx_val_canon = [0]
    else:
        idx_tr_canon = all_idx[:idx_split]
        idx_val_canon = all_idx[idx_split:]

    assert len(idx_tr_canon) != 0
    assert len(idx_val_canon) != 0

    if verbose:
        print(f'Size of the training set after split: {len(idx_tr_canon)}')
        print(f'Size of the validation set after split: {len(idx_val_canon)}')

    d = dict(enumerate(data_ori))
    data_tr = [d.get(item) for item in idx_tr_canon]
    data_val = [d.get(item) for item in idx_val_canon]
    hp.write_in_file(f'{save_dir}data_tr.txt', data_tr)
    hp.write_in_file(f'{save_dir}data_val.txt', data_val)

    if augmentation > 0:
        if verbose:
            print(f'Data augmentation {augmentation}-fold start')

        # Augment separately the training and validation splits
        # It's important to do those steps separetely in order
        # to avoid to have the same molecule represented in
        # both splits
        tr_aug = augment_dataset(data_tr,
                                 augmentation,
                                 min_len,
                                 max_len,
                                 verbose=False)
        val_aug = augment_dataset(data_val,
                                  augmentation,
                                  min_len,
                                  max_len,
                                  verbose=False)

        # Merge with the original data and shuffle
        full_training_set = list(set(data_tr + tr_aug))
        shuffle(full_training_set)
        full_validation_set = list(set(data_val + val_aug))
        shuffle(full_validation_set)
        full_datalist = full_training_set + full_validation_set

        if verbose:
            print(
                f'Size of the training set after agumentation: {len(full_training_set)}'
            )
            print(
                f'Size of the validation set after agumentation: {len(full_validation_set)}'
            )

        # Create the partitions for the data generators
        # with the full augmented dataset
        idx_tr = np.arange(len(full_training_set))
        idx_val = np.arange(len(full_training_set),
                            len(full_training_set) + len(full_validation_set))

        # Save
        hp.write_in_file(f'{save_dir}{save_name}.txt', full_datalist)
        hp.save_obj(list(idx_tr), save_dir + 'idx_tr')
        hp.save_obj(list(idx_val), save_dir + 'idx_val')
    else:
        # Save
        hp.write_in_file(f'{save_dir}{save_name}.txt', data_ori)
        hp.save_obj(list(idx_tr_canon), f'{save_dir}idx_tr')
        hp.save_obj(list(idx_val_canon), f'{save_dir}idx_val')
Exemple #3
0
                # Now let's pruned our valid guys
                unique_set = set(valids)
                n_unique = len(unique_set)
                novo_tr = list(unique_set - set(data_training))
                n_novo_tr = len(novo_tr)
                novo_val = list(unique_set - set(data_validation))
                n_novo_val = len(novo_val)
                novo_analysis = {
                    'n_valid': n_valid,
                    'n_unique': n_unique,
                    'n_novo_tr': n_novo_tr,
                    'n_novo_val': n_novo_val,
                    'novo_tr': novo_tr
                }

                # we save the novo molecules also as .txt
                novo_name = f'{save_path}molecules_{name}'
                with open(f'{novo_name}.txt', 'w+') as f:
                    for item in novo_tr:
                        f.write("%s\n" % item)

                hp.save_obj(novo_analysis, novo_name)

                if verbose: print(f'sampling analysis for {name} done')
            else:
                print(f'There are n {n_valid} valids SMILES for {name}')

    end = time.time()
    if verbose: print(f'NOVO ANALYSIS DONE in {end - start:.04} seconds')
    ####################################
Exemple #4
0
 ####################################
 # start the sampling of new SMILES
 epoch = model_path.split('/')[-1].replace('.h5', '')
 if verbose: print(f'Sampling from model saved at epoch {epoch}')
 
 model = load_model(model_path)
 
 generated_smi = []
 counter=0
 start_sampling = time.time()
 for n in range(n_sample):
     generated_smi.append(sample(model, temp, 
                                 start_char, end_char, max_len+1, 
                                 indices_token, token_indices))
     
     # From 100 molecules to sample,
     # we indicate the current status
     # to the user
     if n_sample>=100:
         if len(generated_smi)%int(0.1*n_sample)==0:
             counter+=10
             delta_time = time.time()-start_sampling
             start_sampling = start_sampling + delta_time
             print(f'Status for model from epoch {epoch}: {counter}% of the molecules sampled in {delta_time:.2f} seconds')
     
 hp.save_obj(generated_smi, f'{save_path}{epoch}_{temp}')
 
 end = time.time()
 if verbose: print(f'SAMPLING DONE for model from epoch {epoch} in {end-start:.2f} seconds')  
 ####################################
     
            # We do a double check because rdkit
            # throws weird errors sometimes
            data_rdkit = []
            for i, x in enumerate(novo_tr):
                mol = Chem.MolFromSmiles(x)
                if mol is not None:
                    data_rdkit.append(mol)

            save_name = name.split('_')[1] + '_' + name.split('_')[2]

            # descriptors
            desc_names = re.compile(FP.DESCRIPTORS['names'])
            functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
            desc_dict = hp_chem.rdkit_desc(data_rdkit, functions, names)
            hp.save_obj(desc_dict, save_path + f'desc_{save_name}')

            # scaffolds
            scaf, generic_scaf = hp_chem.extract_murcko_scaffolds(data_rdkit)
            desc_scaf = {'scaffolds': scaf, 'generic_scaffolds': generic_scaf}
            hp.save_obj(desc_scaf, f'{save_path}scaf_{save_name}')
            hp.write_in_file(f'{save_path}{save_name}_scaffolds.txt', scaf)
            hp.write_in_file(f'{save_path}{save_name}_generic_scaffolds.txt',
                             generic_scaf)

            # fingerprints
            fingerprint = hp_chem.fingerprint_calc(data_rdkit, verbose=verbose)
            fp_dict = {'fingerprint': fingerprint}
            hp.save_obj(fp_dict, save_path + f'fp_{save_name}')

    end = time.time()
    if mode == 'fine_tuning':
        # Load the pretrained model
        path_model = config['FINETUNING']['path_model']
        if path_model is None:
            raise ValueError(
                'You did not provide a path to a model to be loaded for fine-tuning'
            )

        pre_model = load_model(path_model)
        pre_weights = pre_model.get_weights()
        seqmodel.model.set_weights(pre_weights)

    if verbose:
        seqmodel.model.summary()

    history = seqmodel.model.fit_generator(
        generator=tr_generator,
        validation_data=val_generator,
        use_multiprocessing=True,
        epochs=epochs,
        callbacks=[checkpointer, lr_reduction],
        workers=n_workers)

    # Save the loss history
    hp.save_obj(history.history, f'{save_path}history')

    end = time.time()
    print(f'TRAINING DONE in {end - start:.05} seconds')
    ####################################