def draw_at_temp(temp, path_data, save_path):
    """
    Function to draw scaffolds with rdkit
    
    temp: temperature
    path_data: path to data
    save_path: path to save pictures
    """
    for filename in os.listdir(path_data):
        if filename.endswith('.pkl'):
            name = filename.replace('.pkl', '')
            epoch = name.split('_')[1]
            te = name.split('_')[2]

            if float(temp) == float(te) and 'scaf' in name:
                data_ = hp.load_obj(path_data + name)

                for name_data, data in data_.items():
                    # some moleculres are put as a list with the string
                    # error; we remove them for drawing
                    # note that they are very rare
                    data = [x for x in data if type(x) is str]
                    counter = collections.Counter(data)

                    figure_top_common_combined = 5
                    top_common_combined = 20
                    to_plot = [figure_top_common_combined, top_common_combined]

                    for top_common in to_plot:
                        common = counter.most_common(top_common)

                        #all diff scaffolds we have
                        total = sum(counter.values())

                        mols = [Chem.MolFromSmiles(x[0]) for x in common]
                        repet = [f'{100*x[1]/total:.2f}%' for x in common]

                        # print a common plot of all those guys
                        common_top = Draw.MolsToGridImage(mols,
                                                          molsPerRow=5,
                                                          subImgSize=(242,
                                                                      242),
                                                          legends=repet)

                        save_dir_common = f'{save_path}{name_data}/{te}/'
                        os.makedirs(save_dir_common, exist_ok=True)
                        save_filename = f'{save_dir_common}{epoch}_top_{top_common}.png'
                        common_top.save(save_filename)

                        # add SSE
                        sse = sdi(dict(common), scaled=True)
                        img = Image.open(save_filename)
                        number_t_write = len(common)
                        if number_t_write < 10:
                            p = inflect.engine()
                            number_t_write = p.number_to_words(
                                number_t_write).title()
                        text = f'{number_t_write} most common scaffolds at epoch {epoch} (SSE = {sse:.02}):'
                        add_txt_on_img(img, text, save_filename)
    def get_back_scaffolds(space, scaffolds_type):
        name = config['DATA'][space]
        name = name.replace('.txt', '')
        aug = int(config['AUGMENTATION'][space])
        path_to_scaf = f'results/data/{name}/{min_len}_{max_len}_x{aug}/scaf'

        if not os.path.isfile(f'{path_to_scaf}.pkl'):
            raise ValueError(
                'Scaffolds cannot be accessed. Are they missing for the source or taget space?'
            )
        else:
            scaf = hp.load_obj(path_to_scaf)[scaffolds_type]

        return scaf
def draw_scaffolds(top_common, path):
    """ 
    Function to draw scaffolds with rdkit. 
    
    Parameters:
    - dict_scaf: dictionnary with scaffolds.
    - top_common (int): how many of the most common
    scaffolds to draw.
    - path (string): Path to save the scaffolds picture
    and to get the scaffolds data.
    """

    path_scaffolds = f'{path}scaf'
    data_ = hp.load_obj(path_scaffolds)

    for name_data, data in data_.items():
        # Note that some molecules are put as a list
        # with a string error; we remove them for drawing
        # Note 2: they occur very rarely
        data = [x for x in data if type(x) is str]
        counter = collections.Counter(data)
        common = counter.most_common(top_common)

        total = sum(counter.values())
        mols = [Chem.MolFromSmiles(x[0]) for x in common[:top_common]]
        repet = [
            str(x[1]) + f'({100*x[1]/total:.2f}%)' for x in common[:top_common]
        ]

        molsPerRow = 5
        common_top = Draw.MolsToGridImage(mols,
                                          molsPerRow=molsPerRow,
                                          subImgSize=(150, 150),
                                          legends=repet)

        common_top.save(f'{path}top_{top_common}_{name_data}.png')
Example #4
0
    # canonical SMILES
    dir_split_data = f'results/data/{name_data}/{min_len}_{max_len}_x{augmentation}/'

    with open(f'{dir_split_data}data_tr.txt', 'r') as f:
        data_training = f.readlines()
    with open(f'{dir_split_data}data_val.txt', 'r') as f:
        data_validation = f.readlines()
    ####################################

    ####################################
    # Start iterating over the files
    t0 = time.time()
    for filename in os.listdir(path_gen):
        if filename.endswith('.pkl'):
            name = filename.replace('.pkl', '')
            data = hp.load_obj(path_gen + name)

            valids = []
            n_valid = 0

            for gen_smile in data:
                if len(gen_smile) != 0 and isinstance(gen_smile, str):
                    gen_smile = gen_smile.replace(pad_char, '')
                    gen_smile = gen_smile.replace(end_char, '')
                    gen_smile = gen_smile.replace(start_char, '')

                    mol = Chem.MolFromSmiles(gen_smile)
                    if mol is not None:
                        cans = Chem.MolToSmiles(mol,
                                                isomericSmiles=True,
                                                canonical=True)
    ####################################
    # path to the saved novo data
    path_novo = f'results/{name_data}/novo_molecules/'

    # path to save the scores
    save_path = f'results/{name_data}/analysis/'
    os.makedirs(save_path, exist_ok=True)
    ####################################

    ####################################
    # start iterating over the files
    for filename in os.listdir(path_novo):
        if filename.endswith('.pkl'):
            name = filename.replace('.pkl', '')
            data = hp.load_obj(f'{path_novo}{name}')
            novo_tr = data['novo_tr']

            # We do a double check because rdkit
            # throws weird errors sometimes
            data_rdkit = []
            for i, x in enumerate(novo_tr):
                mol = Chem.MolFromSmiles(x)
                if mol is not None:
                    data_rdkit.append(mol)

            save_name = name.split('_')[1] + '_' + name.split('_')[2]

            # descriptors
            desc_names = re.compile(FP.DESCRIPTORS['names'])
            functions, names = hp_chem.get_rdkit_desc_functions(desc_names)
                                     factor=factor,
                                     min_lr=min_lr)
    ####################################

    ####################################
    # Path to the data
    augmentation = int(config['AUGMENTATION'][mode])
    dir_split_data = f'results/data/{name_data}/{min_len}_{max_len}_x{augmentation}/'
    if verbose: print(f'Data path : {dir_split_data}')

    # load partitions
    partition = {}
    path_partition_train = f'{dir_split_data}idx_tr'
    path_partition_valid = f'{dir_split_data}idx_val'

    partition['train'] = hp.load_obj(path_partition_train)
    partition['val'] = hp.load_obj(path_partition_valid)

    # get back the name of the training data from parameters
    path_data = f'{dir_split_data}{min_len}_{max_len}_x{augmentation}.txt'

    # finally, we infer the vocab size from the len
    # of the tokenization used
    vocab_size = len(indices_token)
    ####################################

    ####################################
    # Create the generators
    tr_generator = data_generator.DataGenerator(partition['train'],
                                                batch_size,
                                                max_len_model,
    scaf_source = get_back_scaffolds('source_space', scaffolds_type)

    # load the target set (from which the
    # fine-tuning set comes from) scaffolds
    # if provided
    if config['DATA']['target_space']:
        scaf_target = get_back_scaffolds('target_space', scaffolds_type)
    ####################################

    ####################################
    # scaffolds of the fine-tuning
    # molecules
    if mode == 'fine_tuning':
        aug = int(config['AUGMENTATION']['fine_tuning'])
        path_to_scaf = f'results/data/{name_data}/{min_len}_{max_len}_x{aug}/scaf'
        scaf_ft = hp.load_obj(path_to_scaf)[scaffolds_type]
    ####################################

    ####################################
    # start iterating over the files
    t0 = time.time()
    for filename in os.listdir(path_scaf):
        if filename.endswith('.pkl') and 'scaf' in filename:
            name = filename.replace('.pkl', '')
            data = hp.load_obj(f'{path_scaf}{name}')[scaffolds_type]

            checked_scaf = []
            n_valid = 0

            for gen_scaf in data:
                if len(gen_scaf) != 0 and isinstance(gen_scaf, str):
Example #8
0
    ####################################
    # and do the plot
    flatui_alone = ['#000000', '#000000']
    sns.set_palette(flatui_alone)
    dashe_space = 25
    dashe_len = 12.5
    dashes = None

    dict_src = {}
    dict_tgt = {}

    # start plotting
    for filename in os.listdir(path_fcd):
        if filename.endswith('.pkl'):
            name = filename.replace('.pkl', '')
            data = hp.load_obj(f'{path_fcd}{name}')
            epoch = int(name.split('_')[1])
            te = name.split('_')[2]
            if float(temp) == float(te):
                dict_src[epoch] = data['f_dist_src']
                dict_tgt[epoch] = data['f_dist_tgt']

    do_plot(dict_src,
            dict_tgt,
            f'{save_path}frechet_distance_{te}.png',
            dashes=dashes)

    end = time.time()
    if verbose: print(f'FRECHET PLOT DONE in {end - start:.04} seconds')
    ####################################
def get_dict_with_data(data_name, min_len, max_len, aug):
    data_des = hp.load_obj(
        f'results/data/{data_name}/{min_len}_{max_len}_x{aug}/desc.pkl')
    des = data_des[desc_to_plot]
    dict_temp = {'seq_time': [data_name] * len(des), 'value': des}
    return dict_temp
    # get back dataset descriptor
    src_space_name = config['DATA']['source_space']
    src_space_name = src_space_name.replace('.txt', '')
    dict_temp = get_dict_with_data(src_space_name, min_len, max_len,
                                   int(config['AUGMENTATION']['source_space']))
    df = update_df(df, dict_temp)

    for fname in os.listdir(path_des):
        if fname.endswith('.pkl'):
            if 'desc' in fname and str(temp) in fname:
                name = fname.replace('.pkl', '')
                epoch = int(name.split('_')[1])
                seq_time = f'epoch {epoch}'

                # get values
                data = hp.load_obj(path_des + fname)
                values = data[desc_to_plot]

                # add to dataframe
                dict_temp = {
                    'seq_time': [seq_time] * len(values),
                    'value': values
                }
                df = update_df(df, dict_temp)

    dict_temp = get_dict_with_data(name_data, min_len, max_len,
                                   int(config['AUGMENTATION']['fine_tuning']))
    df = update_df(df, dict_temp)

    tgt_space_name = config['DATA']['target_space']
    tgt_space_name = tgt_space_name.replace('.txt', '')