def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)
Ejemplo n.º 2
0
    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)
Ejemplo n.º 3
0
def _train(args):
    """
    Train the NADE model
    :param args: parsed arguments
    """
    if K.backend() != 'tensorflow':
        print("This repository only support tensorflow backend.")
        raise NotImplementedError()

    batch_size_ = 512
    nb_users = 6040
    nb_movies = 3706
    data_sample = 1.0
    input_dim0 = 6040
    input_dim1 = 5
    std = 0.0
    alpha = 1.0

    print('Loading data...')
    train_file_list = sorted(glob.glob(os.path.join('data/train_set',
                                                    'part*')))
    val_file_list = sorted(glob.glob(os.path.join('data/val_set/', 'part*')))
    test_file_list = sorted(glob.glob(os.path.join('data/test_set/', 'part*')))

    train_file_list = [
        dfile for dfile in train_file_list if os.stat(dfile).st_size != 0
    ]
    val_file_list = [
        dfile for dfile in val_file_list if os.stat(dfile).st_size != 0
    ]
    test_file_list = [
        dfile for dfile in test_file_list if os.stat(dfile).st_size != 0
    ]

    print("Shuffle the data...")
    random.shuffle(train_file_list)
    random.shuffle(val_file_list)
    random.shuffle(test_file_list)
    train_file_list = train_file_list[:max(
        int(len(train_file_list) * data_sample), 1)]

    print('Instantiate DataSet classes...')
    train_set = DataSet(train_file_list,
                        num_users=nb_users,
                        num_items=nb_movies,
                        batch_size=batch_size_,
                        mode=0)
    val_set = DataSet(val_file_list,
                      num_users=nb_users,
                      num_items=nb_movies,
                      batch_size=batch_size_,
                      mode=1)
    test_set = DataSet(test_file_list,
                       num_users=nb_users,
                       num_items=nb_movies,
                       batch_size=batch_size_,
                       mode=2)

    rating_freq = np.zeros((6040, 5))
    init_b = np.zeros((6040, 5))
    for batch in val_set.generate(max_iters=1):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']
        rating_freq += inp_r.sum(axis=0)

    log_rating_freq = np.log(rating_freq + 1e-8)
    log_rating_freq_diff = np.diff(log_rating_freq, axis=1)
    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')

    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    print("Build NADE architecture...")
    # nade_layer = Dropout(0.0)(input_layer)
    nade_layer = input_layer
    nade_layer = NADE(hidden_dim=args.hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02),
                      args=args)(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),
                      name='nade_loss')([
                          nade_layer, output_ratings, input_masks,
                          output_masks, D, d
                      ])

    cf_nade_model = Model(
        inputs=[input_layer, output_ratings, input_masks, output_masks],
        outputs=[loss_out, predicted_ratings])

    print("Get NADE model summary...")
    cf_nade_model.summary()
    # Use Adam optimizer
    adam = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    # Compile NADE model
    cf_nade_model.compile(loss={
        'nade_loss': lambda y_true, y_pred: y_pred
    },
                          optimizer=adam)

    # Create EvaluationCallback for NADE model on train and validation sets
    train_evaluation_callback = EvaluationCallback(data_set=train_set,
                                                   new_items=new_items,
                                                   training_set=True)
    valid_evaluation_callback = EvaluationCallback(data_set=val_set,
                                                   new_items=new_items,
                                                   training_set=False)

    print('Training...')
    cf_nade_model.fit_generator(
        train_set.generate(),
        steps_per_epoch=(train_set.get_corpus_size() // batch_size_),
        epochs=args.n_epochs,
        validation_data=val_set.generate(),
        validation_steps=(val_set.get_corpus_size() // batch_size_),
        shuffle=True,
        callbacks=[
            train_set, val_set, train_evaluation_callback,
            valid_evaluation_callback
        ],
        verbose=1)

    print('Testing...')
    rmses = []
    rate_score = np.array([1, 2, 3, 4, 5], np.float32)
    new_items = new_items

    squared_error = []
    n_samples = []
    for i, batch in enumerate(test_set.generate(max_iters=1)):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']

        pred_batch = cf_nade_model.predict(batch[0])[1]
        true_r = out_r.argmax(axis=2) + 1
        pred_r = (pred_batch *
                  rate_score[np.newaxis, np.newaxis, :]).sum(axis=2)

        pred_r[:, new_items] = 3

        mask = out_r.sum(axis=2)

        se = np.sum(np.square(true_r - pred_r) * mask)
        n = np.sum(mask)
        squared_error.append(se)
        n_samples.append(n)

    total_squared_error = np.array(squared_error).sum()
    total_n_samples = np.array(n_samples).sum()
    rmse = np.sqrt(total_squared_error / (total_n_samples * 1.0 + 1e-8))
    print("test set RMSE is %f" % rmse)
Ejemplo n.º 4
0
def init_model(opts):
    model_config = opts["model"]
    model_config["in_feats"] = 28 * 28
    model = NADE(**model_config)
    return model
Ejemplo n.º 5
0
 def iterative_algorithm(
     self,
     name,
     pop_size=100,
     genome_length=20,
     lim_percentage=20,
     corruption_level=0.2,
     num_epochs=50,
     lr = 0.1,
     max_evaluations=200000,
     unique_training=False,
     hiddens=300,
     rtr = True,
     w=10
     ):
     results_path = "results/autoencoder/{0}/".format(name)
     ensure_dir(results_path)
     fitfile = open("{0}fitnesses.dat".format(results_path),"w")
     self.mask = np.random.binomial(1,0.5,genome_length)
     trials = max_evaluations/pop_size
     population_limit = int(pop_size*(lim_percentage/100.0))
     # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
     # self.dA.build_dA(corruption_level)
     # self.build_sample_dA()
     self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
     # self.NADE.build_NADE()
     new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
     self.population_fitnesses = self.fitness_many(new_population)
     fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
     for iteration in range(0,trials):
         print "iteration:",iteration
         population = new_population
         self.population = new_population
         rw = self.tournament_selection_replacement(population)
         good_strings,good_strings_fitnesses=self.get_good_strings(
                                       population,
                                       population_limit,
                                       unique=unique_training,
                                       fitnesses=self.population_fitnesses
                                     )
         print "training A/E"
         training_data = np.array(good_strings)
         self.train_NADE(training_data,
                       num_epochs=num_epochs,
                       lr=lr)
         print "sampling..."
         sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
         self.sample_fitnesses = self.fitness_many(sampled_population)
         if rtr:
             new_population = self.RTR(
                           population,
                           sampled_population,
                           population_fitnesses=self.population_fitnesses,
                           sample_fitnesses=self.sample_fitnesses,
                           w=w
                           )
         else:
             new_population = sampled_population
             new_population[0:1] = good_strings[0:1]
             self.population_fitnesses = self.sample_fitnesses
             self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
         print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                      np.min(self.population_fitnesses),
                                      np.max(self.population_fitnesses))
         print "best from previous:",(
           self.fitness(new_population[np.argmax(self.population_fitnesses)])
             )
         fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
         fitfile.flush()
     fitfile.close()
     return new_population
Ejemplo n.º 6
0
class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self,fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128,K=2,P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones

    def generate_random_string(self,l=20):
        return [random.choice([0,1]) for i in range(l)]

    def knapsack_fitness(self,string):
        knapsack = self.knapsack
        weights = []
        for i,c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i])*string))
        over = 0
        for i,w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values)*string)
            return _fitness

    def hiff_fitness(self,string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self,string):
        fitness = np.sum(string^self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self,_string):
        string =_string^self.mask
        fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            child_1 = int(np.random.random() * pop_size)
            child_2 = int(np.random.random() * pop_size)
            if fitnesses[child_1] > fitnesses[child_2]:
                new_population.append(copy.deepcopy(population[child_1]))
            else:
                new_population.append(copy.deepcopy(population[child_2]))
        return new_population

    def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]]
                        )
                index += 1
            if len(good_pop) == lim:
                return [good_pop,good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(self.generate_random_string(
                                        l=len(strings[0]))
                                    )
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop,good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population)/20
        _population = np.array(population)
        for ind_i,individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes],[individual],"hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self,strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def train_NADE(self,
                 data,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def build_sample_dA(self):  
        self.sample_dA = theano.function([self.dA.input],self.dA.sample)

    def iterative_algorithm(
        self,
        name,
        pop_size=100,
        genome_length=20,
        lim_percentage=20,
        corruption_level=0.2,
        num_epochs=50,
        lr = 0.1,
        max_evaluations=200000,
        unique_training=False,
        hiddens=300,
        rtr = True,
        w=10
        ):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path),"w")
        self.mask = np.random.binomial(1,0.5,genome_length)
        trials = max_evaluations/pop_size
        population_limit = int(pop_size*(lim_percentage/100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
        for iteration in range(0,trials):
            print "iteration:",iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(population)
            good_strings,good_strings_fitnesses=self.get_good_strings(
                                          population,
                                          population_limit,
                                          unique=unique_training,
                                          fitnesses=self.population_fitnesses
                                        )
            print "training A/E"
            training_data = np.array(good_strings)
            self.train_NADE(training_data,
                          num_epochs=num_epochs,
                          lr=lr)
            print "sampling..."
            sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                              population,
                              sampled_population,
                              population_fitnesses=self.population_fitnesses,
                              sample_fitnesses=self.sample_fitnesses,
                              w=w
                              )
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:",(
              self.fitness(new_population[np.argmax(self.population_fitnesses)])
                )
            fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
            fitfile.flush()
        fitfile.close()
        return new_population
class Sampler():
    def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)

    def sample(self,
               N=100,
               stor_dir='../evaluation',
               T=0.7,
               fold=[1],
               epoch=[9],
               valid=True,
               novel=True,
               unique=True,
               write_csv=True):
        '''Sample from a model where the number of novel valid unique molecules is fixed
        :param stor_dir:    directory where the generated SMILES are saved
        :param N:        number of samples
        :param T:        Temperature
        :param fold:     Folds to use for sampling
        :param epoch:    Epochs to use for sampling
        :param valid:    If True, only accept valid SMILES
        :param novel:    If True, only accept novel SMILES
        :param unique:   If True, only accept unique SMILES
        :param write_csv If True, the generated SMILES are written in stor_dir
        :return: res_molecules: list with all the generated SMILES
        '''

        res_molecules = []
        print('Sampling: started')
        for f in fold:
            for e in epoch:
                self._model.build(stor_dir + '/' + self._experiment_name +
                                  '/models/model_fold_' + str(f) + '_epochs_' +
                                  str(e))

                new_molecules = []
                while len(new_molecules) < N:
                    new_mol = self._encoder.decode(
                        self._model.sample(self._starting_token, T))

                    # Remove remains from generation
                    new_mol = clean_molecule(new_mol[0], self._model_type)

                    # If not valid, get new molecule
                    if valid and not check_valid(new_mol):
                        continue

                    # If not unique, get new molecule
                    if unique and (new_mol in new_molecules):
                        continue

                    # If not novel, get molecule
                    if novel and (new_mol in self._data):
                        continue

                    # If all conditions checked, add new molecule
                    new_molecules.append(new_mol)

                # Prepare name for file
                name = 'molecules_fold_' + str(f) + '_epochs_' + str(
                    e) + '_T_' + str(T) + '_N_' + str(N) + '.csv'
                if unique:
                    name = 'unique_' + name
                if valid:
                    name = 'valid_' + name
                if novel:
                    name = 'novel_' + name

                # Store final molecules
                if write_csv:
                    if not os.path.exists(stor_dir + '/' +
                                          self._experiment_name +
                                          '/molecules/'):
                        os.makedirs(stor_dir + '/' + self._experiment_name +
                                    '/molecules/')
                    mol = np.array(new_molecules).reshape(-1)
                    pd.DataFrame(mol).to_csv(stor_dir + '/' +
                                             self._experiment_name +
                                             '/molecules/' + name,
                                             header=None)

            res_molecules.append(new_molecules)

        print('Sampling: done')
        return res_molecules
Ejemplo n.º 8
0
class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self,fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128,K=2,P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.knapsack = pickle.load(open("weing8.pkl"))
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones
        elif self.FITNESS_F == "royal_road":
            self.fitness = self.royal_road
        elif self.FITNESS_F == "churchill":
            self.fitness = self.churchills_road
            self.optimum = 33

    def generate_random_string(self,l=20):
        return [random.choice([0,1]) for i in range(l)]

    def churchills_road(self,input,k=4,l=4):
        fitness = 0
        for partitions in range(0,l):
            first_part = sum(input[partitions*k*2:partitions*k*2+k])
            second_part = sum(input[(partitions*k*2)+k:(partitions*k*2)+k*2])
            if first_part == k and second_part == 0:
                fitness += 8
            if first_part == 0 and second_part == k:
                fitness += 8
        if sum(input[0:k]) == k and sum(input[len(input)-k:]) == k:
            fitness += 1
        if sum(input[0:k]) == 0 and sum(input[len(input)-k:]) == 0:
            fitness += 1
        return fitness

    def knapsack_fitness(self,string):
        knapsack = self.knapsack
        weights = []
        for i,c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i])*string))
        over = 0
        for i,w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values)*string)
            return _fitness

    def hiff_fitness(self,string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self,string):
        fitness = np.sum(string^self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self,_string):
        string =_string^self.mask
        fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def royal_road(self,string, order=8):
        """Royal Road Function R1 as presented by Melanie Mitchell in : 
        "An introduction to Genetic Algorithms".
        """
        individual = string^self.mask
        nelem = len(individual) / order
        max_value = int(2**order - 1)
        total = 0
        for i in xrange(nelem):
            value = int("".join(map(str, individual[i*order:i*order+order])), 2)
            total += int(order) * int(value/max_value)
        return total

    # def tournament_selection_replacement(self,
    #                                      population,
    #                                      fitnesses=None,
    #                                      pop_size=None):
    #     if pop_size == None:
    #         pop_size = len(population)
    #     if fitnesses == None:
    #         fitnesses = self.fitness_many(population)
    #     new_population = []
    #     while len(new_population) < pop_size:
    #         child_1 = int(np.random.random() * pop_size)
    #         child_2 = int(np.random.random() * pop_size)
    #         if fitnesses[child_1] > fitnesses[child_2]:
    #             new_population.append(copy.deepcopy(population[child_1]))
    #         else:
    #             new_population.append(copy.deepcopy(population[child_2]))
    #     return new_population

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None,
                                         tournament_size=2):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            contenders=np.random.randint(0,len(population),tournament_size)
            # print "contenders:",contenders
            t_fitnesses = [fitnesses[c] for c in contenders]
            # print "fitnesses:",t_fitnesses
            # print "best_fitness:",np.argmax(t_fitnesses)
            # print "winner:",contenders[np.argmax(t_fitnesses)]
            winner = copy.deepcopy(population[contenders[np.argmax(t_fitnesses)]])
            new_population.append(winner)
        return new_population

    def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]]
                        )
                index += 1
            if len(good_pop) == lim:
                return [good_pop,good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(self.generate_random_string(
                                        l=len(strings[0]))
                                    )
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop,good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population)/20
        _population = np.array(population)
        for ind_i,individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes],[individual],"hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self,strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def train_NADE(self,
                 data,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=True,
                      output_folder=output_folder,iteration=iteration)

    def build_sample_dA(self):  
        self.sample_dA = theano.function([self.dA.input],self.dA.sample)

    def iterative_algorithm(
        self,
        name,
        pop_size=100,
        genome_length=20,
        lim_percentage=20,
        corruption_level=0.2,
        num_epochs=50,
        lr = 0.1,
        max_evaluations=200000,
        unique_training=False,
        hiddens=300,
        rtr = True,
        w=10
        ):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path),"w")
        self.mask = np.random.binomial(1,0.5,genome_length)
        trials = max_evaluations/pop_size
        population_limit = int(pop_size*(lim_percentage/100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
        for iteration in range(0,trials):
            print "iteration:",iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(population,
                                                       fitnesses=self.population_fitnesses,
                                                       pop_size=population_limit,
                                                       tournament_size=4)
            if not rtr:
                good_strings,good_strings_fitnesses=self.get_good_strings(
                                              population,
                                              population_limit,
                                              unique=unique_training,
                                              fitnesses=self.population_fitnesses
                                            )
                training_data = np.array(good_strings)
            else:
                training_data = np.array(rw)
            print "training A/E"
            self.train_NADE(training_data,
                          num_epochs=num_epochs,
                          lr=lr)
            print "sampling..."
            # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))]
            sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
            # pdb.set_trace()
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                              population,
                              sampled_population,
                              population_fitnesses=self.population_fitnesses,
                              sample_fitnesses=self.sample_fitnesses,
                              w=w
                              )
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:",(
              self.fitness(new_population[np.argmax(self.population_fitnesses)])
                )
            fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
            fitfile.flush()
            if np.max(self.population_fitnesses) == self.optimum:
                pickle.dump({"pop":self.population,"fitnesses":self.population_fitnesses,"iteration":iteration},open("final_shit.pkl","w"))
                break
        fitfile.close()
        return new_population
Ejemplo n.º 9
0
class FineTuner():

    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def fine_tuning(self, stor_dir='../evaluation/', restart=False):
        '''Perform fine-tuning and store statistic,
        NOTE: Directory should be prepared with the correct name and model
        NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used'
        :param stor_dir:    directory to store data
        :return:
        '''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE random
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model)

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules(
                        self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i:
                    # Load model
                    self._model.build(
                        stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist())

                    # Skip this epoch
                    continue

                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size),
                                          label.reshape(-1, self._molecular_size), epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(
                stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) )

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(
                    i) + '.csv', header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)
Ejemplo n.º 10
0
    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')
    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    nade_layer = Dropout(0.0)(input_layer)
    nade_layer = NADE(hidden_dim=hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02))(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),
Ejemplo n.º 11
0
class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self, fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128, K=2, P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.knapsack = pickle.load(open("weing8.pkl"))
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones
        elif self.FITNESS_F == "royal_road":
            self.fitness = self.royal_road
        elif self.FITNESS_F == "churchill":
            self.fitness = self.churchills_road
            self.optimum = 33

    def generate_random_string(self, l=20):
        return [random.choice([0, 1]) for i in range(l)]

    def churchills_road(self, input, k=4, l=4):
        fitness = 0
        for partitions in range(0, l):
            first_part = sum(input[partitions * k * 2:partitions * k * 2 + k])
            second_part = sum(input[(partitions * k * 2) +
                                    k:(partitions * k * 2) + k * 2])
            if first_part == k and second_part == 0:
                fitness += 8
            if first_part == 0 and second_part == k:
                fitness += 8
        if sum(input[0:k]) == k and sum(input[len(input) - k:]) == k:
            fitness += 1
        if sum(input[0:k]) == 0 and sum(input[len(input) - k:]) == 0:
            fitness += 1
        return fitness

    def knapsack_fitness(self, string):
        knapsack = self.knapsack
        weights = []
        for i, c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i]) * string))
        over = 0
        for i, w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values) * string)
            return _fitness

    def hiff_fitness(self, string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self, string):
        fitness = np.sum(string ^ self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self, _string):
        string = _string ^ self.mask
        fitness = sum(string[0:len(string) / 2]) - sum(
            string[len(string) / 2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def royal_road(self, string, order=8):
        """Royal Road Function R1 as presented by Melanie Mitchell in : 
        "An introduction to Genetic Algorithms".
        """
        individual = string ^ self.mask
        nelem = len(individual) / order
        max_value = int(2**order - 1)
        total = 0
        for i in xrange(nelem):
            value = int(
                "".join(map(str, individual[i * order:i * order + order])), 2)
            total += int(order) * int(value / max_value)
        return total

    # def tournament_selection_replacement(self,
    #                                      population,
    #                                      fitnesses=None,
    #                                      pop_size=None):
    #     if pop_size == None:
    #         pop_size = len(population)
    #     if fitnesses == None:
    #         fitnesses = self.fitness_many(population)
    #     new_population = []
    #     while len(new_population) < pop_size:
    #         child_1 = int(np.random.random() * pop_size)
    #         child_2 = int(np.random.random() * pop_size)
    #         if fitnesses[child_1] > fitnesses[child_2]:
    #             new_population.append(copy.deepcopy(population[child_1]))
    #         else:
    #             new_population.append(copy.deepcopy(population[child_2]))
    #     return new_population

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None,
                                         tournament_size=2):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            contenders = np.random.randint(0, len(population), tournament_size)
            # print "contenders:",contenders
            t_fitnesses = [fitnesses[c] for c in contenders]
            # print "fitnesses:",t_fitnesses
            # print "best_fitness:",np.argmax(t_fitnesses)
            # print "winner:",contenders[np.argmax(t_fitnesses)]
            winner = copy.deepcopy(
                population[contenders[np.argmax(t_fitnesses)]])
            new_population.append(winner)
        return new_population

    def get_good_strings(self, strings, lim=20, unique=False, fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]])
                index += 1
            if len(good_pop) == lim:
                return [good_pop, good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(
                        self.generate_random_string(l=len(strings[0])))
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop, good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population) / 20
        _population = np.array(population)
        for ind_i, individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes], [individual], "hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self, strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,
                                    batch_size=20,
                                    number_batches=None)
        sgd_optimizer(self.dA.params, [self.dA.input],
                      self.dA.cost,
                      train_set,
                      lr=lr,
                      num_epochs=num_epochs,
                      save=False,
                      output_folder=output_folder,
                      iteration=iteration)

    def train_NADE(self,
                   data,
                   num_epochs=200,
                   lr=0.1,
                   output_folder="",
                   iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,
                                    batch_size=20,
                                    number_batches=None)
        sgd_optimizer(self.NADE.params, [self.NADE.v],
                      self.NADE.cost,
                      train_set,
                      lr=lr,
                      num_epochs=num_epochs,
                      save=True,
                      output_folder=output_folder,
                      iteration=iteration)

    def build_sample_dA(self):
        self.sample_dA = theano.function([self.dA.input], self.dA.sample)

    def iterative_algorithm(self,
                            name,
                            pop_size=100,
                            genome_length=20,
                            lim_percentage=20,
                            corruption_level=0.2,
                            num_epochs=50,
                            lr=0.1,
                            max_evaluations=200000,
                            unique_training=False,
                            hiddens=300,
                            rtr=True,
                            w=10):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path), "w")
        self.mask = np.random.binomial(1, 0.5, genome_length)
        trials = max_evaluations / pop_size
        population_limit = int(pop_size * (lim_percentage / 100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length, n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1, 0.5, (pop_size, genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write(
            "{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),
                                       np.min(self.population_fitnesses),
                                       np.max(self.population_fitnesses),
                                       np.std(self.population_fitnesses)))
        for iteration in range(0, trials):
            print "iteration:", iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(
                population,
                fitnesses=self.population_fitnesses,
                pop_size=population_limit,
                tournament_size=4)
            if not rtr:
                good_strings, good_strings_fitnesses = self.get_good_strings(
                    population,
                    population_limit,
                    unique=unique_training,
                    fitnesses=self.population_fitnesses)
                training_data = np.array(good_strings)
            else:
                training_data = np.array(rw)
            print "training A/E"
            self.train_NADE(training_data, num_epochs=num_epochs, lr=lr)
            print "sampling..."
            # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))]
            sampled_population = np.array(
                self.NADE.sample_multiple(n=len(new_population)), "b")
            # pdb.set_trace()
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                    population,
                    sampled_population,
                    population_fitnesses=self.population_fitnesses,
                    sample_fitnesses=self.sample_fitnesses,
                    w=w)
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:", (self.fitness(
                new_population[np.argmax(self.population_fitnesses)]))
            fitfile.write("{0},{1},{2},{3}\n".format(
                np.mean(self.population_fitnesses),
                np.min(self.population_fitnesses),
                np.max(self.population_fitnesses),
                np.std(self.population_fitnesses)))
            fitfile.flush()
            if np.max(self.population_fitnesses) == self.optimum:
                pickle.dump(
                    {
                        "pop": self.population,
                        "fitnesses": self.population_fitnesses,
                        "iteration": iteration
                    }, open("final_shit.pkl", "w"))
                break
        fitfile.close()
        return new_population
Ejemplo n.º 12
0
    # Define datapoints for each class.
    inps = torch.stack([
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 1, 0, 1, 1, 0, 0, 1, 0, 0]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1])
    ],
                       dim=0)

    # Define one model per class.
    models = [
        NADE(inp_dimensions, inp_dimensions // 2) for _ in range(num_classes)
    ]

    # Train each model one by one.
    for inp, model in zip(inps, models):

        # Optimization scheme.
        optimizer = optim.SGD(model.parameters(), lr=0.01)

        for _ in range(num_training_iterations):

            # Zero out previous gradients.
            model.zero_grad()

            # Compute log-likehoods per sample.
            log_likelihoods = model(inp)
class Trainer():
    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def complete_run(self, stor_dir='../evaluation/', restart=False):
        '''Training without validation on complete data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        # Shuffle data before training (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
        # to  (all_SMILES, molecular_size, encoding_size))
        self._data, label = shuffle(
            self._data.reshape(-1, self._molecular_size, self._encoding_size),
            label.reshape(-1, self._molecular_size))

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            # With restart read existing files
            if restart:
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model
            statistic = self._model.train(self._data,
                                          label,
                                          epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

    def single_run(self, stor_dir='../evaluation/', restart=False):
        '''Training with validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE' or self._model_type
                == 'NADE_v2') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        train_data, test_data, train_label, test_label = train_test_split(
            self._data, label, test_size=1. / 5, random_state=1, shuffle=True)
        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # Store validation loss
        tot_loss = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_val_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_val_file.shape[
                                       0] > i and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    tot_loss.append(tmp_val_file[i, 1])

                    # Skip this epoch
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(
                train_data.reshape(-1, self._molecular_size,
                                   self._encoding_size),
                train_label.reshape(-1, self._molecular_size),
                epochs=1,
                batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Test model on validation set
            tot_loss.append(
                self._model.validate(
                    test_data.reshape(-1, self._molecular_size,
                                      self._encoding_size),
                    test_label.reshape(-1, self._molecular_size)))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

            # Store validation data
            pd.DataFrame(np.array(tot_loss).reshape(
                -1, 1)).to_csv(stor_dir + '/' + self._experiment_name +
                               '/validation/val_fold_' + str(fold) + '.csv',
                               header=None)

    def cross_validation(self, stor_dir='../evaluation/', restart=False):
        '''Perform cross-validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2)

        # Count iterations
        fold = 0

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        for train, test in self._kf.split(self._data):

            # Shuffle index within test and train set
            np.random.shuffle(train)
            np.random.shuffle(test)

            fold += 1

            self._model.build()

            # Store total statistics
            tot_stat = []

            # Store validation loss
            tot_loss = []

            for i in range(self._epochs):
                print('Fold:', fold)
                print('Epoch:', i)

                if restart:
                    tmp_val_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/validation/val_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    tmp_stat_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/statistic/stat_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    # Check if current epoch is successfully complete[0]d else continue with normal training
                    if check_model(
                            self._model_type, self._experiment_name, stor_dir,
                            fold, i) and check_molecules(
                                self._experiment_name, stor_dir, fold,
                                i) and tmp_val_file.shape[
                                    0] > i and tmp_stat_file.shape[0] > i:

                        # Load model
                        self._model.build(stor_dir + '/' +
                                          self._experiment_name +
                                          '/models/model_fold_' + str(fold) +
                                          '_epochs_' + str(i))

                        # Fill statistic and loss list
                        tot_stat.append(tmp_stat_file[i,
                                                      1:].reshape(1,
                                                                  -1).tolist())
                        tot_loss.append(tmp_val_file[i, 1])

                        # Skip this epoch
                        continue

                    else:
                        restart = False

                # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
                # to  (all_SMILES, molecular_size, encoding_size))
                statistic = self._model.train(
                    self._data[train].reshape(-1, self._molecular_size,
                                              self._encoding_size),
                    label[train].reshape(-1, self._molecular_size),
                    epochs=1,
                    batch_size=self._batch_size)

                tot_stat.append(statistic.tolist())

                # Store model
                self._model.save(stor_dir + '/' + self._experiment_name +
                                 '/models/model_fold_' + str(fold) +
                                 '_epochs_' + str(i))

                # Test model on validation set
                tot_loss.append(
                    self._model.validate(
                        self._data[test].reshape(-1, self._molecular_size,
                                                 self._encoding_size),
                        label[test].reshape(-1, self._molecular_size)))

                # Sample new molecules
                new_molecules = []
                for s in range(self._samples):
                    mol = self._encoder.decode(
                        self._model.sample(self._starting_token, self._T))
                    new_molecules.append(
                        clean_molecule(mol[0], self._model_type))

                # Store new molecules
                new_molecules = np.array(new_molecules)
                pd.DataFrame(new_molecules).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/molecules/molecule_fold_' + str(fold) + '_epochs_' +
                    str(i) + '.csv',
                    header=None)

                # Store statistic
                store_stat = np.array(tot_stat).reshape(i + 1, -1)
                pd.DataFrame(np.array(store_stat)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None)

                # Store validation data
                pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None)