Python NADE Examples

Programming Language: Python

Namespace/Package Name: nade

Class/Type: NADE

Examples at hotexamples.com: 13

Python NADE - 13 examples found. These are the top rated real world Python examples of nade.NADE extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NADE(7)

build(3)

sample(3)

sample_multiple(2)

save(2)

train(2)

validate(1)

Example #1

Show file

File: sample.py Project: chunliangliu2020/Bidirectional-Molecule-Generation-with-Recurrent-Neural-Networks

    def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)

Example #2

Show file

    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

Example #3

Show file

File: run.py Project: grosa97/transfer-rec

def _train(args):
    """
    Train the NADE model
    :param args: parsed arguments
    """
    if K.backend() != 'tensorflow':
        print("This repository only support tensorflow backend.")
        raise NotImplementedError()

    batch_size_ = 512
    nb_users = 6040
    nb_movies = 3706
    data_sample = 1.0
    input_dim0 = 6040
    input_dim1 = 5
    std = 0.0
    alpha = 1.0

    print('Loading data...')
    train_file_list = sorted(glob.glob(os.path.join('data/train_set',
                                                    'part*')))
    val_file_list = sorted(glob.glob(os.path.join('data/val_set/', 'part*')))
    test_file_list = sorted(glob.glob(os.path.join('data/test_set/', 'part*')))

    train_file_list = [
        dfile for dfile in train_file_list if os.stat(dfile).st_size != 0
    ]
    val_file_list = [
        dfile for dfile in val_file_list if os.stat(dfile).st_size != 0
    ]
    test_file_list = [
        dfile for dfile in test_file_list if os.stat(dfile).st_size != 0
    ]

    print("Shuffle the data...")
    random.shuffle(train_file_list)
    random.shuffle(val_file_list)
    random.shuffle(test_file_list)
    train_file_list = train_file_list[:max(
        int(len(train_file_list) * data_sample), 1)]

    print('Instantiate DataSet classes...')
    train_set = DataSet(train_file_list,
                        num_users=nb_users,
                        num_items=nb_movies,
                        batch_size=batch_size_,
                        mode=0)
    val_set = DataSet(val_file_list,
                      num_users=nb_users,
                      num_items=nb_movies,
                      batch_size=batch_size_,
                      mode=1)
    test_set = DataSet(test_file_list,
                       num_users=nb_users,
                       num_items=nb_movies,
                       batch_size=batch_size_,
                       mode=2)

    rating_freq = np.zeros((6040, 5))
    init_b = np.zeros((6040, 5))
    for batch in val_set.generate(max_iters=1):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']
        rating_freq += inp_r.sum(axis=0)

    log_rating_freq = np.log(rating_freq + 1e-8)
    log_rating_freq_diff = np.diff(log_rating_freq, axis=1)
    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')

    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    print("Build NADE architecture...")
    # nade_layer = Dropout(0.0)(input_layer)
    nade_layer = input_layer
    nade_layer = NADE(hidden_dim=args.hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02),
                      args=args)(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),
                      name='nade_loss')([
                          nade_layer, output_ratings, input_masks,
                          output_masks, D, d
                      ])

    cf_nade_model = Model(
        inputs=[input_layer, output_ratings, input_masks, output_masks],
        outputs=[loss_out, predicted_ratings])

    print("Get NADE model summary...")
    cf_nade_model.summary()
    # Use Adam optimizer
    adam = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    # Compile NADE model
    cf_nade_model.compile(loss={
        'nade_loss': lambda y_true, y_pred: y_pred
    },
                          optimizer=adam)

    # Create EvaluationCallback for NADE model on train and validation sets
    train_evaluation_callback = EvaluationCallback(data_set=train_set,
                                                   new_items=new_items,
                                                   training_set=True)
    valid_evaluation_callback = EvaluationCallback(data_set=val_set,
                                                   new_items=new_items,
                                                   training_set=False)

    print('Training...')
    cf_nade_model.fit_generator(
        train_set.generate(),
        steps_per_epoch=(train_set.get_corpus_size() // batch_size_),
        epochs=args.n_epochs,
        validation_data=val_set.generate(),
        validation_steps=(val_set.get_corpus_size() // batch_size_),
        shuffle=True,
        callbacks=[
            train_set, val_set, train_evaluation_callback,
            valid_evaluation_callback
        ],
        verbose=1)

    print('Testing...')
    rmses = []
    rate_score = np.array([1, 2, 3, 4, 5], np.float32)
    new_items = new_items

    squared_error = []
    n_samples = []
    for i, batch in enumerate(test_set.generate(max_iters=1)):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']

        pred_batch = cf_nade_model.predict(batch[0])[1]
        true_r = out_r.argmax(axis=2) + 1
        pred_r = (pred_batch *
                  rate_score[np.newaxis, np.newaxis, :]).sum(axis=2)

        pred_r[:, new_items] = 3

        mask = out_r.sum(axis=2)

        se = np.sum(np.square(true_r - pred_r) * mask)
        n = np.sum(mask)
        squared_error.append(se)
        n_samples.append(n)

    total_squared_error = np.array(squared_error).sum()
    total_n_samples = np.array(n_samples).sum()
    rmse = np.sqrt(total_squared_error / (total_n_samples * 1.0 + 1e-8))
    print("test set RMSE is %f" % rmse)

Example #4

Show file

File: train_nade.py Project: airalcorn2/pytorch-nade

def init_model(opts):
    model_config = opts["model"]
    model_config["in_feats"] = 28 * 28
    model = NADE(**model_config)
    return model

Example #5

Show file

File: daga.py Project: alexanderchurchill/nadesid

 def iterative_algorithm(
     self,
     name,
     pop_size=100,
     genome_length=20,
     lim_percentage=20,
     corruption_level=0.2,
     num_epochs=50,
     lr = 0.1,
     max_evaluations=200000,
     unique_training=False,
     hiddens=300,
     rtr = True,
     w=10
     ):
     results_path = "results/autoencoder/{0}/".format(name)
     ensure_dir(results_path)
     fitfile = open("{0}fitnesses.dat".format(results_path),"w")
     self.mask = np.random.binomial(1,0.5,genome_length)
     trials = max_evaluations/pop_size
     population_limit = int(pop_size*(lim_percentage/100.0))
     # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
     # self.dA.build_dA(corruption_level)
     # self.build_sample_dA()
     self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
     # self.NADE.build_NADE()
     new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
     self.population_fitnesses = self.fitness_many(new_population)
     fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
     for iteration in range(0,trials):
         print "iteration:",iteration
         population = new_population
         self.population = new_population
         rw = self.tournament_selection_replacement(population)
         good_strings,good_strings_fitnesses=self.get_good_strings(
                                       population,
                                       population_limit,
                                       unique=unique_training,
                                       fitnesses=self.population_fitnesses
                                     )
         print "training A/E"
         training_data = np.array(good_strings)
         self.train_NADE(training_data,
                       num_epochs=num_epochs,
                       lr=lr)
         print "sampling..."
         sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
         self.sample_fitnesses = self.fitness_many(sampled_population)
         if rtr:
             new_population = self.RTR(
                           population,
                           sampled_population,
                           population_fitnesses=self.population_fitnesses,
                           sample_fitnesses=self.sample_fitnesses,
                           w=w
                           )
         else:
             new_population = sampled_population
             new_population[0:1] = good_strings[0:1]
             self.population_fitnesses = self.sample_fitnesses
             self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
         print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                      np.min(self.population_fitnesses),
                                      np.max(self.population_fitnesses))
         print "best from previous:",(
           self.fitness(new_population[np.argmax(self.population_fitnesses)])
             )
         fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
         fitfile.flush()
     fitfile.close()
     return new_population

Example #6

Show file

File: daga.py Project: alexanderchurchill/nadesid

class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self,fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128,K=2,P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones

    def generate_random_string(self,l=20):
        return [random.choice([0,1]) for i in range(l)]

    def knapsack_fitness(self,string):
        knapsack = self.knapsack
        weights = []
        for i,c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i])*string))
        over = 0
        for i,w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values)*string)
            return _fitness

    def hiff_fitness(self,string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self,string):
        fitness = np.sum(string^self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self,_string):
        string =_string^self.mask
        fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            child_1 = int(np.random.random() * pop_size)
            child_2 = int(np.random.random() * pop_size)
            if fitnesses[child_1] > fitnesses[child_2]:
                new_population.append(copy.deepcopy(population[child_1]))
            else:
                new_population.append(copy.deepcopy(population[child_2]))
        return new_population

    def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]]
                        )
                index += 1
            if len(good_pop) == lim:
                return [good_pop,good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(self.generate_random_string(
                                        l=len(strings[0]))
                                    )
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop,good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population)/20
        _population = np.array(population)
        for ind_i,individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes],[individual],"hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self,strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def train_NADE(self,
                 data,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def build_sample_dA(self):  
        self.sample_dA = theano.function([self.dA.input],self.dA.sample)

    def iterative_algorithm(
        self,
        name,
        pop_size=100,
        genome_length=20,
        lim_percentage=20,
        corruption_level=0.2,
        num_epochs=50,
        lr = 0.1,
        max_evaluations=200000,
        unique_training=False,
        hiddens=300,
        rtr = True,
        w=10
        ):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path),"w")
        self.mask = np.random.binomial(1,0.5,genome_length)
        trials = max_evaluations/pop_size
        population_limit = int(pop_size*(lim_percentage/100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
        for iteration in range(0,trials):
            print "iteration:",iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(population)
            good_strings,good_strings_fitnesses=self.get_good_strings(
                                          population,
                                          population_limit,
                                          unique=unique_training,
                                          fitnesses=self.population_fitnesses
                                        )
            print "training A/E"
            training_data = np.array(good_strings)
            self.train_NADE(training_data,
                          num_epochs=num_epochs,
                          lr=lr)
            print "sampling..."
            sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                              population,
                              sampled_population,
                              population_fitnesses=self.population_fitnesses,
                              sample_fitnesses=self.sample_fitnesses,
                              w=w
                              )
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:",(
              self.fitness(new_population[np.argmax(self.population_fitnesses)])
                )
            fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
            fitfile.flush()
        fitfile.close()
        return new_population

Example #7

Show file

File: sample.py Project: chunliangliu2020/Bidirectional-Molecule-Generation-with-Recurrent-Neural-Networks

class Sampler():
    def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)

    def sample(self,
               N=100,
               stor_dir='../evaluation',
               T=0.7,
               fold=[1],
               epoch=[9],
               valid=True,
               novel=True,
               unique=True,
               write_csv=True):
        '''Sample from a model where the number of novel valid unique molecules is fixed
        :param stor_dir:    directory where the generated SMILES are saved
        :param N:        number of samples
        :param T:        Temperature
        :param fold:     Folds to use for sampling
        :param epoch:    Epochs to use for sampling
        :param valid:    If True, only accept valid SMILES
        :param novel:    If True, only accept novel SMILES
        :param unique:   If True, only accept unique SMILES
        :param write_csv If True, the generated SMILES are written in stor_dir
        :return: res_molecules: list with all the generated SMILES
        '''

        res_molecules = []
        print('Sampling: started')
        for f in fold:
            for e in epoch:
                self._model.build(stor_dir + '/' + self._experiment_name +
                                  '/models/model_fold_' + str(f) + '_epochs_' +
                                  str(e))

                new_molecules = []
                while len(new_molecules) < N:
                    new_mol = self._encoder.decode(
                        self._model.sample(self._starting_token, T))

                    # Remove remains from generation
                    new_mol = clean_molecule(new_mol[0], self._model_type)

                    # If not valid, get new molecule
                    if valid and not check_valid(new_mol):
                        continue

                    # If not unique, get new molecule
                    if unique and (new_mol in new_molecules):
                        continue

                    # If not novel, get molecule
                    if novel and (new_mol in self._data):
                        continue

                    # If all conditions checked, add new molecule
                    new_molecules.append(new_mol)

                # Prepare name for file
                name = 'molecules_fold_' + str(f) + '_epochs_' + str(
                    e) + '_T_' + str(T) + '_N_' + str(N) + '.csv'
                if unique:
                    name = 'unique_' + name
                if valid:
                    name = 'valid_' + name
                if novel:
                    name = 'novel_' + name

                # Store final molecules
                if write_csv:
                    if not os.path.exists(stor_dir + '/' +
                                          self._experiment_name +
                                          '/molecules/'):
                        os.makedirs(stor_dir + '/' + self._experiment_name +
                                    '/molecules/')
                    mol = np.array(new_molecules).reshape(-1)
                    pd.DataFrame(mol).to_csv(stor_dir + '/' +
                                             self._experiment_name +
                                             '/molecules/' + name,
                                             header=None)

            res_molecules.append(new_molecules)

        print('Sampling: done')
        return res_molecules

Example #8

Show file

File: daga.py Project: alexanderchurchill/nnpapersid

class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self,fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128,K=2,P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.knapsack = pickle.load(open("weing8.pkl"))
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones
        elif self.FITNESS_F == "royal_road":
            self.fitness = self.royal_road
        elif self.FITNESS_F == "churchill":
            self.fitness = self.churchills_road
            self.optimum = 33

    def generate_random_string(self,l=20):
        return [random.choice([0,1]) for i in range(l)]

    def churchills_road(self,input,k=4,l=4):
        fitness = 0
        for partitions in range(0,l):
            first_part = sum(input[partitions*k*2:partitions*k*2+k])
            second_part = sum(input[(partitions*k*2)+k:(partitions*k*2)+k*2])
            if first_part == k and second_part == 0:
                fitness += 8
            if first_part == 0 and second_part == k:
                fitness += 8
        if sum(input[0:k]) == k and sum(input[len(input)-k:]) == k:
            fitness += 1
        if sum(input[0:k]) == 0 and sum(input[len(input)-k:]) == 0:
            fitness += 1
        return fitness

    def knapsack_fitness(self,string):
        knapsack = self.knapsack
        weights = []
        for i,c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i])*string))
        over = 0
        for i,w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values)*string)
            return _fitness

    def hiff_fitness(self,string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self,string):
        fitness = np.sum(string^self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self,_string):
        string =_string^self.mask
        fitness = sum(string[0:len(string)/2]) - sum(string[len(string)/2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def royal_road(self,string, order=8):
        """Royal Road Function R1 as presented by Melanie Mitchell in : 
        "An introduction to Genetic Algorithms".
        """
        individual = string^self.mask
        nelem = len(individual) / order
        max_value = int(2**order - 1)
        total = 0
        for i in xrange(nelem):
            value = int("".join(map(str, individual[i*order:i*order+order])), 2)
            total += int(order) * int(value/max_value)
        return total

    # def tournament_selection_replacement(self,
    #                                      population,
    #                                      fitnesses=None,
    #                                      pop_size=None):
    #     if pop_size == None:
    #         pop_size = len(population)
    #     if fitnesses == None:
    #         fitnesses = self.fitness_many(population)
    #     new_population = []
    #     while len(new_population) < pop_size:
    #         child_1 = int(np.random.random() * pop_size)
    #         child_2 = int(np.random.random() * pop_size)
    #         if fitnesses[child_1] > fitnesses[child_2]:
    #             new_population.append(copy.deepcopy(population[child_1]))
    #         else:
    #             new_population.append(copy.deepcopy(population[child_2]))
    #     return new_population

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None,
                                         tournament_size=2):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            contenders=np.random.randint(0,len(population),tournament_size)
            # print "contenders:",contenders
            t_fitnesses = [fitnesses[c] for c in contenders]
            # print "fitnesses:",t_fitnesses
            # print "best_fitness:",np.argmax(t_fitnesses)
            # print "winner:",contenders[np.argmax(t_fitnesses)]
            winner = copy.deepcopy(population[contenders[np.argmax(t_fitnesses)]])
            new_population.append(winner)
        return new_population

    def get_good_strings(self,strings,lim=20,unique=False,fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]]
                        )
                index += 1
            if len(good_pop) == lim:
                return [good_pop,good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(self.generate_random_string(
                                        l=len(strings[0]))
                                    )
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop,good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population)/20
        _population = np.array(population)
        for ind_i,individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes],[individual],"hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self,strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.dA.params,[self.dA.input],self.dA.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=False,
                      output_folder=output_folder,iteration=iteration)

    def train_NADE(self,
                 data,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,batch_size=20,number_batches=None)
        sgd_optimizer(self.NADE.params,[self.NADE.v],self.NADE.cost,train_set,
                      lr=lr,num_epochs=num_epochs,save=True,
                      output_folder=output_folder,iteration=iteration)

    def build_sample_dA(self):  
        self.sample_dA = theano.function([self.dA.input],self.dA.sample)

    def iterative_algorithm(
        self,
        name,
        pop_size=100,
        genome_length=20,
        lim_percentage=20,
        corruption_level=0.2,
        num_epochs=50,
        lr = 0.1,
        max_evaluations=200000,
        unique_training=False,
        hiddens=300,
        rtr = True,
        w=10
        ):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path),"w")
        self.mask = np.random.binomial(1,0.5,genome_length)
        trials = max_evaluations/pop_size
        population_limit = int(pop_size*(lim_percentage/100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
        for iteration in range(0,trials):
            print "iteration:",iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(population,
                                                       fitnesses=self.population_fitnesses,
                                                       pop_size=population_limit,
                                                       tournament_size=4)
            if not rtr:
                good_strings,good_strings_fitnesses=self.get_good_strings(
                                              population,
                                              population_limit,
                                              unique=unique_training,
                                              fitnesses=self.population_fitnesses
                                            )
                training_data = np.array(good_strings)
            else:
                training_data = np.array(rw)
            print "training A/E"
            self.train_NADE(training_data,
                          num_epochs=num_epochs,
                          lr=lr)
            print "sampling..."
            # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))]
            sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
            # pdb.set_trace()
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                              population,
                              sampled_population,
                              population_fitnesses=self.population_fitnesses,
                              sample_fitnesses=self.sample_fitnesses,
                              w=w
                              )
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:",(
              self.fitness(new_population[np.argmax(self.population_fitnesses)])
                )
            fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
            fitfile.flush()
            if np.max(self.population_fitnesses) == self.optimum:
                pickle.dump({"pop":self.population,"fitnesses":self.population_fitnesses,"iteration":iteration},open("final_shit.pkl","w"))
                break
        fitfile.close()
        return new_population

Example #9

Show file

class FineTuner():

    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def fine_tuning(self, stor_dir='../evaluation/', restart=False):
        '''Perform fine-tuning and store statistic,
        NOTE: Directory should be prepared with the correct name and model
        NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used'
        :param stor_dir:    directory to store data
        :return:
        '''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE random
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model)

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules(
                        self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i:
                    # Load model
                    self._model.build(
                        stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist())

                    # Skip this epoch
                    continue

                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size),
                                          label.reshape(-1, self._molecular_size), epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(
                stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) )

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(
                    i) + '.csv', header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

Example #10

Show file

    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')
    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    nade_layer = Dropout(0.0)(input_layer)
    nade_layer = NADE(hidden_dim=hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02))(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),

Example #11

Show file

File: daga.py Project: alexanderchurchill/nnpapersid

class AESolver(object):
    """
    The Denoising Autoencoder Genetic Algorithm
    """
    def __init__(self, fitness_f):
        super(AESolver, self).__init__()
        self.FITNESS_F = fitness_f
        if self.FITNESS_F == "hiff":
            self.HIFF = HIFF(NUMGENES=128, K=2, P=7)
            self.fitness = self.hiff_fitness
        elif self.FITNESS_F == "knapsack":
            self.knapsack = pickle.load(open("weing8.pkl"))
            self.fitness = self.knapsack_fitness
        elif self.FITNESS_F == "max_ones":
            self.fitness = self.max_ones_fitness
        elif self.FITNESS_F == "left_ones":
            self.fitness = self.left_ones
        elif self.FITNESS_F == "royal_road":
            self.fitness = self.royal_road
        elif self.FITNESS_F == "churchill":
            self.fitness = self.churchills_road
            self.optimum = 33

    def generate_random_string(self, l=20):
        return [random.choice([0, 1]) for i in range(l)]

    def churchills_road(self, input, k=4, l=4):
        fitness = 0
        for partitions in range(0, l):
            first_part = sum(input[partitions * k * 2:partitions * k * 2 + k])
            second_part = sum(input[(partitions * k * 2) +
                                    k:(partitions * k * 2) + k * 2])
            if first_part == k and second_part == 0:
                fitness += 8
            if first_part == 0 and second_part == k:
                fitness += 8
        if sum(input[0:k]) == k and sum(input[len(input) - k:]) == k:
            fitness += 1
        if sum(input[0:k]) == 0 and sum(input[len(input) - k:]) == 0:
            fitness += 1
        return fitness

    def knapsack_fitness(self, string):
        knapsack = self.knapsack
        weights = []
        for i, c in enumerate(knapsack.capacities):
            weights.append(np.sum(np.array(knapsack.constraints[i]) * string))
        over = 0
        for i, w in enumerate(weights):
            if w > knapsack.capacities[i]:
                over += (w - knapsack.capacities[i])
        if over > 0:
            return -over
        else:
            _fitness = np.sum(np.array(knapsack.values) * string)
            return _fitness

    def hiff_fitness(self, string):
        fitness = self.HIFF.H(string)
        return fitness

    def max_ones_fitness(self, string):
        fitness = np.sum(string ^ self.mask)
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def left_ones_fitness(self, _string):
        string = _string ^ self.mask
        fitness = sum(string[0:len(string) / 2]) - sum(
            string[len(string) / 2:])
        if cache:
            self.cache_fitness(fitness)
        return fitness

    def royal_road(self, string, order=8):
        """Royal Road Function R1 as presented by Melanie Mitchell in : 
        "An introduction to Genetic Algorithms".
        """
        individual = string ^ self.mask
        nelem = len(individual) / order
        max_value = int(2**order - 1)
        total = 0
        for i in xrange(nelem):
            value = int(
                "".join(map(str, individual[i * order:i * order + order])), 2)
            total += int(order) * int(value / max_value)
        return total

    # def tournament_selection_replacement(self,
    #                                      population,
    #                                      fitnesses=None,
    #                                      pop_size=None):
    #     if pop_size == None:
    #         pop_size = len(population)
    #     if fitnesses == None:
    #         fitnesses = self.fitness_many(population)
    #     new_population = []
    #     while len(new_population) < pop_size:
    #         child_1 = int(np.random.random() * pop_size)
    #         child_2 = int(np.random.random() * pop_size)
    #         if fitnesses[child_1] > fitnesses[child_2]:
    #             new_population.append(copy.deepcopy(population[child_1]))
    #         else:
    #             new_population.append(copy.deepcopy(population[child_2]))
    #     return new_population

    def tournament_selection_replacement(self,
                                         population,
                                         fitnesses=None,
                                         pop_size=None,
                                         tournament_size=2):
        if pop_size == None:
            pop_size = len(population)
        if fitnesses == None:
            fitnesses = self.fitness_many(population)
        new_population = []
        while len(new_population) < pop_size:
            contenders = np.random.randint(0, len(population), tournament_size)
            # print "contenders:",contenders
            t_fitnesses = [fitnesses[c] for c in contenders]
            # print "fitnesses:",t_fitnesses
            # print "best_fitness:",np.argmax(t_fitnesses)
            # print "winner:",contenders[np.argmax(t_fitnesses)]
            winner = copy.deepcopy(
                population[contenders[np.argmax(t_fitnesses)]])
            new_population.append(winner)
        return new_population

    def get_good_strings(self, strings, lim=20, unique=False, fitnesses=None):
        if fitnesses == None:
            fitnesses = [self.fitness(s) for s in strings]
        sorted_fitnesses = sorted(range(len(fitnesses)),
                                  key=lambda k: fitnesses[k])
        sorted_fitnesses.reverse()
        if unique == False:
            return ([strings[i] for i in sorted_fitnesses[0:lim]],
                    [fitnesses[k] for k in sorted_fitnesses[0:lim]])
        else:
            uniques = {}
            good_pop = []
            good_pop_fitnesses = []
            index = 0
            while len(good_pop) < lim and index < len(sorted_fitnesses):
                key = str(strings[sorted_fitnesses[index]])
                if key not in uniques:
                    uniques[key] = 0
                    good_pop.append(strings[sorted_fitnesses[index]])
                    good_pop_fitnesses.append(
                        fitnesses[sorted_fitnesses[index]])
                index += 1
            if len(good_pop) == lim:
                return [good_pop, good_pop_fitnesses]
            else:
                while len(good_pop) < lim:
                    good_pop.append(
                        self.generate_random_string(l=len(strings[0])))
                    good_pop_fitnesses.append(self.fitness(good_pop[-1]))
                return [good_pop, good_pop_fitnesses]

    def RTR(self,
            population,
            sampled_population,
            population_fitnesses,
            sample_fitnesses,
            w=None):
        if w == None:
            w = len(population) / 20
        _population = np.array(population)
        for ind_i, individual in enumerate(sampled_population):
            indexes = np.random.choice(len(_population), w, replace=False)
            distances = cdist(_population[indexes], [individual], "hamming")
            replacement = indexes[np.argmin(distances.flatten())]
            if population_fitnesses[replacement] < sample_fitnesses[ind_i]:
                _population[replacement] = individual
                population_fitnesses[replacement] = sample_fitnesses[ind_i]
        return _population

    def fitness_many(self, strings):
        return [self.fitness(s) for s in strings]

    def train_dA(self,
                 data,
                 corruption_level=0.2,
                 num_epochs=200,
                 lr=0.1,
                 output_folder="",
                 iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,
                                    batch_size=20,
                                    number_batches=None)
        sgd_optimizer(self.dA.params, [self.dA.input],
                      self.dA.cost,
                      train_set,
                      lr=lr,
                      num_epochs=num_epochs,
                      save=False,
                      output_folder=output_folder,
                      iteration=iteration)

    def train_NADE(self,
                   data,
                   num_epochs=200,
                   lr=0.1,
                   output_folder="",
                   iteration=0):
        train_data = data
        train_set = SequenceDataset(train_data,
                                    batch_size=20,
                                    number_batches=None)
        sgd_optimizer(self.NADE.params, [self.NADE.v],
                      self.NADE.cost,
                      train_set,
                      lr=lr,
                      num_epochs=num_epochs,
                      save=True,
                      output_folder=output_folder,
                      iteration=iteration)

    def build_sample_dA(self):
        self.sample_dA = theano.function([self.dA.input], self.dA.sample)

    def iterative_algorithm(self,
                            name,
                            pop_size=100,
                            genome_length=20,
                            lim_percentage=20,
                            corruption_level=0.2,
                            num_epochs=50,
                            lr=0.1,
                            max_evaluations=200000,
                            unique_training=False,
                            hiddens=300,
                            rtr=True,
                            w=10):
        results_path = "results/autoencoder/{0}/".format(name)
        ensure_dir(results_path)
        fitfile = open("{0}fitnesses.dat".format(results_path), "w")
        self.mask = np.random.binomial(1, 0.5, genome_length)
        trials = max_evaluations / pop_size
        population_limit = int(pop_size * (lim_percentage / 100.0))
        # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
        # self.dA.build_dA(corruption_level)
        # self.build_sample_dA()
        self.NADE = NADE(n_visible=genome_length, n_hidden=hiddens)
        # self.NADE.build_NADE()
        new_population = np.random.binomial(1, 0.5, (pop_size, genome_length))
        self.population_fitnesses = self.fitness_many(new_population)
        fitfile.write(
            "{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),
                                       np.min(self.population_fitnesses),
                                       np.max(self.population_fitnesses),
                                       np.std(self.population_fitnesses)))
        for iteration in range(0, trials):
            print "iteration:", iteration
            population = new_population
            self.population = new_population
            rw = self.tournament_selection_replacement(
                population,
                fitnesses=self.population_fitnesses,
                pop_size=population_limit,
                tournament_size=4)
            if not rtr:
                good_strings, good_strings_fitnesses = self.get_good_strings(
                    population,
                    population_limit,
                    unique=unique_training,
                    fitnesses=self.population_fitnesses)
                training_data = np.array(good_strings)
            else:
                training_data = np.array(rw)
            print "training A/E"
            self.train_NADE(training_data, num_epochs=num_epochs, lr=lr)
            print "sampling..."
            # sampled_population = [np.array(self.NADE.sample(),"b") for i in range(len(self.population))]
            sampled_population = np.array(
                self.NADE.sample_multiple(n=len(new_population)), "b")
            # pdb.set_trace()
            self.sample_fitnesses = self.fitness_many(sampled_population)
            if rtr:
                new_population = self.RTR(
                    population,
                    sampled_population,
                    population_fitnesses=self.population_fitnesses,
                    sample_fitnesses=self.sample_fitnesses,
                    w=w)
            else:
                new_population = sampled_population
                new_population[0:1] = good_strings[0:1]
                self.population_fitnesses = self.sample_fitnesses
                self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
            print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                         np.min(self.population_fitnesses),
                                         np.max(self.population_fitnesses))
            print "best from previous:", (self.fitness(
                new_population[np.argmax(self.population_fitnesses)]))
            fitfile.write("{0},{1},{2},{3}\n".format(
                np.mean(self.population_fitnesses),
                np.min(self.population_fitnesses),
                np.max(self.population_fitnesses),
                np.std(self.population_fitnesses)))
            fitfile.flush()
            if np.max(self.population_fitnesses) == self.optimum:
                pickle.dump(
                    {
                        "pop": self.population,
                        "fitnesses": self.population_fitnesses,
                        "iteration": iteration
                    }, open("final_shit.pkl", "w"))
                break
        fitfile.close()
        return new_population

Example #12

Show file

    # Define datapoints for each class.
    inps = torch.stack([
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 1, 0, 1, 1, 0, 0, 1, 0, 0]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1])
    ],
                       dim=0)

    # Define one model per class.
    models = [
        NADE(inp_dimensions, inp_dimensions // 2) for _ in range(num_classes)
    ]

    # Train each model one by one.
    for inp, model in zip(inps, models):

        # Optimization scheme.
        optimizer = optim.SGD(model.parameters(), lr=0.01)

        for _ in range(num_training_iterations):

            # Zero out previous gradients.
            model.zero_grad()

            # Compute log-likehoods per sample.
            log_likelihoods = model(inp)

Example #13

Show file

File: trainer.py Project: chunliangliu2020/Bidirectional-Molecule-Generation-with-Recurrent-Neural-Networks

class Trainer():
    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def complete_run(self, stor_dir='../evaluation/', restart=False):
        '''Training without validation on complete data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        # Shuffle data before training (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
        # to  (all_SMILES, molecular_size, encoding_size))
        self._data, label = shuffle(
            self._data.reshape(-1, self._molecular_size, self._encoding_size),
            label.reshape(-1, self._molecular_size))

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            # With restart read existing files
            if restart:
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model
            statistic = self._model.train(self._data,
                                          label,
                                          epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

    def single_run(self, stor_dir='../evaluation/', restart=False):
        '''Training with validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE' or self._model_type
                == 'NADE_v2') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        train_data, test_data, train_label, test_label = train_test_split(
            self._data, label, test_size=1. / 5, random_state=1, shuffle=True)
        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # Store validation loss
        tot_loss = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_val_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_val_file.shape[
                                       0] > i and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    tot_loss.append(tmp_val_file[i, 1])

                    # Skip this epoch
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(
                train_data.reshape(-1, self._molecular_size,
                                   self._encoding_size),
                train_label.reshape(-1, self._molecular_size),
                epochs=1,
                batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Test model on validation set
            tot_loss.append(
                self._model.validate(
                    test_data.reshape(-1, self._molecular_size,
                                      self._encoding_size),
                    test_label.reshape(-1, self._molecular_size)))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

            # Store validation data
            pd.DataFrame(np.array(tot_loss).reshape(
                -1, 1)).to_csv(stor_dir + '/' + self._experiment_name +
                               '/validation/val_fold_' + str(fold) + '.csv',
                               header=None)

    def cross_validation(self, stor_dir='../evaluation/', restart=False):
        '''Perform cross-validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2)

        # Count iterations
        fold = 0

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        for train, test in self._kf.split(self._data):

            # Shuffle index within test and train set
            np.random.shuffle(train)
            np.random.shuffle(test)

            fold += 1

            self._model.build()

            # Store total statistics
            tot_stat = []

            # Store validation loss
            tot_loss = []

            for i in range(self._epochs):
                print('Fold:', fold)
                print('Epoch:', i)

                if restart:
                    tmp_val_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/validation/val_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    tmp_stat_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/statistic/stat_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    # Check if current epoch is successfully complete[0]d else continue with normal training
                    if check_model(
                            self._model_type, self._experiment_name, stor_dir,
                            fold, i) and check_molecules(
                                self._experiment_name, stor_dir, fold,
                                i) and tmp_val_file.shape[
                                    0] > i and tmp_stat_file.shape[0] > i:

                        # Load model
                        self._model.build(stor_dir + '/' +
                                          self._experiment_name +
                                          '/models/model_fold_' + str(fold) +
                                          '_epochs_' + str(i))

                        # Fill statistic and loss list
                        tot_stat.append(tmp_stat_file[i,
                                                      1:].reshape(1,
                                                                  -1).tolist())
                        tot_loss.append(tmp_val_file[i, 1])

                        # Skip this epoch
                        continue

                    else:
                        restart = False

                # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
                # to  (all_SMILES, molecular_size, encoding_size))
                statistic = self._model.train(
                    self._data[train].reshape(-1, self._molecular_size,
                                              self._encoding_size),
                    label[train].reshape(-1, self._molecular_size),
                    epochs=1,
                    batch_size=self._batch_size)

                tot_stat.append(statistic.tolist())

                # Store model
                self._model.save(stor_dir + '/' + self._experiment_name +
                                 '/models/model_fold_' + str(fold) +
                                 '_epochs_' + str(i))

                # Test model on validation set
                tot_loss.append(
                    self._model.validate(
                        self._data[test].reshape(-1, self._molecular_size,
                                                 self._encoding_size),
                        label[test].reshape(-1, self._molecular_size)))

                # Sample new molecules
                new_molecules = []
                for s in range(self._samples):
                    mol = self._encoder.decode(
                        self._model.sample(self._starting_token, self._T))
                    new_molecules.append(
                        clean_molecule(mol[0], self._model_type))

                # Store new molecules
                new_molecules = np.array(new_molecules)
                pd.DataFrame(new_molecules).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/molecules/molecule_fold_' + str(fold) + '_epochs_' +
                    str(i) + '.csv',
                    header=None)

                # Store statistic
                store_stat = np.array(tot_stat).reshape(i + 1, -1)
                pd.DataFrame(np.array(store_stat)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None)

                # Store validation data
                pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None)