def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)
Esempio n. 2
0
    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)
Esempio n. 3
0
def _train(args):
    """
    Train the NADE model
    :param args: parsed arguments
    """
    if K.backend() != 'tensorflow':
        print("This repository only support tensorflow backend.")
        raise NotImplementedError()

    batch_size_ = 512
    nb_users = 6040
    nb_movies = 3706
    data_sample = 1.0
    input_dim0 = 6040
    input_dim1 = 5
    std = 0.0
    alpha = 1.0

    print('Loading data...')
    train_file_list = sorted(glob.glob(os.path.join('data/train_set',
                                                    'part*')))
    val_file_list = sorted(glob.glob(os.path.join('data/val_set/', 'part*')))
    test_file_list = sorted(glob.glob(os.path.join('data/test_set/', 'part*')))

    train_file_list = [
        dfile for dfile in train_file_list if os.stat(dfile).st_size != 0
    ]
    val_file_list = [
        dfile for dfile in val_file_list if os.stat(dfile).st_size != 0
    ]
    test_file_list = [
        dfile for dfile in test_file_list if os.stat(dfile).st_size != 0
    ]

    print("Shuffle the data...")
    random.shuffle(train_file_list)
    random.shuffle(val_file_list)
    random.shuffle(test_file_list)
    train_file_list = train_file_list[:max(
        int(len(train_file_list) * data_sample), 1)]

    print('Instantiate DataSet classes...')
    train_set = DataSet(train_file_list,
                        num_users=nb_users,
                        num_items=nb_movies,
                        batch_size=batch_size_,
                        mode=0)
    val_set = DataSet(val_file_list,
                      num_users=nb_users,
                      num_items=nb_movies,
                      batch_size=batch_size_,
                      mode=1)
    test_set = DataSet(test_file_list,
                       num_users=nb_users,
                       num_items=nb_movies,
                       batch_size=batch_size_,
                       mode=2)

    rating_freq = np.zeros((6040, 5))
    init_b = np.zeros((6040, 5))
    for batch in val_set.generate(max_iters=1):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']
        rating_freq += inp_r.sum(axis=0)

    log_rating_freq = np.log(rating_freq + 1e-8)
    log_rating_freq_diff = np.diff(log_rating_freq, axis=1)
    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')

    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    print("Build NADE architecture...")
    # nade_layer = Dropout(0.0)(input_layer)
    nade_layer = input_layer
    nade_layer = NADE(hidden_dim=args.hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02),
                      args=args)(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),
                      name='nade_loss')([
                          nade_layer, output_ratings, input_masks,
                          output_masks, D, d
                      ])

    cf_nade_model = Model(
        inputs=[input_layer, output_ratings, input_masks, output_masks],
        outputs=[loss_out, predicted_ratings])

    print("Get NADE model summary...")
    cf_nade_model.summary()
    # Use Adam optimizer
    adam = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    # Compile NADE model
    cf_nade_model.compile(loss={
        'nade_loss': lambda y_true, y_pred: y_pred
    },
                          optimizer=adam)

    # Create EvaluationCallback for NADE model on train and validation sets
    train_evaluation_callback = EvaluationCallback(data_set=train_set,
                                                   new_items=new_items,
                                                   training_set=True)
    valid_evaluation_callback = EvaluationCallback(data_set=val_set,
                                                   new_items=new_items,
                                                   training_set=False)

    print('Training...')
    cf_nade_model.fit_generator(
        train_set.generate(),
        steps_per_epoch=(train_set.get_corpus_size() // batch_size_),
        epochs=args.n_epochs,
        validation_data=val_set.generate(),
        validation_steps=(val_set.get_corpus_size() // batch_size_),
        shuffle=True,
        callbacks=[
            train_set, val_set, train_evaluation_callback,
            valid_evaluation_callback
        ],
        verbose=1)

    print('Testing...')
    rmses = []
    rate_score = np.array([1, 2, 3, 4, 5], np.float32)
    new_items = new_items

    squared_error = []
    n_samples = []
    for i, batch in enumerate(test_set.generate(max_iters=1)):
        inp_r = batch[0]['input_ratings']
        out_r = batch[0]['output_ratings']
        inp_m = batch[0]['input_masks']
        out_m = batch[0]['output_masks']

        pred_batch = cf_nade_model.predict(batch[0])[1]
        true_r = out_r.argmax(axis=2) + 1
        pred_r = (pred_batch *
                  rate_score[np.newaxis, np.newaxis, :]).sum(axis=2)

        pred_r[:, new_items] = 3

        mask = out_r.sum(axis=2)

        se = np.sum(np.square(true_r - pred_r) * mask)
        n = np.sum(mask)
        squared_error.append(se)
        n_samples.append(n)

    total_squared_error = np.array(squared_error).sum()
    total_n_samples = np.array(n_samples).sum()
    rmse = np.sqrt(total_squared_error / (total_n_samples * 1.0 + 1e-8))
    print("test set RMSE is %f" % rmse)
Esempio n. 4
0
def init_model(opts):
    model_config = opts["model"]
    model_config["in_feats"] = 28 * 28
    model = NADE(**model_config)
    return model
Esempio n. 5
0
 def iterative_algorithm(
     self,
     name,
     pop_size=100,
     genome_length=20,
     lim_percentage=20,
     corruption_level=0.2,
     num_epochs=50,
     lr = 0.1,
     max_evaluations=200000,
     unique_training=False,
     hiddens=300,
     rtr = True,
     w=10
     ):
     results_path = "results/autoencoder/{0}/".format(name)
     ensure_dir(results_path)
     fitfile = open("{0}fitnesses.dat".format(results_path),"w")
     self.mask = np.random.binomial(1,0.5,genome_length)
     trials = max_evaluations/pop_size
     population_limit = int(pop_size*(lim_percentage/100.0))
     # self.dA = dA(n_visible=genome_length,n_hidden=hiddens)
     # self.dA.build_dA(corruption_level)
     # self.build_sample_dA()
     self.NADE = NADE(n_visible=genome_length,n_hidden=hiddens)
     # self.NADE.build_NADE()
     new_population = np.random.binomial(1,0.5,(pop_size,genome_length))
     self.population_fitnesses = self.fitness_many(new_population)
     fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
     for iteration in range(0,trials):
         print "iteration:",iteration
         population = new_population
         self.population = new_population
         rw = self.tournament_selection_replacement(population)
         good_strings,good_strings_fitnesses=self.get_good_strings(
                                       population,
                                       population_limit,
                                       unique=unique_training,
                                       fitnesses=self.population_fitnesses
                                     )
         print "training A/E"
         training_data = np.array(good_strings)
         self.train_NADE(training_data,
                       num_epochs=num_epochs,
                       lr=lr)
         print "sampling..."
         sampled_population = np.array(self.NADE.sample_multiple(n=len(new_population)),"b")
         self.sample_fitnesses = self.fitness_many(sampled_population)
         if rtr:
             new_population = self.RTR(
                           population,
                           sampled_population,
                           population_fitnesses=self.population_fitnesses,
                           sample_fitnesses=self.sample_fitnesses,
                           w=w
                           )
         else:
             new_population = sampled_population
             new_population[0:1] = good_strings[0:1]
             self.population_fitnesses = self.sample_fitnesses
             self.population_fitnesses[0:1] = good_strings_fitnesses[0:1]
         print "{0},{1},{2}\n".format(np.mean(self.population_fitnesses),
                                      np.min(self.population_fitnesses),
                                      np.max(self.population_fitnesses))
         print "best from previous:",(
           self.fitness(new_population[np.argmax(self.population_fitnesses)])
             )
         fitfile.write("{0},{1},{2},{3}\n".format(np.mean(self.population_fitnesses),np.min(self.population_fitnesses),np.max(self.population_fitnesses),np.std(self.population_fitnesses)))
         fitfile.flush()
     fitfile.close()
     return new_population
Esempio n. 6
0
    init_b[:, 1:] = log_rating_freq_diff
    init_b[:, 0] = log_rating_freq[:, 0]

    new_items = np.where(rating_freq.sum(axis=1) == 0)[0]

    input_layer = Input(shape=(input_dim0, input_dim1), name='input_ratings')
    output_ratings = Input(shape=(input_dim0, input_dim1),
                           name='output_ratings')
    input_masks = Input(shape=(input_dim0, ), name='input_masks')
    output_masks = Input(shape=(input_dim0, ), name='output_masks')

    nade_layer = Dropout(0.0)(input_layer)
    nade_layer = NADE(hidden_dim=hidden_dim,
                      activation='tanh',
                      bias=True,
                      W_regularizer=keras.regularizers.l2(0.02),
                      V_regularizer=keras.regularizers.l2(0.02),
                      b_regularizer=keras.regularizers.l2(0.02),
                      c_regularizer=keras.regularizers.l2(0.02))(nade_layer)

    predicted_ratings = Lambda(prediction_layer,
                               output_shape=prediction_output_shape,
                               name='predicted_ratings')(nade_layer)

    d = Lambda(d_layer, output_shape=d_output_shape, name='d')(input_masks)

    sum_masks = add([input_masks, output_masks])
    D = Lambda(D_layer, output_shape=D_output_shape, name='D')(sum_masks)

    loss_out = Lambda(rating_cost_lambda_func,
                      output_shape=(1, ),
Esempio n. 7
0
    # Define datapoints for each class.
    inps = torch.stack([
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 0, 1, 0, 0, 1, 0, 1, 0, 1]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([1, 1, 0, 1, 1, 0, 0, 1, 0, 0]),
        torch.randn(num_samples_per_class, inp_dimensions) / 10 +
        torch.tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 1])
    ],
                       dim=0)

    # Define one model per class.
    models = [
        NADE(inp_dimensions, inp_dimensions // 2) for _ in range(num_classes)
    ]

    # Train each model one by one.
    for inp, model in zip(inps, models):

        # Optimization scheme.
        optimizer = optim.SGD(model.parameters(), lr=0.01)

        for _ in range(num_training_iterations):

            # Zero out previous gradients.
            model.zero_grad()

            # Compute log-likehoods per sample.
            log_likelihoods = model(inp)