def data_iter(train_file, validate_file, config, batch_size, num_embed, vocab_path, max_length=100): logger.info('Loading data...') x_train, y_train, vocab, vocab_inv, n_class = data_helpers.load_data(train_file, config, max_length, None) embed_size = num_embed sentence_size = x_train.shape[1] vocab_size = len(vocab) util.save_to_pickle(vocab_path, vocab) x_dev, y_dev, _, _, _ = data_helpers.load_data(validate_file, config, max_length, vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_train = x_train[shuffle_indices] y_train = y_train[shuffle_indices] logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) logger.info('train shape: %(shape)s', {'shape': x_train.shape}) logger.info('valid shape: %(shape)s', {'shape': x_dev.shape}) logger.info('sentence max words: %(shape)s', {'shape': sentence_size}) logger.info('embedding size: %(msg)s', {'msg': embed_size}) logger.info('vocab size: %(msg)s', {'msg': vocab_size}) train = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return train, valid, sentence_size, embed_size, vocab_size, n_class
def preprocess_purpose_data(max_len=60, portion=0.85, augmentation=False, deduplicate=True): """ Only augment training data, not test data. """ sentences, _, purposes, augment_sentences = _preprocess_dataset_small(max_len, augmentation, deduplicate=deduplicate) # shuffle all data if augmentation: data = list(zip(sentences, purposes, augment_sentences)) word_to_idx = compute_word_to_idx(sentences+augment_sentences) else: data = list(zip(sentences, purposes)) word_to_idx = compute_word_to_idx(sentences) random.shuffle(data) random.shuffle(data) end = int(len(data) * portion) train_data, test_data = data[:end], data[end:] if augmentation: # only augment train_data, concat both sentences train_sentences, train_purposes, train_augment_sentences = zip(*train_data) train_sentences += train_augment_sentences train_purposes += train_purposes train_data = list(zip(train_sentences, train_purposes)) # rebind test_data without augementation test_sentences, test_purposes, _ = zip(*test_data) test_data = list(zip(test_sentences, test_purposes)) # save as pickle for later use save_to_pickle('processed_data/purpose.train.pkl', [train_data, word_to_idx]) save_to_pickle('processed_data/purpose.test.pkl', [test_data, word_to_idx])
def preprocess_polarity_data(max_len=60, portion=0.85, deduplicate=True): small_sentences, small_polarities, _, _ = _preprocess_dataset_small(max_len, deduplicate=deduplicate) large_sentences, large_polarities, polarity_to_idx = _preprocess_dataset_large(max_len, deduplicate=deduplicate) combined_sentences = small_sentences + large_sentences combined_polarities = small_polarities + large_polarities data = [] if deduplicate: seen = {} for sent, polarity in zip(combined_sentences, combined_polarities): key = ''.join(sent) if key not in seen: seen[key] = True data.append((sent, polarity)) print('unique sentences:', len(seen), 'duplicate:', len(combined_sentences)-len(seen)) else: data = list(zip(combined_sentences, combined_polarities)) # shuffle all data random.shuffle(data) word_to_idx = compute_word_to_idx(combined_sentences) end = int(len(data) * portion) train_data, test_data = data[:end], data[end:] # save as pickle for later use save_to_pickle('processed_data/polarity.train.pkl', [train_data, word_to_idx, polarity_to_idx]) save_to_pickle('processed_data/polarity.test.pkl', [test_data, word_to_idx, polarity_to_idx])
def load_glove_model(self, path_to_glove, word_to_idx, saved_embedding='processed_data/glove_embedding.pkl', regenerate=True): """ Overwrite nn.Embedding.weight by pre-trained GloVe vectors. First load pre-trained GloVe model, i.e., a word-vector lookup table Then filter the words appeared in our dataset based on word_to_idx Then overwrite initial nn.Embedding.weight Credit: https://github.com/pytorch/text/issues/30 """ if regenerate: count = 0 with open(path_to_glove, 'r') as f: for line in f.readlines(): # print(line) row = line.split() word, vector = row[0], row[1:] vector = torch.FloatTensor(list(map(float, vector))) # only update the word that is in both word_to_idx and glove # remain the same weight for the word that is not in glove model if word in word_to_idx: count += 1 # overwrite initial embedding.weight self.embeddings.weight.data[word_to_idx[word]] = vector print('num of words in both word_to_idx and glove', count) save_to_pickle(saved_embedding, self.embeddings.weight.data) else: self.embeddings.weight.data.copy_(load_pickle(saved_embedding))
def __init__(self, train_file, validate_file, config, vocab_path, max_length): logger.info('Loading data...') x_train, x_train_len, y_train, vocab, vocab_inv, self.n_class = \ self.load_data(train_file, config, max_length, None) self.sentence_size = x_train.shape[1] self.vocab_size = len(vocab) util.save_to_pickle(vocab_path, vocab) x_dev, x_dev_len, y_dev, _, _, _ = self.load_data(validate_file, config, max_length, vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_train = x_train[shuffle_indices] x_train_len = x_train_len[shuffle_indices] y_train = y_train[shuffle_indices] # replicating random examples from pre-data # rest = batch_size - len(x_train) % batch_size # random_indices = np.random.randint(x_train.shape[0], size=rest) # # x_train = np.concatenate((x_train, x_train[random_indices, :]), axis=0) # x_train_len = np.concatenate((x_train_len, x_train_len[random_indices]), axis=0) # y_train = np.concatenate((y_train, y_train[random_indices]), axis=0) self.x_train = mx.nd.array(x_train) self.x_train_len = mx.nd.array(x_train_len) self.y_train = mx.nd.array(y_train) self.x_dev = mx.nd.array(x_dev) self.x_dev_len = mx.nd.array(x_dev_len) self.y_dev = mx.nd.array(y_dev) logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) logger.info('train shape: %(shape)s', {'shape': x_train.shape}) logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
def train_epochs(resume=False, use_glove=True): """Train multiple opochs""" print('total epochs: ', cfg.EPOCHS, '; use_glove: ', use_glove) training_data, word_to_idx, label_to_idx = data_loader() model, best_acc, start_epoch = get_model(word_to_idx, label_to_idx, resume, use_glove) losses = [] loss_function = nn.NLLLoss() if cfg.RUN_MODE == 'CNN': optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0) # optimizer = optim.SGD(model.parameters(), lr=0.1) # optimizer = optim.Adagrad(model.parameters(), lr=0.01, weight_decay=0.01) else: # optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=0.1) # optimizers below are not working # optimizer = optim.Adagrad(model.parameters(), lr=0.001) since = time.time() training_error_rates = [] test_error_rates = [] for epoch in range(1 + start_epoch, start_epoch + cfg.EPOCHS + 1): train_error, train_loss = train(model, loss_function, optimizer, training_data, word_to_idx) losses.append(train_loss) training_error_rates.append(train_error) test_error_rate = get_error_rate(model, training=False) test_error_rates.append(test_error_rate) acc = 1 - test_error_rate print('epoch: {}, time: {:.2f}s, cost so far: {}, accurary: {:.3f}'. format(epoch, (time.time() - since), train_loss.numpy(), acc)) if acc > best_acc: save_checkpoint(model, acc, epoch) best_acc = acc # save all_losses save_to_pickle('checkpoint/all_losses.p', losses) save_to_pickle('checkpoint/training_error_rates.p', training_error_rates) save_to_pickle('checkpoint/test_error_rates.p', test_error_rates)
def main(): #batch_size = 128 # For mini-batch gradient descent #epochs = 10 args = read_args() x_train, x_test, y_train, y_test = load_dataset() #toresults #toresults=get_time_str() ## TODO 3: Build the Keras model args.input_num = x_train.shape[1] # model = create_model(args=args) #data augmentation # datagen = ImageDataGenerator( #rotation_range=10, #width_shift_range=0.1, #height_shift_range=0.1, #horizontal_flip=True, #) #datagen.fit(x_train) # Data Augmentation #hist=model.fit_generator(datagen.flow(x_train, y_train, batch_size=32), # steps_per_epoch=x_train.shape[0] // args.batch_size, # epochs=args.epochs, # verbose=1, # validation_data=(x_test, y_test),workers=4) # # TODO 4: Fit the model hist = model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, verbose=1, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print(score) y_test_pred_mat = model.predict(x_test) y_test_norm = np.argmax(y_test, axis=1) predictions = np.argmax(y_test_pred_mat, axis=1) # TODO 5: Evaluate the model, calculating the metrics. # Option 1: Use the model.evaluate() method. For this, the model must be # already compiled with the metrics. # performance = model.evaluate(X_test, y_test) # Option 2: Use the model.predict() method and calculate the metrics using # sklearn. We recommend this, because you can store the predictions if # you need more analysis later. Also, if you calculate the metrics on a # notebook, then you can compare multiple classifiers. # predictions = ... # performance = ... # TODO 6: Save the results. #Pandas results = pandas.DataFrame(y_test_norm, columns=['true_label']) results.loc[:, 'predicted'] = predictions results.to_csv('predictions_{}.csv'.format(args.experiment_name), index=False) #to pickle params_dict = get_keras_model_history_params(model, [('args', args.__dict__)]) save_to_pickle(params_dict, 'params_{}.pick'.format(args.experiment_name)) #guardar graficos de accuracy y loss save_fig(hist) print(model.summary())