def main(args): word2vec = KeyedVectors.load(args.word2vec, mmap='r') data = data_utils.load_data(args.file) if args.use_gensim: data_utils.load_gensim(args.word2vec) else: data_utils.load_embeddings(args.embeddings) data_utils.load_w2i(args.w2i) compute_recall(data, word2vec)
def main(args): global device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") word2vec = KeyedVectors.load(args.word2vec, mmap='r') data = data_utils.load_data(args.file) if args.use_gensim: data_utils.load_gensim(args.word2vec) else: data_utils.load_embeddings(args.embeddings) data_utils.load_w2i(args.w2i) run(data, word2vec)
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) vocab_size = 75100 embedding_path = './data/embeddings.npy' embedding = utils.load_embeddings(embedding_path, vocab_size, FLAGS.embedding_dim) return x_train, y_train, vocab_processor, x_dev, y_dev, embedding
def create_model(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces): from model import E2EETModel, MentionLevelModel if cf.TASK == "end_to_end": model = E2EETModel( embedding_dim = cf.EMBEDDING_DIM, hidden_dim = cf.HIDDEN_DIM, vocab_size = len(wordpiece_vocab), label_size = len(hierarchy), model_options = cf.MODEL_OPTIONS, total_wordpieces = total_wordpieces, category_counts = hierarchy.get_train_category_counts(), hierarchy_matrix = hierarchy.hierarchy_matrix, embedding_model = cf.EMBEDDING_MODEL, vocab_size_word = len(word_vocab), pretrained_embeddings = None if cf.EMBEDDING_MODEL in ["random", "bert"] else load_embeddings(cf.EMBEDDING_MODEL, word_vocab, cf.EMBEDDING_DIM)) elif cf.TASK == "mention_level": model = MentionLevelModel( embedding_dim = cf.EMBEDDING_DIM, hidden_dim = cf.HIDDEN_DIM, vocab_size = len(wordpiece_vocab), label_size = len(hierarchy), model_options = cf.MODEL_OPTIONS, total_wordpieces = total_wordpieces, category_counts = hierarchy.get_train_category_counts(), hierarchy_matrix = hierarchy.hierarchy_matrix, context_window = cf.MODEL_OPTIONS['context_window'], mention_window = cf.MODEL_OPTIONS['mention_window'], attention_type = cf.MODEL_OPTIONS['attention_type'], use_context_encoders = cf.MODEL_OPTIONS['use_context_encoders']) return model
def do_train(args): # Set up some parameters. config = Config() helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter( logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None with tf.Graph().as_default(): logger.info("Building model...", ) start = time.time() model = NGramModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) model.fit(session, saver, train_raw, dev_raw, train, dev)
def get_all(args): """ Gets the training and test data, and templates. """ global embedding_size global num_words print("Load data...") data_train = data_utils.load_data(args.folder + "/train_data.json") data_test = data_utils.load_data(args.folder + "/dev_data.json") print(len(data_train)) data_utils.load_embeddings(args.embeddings) embedding_size = len(data_utils.embeddings[0]) data_utils.load_w2i(args.w2i) num_words = len(data_utils.w2i) w2emb = data_utils.load_w2emb(args.w2emb) templates_emb = data_utils.load_templates("../../data/templates.pkl") gensim_model = KeyedVectors.load(args.word2vec, mmap='r') print("Do the templates...") templates_emb = [y for x in templates_emb for y in x] cut_templates = [temp[-args.max_length:] for temp in templates_emb] templates_emb = [ np.pad(temp1, ((0, args.max_length - len(temp1)), (0, 0)), "constant", constant_values=(num_words)) for temp1 in cut_templates ] templates_emb = torch.Tensor(templates_emb) print("Go through training data...") training_data = get_data(args.saved_train, data_train, data_utils.embeddings, data_utils.w2i, gensim_model, args) test_data = get_data(args.saved_test, data_test, data_utils.embeddings, data_utils.w2i, gensim_model, args) return training_data, test_data, templates_emb, w2emb
def do_seq2seq_prediction(): config = Config() # Load training data helper, data = load_and_preprocess_data('data/data.txt') inputs = data labels = inputs train_examples_raw = zip( inputs, labels) # This is a list of (input, label) tuples. # Load pretrained embedding matrix # Embedding matrix has shape of (n_tokens, embed_size) embeddings = load_embeddings('data/vocab.txt', 'data/wordVectors.txt', helper) # config.n_tokens = embeddings.shape[0] # config.embed_size = embeddings.shape[1] helper.save(config.model_path) #Create and train a seq2seq autoencoder with tf.Graph().as_default(): print "Building model..." cell_size = 100 cell_type = "lstm" cell_init = "identity" clip_gradients = True activation_choice = "tanh" print "We are considering {:} of size N = {:} with activation being {:}.".format( cell_type, cell_size, activation_choice) if clip_gradients: print "Gradient clipping turned on." else: print "Gradient clipping turned off." model = Seq2seq_autoencoder(helper, config, embeddings, cell_size, cell_type, cell_init, clip_gradients, activation_choice) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) model.fit(sess, saver, train_examples_raw)
def test_encoding(args): config = build_seq2seq_config(args) helper = ModelHelper.load(args.model_path) input_data = load_data(args.data) embeddings = load_embeddings(args, helper) config.n_tokens = embeddings.shape[0] config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): model = Seq2seq_autoencoder(helper, config, embeddings) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print model.config.model_output saver.restore(sess, model.config.model_output) encodings = model.encode(sess, input_data) print encodings
def do_evaluate(args): config = build_seq2seq_config(args) helper = ModelHelper.load(args.model_path) inputs_raw = load_data(args.data) embeddings = load_embeddings(args, helper) config.n_tokens = embeddings.shape[0] config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): model = Seq2seq_autoencoder(helper, config, embeddings) init = tf.global_variables_initializer() saver = tf.train.Saver() vectorized = model.helper.vectorize(inputs_raw) inputs = model.preprocess_sequence_data(zip(vectorized, vectorized)) with tf.Session() as sess: sess.run(init) saver.restore(sess, model.config.model_output) test_loss = model.evaluate(sess, inputs) result_path = "ae_results/%s.txt" % args.model_path.split('/')[1] with open(result_path, 'w') as result_file: result_file.write("%f" % test_loss)
def do_train(args): config = Config(args) print "== Seq2Seq Config ==" print " Cell size:", config.cell_size print " Cell type:", config.cell_type print " Cell init:", config.cell_init print " Activation:", config.activation_choice print " Gradient clipping:", config.clip_gradients print " Feed decoder:", config.feed_decoder # Load training data helper, train, dev = load_and_preprocess_data(args) inputs = train labels = train train_examples_raw = zip( inputs, labels) # This is a list of (input, label) tuples. dev_set_raw = zip(dev, dev) # Load pretrained embedding matrix # Embedding matrix has shape of (n_tokens, embed_size) embeddings = load_embeddings(args, helper) config.n_tokens = embeddings.shape[0] config.embed_size = embeddings.shape[1] helper.save(config.model_path) #Create and train a seq2seq autoencoder with tf.Graph().as_default(): print "Building model..." model = Seq2seq_autoencoder(helper, config, embeddings) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) model.fit(sess, saver, train_examples_raw, dev_set_raw)
x_dev = pad_dataset(eval_df.text_tokenized.values.tolist(), 128) def create_label(label): if label == 1: return [0, 1] elif label == 0: return [1, 0] y_train = np.array((train_df['class'].apply(create_label)).values.tolist()) y_dev = np.array((eval_df['class'].apply(create_label)).values.tolist()) vocab_size = len(wdict) embedding_path = FLAGS.embeddings_path embedding = utils.load_embeddings(embedding_path, vocab_size, FLAGS.embedding_dim) print("Embeddings loaded, Vocabulary Size: {:d}. Starting training ...".format( vocab_size)) def prepare_filepath_for_storing_model(output_dir: str) -> str: """Prepare the filepath where the trained model will be stored. :param output_dir: Directory where to store outputs (trained models). :return: path_to_store_model: Path where to store the trained model. """ path_to_store_model = os.path.join(output_dir, 'models') if not os.path.exists(path_to_store_model): os.makedirs(path_to_store_model) return path_to_store_model
train_df.drop(columns=["text"], inplace=True) logger.info("Building dataset...") vocab2id = build_vocab(docs=X_train, min_count=config.min_count) pkl.dump( vocab2id, open( os.path.join(args.model_dir, "vocab_{}.vocab".format(args.model)), "wb")) train_data = build_dataset(X_train, vocab2id, max_doc_len=config.max_doc_len) train_df.drop(columns=["text_words"], inplace=True) logger.info("Loading embeddings...") embeddings = load_embeddings(args.embedding_path, vocab2id) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") if args.nsplits > 1: SKF = StratifiedKFold(n_splits=args.nsplits, shuffle=True) for fold_idx, (train_idx, val_idx) in enumerate(SKF.split(X_train, y_train)): logger.info("*" * 20 + "Training {}-fold...".format(fold_idx)) model = load_model(config, args.model, embeddings, embeddings.shape[1], len(y_train.unique()),
def main(): l.write('# Loading and Setting Up Data') l.write('Loading Training Data') with s3_read('ml/data/news_classifier/train_data.json') as file: data = pd.read_json(file, orient="records") data = data[:1000] l.write('Loading embeddings') with s3_read('ml/glove_embeddings/glove.6B.100d.txt') as file: embeddings = data_utils.load_embeddings(file, embedding_dim=100) l.write('Preparing data') train_test_split = 0.95 split_idx = math.floor(len(data) * train_test_split) train_data = data.iloc[0:split_idx] valid_data = data.iloc[split_idx:] encoding = WordEmbeddingEncoding(data, embeddings) encoding.prepare() train_dataset = WordTokenDataset(train_data, encoding) train_dataset.prepare() valid_dataset = WordTokenDataset(valid_data, encoding) valid_dataset.prepare() print('# Training the Model') hyperparams_list = [ { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 50 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 50 }, ] models = [] train_losses_list = [] valid_losses = [] accepted_tokens = {t for t in embeddings.index} for i, hyperparams in enumerate(hyperparams_list): l.write(f'Model {i+1} / {len(hyperparams_list)}') start_time = time() batch_size = hyperparams['batch_size'] lr = hyperparams['lr'] weighting = hyperparams['weighting'] # 1. Setup Data Loader data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_utils.collate_samples) # 2. Create the Model model = Model(embeddings=embeddings, n_classes=encoding.n_classes(), weighting=weighting) # 3. Setup Criterion and Optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 4. Train the Model train_losses = train(model, criterion, optimizer, train_dataset, data_loader, epochs=EPOCHS) # 5. Calculate Validation Loss with torch.no_grad(): valid_samples = valid_dataset[:] outputs = model(valid_samples) valid_loss = criterion(outputs, valid_samples.label) valid_losses.append(valid_loss) end_time = time() models.append(model) train_losses_list.append(train_losses) l.write(f'Model completed in {(end_time - start_time)/60:.02f}m.\n') l.write('# Results') uniform_mask = [hp['weighting'] == 'uniform' for hp in hyperparams_list] models = [m for i, m in enumerate(models) if uniform_mask[i]] train_losses_list = [ losses for i, losses in enumerate(train_losses_list) if uniform_mask[i] ] valid_losses = [ loss.item() for i, loss in enumerate(valid_losses) if uniform_mask[i] ] best_model_idx = valid_losses.index(min(valid_losses)) best_model = models[best_model_idx] l.write(f'Best Model: {best_model_idx+1}') l.write('Computing Model Accuracy...') samples = valid_dataset[:] predictions = best_model.predict(samples) total = len(samples.label) correct = torch.sum(predictions == samples.label) l.write(f'Accuracy of Model: {(float(correct) / total)*100:.02f}%.') l.write('Persisting Models...') with s3_write('ml/models/news_classifier/glove_model.torch', 'b') as file: torch.save(best_model.state_dict(), file) l.write('Done!')
import models import train_utils import helpers ############################################################################## # Settings ############################################################################## CUDA = False ############################################################################## # Load the dataset ############################################################################## Data = namedtuple("Data", \ "corpus train dev test embeddings word_to_index") data_utils.download_ask_ubuntu_dataset() EMBEDDINGS, WORD_TO_INDEX = data_utils.load_embeddings() CORPUS = data_utils.load_corpus(WORD_TO_INDEX) TRAIN_DATA = data_utils.load_train_data() DEV_DATA, TEST_DATA = data_utils.load_eval_data() DATA = Data(CORPUS, TRAIN_DATA, DEV_DATA, TEST_DATA,\ EMBEDDINGS, WORD_TO_INDEX) ############################################################################## # Train and evaluate the models for Part 1 ############################################################################## RESULTS = [] MARGINS = [0.2] MAX_EPOCHS = 50 BATCH_SIZE = 32 FILTER_WIDTHS = [3] POOL_METHOD = "average"
def main(): print('Loading and Setting Up Data...') embeddings = data_utils.load_embeddings( './data/glove.6B/glove.6B.100d.txt', embedding_dim=100) data = pd.read_json('./data/train_data.json', orient='records') train_test_split = 0.95 split_idx = math.floor(len(data) * train_test_split) train_data = data.iloc[0:split_idx] valid_data = data.iloc[split_idx:] encoding = WordEmbeddingEncoding(data, embeddings) encoding.prepare() train_dataset = WordTokenDataset(train_data, encoding) train_dataset.prepare() valid_dataset = WordTokenDataset(valid_data, encoding) valid_dataset.prepare() print('Creating Model...') hyperparams_list = [ { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 50 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 50 }, { 'weighting': 'tf_idf', 'lr': 0.001, 'batch_size': 100 }, { 'weighting': 'tf_idf', 'lr': 0.01, 'batch_size': 100 }, { 'weighting': 'tf_idf', 'lr': 0.001, 'batch_size': 50 }, { 'weighting': 'tf_idf', 'lr': 0.01, 'batch_size': 50 }, ] class Model(torch.nn.Module): def __init__(self, embeddings, n_classes, weighting): super(Model, self).__init__() self.weighting = weighting torch_embeddings = torch.FloatTensor(embeddings.values) self.embedding_bag = torch.nn.EmbeddingBag.from_pretrained( torch_embeddings, mode='sum') self.linear = torch.nn.Linear(self.embedding_bag.embedding_dim, n_classes) def forward(self, samples): if weighting == 'tf_idf': weights = samples.create_tf_idf_weights() else: weights = samples.create_uniform_weights() x = self.embedding_bag(samples.sequence, samples.offset, per_sample_weights=weights) output = self.linear(x) return output def predict(self, samples): with torch.no_grad(): outputs = self(samples) predictions = torch.argmax(outputs, axis=1) return predictions print('Training the Model...') def train(model, criterion, optimizer, dataset, data_loader, epochs, log=True): train_losses = [] for epoch in range(epochs): losses = [] for i, samples in enumerate(data_loader): optimizer.zero_grad() output = model(samples) loss = criterion(output, samples.label) loss.backward() optimizer.step() losses.append(loss) train_loss = torch.mean(torch.stack(losses)) train_losses.append(train_loss) if log and (epoch + 1) % 10 == 0: train_loss_estimator_size = 10000 train_loss_estimator_start = max( 1, len(dataset) - train_loss_estimator_size) random_start = torch.randint(high=train_loss_estimator_start, size=(1, )).item() samples = dataset[random_start:(random_start + train_loss_estimator_size)] predictions = model.predict(samples) labels = samples.label total = len(labels) correct = torch.sum(labels == predictions) print(f'Epoch {epoch + 1}') print(f'Accuracy: {float(correct)/total*100:.02f}%.') print(f'Training Loss: {train_loss.item()}') print() return train_losses models = [] train_losses_list = [] valid_losses = [] accepted_tokens = {t for t in embeddings.index} for i, hyperparams in enumerate(hyperparams_list): print(f'Starting training Model {i+1} / {len(hyperparams_list)}...') start_time = time() batch_size = hyperparams['batch_size'] lr = hyperparams['lr'] weighting = hyperparams['weighting'] # 1. Setup Data Loader data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_utils.collate_samples) # 2. Create the Model model = Model(embeddings=embeddings, n_classes=encoding.n_classes(), weighting=weighting) # 3. Setup Criterion and Optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 4. Train the Model train_losses = train(model, criterion, optimizer, train_dataset, data_loader, epochs=EPOCHS) # 5. Calculate Validation Loss with torch.no_grad(): valid_samples = valid_dataset[:] outputs = model(valid_samples) valid_loss = criterion(outputs, valid_samples.label) valid_losses.append(valid_loss) end_time = time() models.append(model) train_losses_list.append(train_losses) print(f'Model completed in {(end_time - start_time)/60:.02f}m.') print() print('Checking Results...') uniform_mask = [hp['weighting'] == 'uniform' for hp in hyperparams_list] uniform_models = [m for i, m in enumerate(models) if uniform_mask[i]] uniform_train_losses_list = [ losses for i, losses in enumerate(train_losses_list) if uniform_mask[i] ] uniform_valid_losses = [ loss.item() for i, loss in enumerate(valid_losses) if uniform_mask[i] ] tf_idf_models = [m for i, m in enumerate(models) if not uniform_mask[i]] tf_idf_train_losses_list = [ losses for i, losses in enumerate(train_losses_list) if not uniform_mask[i] ] tf_idf_valid_losses = [ loss.item() for i, loss in enumerate(valid_losses) if not uniform_mask[i] ] best_uniform_model_idx = uniform_valid_losses.index( min(uniform_valid_losses)) best_uniform_model = uniform_models[best_uniform_model_idx] best_tf_idf_model_idx = tf_idf_valid_losses.index(min(tf_idf_valid_losses)) best_tf_idf_model = tf_idf_models[best_tf_idf_model_idx] print(f'Best Uniform Model: {best_uniform_model_idx+1}') print(f'Best TF-IDF Model: {best_tf_idf_model_idx+1}') print('Computing Uniform Model Accuracy...') samples = valid_dataset[:] uniform_predictions = best_uniform_model.predict(valid_samples) total = len(valid_samples.label) correct = torch.sum(uniform_predictions == valid_samples.label) print(f'Accuracy of Uniform Model: {(float(correct) / total)*100:.02f}%.') print('Computing TF-IDF Model Accuracy...') tf_idf_predictions = best_tf_idf_model.predict(samples) total = len(samples.label) correct = torch.sum(tf_idf_predictions == samples.label) print(f'Accuracy of TF-IDF Model: {(float(correct) / total)*100:.02f}%.') print('Persisting Models...') torch.save(best_uniform_model.state_dict(), './models/uniform_glove_model.torch') torch.save(best_tf_idf_model.state_dict(), './models/tf_idf_model.torch') print('Done!')