def __init__(self, model, size, embedding_dim, vec_dir, ordered_words, type='glove', freeze=True): assert type in ('glove', ), 'only GloVe supported' self.model = model.add_subcollection('PretrainedEmbedding') self.size = size self.embedding_dim = embedding_dim self.freeze = freeze self.embedding = self.model.lookup_parameters_from_numpy( load_glove(ordered_words, self.embedding_dim, vec_dir))
def train(args): with open(args.train_data, 'rb') as f: train_dataset: SNLIDataset = pickle.load(f) with open(args.valid_data, 'rb') as f: valid_dataset: SNLIDataset = pickle.load(f) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2, collate_fn=train_dataset.collate, pin_memory=True) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2, collate_fn=valid_dataset.collate, pin_memory=True) word_vocab = train_dataset.word_vocab label_vocab = train_dataset.label_vocab model = SNLIModel(num_classes=len(label_vocab), num_words=len(word_vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, clf_hidden_dim=args.clf_hidden_dim, clf_num_layers=args.clf_num_layers, use_leaf_rnn=args.leaf_rnn, use_batchnorm=args.batchnorm, intra_attention=args.intra_attention, dropout_prob=args.dropout) if args.glove: logging.info('Loading GloVe pretrained vectors...') model.word_embedding.weight.data.zero_() glove_weight = load_glove( path=args.glove, vocab=word_vocab, init_weight=model.word_embedding.weight.data.numpy()) glove_weight[word_vocab.pad_id] = 0 model.word_embedding.weight.data.set_(torch.FloatTensor(glove_weight)) if args.fix_word_embedding: logging.info('Will not update word embeddings') model.word_embedding.weight.requires_grad = False if args.gpu > -1: logging.info(f'Using GPU {args.gpu}') model.cuda(args.gpu) params = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adam(params=params) criterion = nn.CrossEntropyLoss() train_summary_writer = tensorboard.FileWriter(logdir=os.path.join( args.save_dir, 'log', 'train'), flush_secs=10) valid_summary_writer = tensorboard.FileWriter(logdir=os.path.join( args.save_dir, 'log', 'valid'), flush_secs=10) def run_iter(batch, is_training): model.train(is_training) pre = wrap_with_variable(batch['pre'], volatile=not is_training, gpu=args.gpu) hyp = wrap_with_variable(batch['hyp'], volatile=not is_training, gpu=args.gpu) pre_length = wrap_with_variable(batch['pre_length'], volatile=not is_training, gpu=args.gpu) hyp_length = wrap_with_variable(batch['hyp_length'], volatile=not is_training, gpu=args.gpu) label = wrap_with_variable(batch['label'], volatile=not is_training, gpu=args.gpu) logits = model(pre=pre, pre_length=pre_length, hyp=hyp, hyp_length=hyp_length) label_pred = logits.max(1)[1] accuracy = torch.eq(label, label_pred).float().mean() loss = criterion(input=logits, target=label) if is_training: optimizer.zero_grad() loss.backward() clip_grad_norm(parameters=params, max_norm=5) optimizer.step() return loss, accuracy def add_scalar_summary(summary_writer, name, value, step): value = unwrap_scalar_variable(value) summ = summary.scalar(name=name, scalar=value) summary_writer.add_summary(summary=summ, global_step=step) num_train_batches = len(train_loader) validate_every = num_train_batches // 10 best_vaild_accuacy = 0 iter_count = 0 for epoch_num in range(1, args.max_epoch + 1): logging.info(f'Epoch {epoch_num}: start') for batch_iter, train_batch in enumerate(train_loader): if args.anneal_temperature and iter_count % 500 == 0: gamma = 0.00001 new_temperature = max([0.5, math.exp(-gamma * iter_count)]) model.encoder.gumbel_temperature = new_temperature logging.info( f'Iter #{iter_count}: ' f'Set Gumbel temperature to {new_temperature:.4f}') train_loss, train_accuracy = run_iter(batch=train_batch, is_training=True) iter_count += 1 add_scalar_summary(summary_writer=train_summary_writer, name='loss', value=train_loss, step=iter_count) add_scalar_summary(summary_writer=train_summary_writer, name='accuracy', value=train_accuracy, step=iter_count) if (batch_iter + 1) % validate_every == 0: valid_loss_sum = valid_accuracy_sum = 0 num_valid_batches = len(valid_loader) for valid_batch in valid_loader: valid_loss, valid_accuracy = run_iter(batch=valid_batch, is_training=False) valid_loss_sum += unwrap_scalar_variable(valid_loss) valid_accuracy_sum += unwrap_scalar_variable( valid_accuracy) valid_loss = valid_loss_sum / num_valid_batches valid_accuracy = valid_accuracy_sum / num_valid_batches add_scalar_summary(summary_writer=valid_summary_writer, name='loss', value=valid_loss, step=iter_count) add_scalar_summary(summary_writer=valid_summary_writer, name='accuracy', value=valid_accuracy, step=iter_count) progress = epoch_num + batch_iter / num_train_batches logging.info(f'Epoch {progress:.2f}: ' f'valid loss = {valid_loss:.4f}, ' f'valid accuracy = {valid_accuracy:.4f}') if valid_accuracy > best_vaild_accuacy: best_vaild_accuacy = valid_accuracy model_filename = (f'model-{progress:.2f}' f'-{valid_loss:.4f}' f'-{valid_accuracy:.4f}.pkl') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'Saved the new best model to {model_path}')
def train(args): with open(args.train_data, 'rb') as f: train_dataset: SNLIDataset = pickle.load(f) with open(args.valid_data, 'rb') as f: valid_dataset: SNLIDataset = pickle.load(f) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2, collate_fn=train_dataset.collate, pin_memory=True) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2, collate_fn=valid_dataset.collate, pin_memory=True) word_vocab = train_dataset.word_vocab label_vocab = train_dataset.label_vocab model = SNLIModel(num_classes=len(label_vocab), num_words=len(word_vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, clf_hidden_dim=args.clf_hidden_dim, clf_num_layers=args.clf_num_layers, use_leaf_rnn=args.leaf_rnn, use_batchnorm=args.batchnorm, intra_attention=args.intra_attention, dropout_prob=args.dropout, bidirectional=args.bidirectional) if args.glove: logging.info('Loading GloVe pretrained vectors...') glove_weight = load_glove( path=args.glove, vocab=word_vocab, init_weight=model.word_embedding.weight.data.numpy()) glove_weight[word_vocab.pad_id] = 0 model.word_embedding.weight.data.set_(torch.FloatTensor(glove_weight)) if args.fix_word_embedding: logging.info('Will not update word embeddings') model.word_embedding.weight.requires_grad = False model.to(args.device) logging.info(f'Using device {args.device}') if args.optimizer == 'adam': optimizer_class = optim.Adam elif args.optimizer == 'adagrad': optimizer_class = optim.Adagrad elif args.optimizer == 'adadelta': optimizer_class = optim.Adadelta params = [p for p in model.parameters() if p.requires_grad] optimizer = optimizer_class(params=params, weight_decay=args.l2reg) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=10, verbose=True) criterion = nn.CrossEntropyLoss() train_summary_writer = SummaryWriter( log_dir=os.path.join(args.save_dir, 'log', 'train')) valid_summary_writer = SummaryWriter( log_dir=os.path.join(args.save_dir, 'log', 'valid')) def run_iter(batch, is_training): model.train(is_training) pre = batch['pre'].to(args.device) hyp = batch['hyp'].to(args.device) pre_length = batch['pre_length'].to(args.device) hyp_length = batch['hyp_length'].to(args.device) label = batch['label'].to(args.device) logits = model(pre=pre, pre_length=pre_length, hyp=hyp, hyp_length=hyp_length) label_pred = logits.max(1)[1] accuracy = torch.eq(label, label_pred).float().mean() loss = criterion(input=logits, target=label) if is_training: optimizer.zero_grad() loss.backward() clip_grad_norm_(parameters=params, max_norm=5) optimizer.step() return loss, accuracy def add_scalar_summary(summary_writer, name, value, step): if torch.is_tensor(value): value = value.item() summary_writer.add_scalar(tag=name, scalar_value=value, global_step=step) num_train_batches = len(train_loader) validate_every = num_train_batches // 10 best_vaild_accuacy = 0 iter_count = 0 for epoch_num in range(args.max_epoch): logging.info(f'Epoch {epoch_num}: start') for batch_iter, train_batch in enumerate(train_loader): if iter_count % args.anneal_temperature_every == 0: rate = args.anneal_temperature_rate new_temperature = max([0.5, math.exp(-rate * iter_count)]) model.encoder.gumbel_temperature = new_temperature logging.info( f'Iter #{iter_count}: ' f'Set Gumbel temperature to {new_temperature:.4f}') train_loss, train_accuracy = run_iter(batch=train_batch, is_training=True) iter_count += 1 add_scalar_summary(summary_writer=train_summary_writer, name='loss', value=train_loss, step=iter_count) add_scalar_summary(summary_writer=train_summary_writer, name='accuracy', value=train_accuracy, step=iter_count) if (batch_iter + 1) % validate_every == 0: torch.set_grad_enabled(False) valid_loss_sum = valid_accuracy_sum = 0 num_valid_batches = len(valid_loader) for valid_batch in valid_loader: valid_loss, valid_accuracy = run_iter(batch=valid_batch, is_training=False) valid_loss_sum += valid_loss.item() valid_accuracy_sum += valid_accuracy.item() torch.set_grad_enabled(True) valid_loss = valid_loss_sum / num_valid_batches valid_accuracy = valid_accuracy_sum / num_valid_batches scheduler.step(valid_accuracy) add_scalar_summary(summary_writer=valid_summary_writer, name='loss', value=valid_loss, step=iter_count) add_scalar_summary(summary_writer=valid_summary_writer, name='accuracy', value=valid_accuracy, step=iter_count) progress = epoch_num + batch_iter / num_train_batches logging.info(f'Epoch {progress:.2f}: ' f'valid loss = {valid_loss:.4f}, ' f'valid accuracy = {valid_accuracy:.4f}') if valid_accuracy > best_vaild_accuacy: best_vaild_accuacy = valid_accuracy model_filename = (f'model-{progress:.2f}' f'-{valid_loss:.4f}' f'-{valid_accuracy:.4f}.pkl') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'Saved the new best model to {model_path}')
def prepare_answer(train_doc, train_answer, train_candidates, test_doc, test_answer, test_candidates, val_doc=None, val_answer=None, val_candidates=None, max_document_length=1000, max_answer_length=20, max_vocabulary_size=50000, embeddings_size=50): """ Prepares a dataset for use by a question-answer like model. This version will use the patterns generated previously for the training, test and validation sets as candidate for all three sets. :param train_doc: the training documents :param train_answer: the KPs for the training documents :param train_candidates: the candidate KPs for the training documents :param test_doc: the test documents :param test_answer: the KPs for the test documents :param test_candidates: the candidate KPs for the test documents :param val_doc: the validation documents (can be None) :param val_answer: the KPs for the validation documents (can be None) :param val_candidates: the candidate KPs for the validation documents (can be None) :param max_document_length: the maximum length of the documents (shorter documents will be truncated!) :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!) :param max_vocabulary_size: the maximum size of the vocabulary to use (i.e. we keep only the top max_vocabulary_size words) :param embeddings_size: the size of the GLoVE embeddings to use :return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training, test and validation set, and an embedding matrix for an Embedding layer """ # Prepare validation return data val_q = None val_a = None val_y = None # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values train_q = [] test_q = [] train_a = [] test_a = [] train_y = [] test_y = [] if val_doc and val_answer: val_q = [] val_a = [] val_y = [] documents_full = [] for key, doc in train_doc.items(): documents_full.append(token for token in doc) for key, doc in test_doc.items(): documents_full.append(token for token in doc) if val_doc and val_answer: for key, doc in val_doc.items(): documents_full.append(token for token in doc) logging.debug("Fitting dictionary on %s documents..." % len(documents_full)) dictionary = dict.Dictionary(num_words=max_vocabulary_size) dictionary.fit_on_texts(documents_full) logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index)) # Pair up each document with a candidate keyphrase and its truth value for key, document in train_doc.items(): doc_sequence = dictionary.token_list_to_sequence(document) for kp in train_candidates[key]: train_q.append(doc_sequence) train_a.append(dictionary.token_list_to_sequence(kp)) train_y.append([0, 1] if kp in train_answer[key] else [1, 0]) for key, document in test_doc.items(): doc_sequence = dictionary.token_list_to_sequence(document) for kp in test_candidates[key]: test_q.append(doc_sequence) test_a.append(dictionary.token_list_to_sequence(kp)) test_y.append([0, 1] if kp in test_answer[key] else [1, 0]) if val_doc and val_answer: for key, document in val_doc.items(): doc_sequence = dictionary.token_list_to_sequence(document) for kp in val_candidates[key]: val_q.append(doc_sequence) val_a.append(dictionary.token_list_to_sequence(kp)) val_y.append([0, 1] if kp in val_answer[key] else [1, 0]) logging.debug("Longest training document : %s tokens" % len(max(train_q, key=len))) logging.debug("Longest training answer : %s tokens" % len(max(train_a, key=len))) logging.debug("Longest test document : %s tokens" % len(max(test_q, key=len))) logging.debug("Longest test answer : %s tokens" % len(max(test_a, key=len))) if val_doc and val_answer: logging.debug("Longest validation document : %s tokens" % len(max(val_q, key=len))) logging.debug("Longest validation answer : %s tokens" % len(max(val_a, key=len))) train_q = np.asarray( pad_sequences(train_q, maxlen=max_document_length, padding='post', truncating='post')) train_a = np.asarray( pad_sequences(train_a, maxlen=max_answer_length, padding='post', truncating='post')) test_q = np.asarray( pad_sequences(test_q, maxlen=max_document_length, padding='post', truncating='post')) test_a = np.asarray( pad_sequences(test_a, maxlen=max_answer_length, padding='post', truncating='post')) if val_doc and val_answer: val_q = np.asarray( pad_sequences(val_q, maxlen=max_document_length, padding='post', truncating='post')) val_a = np.asarray( pad_sequences(val_a, maxlen=max_answer_length, padding='post', truncating='post')) logging.debug("Training set documents size : %s", np.shape(train_q)) logging.debug("Training set answers size : %s", np.shape(train_a)) logging.debug("Test set documents size : %s", np.shape(test_q)) logging.debug("Test set answers size : %s ", np.shape(test_a)) if val_doc and val_answer: logging.debug("Validation set documents size : %s", np.shape(val_q)) logging.debug("Validation set answers size : %s ", np.shape(val_a)) # prepare the matrix for the embedding layer word_index = dictionary.word_index embeddings_index = glove.load_glove('', embeddings_size) num_words = min(max_vocabulary_size, 1 + len(word_index)) logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size)) embedding_matrix = np.zeros((num_words, embeddings_size)) for word, i in word_index.items(): if i >= num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return [train_q, train_a], train_y, [test_q, test_a], test_y, [ val_q, val_a ], val_y, embedding_matrix, dictionary
def prepare_sequential(train_doc, train_answer, test_doc, test_answer, val_doc, val_answer, max_document_length=1000, max_vocabulary_size=50000, embeddings_size=50, stem_test=False): """ Prepares a dataset for use by a sequential, categorical model. :param train_doc: the training documents :param train_answer: the KPs for the training documents :param test_doc: the test documents :param test_answer: the KPs for the test documents :param val_doc: the validation documents (can be None) :param val_answer: the KPs for the validation documents (can be None) :param max_document_length: the maximum length of the documents (shorter documents will be truncated!) :param max_vocabulary_size: the maximum size of the vocabulary to use (i.e. we keep only the top max_vocabulary_size words) :param embeddings_size: the size of the GLoVE embeddings to use :param stem_test: set the value to True if the test set answers are stemmed :return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training, test and validation set, and an embedding matrix for an Embedding layer """ train_answer_seq = make_sequential(train_doc, train_answer) if not stem_test: test_answer_seq = make_sequential(test_doc, test_answer) else: import copy stemmed_test_doc = copy.deepcopy(test_doc) stemmed_test_doc = stem_dataset(stemmed_test_doc) test_answer_seq = make_sequential(stemmed_test_doc, test_answer) # Prepare validation return data val_x = None val_y = None if val_doc and val_answer: val_answer_seq = make_sequential(val_doc, val_answer) # Transform the documents to sequence documents_full = [] train_y = [] test_y = [] if val_doc and val_answer: val_y = [] for key, doc in train_doc.items(): documents_full.append(token for token in doc) train_y.append(train_answer_seq[key]) for key, doc in test_doc.items(): documents_full.append(token for token in doc) test_y.append(test_answer_seq[key]) if val_doc and val_answer: for key, doc in val_doc.items(): documents_full.append(token for token in doc) val_y.append(val_answer_seq[key]) logging.debug("Fitting dictionary on %s documents..." % len(documents_full)) dictionary = dict.Dictionary(num_words=max_vocabulary_size) dictionary.fit_on_texts(documents_full) logging.debug("Dictionary fitting completed. Found %s unique tokens" % len(dictionary.word_index)) # Now we can prepare the actual input train_x = dictionary.texts_to_sequences(train_doc.values()) test_x = dictionary.texts_to_sequences(test_doc.values()) if val_doc and val_answer: val_x = dictionary.texts_to_sequences(val_doc.values()) logging.debug("Longest training document : %s tokens" % len(max(train_x, key=len))) logging.debug("Longest test document : %s tokens" % len(max(test_x, key=len))) if val_doc and val_answer: logging.debug("Longest validation document : %s tokens" % len(max(val_x, key=len))) train_x = np.asarray( pad_sequences(train_x, maxlen=max_document_length, padding='post', truncating='post')) train_y = pad_sequences(train_y, maxlen=max_document_length, padding='post', truncating='post') train_y = make_categorical(train_y) test_x = np.asarray( pad_sequences(test_x, maxlen=max_document_length, padding='post', truncating='post')) test_y = pad_sequences(test_y, maxlen=max_document_length, padding='post', truncating='post') test_y = make_categorical(test_y) if val_doc and val_answer: val_x = np.asarray( pad_sequences(val_x, maxlen=max_document_length, padding='post', truncating='post')) val_y = pad_sequences(val_y, maxlen=max_document_length, padding='post', truncating='post') val_y = make_categorical(val_y) logging.debug("Training set samples size : %s", np.shape(train_x)) logging.debug("Training set answers size : %s", np.shape(train_y)) logging.debug("Test set samples size : %s", np.shape(test_x)) logging.debug("Test set answers size : %s ", np.shape(test_y)) if val_doc and val_answer: logging.debug("Validation set samples size : %s", np.shape(val_x)) logging.debug("Validation set answers size : %s ", np.shape(val_y)) # prepare the matrix for the embedding layer word_index = dictionary.word_index embeddings_index = glove.load_glove('', embeddings_size) num_words = min(max_vocabulary_size, 1 + len(word_index)) logging.debug("Building embedding matrix of size [%s,%s]..." % (num_words, embeddings_size)) embedding_matrix = np.zeros((num_words, embeddings_size)) for word, i in word_index.items(): if i >= num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix