Example #1
0
    def __init__(self,
                 model,
                 size,
                 embedding_dim,
                 vec_dir,
                 ordered_words,
                 type='glove',
                 freeze=True):
        assert type in ('glove', ), 'only GloVe supported'

        self.model = model.add_subcollection('PretrainedEmbedding')

        self.size = size
        self.embedding_dim = embedding_dim
        self.freeze = freeze
        self.embedding = self.model.lookup_parameters_from_numpy(
            load_glove(ordered_words, self.embedding_dim, vec_dir))
Example #2
0
def train(args):
    with open(args.train_data, 'rb') as f:
        train_dataset: SNLIDataset = pickle.load(f)
    with open(args.valid_data, 'rb') as f:
        valid_dataset: SNLIDataset = pickle.load(f)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=2,
                              collate_fn=train_dataset.collate,
                              pin_memory=True)
    valid_loader = DataLoader(dataset=valid_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=2,
                              collate_fn=valid_dataset.collate,
                              pin_memory=True)
    word_vocab = train_dataset.word_vocab
    label_vocab = train_dataset.label_vocab

    model = SNLIModel(num_classes=len(label_vocab),
                      num_words=len(word_vocab),
                      word_dim=args.word_dim,
                      hidden_dim=args.hidden_dim,
                      clf_hidden_dim=args.clf_hidden_dim,
                      clf_num_layers=args.clf_num_layers,
                      use_leaf_rnn=args.leaf_rnn,
                      use_batchnorm=args.batchnorm,
                      intra_attention=args.intra_attention,
                      dropout_prob=args.dropout)
    if args.glove:
        logging.info('Loading GloVe pretrained vectors...')
        model.word_embedding.weight.data.zero_()
        glove_weight = load_glove(
            path=args.glove,
            vocab=word_vocab,
            init_weight=model.word_embedding.weight.data.numpy())
        glove_weight[word_vocab.pad_id] = 0
        model.word_embedding.weight.data.set_(torch.FloatTensor(glove_weight))
    if args.fix_word_embedding:
        logging.info('Will not update word embeddings')
        model.word_embedding.weight.requires_grad = False
    if args.gpu > -1:
        logging.info(f'Using GPU {args.gpu}')
        model.cuda(args.gpu)
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(params=params)
    criterion = nn.CrossEntropyLoss()

    train_summary_writer = tensorboard.FileWriter(logdir=os.path.join(
        args.save_dir, 'log', 'train'),
                                                  flush_secs=10)
    valid_summary_writer = tensorboard.FileWriter(logdir=os.path.join(
        args.save_dir, 'log', 'valid'),
                                                  flush_secs=10)

    def run_iter(batch, is_training):
        model.train(is_training)
        pre = wrap_with_variable(batch['pre'],
                                 volatile=not is_training,
                                 gpu=args.gpu)
        hyp = wrap_with_variable(batch['hyp'],
                                 volatile=not is_training,
                                 gpu=args.gpu)
        pre_length = wrap_with_variable(batch['pre_length'],
                                        volatile=not is_training,
                                        gpu=args.gpu)
        hyp_length = wrap_with_variable(batch['hyp_length'],
                                        volatile=not is_training,
                                        gpu=args.gpu)
        label = wrap_with_variable(batch['label'],
                                   volatile=not is_training,
                                   gpu=args.gpu)
        logits = model(pre=pre,
                       pre_length=pre_length,
                       hyp=hyp,
                       hyp_length=hyp_length)
        label_pred = logits.max(1)[1]
        accuracy = torch.eq(label, label_pred).float().mean()
        loss = criterion(input=logits, target=label)
        if is_training:
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(parameters=params, max_norm=5)
            optimizer.step()
        return loss, accuracy

    def add_scalar_summary(summary_writer, name, value, step):
        value = unwrap_scalar_variable(value)
        summ = summary.scalar(name=name, scalar=value)
        summary_writer.add_summary(summary=summ, global_step=step)

    num_train_batches = len(train_loader)
    validate_every = num_train_batches // 10
    best_vaild_accuacy = 0
    iter_count = 0
    for epoch_num in range(1, args.max_epoch + 1):
        logging.info(f'Epoch {epoch_num}: start')
        for batch_iter, train_batch in enumerate(train_loader):
            if args.anneal_temperature and iter_count % 500 == 0:
                gamma = 0.00001
                new_temperature = max([0.5, math.exp(-gamma * iter_count)])
                model.encoder.gumbel_temperature = new_temperature
                logging.info(
                    f'Iter #{iter_count}: '
                    f'Set Gumbel temperature to {new_temperature:.4f}')
            train_loss, train_accuracy = run_iter(batch=train_batch,
                                                  is_training=True)
            iter_count += 1
            add_scalar_summary(summary_writer=train_summary_writer,
                               name='loss',
                               value=train_loss,
                               step=iter_count)
            add_scalar_summary(summary_writer=train_summary_writer,
                               name='accuracy',
                               value=train_accuracy,
                               step=iter_count)

            if (batch_iter + 1) % validate_every == 0:
                valid_loss_sum = valid_accuracy_sum = 0
                num_valid_batches = len(valid_loader)
                for valid_batch in valid_loader:
                    valid_loss, valid_accuracy = run_iter(batch=valid_batch,
                                                          is_training=False)
                    valid_loss_sum += unwrap_scalar_variable(valid_loss)
                    valid_accuracy_sum += unwrap_scalar_variable(
                        valid_accuracy)
                valid_loss = valid_loss_sum / num_valid_batches
                valid_accuracy = valid_accuracy_sum / num_valid_batches
                add_scalar_summary(summary_writer=valid_summary_writer,
                                   name='loss',
                                   value=valid_loss,
                                   step=iter_count)
                add_scalar_summary(summary_writer=valid_summary_writer,
                                   name='accuracy',
                                   value=valid_accuracy,
                                   step=iter_count)
                progress = epoch_num + batch_iter / num_train_batches
                logging.info(f'Epoch {progress:.2f}: '
                             f'valid loss = {valid_loss:.4f}, '
                             f'valid accuracy = {valid_accuracy:.4f}')
                if valid_accuracy > best_vaild_accuacy:
                    best_vaild_accuacy = valid_accuracy
                    model_filename = (f'model-{progress:.2f}'
                                      f'-{valid_loss:.4f}'
                                      f'-{valid_accuracy:.4f}.pkl')
                    model_path = os.path.join(args.save_dir, model_filename)
                    torch.save(model.state_dict(), model_path)
                    print(f'Saved the new best model to {model_path}')
Example #3
0
def train(args):
    with open(args.train_data, 'rb') as f:
        train_dataset: SNLIDataset = pickle.load(f)
    with open(args.valid_data, 'rb') as f:
        valid_dataset: SNLIDataset = pickle.load(f)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=2,
                              collate_fn=train_dataset.collate,
                              pin_memory=True)
    valid_loader = DataLoader(dataset=valid_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=2,
                              collate_fn=valid_dataset.collate,
                              pin_memory=True)
    word_vocab = train_dataset.word_vocab
    label_vocab = train_dataset.label_vocab

    model = SNLIModel(num_classes=len(label_vocab),
                      num_words=len(word_vocab),
                      word_dim=args.word_dim,
                      hidden_dim=args.hidden_dim,
                      clf_hidden_dim=args.clf_hidden_dim,
                      clf_num_layers=args.clf_num_layers,
                      use_leaf_rnn=args.leaf_rnn,
                      use_batchnorm=args.batchnorm,
                      intra_attention=args.intra_attention,
                      dropout_prob=args.dropout,
                      bidirectional=args.bidirectional)
    if args.glove:
        logging.info('Loading GloVe pretrained vectors...')
        glove_weight = load_glove(
            path=args.glove,
            vocab=word_vocab,
            init_weight=model.word_embedding.weight.data.numpy())
        glove_weight[word_vocab.pad_id] = 0
        model.word_embedding.weight.data.set_(torch.FloatTensor(glove_weight))
    if args.fix_word_embedding:
        logging.info('Will not update word embeddings')
        model.word_embedding.weight.requires_grad = False
    model.to(args.device)
    logging.info(f'Using device {args.device}')
    if args.optimizer == 'adam':
        optimizer_class = optim.Adam
    elif args.optimizer == 'adagrad':
        optimizer_class = optim.Adagrad
    elif args.optimizer == 'adadelta':
        optimizer_class = optim.Adadelta
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optimizer_class(params=params, weight_decay=args.l2reg)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                               mode='max',
                                               factor=0.5,
                                               patience=10,
                                               verbose=True)
    criterion = nn.CrossEntropyLoss()

    train_summary_writer = SummaryWriter(
        log_dir=os.path.join(args.save_dir, 'log', 'train'))
    valid_summary_writer = SummaryWriter(
        log_dir=os.path.join(args.save_dir, 'log', 'valid'))

    def run_iter(batch, is_training):
        model.train(is_training)
        pre = batch['pre'].to(args.device)
        hyp = batch['hyp'].to(args.device)
        pre_length = batch['pre_length'].to(args.device)
        hyp_length = batch['hyp_length'].to(args.device)
        label = batch['label'].to(args.device)
        logits = model(pre=pre,
                       pre_length=pre_length,
                       hyp=hyp,
                       hyp_length=hyp_length)
        label_pred = logits.max(1)[1]
        accuracy = torch.eq(label, label_pred).float().mean()
        loss = criterion(input=logits, target=label)
        if is_training:
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(parameters=params, max_norm=5)
            optimizer.step()
        return loss, accuracy

    def add_scalar_summary(summary_writer, name, value, step):
        if torch.is_tensor(value):
            value = value.item()
        summary_writer.add_scalar(tag=name,
                                  scalar_value=value,
                                  global_step=step)

    num_train_batches = len(train_loader)
    validate_every = num_train_batches // 10
    best_vaild_accuacy = 0
    iter_count = 0
    for epoch_num in range(args.max_epoch):
        logging.info(f'Epoch {epoch_num}: start')
        for batch_iter, train_batch in enumerate(train_loader):
            if iter_count % args.anneal_temperature_every == 0:
                rate = args.anneal_temperature_rate
                new_temperature = max([0.5, math.exp(-rate * iter_count)])
                model.encoder.gumbel_temperature = new_temperature
                logging.info(
                    f'Iter #{iter_count}: '
                    f'Set Gumbel temperature to {new_temperature:.4f}')
            train_loss, train_accuracy = run_iter(batch=train_batch,
                                                  is_training=True)
            iter_count += 1
            add_scalar_summary(summary_writer=train_summary_writer,
                               name='loss',
                               value=train_loss,
                               step=iter_count)
            add_scalar_summary(summary_writer=train_summary_writer,
                               name='accuracy',
                               value=train_accuracy,
                               step=iter_count)

            if (batch_iter + 1) % validate_every == 0:
                torch.set_grad_enabled(False)
                valid_loss_sum = valid_accuracy_sum = 0
                num_valid_batches = len(valid_loader)
                for valid_batch in valid_loader:
                    valid_loss, valid_accuracy = run_iter(batch=valid_batch,
                                                          is_training=False)
                    valid_loss_sum += valid_loss.item()
                    valid_accuracy_sum += valid_accuracy.item()
                torch.set_grad_enabled(True)
                valid_loss = valid_loss_sum / num_valid_batches
                valid_accuracy = valid_accuracy_sum / num_valid_batches
                scheduler.step(valid_accuracy)
                add_scalar_summary(summary_writer=valid_summary_writer,
                                   name='loss',
                                   value=valid_loss,
                                   step=iter_count)
                add_scalar_summary(summary_writer=valid_summary_writer,
                                   name='accuracy',
                                   value=valid_accuracy,
                                   step=iter_count)
                progress = epoch_num + batch_iter / num_train_batches
                logging.info(f'Epoch {progress:.2f}: '
                             f'valid loss = {valid_loss:.4f}, '
                             f'valid accuracy = {valid_accuracy:.4f}')
                if valid_accuracy > best_vaild_accuacy:
                    best_vaild_accuacy = valid_accuracy
                    model_filename = (f'model-{progress:.2f}'
                                      f'-{valid_loss:.4f}'
                                      f'-{valid_accuracy:.4f}.pkl')
                    model_path = os.path.join(args.save_dir, model_filename)
                    torch.save(model.state_dict(), model_path)
                    print(f'Saved the new best model to {model_path}')
Example #4
0
def prepare_answer(train_doc,
                   train_answer,
                   train_candidates,
                   test_doc,
                   test_answer,
                   test_candidates,
                   val_doc=None,
                   val_answer=None,
                   val_candidates=None,
                   max_document_length=1000,
                   max_answer_length=20,
                   max_vocabulary_size=50000,
                   embeddings_size=50):
    """
        Prepares a dataset for use by a question-answer like model. This version will use the patterns generated
        previously for the training, test and validation sets as candidate for all three sets.

        :param train_doc: the training documents
        :param train_answer: the KPs for the training documents
        :param train_candidates: the candidate KPs for the training documents
        :param test_doc: the test documents
        :param test_answer: the KPs for the test documents
        :param test_candidates: the candidate KPs for the test documents
        :param val_doc: the validation documents (can be None)
        :param val_answer: the KPs for the validation documents (can be None)
        :param val_candidates: the candidate KPs for the validation documents (can be None)
        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
        :param max_answer_length: the maximum length of the answers (shorter answers will be truncated!)
        :param max_vocabulary_size: the maximum size of the vocabulary to use
        (i.e. we keep only the top max_vocabulary_size words)
        :param embeddings_size: the size of the GLoVE embeddings to use
        :return:  a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
        test and validation set, and an embedding matrix for an Embedding layer
        """

    # Prepare validation return data
    val_q = None
    val_a = None
    val_y = None

    # Prepare the return values: lists that will hold questions (documents), answers (keyphrases), and truth values
    train_q = []
    test_q = []
    train_a = []
    test_a = []
    train_y = []
    test_y = []

    if val_doc and val_answer:
        val_q = []
        val_a = []
        val_y = []

    documents_full = []
    for key, doc in train_doc.items():
        documents_full.append(token for token in doc)
    for key, doc in test_doc.items():
        documents_full.append(token for token in doc)

    if val_doc and val_answer:
        for key, doc in val_doc.items():
            documents_full.append(token for token in doc)

    logging.debug("Fitting dictionary on %s documents..." %
                  len(documents_full))

    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
    dictionary.fit_on_texts(documents_full)

    logging.debug("Dictionary fitting completed. Found %s unique tokens" %
                  len(dictionary.word_index))

    # Pair up each document with a candidate keyphrase and its truth value
    for key, document in train_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)
        for kp in train_candidates[key]:
            train_q.append(doc_sequence)
            train_a.append(dictionary.token_list_to_sequence(kp))
            train_y.append([0, 1] if kp in train_answer[key] else [1, 0])

    for key, document in test_doc.items():
        doc_sequence = dictionary.token_list_to_sequence(document)
        for kp in test_candidates[key]:
            test_q.append(doc_sequence)
            test_a.append(dictionary.token_list_to_sequence(kp))
            test_y.append([0, 1] if kp in test_answer[key] else [1, 0])

    if val_doc and val_answer:
        for key, document in val_doc.items():
            doc_sequence = dictionary.token_list_to_sequence(document)
            for kp in val_candidates[key]:
                val_q.append(doc_sequence)
                val_a.append(dictionary.token_list_to_sequence(kp))
                val_y.append([0, 1] if kp in val_answer[key] else [1, 0])

    logging.debug("Longest training document   : %s tokens" %
                  len(max(train_q, key=len)))
    logging.debug("Longest training answer     : %s tokens" %
                  len(max(train_a, key=len)))
    logging.debug("Longest test document       : %s tokens" %
                  len(max(test_q, key=len)))
    logging.debug("Longest test answer         : %s tokens" %
                  len(max(test_a, key=len)))
    if val_doc and val_answer:
        logging.debug("Longest validation document : %s tokens" %
                      len(max(val_q, key=len)))
        logging.debug("Longest validation answer   : %s tokens" %
                      len(max(val_a, key=len)))

    train_q = np.asarray(
        pad_sequences(train_q,
                      maxlen=max_document_length,
                      padding='post',
                      truncating='post'))
    train_a = np.asarray(
        pad_sequences(train_a,
                      maxlen=max_answer_length,
                      padding='post',
                      truncating='post'))

    test_q = np.asarray(
        pad_sequences(test_q,
                      maxlen=max_document_length,
                      padding='post',
                      truncating='post'))
    test_a = np.asarray(
        pad_sequences(test_a,
                      maxlen=max_answer_length,
                      padding='post',
                      truncating='post'))

    if val_doc and val_answer:
        val_q = np.asarray(
            pad_sequences(val_q,
                          maxlen=max_document_length,
                          padding='post',
                          truncating='post'))
        val_a = np.asarray(
            pad_sequences(val_a,
                          maxlen=max_answer_length,
                          padding='post',
                          truncating='post'))

    logging.debug("Training set documents size   : %s", np.shape(train_q))
    logging.debug("Training set answers size     : %s", np.shape(train_a))
    logging.debug("Test set documents size       : %s", np.shape(test_q))
    logging.debug("Test set answers size         : %s ", np.shape(test_a))

    if val_doc and val_answer:
        logging.debug("Validation set documents size : %s", np.shape(val_q))
        logging.debug("Validation set answers size   : %s ", np.shape(val_a))

    # prepare the matrix for the embedding layer
    word_index = dictionary.word_index
    embeddings_index = glove.load_glove('', embeddings_size)

    num_words = min(max_vocabulary_size, 1 + len(word_index))

    logging.debug("Building embedding matrix of size [%s,%s]..." %
                  (num_words, embeddings_size))

    embedding_matrix = np.zeros((num_words, embeddings_size))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return [train_q, train_a], train_y, [test_q, test_a], test_y, [
        val_q, val_a
    ], val_y, embedding_matrix, dictionary
Example #5
0
def prepare_sequential(train_doc,
                       train_answer,
                       test_doc,
                       test_answer,
                       val_doc,
                       val_answer,
                       max_document_length=1000,
                       max_vocabulary_size=50000,
                       embeddings_size=50,
                       stem_test=False):
    """
        Prepares a dataset for use by a sequential, categorical model.

        :param train_doc: the training documents
        :param train_answer: the KPs for the training documents
        :param test_doc: the test documents
        :param test_answer: the KPs for the test documents
        :param val_doc: the validation documents (can be None)
        :param val_answer: the KPs for the validation documents (can be None)
        :param max_document_length: the maximum length of the documents (shorter documents will be truncated!)
        :param max_vocabulary_size: the maximum size of the vocabulary to use
        (i.e. we keep only the top max_vocabulary_size words)
        :param embeddings_size: the size of the GLoVE embeddings to use
        :param stem_test: set the value to True if the test set answers are stemmed
        :return: a tuple (train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix) containing the training,
        test and validation set, and an embedding matrix for an Embedding layer
        """

    train_answer_seq = make_sequential(train_doc, train_answer)

    if not stem_test:
        test_answer_seq = make_sequential(test_doc, test_answer)
    else:
        import copy
        stemmed_test_doc = copy.deepcopy(test_doc)
        stemmed_test_doc = stem_dataset(stemmed_test_doc)
        test_answer_seq = make_sequential(stemmed_test_doc, test_answer)

    # Prepare validation return data
    val_x = None
    val_y = None

    if val_doc and val_answer:
        val_answer_seq = make_sequential(val_doc, val_answer)

    # Transform the documents to sequence
    documents_full = []
    train_y = []
    test_y = []

    if val_doc and val_answer:
        val_y = []

    for key, doc in train_doc.items():
        documents_full.append(token for token in doc)
        train_y.append(train_answer_seq[key])
    for key, doc in test_doc.items():
        documents_full.append(token for token in doc)
        test_y.append(test_answer_seq[key])

    if val_doc and val_answer:
        for key, doc in val_doc.items():
            documents_full.append(token for token in doc)
            val_y.append(val_answer_seq[key])

    logging.debug("Fitting dictionary on %s documents..." %
                  len(documents_full))

    dictionary = dict.Dictionary(num_words=max_vocabulary_size)
    dictionary.fit_on_texts(documents_full)

    logging.debug("Dictionary fitting completed. Found %s unique tokens" %
                  len(dictionary.word_index))

    # Now we can prepare the actual input
    train_x = dictionary.texts_to_sequences(train_doc.values())
    test_x = dictionary.texts_to_sequences(test_doc.values())
    if val_doc and val_answer:
        val_x = dictionary.texts_to_sequences(val_doc.values())

    logging.debug("Longest training document : %s tokens" %
                  len(max(train_x, key=len)))
    logging.debug("Longest test document :     %s tokens" %
                  len(max(test_x, key=len)))
    if val_doc and val_answer:
        logging.debug("Longest validation document : %s tokens" %
                      len(max(val_x, key=len)))

    train_x = np.asarray(
        pad_sequences(train_x,
                      maxlen=max_document_length,
                      padding='post',
                      truncating='post'))
    train_y = pad_sequences(train_y,
                            maxlen=max_document_length,
                            padding='post',
                            truncating='post')
    train_y = make_categorical(train_y)

    test_x = np.asarray(
        pad_sequences(test_x,
                      maxlen=max_document_length,
                      padding='post',
                      truncating='post'))
    test_y = pad_sequences(test_y,
                           maxlen=max_document_length,
                           padding='post',
                           truncating='post')
    test_y = make_categorical(test_y)

    if val_doc and val_answer:
        val_x = np.asarray(
            pad_sequences(val_x,
                          maxlen=max_document_length,
                          padding='post',
                          truncating='post'))
        val_y = pad_sequences(val_y,
                              maxlen=max_document_length,
                              padding='post',
                              truncating='post')
        val_y = make_categorical(val_y)

    logging.debug("Training set samples size   : %s", np.shape(train_x))
    logging.debug("Training set answers size   : %s", np.shape(train_y))
    logging.debug("Test set samples size       : %s", np.shape(test_x))
    logging.debug("Test set answers size       : %s ", np.shape(test_y))

    if val_doc and val_answer:
        logging.debug("Validation set samples size : %s", np.shape(val_x))
        logging.debug("Validation set answers size : %s ", np.shape(val_y))

    # prepare the matrix for the embedding layer
    word_index = dictionary.word_index
    embeddings_index = glove.load_glove('', embeddings_size)

    num_words = min(max_vocabulary_size, 1 + len(word_index))

    logging.debug("Building embedding matrix of size [%s,%s]..." %
                  (num_words, embeddings_size))

    embedding_matrix = np.zeros((num_words, embeddings_size))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return train_x, train_y, test_x, test_y, val_x, val_y, embedding_matrix