Exemple #1
0
def import_sejm_data():
    es = Elasticsearch()
    df = load_clean_df()

    embeddings = load_embeddings(LEXRANK_WEIGHTED, dim=None)
    df['embedding'] = embeddings

    def to_doc(row):
        return {"_index": 'sejm', "_id": row.id, "_source": row.to_dict()}

    docs_gen = (to_doc(row) for index_, row in df.iterrows())

    sejm_settings = {
        "settings": {
            "max_result_window": 300000
        },
        "mappings": {
            "properties": {
                "embedding": {
                    "type": "dense_vector",
                    "dims": 768
                }
            }
        }
    }

    es.indices.create(index='sejm', ignore=400, body=sejm_settings)
    helpers.bulk(es, tqdm(docs_gen, total=len(df)))
Exemple #2
0
def main(test_file, vocab_file, embeddings_file, pretrained_file, max_length=50, gpu_index=0, batch_size=128):
    """
    Test the ESIM model with pretrained weights on some dataset.
    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    device = torch.device("cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    hidden_size = checkpoint["model"]["projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["classification.6.weight"].size(0)
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")    
    test_data = LCQMC_Dataset(test_file, vocab_file, max_length)
    test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    print("\t* Building model...")
    model = ESIM(hidden_size, embeddings=embeddings, num_classes=num_classes, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing ESIM model on device: {} ".format(device), 20 * "=")
    batch_time, total_time, accuracy, auc = test(model, test_loader)
    print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n".format(batch_time, total_time, (accuracy*100), auc))
Exemple #3
0
def main(test_file,
         vocab_file,
         embeddings_file,
         pretrained_file,
         max_length=50,
         gpu_index=0,
         batch_size=128):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    test_data = LCQMC_Dataset(test_file, vocab_file, max_length)
    test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    print("\t* Building model...")
    model = SiaGRU(embeddings, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing SiaGRU model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, auc = test(model, test_loader)
    print(
        "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%, auc: {:.4f}\n"
        .format(batch_time, total_time, (accuracy * 100), auc))
def model_load_test(test_df,
                    vocab_file,
                    embeddings_file,
                    pretrained_file,
                    test_prediction_dir,
                    test_prediction_name,
                    mode,
                    num_labels=2,
                    max_length=64,
                    gpu_index=0,
                    batch_size=128):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    print("\t* Building model...")
    model = ABCNN(embeddings,
                  num_labels=num_labels,
                  num_layer=1,
                  linear_size=300,
                  max_length=max_length,
                  device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing ABCNN model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, predictions = test(model, test_loader)
    print(
        "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n"
        .format(batch_time, total_time, (accuracy * 100)))
    test_prediction = pd.DataFrame({'prediction': predictions})
    if not os.path.exists(test_prediction_dir):
        os.makedirs(test_prediction_dir)
    test_prediction.to_csv(os.path.join(test_prediction_dir,
                                        test_prediction_name),
                           index=False)
Exemple #5
0
def main(vocab_file,
         embeddings_file,
         pretrained_file,
         max_length=50,
         gpu_index=0,
         batch_size=128):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    # test_data = LCQMC_Dataset(test_file, vocab_file, max_length)
    # test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
    print("\t* Building model...")
    model = SiaGRU(embeddings, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing SiaGRU model on device: {} ".format(device),
          20 * "=")

    database = [
        line for line in open('./data/rumors.txt', 'r', encoding='utf-8')
    ]

    while True:
        input("enter to continue")
        inputs = [
            line for line in open('./data/input.txt', 'r', encoding='utf-8')
        ]
        init_csv(inputs, database, './data/work_data.csv')
        dataset = LCQMC_Dataset('./data/work_data.csv', vocab_file, max_length)
        dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)
        prob = get_score(model, dataloader)
        for i, p in enumerate(prob):
            if p > 0.5:
                print("text:", inputs[i // len(database)])
                print("rumor:", database[i % len(database)])
                print("prob:", p)
Exemple #6
0
def main(args):
    print(20 * "=", " Preparing for training ", 20 * "=")
    if not os.path.exists(args.result):
        os.makedirs(args.result)

    # -------------------- Loda pretraining model ------------------- #
    checkpoints = torch.load(args.pretrained_file)
    # 可以从模型中直接恢复,也可以直接在前面定义 Retrieving model parameters from checkpoint.
    # hidden_size = checkpoints["model"]["projection.0.weight"].size(0)
    # num_classes = checkpoints["model"]["classification.6.weight"].size(0)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    test_data = LCQMC_dataset(args.test_file,
                              args.vocab_file,
                              args.max_length,
                              test_flag=True)
    test_loader = DataLoader(test_data, batch_size=args.batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = load_embeddings(args.embed_file)
    model = ESIM(args, embeddings=embeddings).to(args.device)
    model.load_state_dict(checkpoints["model"])
    print(20 * "=", " Testing ESIM model on device: {} ".format(args.device),
          20 * "=")
    all_predict = predict(model, test_loader)
    index = np.array([], dtype=int)
    for i in range(len(all_predict)):
        index = np.append(index, i)
    # ---------------------生成文件--------------------------
    df_test = pd.DataFrame(columns=['index', 'prediction'])
    df_test['index'] = index
    df_test['prediction'] = all_predict
    df_test.to_csv(args.submit_example_path,
                   index=False,
                   columns=['index', 'prediction'],
                   sep='\t')
Exemple #7
0
    def _create_loss(self):
        '''  Create loss, output projection, RNN cell, embeddings  '''
        print 'Creating loss...  ',
        start = time.time()
        xavier = tf.contrib.layers.xavier_initializer()
        # use output projection if we're using sampled softmax
        if config.NUM_SAMPLES > 0 and config.NUM_SAMPLES < self.dec_vocab:
            proj_w_size = config.HIDDEN_SIZE
            w = tf.Variable(xavier([proj_w_size, self.dec_vocab]), name='w')
            b = tf.Variable(xavier([self.dec_vocab]), name='b')
            self.output_projection = (w, b)

        def sampled_loss(inputs, labels):
            labels = tf.reshape(labels, [-1, 1])
            return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs,
                                              labels, config.NUM_SAMPLES,
                                              self.dec_vocab)

        self.softmax_loss = sampled_loss

        single_cell = tf.nn.rnn_cell.GRUCell(config.HIDDEN_SIZE)
        self.cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] *
                                                config.NUM_LAYERS)

        if self.pretrained:
            # set up variables for special tokens and concat with pretrained
            pad = tf.zeros([1, config.EMBED_SIZE])
            flags = tf.Variable(xavier([3, config.EMBED_SIZE],
                                       dtype=tf.float32),
                                name='flags')
            embeddings = tf.constant(data.load_embeddings(self.data_path),
                                     dtype=tf.float32)
            self.embeddings = tf.concat(0, [pad, flags, embeddings])
        else:
            self.embeddings = tf.Variable(xavier(
                [self.enc_vocab, config.EMBED_SIZE]),
                                          dtype=tf.float32)

        feed_prev = self.feed_prev_placeholder
        self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
            self.encoder_inputs,
            self.decoder_inputs,
            self.targets,
            self.decoder_masks,
            config.BUCKETS,
            lambda x, y: self._seq_f(x, y, feed_prev),
            softmax_loss_function=self.softmax_loss)

        # If we use output projection, we need to project outputs for decoding.

        def project_outputs(cur, bucket):
            if self.output_projection:
                return [
                    tf.matmul(output, self.output_projection[0]) +
                    self.output_projection[1]
                    for output in self.outputs[bucket]
                ]
            return tf.constant(False)

        for bucket in xrange(len(config.BUCKETS)):
            cur = self.outputs[bucket]
            self.outputs[bucket] = tf.cond(
                self.feed_prev_placeholder,
                lambda: project_outputs(cur, bucket), lambda: cur)

        print 'Took', time.time() - start, 'seconds'
Exemple #8
0
parser.add_argument('--mode', type=str, default='demo', help='train/test/demo')
parser.add_argument('--demo_model',
                    type=str,
                    default='1521112368',
                    help='model for test and demo')
args = parser.parse_args()

## get char embeddings
#word2id:为每一个不重复的字进行编号,其中UNK为最后一位
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl'))
print("\n========word2id=========\n", word2id)
if args.embedding_type == 'random':
    #随机生成词嵌入矩阵(一共3905个字,默认取300个特征,维度为3905*300)
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embeddings = load_embeddings(args.embedding_dim, word2id,
                                 args.embedding_type)
    #使用gensim(word2vec)基于wiki百科语料训练的中文词向量

print("\n=========embeddings==========\n", embeddings, "\ndim(embeddings)=",
      embeddings.shape)

## read corpus and get training data获取
if args.mode != 'demo':
    train_path = os.path.join('.', args.train_data, 'ner_train_data')
    test_path = os.path.join('.', args.test_data, 'ner_test_data')
    train_data = read_corpus(train_path)  #读取训练集
    test_data = read_corpus(test_path)
    test_size = len(test_data)  #读取测试集
    print('train_data=\n', train_data)
    #print("\n==========train_data================\n",train_data)
    #print("\n==========test_data================\n",test_data)
Exemple #9
0
def main(train_file,
         dev_file,
         embeddings_file,
         vocab_file,
         target_dir,
         max_length=50,
         epochs=50,
         batch_size=128,
         lr=0.0005,
         patience=5,
         max_grad_norm=10.0,
         gpu_index=0,
         checkpoint=None):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = LCQMC_Dataset(train_file, vocab_file, max_length)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = LCQMC_Dataset(dev_file, vocab_file, max_length)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = load_embeddings(embeddings_file)
    model = BIMPM(embeddings, device=device).to(device)
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion)
    print(
        "\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}"
        .format(valid_loss, (valid_accuracy * 100), auc))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training BIMPM model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print(
            "-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
            .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses
                }, os.path.join(target_dir, "best.pth.tar"))
        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Exemple #10
0
def run_task(task_name, task_data, task_format, model_name, model_id):
    mode = config.layer_mode
    train_data, dev_data, test_data = None, None, None

    if config.train:
        dev_data = load_embeddings(task_name, task_data, task_format, 'dev',
                                   model_name, model_id, config.label_map)
        train_data = dev_data if config.sample else load_embeddings(
            task_name, task_data, task_format, 'train', model_name, model_id,
            config.label_map)

    if config.export:
        test_data = load_embeddings(task_name, task_data, task_format, 'test',
                                    model_name, model_id, config.label_map)

    if config.dry_run:
        print('loaded data, now stopping dry run')
        return

    layers_labels = []

    n_workers = config.num_workers if config.num_workers > 0 else 1
    if n_workers == 1:
        for layer in range(*config.layer_range):
            layer_labels, layer_preds = run_layer(train_data, dev_data,
                                                  test_data, task_name,
                                                  model_name, mode, layer)
            if config.export and layer_labels is not None:
                if len(layers_labels) == 0:
                    layers_labels.append(layer_labels)
                layers_labels.append(layer_preds)
    else:
        procs_queue, procs_running = [], []
        for layer in range(*config.layer_range):
            p = mp.Process(target=run_layer,
                           args=(train_data, dev_data, test_data, task_name,
                                 model_name, mode, layer))
            procs_queue.append(p)
            # run_layer(train_data, dev_data, test_data, task_name, model_name, mode, layer)

        for p in procs_queue:
            while len(procs_running) >= n_workers:
                time.sleep(1)
                for i in range(n_workers):
                    if not procs_running[i].is_alive():
                        procs_running.pop(i)
                        break

            procs_running.append(p)
            p.start()

        for p in procs_running:
            p.join()

    if len(layers_labels) > 0:
        preds_dir = os.path.join(task_name, 'predictions', config.name)
        preds_path = data_path(preds_dir, mode, 'json', model_name)

        with open(preds_path, 'w') as f:
            for labels in zip(*layers_labels):
                labels = [config.label_map[lab] for lab in labels]
                f.write('\t'.join(labels) + '\n')

        print(f'Saved layer-wise predictions to {preds_path}')

    if config.report:
        summaries = []

        for layer in range(*config.layer_range):
            summary_dir = os.path.join(task_name, 'summaries', config.name,
                                       mode)
            summary_path = data_path(summary_dir, mode, 'json', model_name,
                                     layer)

            if not os.path.exists(summary_path):
                print('skipping, {} does not exist'.format(summary_path))
            else:
                with open(summary_path) as f:
                    summary = json.load(f)
                summaries.append((layer, summary))

        report(summaries)
Exemple #11
0
                         n_gram=config.n_gram,
                         context_mode=config.context_mode)
validation_corpus = Corpus(args.data + "/valid.txt",
                           dictionary,
                           create_dict=True,
                           use_cuda=args.cuda,
                           n_gram=config.n_gram,
                           context_mode=config.context_mode)

# TensorboardX object
writer = SummaryWriter("saved_runs/" + args.save)

# Word embeddings
embedding = nn.Embedding(len(dictionary), config.em_size, padding_idx=0)
if config.pre_trained:
    load_embeddings(embedding, dictionary.word2idx, config.pre_trained,
                    config.em_size)

# Model, Optimizer and Loss
model = LSTM_LM(embedding, config)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
criterion = nn.CrossEntropyLoss(ignore_index=0)

if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()

total_params = sum(x.size()[0] *
                   x.size()[1] if len(x.size()) > 1 else x.size()[0]
                   for x in model.parameters())
print('Model total parameters:', total_params, flush=True)
Exemple #12
0
def test(
    name: str,
    model_type: str,
    step: int = None,
    mode: str = 'test',
    data_seed: int = None,
    data_name: str = 'SNLI',
    data_embedding: str = 'GloVe',
    data_pad: bool = True,
    batch_size: int = 10,
    print_errors: bool = False,
    print_errors_limit: int = 10,
    **kwargs,
) -> None:
    model_path = build.get_model_path(name)

    model = getattr(nn, model_type)(embeddings=data.load_embeddings(
        data_name, data_embedding, data_seed),
                                    **kwargs)
    log.info(str(model))
    log.debug('Model parameters:\n\n\t' +
              '\n\t'.join(graph.print_trainable_variables().split('\n')))

    with tf.Session(config=_make_config()) as sess:
        dataset = data.load_dataset(data_name, mode, data_embedding, data_seed)

        data_iter, data_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='data_handle',
            dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            pad=data_pad,
            session=sess)

        _restore_model(sess, model_path, step)

        y_preds, y_trues = [], []  # type: ignore
        sess.run(data_iter.initializer)
        while True:
            try:
                true, pred = sess.run(
                    [model.y, model.prediction],
                    feed_dict={
                        model.handle: data_hd,
                        model.keep_prob: 1.0,
                        model.is_training: False
                    })
                y_preds.extend(np.squeeze(pred).tolist())
                y_trues.extend(np.squeeze(true).tolist())
            except tf.errors.OutOfRangeError:
                break

    # print accuracy
    print('Acc: %.4f' % sklearn.metrics.accuracy_score(y_trues, y_preds))

    # Print confusion matrix
    labels = list(
        sorted(data.SNLI.LABELS.keys(), key=lambda x: data.SNLI.LABELS[x]))
    cm = sklearn.metrics.confusion_matrix(y_trues,
                                          y_preds,
                                          labels=range(len(labels)))
    tmpl = '%15s ' * (len(labels) + 2)
    print(tmpl % tuple([''] + labels + ['']))
    corr = 0
    for i in range(len(labels)):
        stats = cm[i]
        prob = stats[i] / sum(stats)
        corr += stats[i]
        print(tmpl %
              tuple([labels[i]] + list(map(str, cm[i])) + ['%.4f' % prob]))
    print(tmpl % tuple(['%d / %d' % (corr, len(y_trues))] +
                       [''] * len(labels) + ['%.4f' % (corr / len(y_trues))]))

    # Print errors
    if print_errors:
        tmpl = '\n%4d. Pred: %-20s  True: %s\n      %s\n      %s'
        for i, (y_pred, y_true) in enumerate(zip(y_preds, y_trues)):
            if y_pred != y_true and print_errors_limit != 0:
                s1 = ' '.join(dataset.x1_words[i])
                s2 = ' '.join(dataset.x2_words[i])
                l_pred = labels[y_pred]
                l_true = labels[y_true]
                print(tmpl % (i, l_pred, l_true, s1, s2))
                print_errors_limit -= 1
Exemple #13
0
def train(name: str,
          model_type: str,
          batch_size: int = 256,
          epoch_num: int = 200,
          keep_prob: float = 0.8,
          train_regex_list: t.Union[t.List[str], str] = None,
          optim_manager_type: str = 'NotChange',
          data_name: str = 'SNLI',
          data_embedding: str = 'GloVe',
          data_argument: bool = False,
          data_pad: bool = True,
          data_cache: bool = False,
          data_seed: int = None,
          record_every: int = 64000,
          validate_every: int = 640000,
          save_every: int = 6400000,
          restore_from: str = None,
          restore_step: int = None,
          profiling: bool = False,
          clip_norm: int = None,
          seed: int = None,
          debug: bool = False,
          **kwargs) -> None:

    # Data preparation
    model_path = build.get_model_path(name)
    shutil.rmtree(model_path, ignore_errors=True)  # remove previous trained

    # Network setup
    model = getattr(nn, model_type)(embeddings=data.load_embeddings(
        data_name, data_embedding, data_seed),
                                    **_select_kwargs_regex(kwargs,
                                                           r'^optim[0-9]*_',
                                                           invert=True))
    log.info(str(model))
    log.debug('Model parameters:\n\n\t' +
              '\n\t'.join(graph.print_trainable_variables().split('\n')))

    # Control randomization
    if seed:
        log.info(
            'Set random seed for data shuffling and graph computation: %d' %
            seed)
        tf.set_random_seed(seed)

    train_summary = _make_model_summary(model)

    with tf.Session(config=_make_config()) as sess:
        if debug:
            from tensorflow.python import debug as tf_debug
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        dataset_opts = {
            'pad': data_pad,
            'batch_size': batch_size,
            'session': sess,
        }
        train_iter, train_hd = _make_dataset_iterator(
            type_name='one_shot_iterator',
            handle_name='train_handle',
            dataset=data.load_dataset(data_name, 'train', data_embedding,
                                      data_seed),
            argument=data_argument,
            bucket_boundaries=[20, 50],
            repeat_num=epoch_num,
            cache=data_cache,
            seed=seed,
            **dataset_opts)
        valid_iter, valid_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='valid_handle',
            dataset=data.load_dataset(data_name, 'validation', data_embedding,
                                      data_seed),
            shuffle=False,
            cache=True,
            **dataset_opts)
        test_iter, test_hd = _make_dataset_iterator(
            type_name='initializable_iterator',
            handle_name='test_handle',
            dataset=data.load_dataset(data_name, 'test', data_embedding,
                                      data_seed),
            shuffle=False,
            cache=True,
            **dataset_opts)

        om = _make_optim_manager(optim_manager_type, model.loss, clip_norm,
                                 train_regex_list, kwargs)

        test_wtr = tf.summary.FileWriter(os.path.join(model_path, 'test'))
        train_wtr = tf.summary.FileWriter(os.path.join(model_path, 'train'),
                                          sess.graph)
        # Build a validation summary writer for each optimizer
        valid_wtr = {}
        for optim in om.optims:
            valid_wtr[optim.get_name()] = tf.summary.FileWriter(
                os.path.join(model_path, 'valid-%s' % optim.get_name()))

        if restore_from:
            _copy_checkpoint(restore_from, model_path, restore_step)
            _restore_model(sess, model_path, restore_step)
            # Evaluate the pretrained model
            step = restore_step
            _iterate_dataset(sess, model, valid_iter, valid_hd,
                             valid_wtr[om.optim.get_name()], step)
            _iterate_dataset(sess, model, test_iter, test_hd, test_wtr, step)
        else:
            sess.run(tf.global_variables_initializer())
            step = 0

        if profiling:
            _profile_and_exit(sess, model, om.optim_op, train_hd)

        pbar = tqdm.tqdm(total=save_every, desc='Train', unit=' inst')
        try:
            while True:
                feed_dict = {
                    model.handle: train_hd,
                    model.keep_prob: keep_prob,
                    model.is_training: True
                }
                if om.feed_lr:
                    feed_dict[om.lr_op] = om.lr_val
                if step % record_every == 0:
                    summary, _, loss = sess.run(
                        [train_summary, om.optim_op, model.loss],
                        feed_dict=feed_dict)
                    pbar.set_postfix(loss='{:.3f}'.format(loss))
                    train_wtr.add_summary(summary, step)
                else:
                    sess.run([om.optim_op], feed_dict=feed_dict)

                if step and step % validate_every == 0:
                    pbar.set_description('Valid')
                    valid_acc = _iterate_dataset(
                        sess, model, valid_iter, valid_hd,
                        valid_wtr[om.optim.get_name()], step)
                    # Update upon the validation perfomance
                    om.update(valid_acc, step)
                    pbar.set_description('Test')
                    _iterate_dataset(sess, model, test_iter, test_hd, test_wtr,
                                     step)
                    pbar.set_description('Train')

                if step and step % save_every == 0:
                    save_path = _save_model(sess, model_path, step)
                    pbar.set_description(save_path)
                    pbar.update(batch_size)
                    pbar.close()
                    pbar = tqdm.tqdm(total=save_every,
                                     desc='Train',
                                     unit=' inst')
                else:
                    pbar.update(batch_size)

                step += batch_size

        except tf.errors.OutOfRangeError:
            save_path = _save_model(sess, model_path, step)
            pbar.set_description(save_path)
            log.info('Training finished!')
Exemple #14
0
from data import model_path, load_embeddings
from tqdm import tqdm
from glob import glob

skip = [
    ('sbert-lexrank-top1', 5),
    ('sbert-lexrank-top1', 10),
    ('sbert-tf-idf-top1', 5),
    ('sbert-tf-idf-top1', 10),
    ('use-lexrank-top1', 5),
    ('use-lexrank-top1', 10),
    ('use-tf-idf-top1', 5),
    ('use-tf-idf-top1', 10),
    ('use-tf-idf-top1', 15),
    ('use-tf-idf-top1', 20),
    ('use-tf-idf-top5', 5),
    ('use-tf-idf-top5', 10),
]

for emb_file in tqdm(sorted(glob(model_path + 'embeddings/*.pkl')),
                     position=0):
    emb = emb_file.split('/')[-1][:-4]

    for n_neighbors in tqdm([5, 10, 15, 20, 25, 50], position=1, leave=False):
        if (emb, n_neighbors) in skip:
            continue
        if 'top' in emb and n_neighbors in [25, 50]:
            continue
        load_embeddings(emb, dim=5, n_neighbors=n_neighbors)
def model_train_validate_test(train_df,
                              dev_df,
                              test_df,
                              embeddings_file,
                              vocab_file,
                              target_dir,
                              mode,
                              max_length=64,
                              num_labels=2,
                              epochs=50,
                              batch_size=256,
                              lr=0.0005,
                              patience=3,
                              max_grad_norm=10.0,
                              gpu_index=0,
                              if_save_model=False,
                              checkpoint=None):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = My_Dataset(train_df, vocab_file, max_length, mode)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = My_Dataset(dev_df, vocab_file, max_length, mode)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    if (embeddings_file is not None):
        embeddings = load_embeddings(embeddings_file)
    else:
        embeddings = None
    model = ABCNN(embeddings=embeddings,
                  num_labels=num_labels,
                  num_layer=1,
                  linear_size=300,
                  max_length=max_length,
                  device=device).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))

    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ABCNN model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, _, = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0

            if (if_save_model):
                torch.save(
                    {
                        "epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses
                    }, os.path.join(target_dir, "best.pth.tar"))

                print("save model succesfully!\n")

            print("* Test for epoch {}:".format(epoch))
            _, _, test_accuracy, predictions = validate(
                model, test_loader, criterion)
            print("Test accuracy: {:.4f}%\n".format(test_accuracy))
            test_prediction = pd.DataFrame({'prediction': predictions})
            test_prediction.to_csv(os.path.join(target_dir,
                                                "test_prediction.csv"),
                                   index=False)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break
Exemple #16
0
from data import load_data, load_embeddings
from model import create_model, train_model
from evaluation import evaluate_model
from sklearn.model_selection import train_test_split, StratifiedKFold

log.basicConfig(format='%(asctime)s %(message)s', level=log.INFO)

try:
    experiment = experiments[sys.argv[1]]
except:
    log.error("experiment \"{0}\" does not exist".format(sys.argv[1]))
    sys.exit(1)

X_train, X_test, y_train, y_test, word_index = load_data(experiment)

if "embedding_file" in experiment:
    embedding_matrix = load_embeddings(experiment, word_index)
    model = create_model(experiment,
                         X_train,
                         y_train,
                         embedding_matrix=embedding_matrix,
                         word_index=word_index)
else:
    model = create_model(experiment, X_train, y_train, word_index=word_index)

model = train_model(model, X_train, y_train)
evaluate_model(model, X_test, y_test)
#pred = model.predict_classes(X_test)
# for p in pred:
#     print (p)
Exemple #17
0
def main(args):
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(args.target_dir):
        os.makedirs(args.target_dir)

    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    # train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False)
    train_data = LCQMC_dataset(args.train_file, args.vocab_file, args.max_length, test_flag=False)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True)
    print("\t* Loading valid data...")
    dev_data = LCQMC_dataset(args.dev_file, args.vocab_file, args.max_length, test_flag=False)
    dev_loader = DataLoader(dev_data, batch_size=args.batch_size, shuffle=True)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    embeddings = load_embeddings(args.embed_file)
    model = ESIM(args, embeddings=embeddings).to(args.device)

    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()  # 交叉熵损失函数
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=args.lr)  # 优化器
    # 学习计划
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
                                                           factor=0.85, patience=0)

    best_score = 0.0
    start_epoch = 1

    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if args.checkpoint:
        # 从文件中加载checkpoint数据, 从而继续训练模型
        checkpoints = torch.load(args.checkpoint)
        start_epoch = checkpoints["epoch"] + 1
        best_score = checkpoints["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".format(start_epoch))
        model.load_state_dict(checkpoints["model"])  # 模型部分
        optimizer.load_state_dict(checkpoints["optimizer"])
        epochs_count = checkpoints["epochs_count"]
        train_losses = checkpoints["train_losses"]
        valid_losses = checkpoints["valid_losses"]

        # 这里改为只有从以前加载的checkpoint中才进行计算 valid, Compute loss and accuracy before starting (or resuming) training.
        _, valid_loss, valid_accuracy, auc = validate(model, dev_loader, criterion)
        print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}"
              .format(valid_loss, (valid_accuracy * 100), auc))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training ESIM model on device: {}".format(args.device), 20 * "=")
    patience_counter = 0

    for epoch in range(start_epoch, args.epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer,
                                                       criterion, epoch, args.max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%"
              .format(epoch_time, epoch_loss, (epoch_accuracy * 100)))

        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, epoch_auc = validate(model, train_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
              .format(epoch_time, epoch_loss, (epoch_accuracy * 100), epoch_auc))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0
            # 保存最好的结果,需要保存的参数,这些参数在checkpoint中都能找到
            torch.save(
                {
                    "epoch": epoch,
                    "model": model.state_dict(),
                    "best_score": best_score,
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses},
                os.path.join(args.target_dir, "new_best.pth.tar"))
        # 保存每个epoch的结果 Save the model at each epoch.(这里可要可不要)
        torch.save(
            {
                "epoch": epoch,
                "model": model.state_dict(),
                "best_score": best_score,
                "optimizer": optimizer.state_dict(),
                "epochs_count": epochs_count,
                "train_losses": train_losses,
                "valid_losses": valid_losses},
            os.path.join(args.target_dir, "new_esim_{}.pth.tar".format(epoch)))

        if patience_counter >= args.patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break