Beispiel #1
0
def train(nvsm, device, optimizer, epochs, train_loader, eval_loader,
          k_values, loss_function, lamb, print_every):
    for epoch in tqdm(range(epochs),
                      desc = 'Epochs',
                      ncols = 70):
        tqdm_train_loader = tqdm(
            enumerate(train_loader),
            desc  = 'Batch',
            total = len(train_loader),
            ncols = 70,
            leave = True
        )
        for i, (n_grams, doc_ids) in tqdm_train_loader:
            n_grams    = n_grams.to(device)
            doc_ids    = doc_ids.to(device)
            optimizer.zero_grad()
            pred_proba = nvsm(n_grams, doc_ids)
            loss       = loss_function(nvsm, pred_proba, lamb)
            loss.backward()
            optimizer.step()
            if i % print_every == 0:
                nvsm.eval()
                recall_at_ks = evaluate(
                    nvsm          = nvsm,
                    device        = device,
                    eval_loader   = eval_loader,
                    recalls       = k_values,
                    loss_function = loss_function,
                )
                nvsm.train()
                model_eval = generate_eval(k_values, recall_at_ks)
                print(f'  [{epoch:3}, {i:5d}]: {loss:5.4f} || {model_eval}')
Beispiel #2
0
def main():
    data = np.array(list(extract_features.extract(args.video)))
    t = data[:, 0]
    X = data[:, 1:]

    with open(args.model, 'rb') as model_file:
        model = pickle.load(model_file)
    y = evaluate_model.evaluate(X, model)

    doc = write_clips.create_document(t, y, args.video)
    with open(args.output, 'wb') as output_file:
        doc.write(output_file)
Beispiel #3
0
def do_run(models):
    start_time = datetime.datetime.now()
    save_last_run()
    msgBody = ""
    log = ""

    failed = False
    datasets = makeDatasets()
    for m, dataset, args, expected in models:
        try:
            ds = datasets[dataset]
            log += sh("rake create_model_%s_%s" % (m, ds.rake_suffix))
            log += "\n\n"

            msgBody += "**********************************\n"
            msgBody += "%s  %s " % (m, args)

            import evaluate_model
            eval_start = datetime.datetime.now()
            actual, out_fname = evaluate_model.evaluate(
                run_description=str(args),
                corpus_fn=ds.corpus_fn,
                model_fn="%s/models/%s.pck" % (ds.dir, m),
                output_dir="%s/output" % ds.dir,
                gtruth_tag_fn=ds.gtruth_tag_fn,
                map_fn=ds.map_fn,
                **args)
            eval_stop = datetime.datetime.now()
            msgBody += "\n(outfile: %s)\n" % out_fname
            if expected != None:
                assert actual == expected, (actual, expected)
                msgBody += "got %d, as expected, in %s\n" % (
                    actual, str(eval_stop - eval_start))
            else:
                msgBody += "Didn't check expected.  got %d\n" % (actual)

        except:
            stackTrace = traceback.format_exc()
            failed = True
            msgBody += "failed with %s!\n\n"
            msgBody += stackTrace
    end_time = datetime.datetime.now()
    msgBody = "Ran %d tests in " % len(models) + str(
        end_time - start_time) + "\n" + msgBody
    return failed, msgBody
Beispiel #4
0
def create_subdirectory_if_not_exists(dir_name):
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
        print("Directory", dir_name, "created.")
    else:
        print("Directory", dir_name, "already exists.")


# Create output directory to store preprocessed data and trained model
create_subdirectory_if_not_exists("out")

# Define file locations
train_data = "out/preprocessed_train.npz"
test_data = "out/preprocessed_test.npz"
model_file = "out/model.h5"

if not os.path.isfile(train_data) or not os.path.isfile(test_data):
    # Preprocess data
    preprocess(train_data=train_data, test_data=test_data)
    print("Data preprocessed and saved locally.")
else:
    print("Preprocessed data exists.")

# Train model
build(train_data=train_data, save_file=model_file)

# Evaluate model
scores = evaluate(model_file, test_data)
print("Final scores:", scores)
Beispiel #5
0
import os, shutil
from keras import backend

import settings
from train_model import train
from evaluate_model import evaluate

for config in settings.TESTS_CONFIG:
    print("Training model ", str(config))
    train(config)
    file = os.path.basename(config.file_name)
    os.rename(config.file_name,
              os.path.join(settings.completed_tests_folder, file))
    shutil.move(config.model_dir,
                os.path.join(settings.results_folder, str(config)))
    print("Evaluating model ", str(config))
    evaluate(config)
    backend.clear_session()
Beispiel #6
0
            # 7, 8, 9, 10
            new_y.append(2)
    y = new_y
    X = dataset[:, 0:11]

    # Divide Dataset: 20% Test and 80% Train
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=5)

    # Now we're going to Tunning Hyperparameters only with Train Data
    best_parameters = svc_tunning(x_train, y_train)
    # best_parameters = {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
    # best_parameters = {'C': 10, 'gamma': 0.9 , 'kernel': 'rbf'}

    # Pass the best parameters to train, and the Train Data
    trained_model = svc_train(x_train,
                              y_train,
                              c=best_parameters["C"],
                              gamma=best_parameters["gamma"],
                              kernel=best_parameters["kernel"])

    # Evaluate the model
    print("Evaluate model\n")
    evaluate(trained_model, x_train, y_train)

    # Test the model
    print("Train model\n")
    test(trained_model, x_test, y_test)
def main():
    #mypath = r'/home/connlab/108IR/will/final/NVSM_pytorch/'
    mypath = r'C:/Users/willll/Desktop/WIillll/IRCLass/Final/NVSM_pytorch'
    pretrained_model = 'bert-base-uncased'
    glove_path = Path(mypath + '/glove')
    model_folder = Path(mypath + '/models')
    # data_folder           = Path(mypath + '/data/processed')
    data_folder = Path(mypath + '/Willll/fakedoc')
    testing_query_folder = Path(mypath + '/Willll/test/query')
    model_path = model_folder / 'nvsm_bert.pt'
    batch_size = 140  # for 150, 8053 / 8113MB GPU memory, to tweak
    epochs = 1
    docs, queries, tokenizer = load_data(data_folder, testing_query_folder,
                                         pretrained_model)
    # docs = docs[:20]
    doc_names = [doc['name'] for doc in docs]
    n_grams, document_ids = create_dataset(
        tok_docs=[doc['tokens'] for doc in docs], tokenizer=tokenizer, n=30)
    print('N-grams number', len(n_grams))
    k_values = [1, 3, 5, 10]
    (train_data, eval_data,
     eval_train_data) = create_pytorch_datasets(n_grams, document_ids)
    print('Train dataset size', len(train_data))
    print('Eval dataset size', len(eval_data))
    print('Eval (training) dataset size', len(eval_train_data))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False)
    eval_train_loader = DataLoader(eval_train_data,
                                   batch_size=batch_size,
                                   shuffle=False)
    device = torch.device('cuda')
    lamb = 1e-3
    nvsm = NVSMBERT(
        pretrained_model=pretrained_model,
        n_doc=len(doc_names),
        dim_doc_emb=20,
        neg_sampling_rate=10,
    ).to(device)
    # BERT custom optimizer
    param_optimizer = list(nvsm.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=5e-5,
                         warmup=0.1,
                         t_total=len(train_loader) * epochs)
    train(nvsm=nvsm,
          device=device,
          optimizer=optimizer,
          epochs=epochs,
          train_loader=train_loader,
          eval_loader=eval_train_loader,
          k_values=k_values,
          loss_function=loss_function,
          lamb=lamb,
          print_every=10000)
    torch.save(nvsm.state_dict(), model_path)
    nvsm.eval()
    recall_at_ks = evaluate(
        nvsm=nvsm,
        device=device,
        eval_loader=eval_loader,
        recalls=k_values,
        loss_function=loss_function,
    )
    print(generate_eval(k_values, recall_at_ks))
    queries_text = [query['tokens'] for query in queries]
    queries_name = [query['name'] for query in queries]
    evaluation_results = evaluate_queries_bert(nvsm, queries_text, doc_names,
                                               tokenizer, batch_size, device)
    print(evaluation_results)
    # print(len(ranksResults))
    for query_name, query_text, doc_idx in zip(queries_name, queries_text,
                                               evaluation_results):
        print(f'{query_name} {query_text:35} -> {doc_names[doc_idx]}')

    with open(mypath + './Willll/result.txt', 'w') as f:
        f.write('Query,RetrievedDocuments\n')
        resuList = ' '
        for qIndex, qName in enumerate(queries_name):
            f.write(f'{qName},')
            f.write(
                f'{resuList.join(doc_names[x] for x in ranksResults[qIndex])}\n'
            )
Beispiel #8
0
def main():
    model_folder = Path('../../models')
    data_folder = Path('../../data/processed')
    model_path = model_folder / 'nvsm_30_20_10.pt'
    batch_size = 1000
    voc, stoi, itos, docs = load_data(model_folder, data_folder)
    doc_names = [doc['name'] for doc in docs]
    print('Vocabulary size', len(voc))
    n_grams, document_ids = create_dataset(
        tok_docs=[doc['tokens'] for doc in docs], stoi=stoi, n=10)
    print('N-grams number', len(n_grams))
    k_values = [1, 3, 5, 10]
    (train_data, eval_data,
     eval_train_data) = create_pytorch_datasets(n_grams, document_ids)
    print('Train dataset size', len(train_data))
    print('Eval dataset size', len(eval_data))
    print('Eval (training) dataset size', len(eval_train_data))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False)
    eval_train_loader = DataLoader(eval_train_data,
                                   batch_size=batch_size,
                                   shuffle=False)
    device = torch.device('cuda')
    lamb = 1e-3  # regularization weight in the loss
    nvsm = NVSMLinear(n_doc=len(doc_names),
                      n_tok=len(stoi),
                      dim_doc_emb=20,
                      dim_tok_emb=30,
                      neg_sampling_rate=10,
                      pad_token_id=stoi['<PAD>']).to(device)
    optimizer = optim.Adam(nvsm.parameters(), lr=1e-3)
    train(nvsm=nvsm,
          device=device,
          optimizer=optimizer,
          epochs=120,
          train_loader=train_loader,
          eval_loader=eval_train_loader,
          k_values=k_values,
          loss_function=loss_function,
          lamb=lamb,
          print_every=500)
    torch.save(nvsm.state_dict(), model_path)
    nvsm.eval()
    recall_at_ks = evaluate(
        nvsm=nvsm,
        device=device,
        eval_loader=eval_loader,
        recalls=k_values,
        loss_function=loss_function,
    )
    print(generate_eval(k_values, recall_at_ks))
    queries_text = [
        'violence king louis decapitated', 'domain language translate',
        'governement robespierre', 'perfect imperfect information',
        'ontology translation', 'high levels of political violence',
        'state education system which promotes civic values',
        'political struggles',
        'Almost all future revolutionary movements looked back to the Revolution as their predecessor',
        'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture',
        'mathematical model winning strategy',
        'solutions for two-person zero-sum games',
        'cooperative coalitions bargaining', 'eigenvalue',
        'graph, dimension and components', 'inner product vertex'
    ]
    evaluation_results = evaluate_queries(nvsm, queries_text, doc_names, stoi,
                                          batch_size, device)
    for query, doc_idx in zip(queries_text, evaluation_results):
        print(f'{query:35} -> {doc_names[doc_idx]}')
Beispiel #9
0
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim,
          sentence_length, num_layers, epochs, learning_rate, tag2id,
          model_saved_path, train_log_path, validate_log_path,
          train_history_image_path):
    '''
    data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造
    data_size:   训练集和测试集的样本数量
    batch_size:  批次的样本个数
    embedding_dim:  词嵌入的维度
    hidden_dim:     隐藏层的维度
    sentence_length:  文本限制的长度
    num_layers:       神经网络堆叠的LSTM层数
    epochs:           训练迭代的轮次
    learning_rate:    学习率
    tag2id:           标签到id的映射字典
    model_saved_path: 模型保存的路径
    train_log_path:   训练日志保存的路径
    validate_log_path:  测试集日志保存的路径
    train_history_image_path:  训练数据的相关图片保存路径
    '''
    # 将中文字符和id的对应码表加载进内存
    char2id = json.load(
        open("./data/char_to_id.json", mode="r", encoding="utf-8"))
    # 初始化BiLSTM_CRF模型
    model = BiLSTM_CRF(vocab_size=len(char2id),
                       tag_to_ix=tag2id,
                       embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       batch_size=batch_size,
                       num_layers=num_layers,
                       sequence_length=sentence_length)

    # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam)
    # 参数说明如下:
    # lr:          优化器学习率
    # momentum:    优化下降的动量因子, 加速梯度下降过程
    # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=learning_rate,
                           betas=(0.9, 0.999),
                           eps=1e-8,
                           weight_decay=1e-4)

    # 设定优化器学习率更新策略
    # 参数说明如下:
    # optimizer:    优化器
    # step_size:    更新频率, 每过多少个epoch更新一次优化器学习率
    # gamma:        学习率衰减幅度,
    #               按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1
    #   例如:
    #   初始学习率 lr = 0.5,    step_size = 20,    gamma = 0.1
    #              lr = 0.5     if epoch < 20
    #              lr = 0.05    if 20 <= epoch < 40
    #              lr = 0.005   if 40 <= epoch < 60
    # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8)

    # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标
    train_loss_list = []
    train_acc_list = []
    train_recall_list = []
    train_f1_list = []
    train_log_file = open(train_log_path, mode="w", encoding="utf-8")
    # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标
    validate_loss_list = []
    validate_acc_list = []
    validate_recall_list = []
    validate_f1_list = []
    validate_log_file = open(validate_log_path, mode="w", encoding="utf-8")
    # 利用tag2id生成id到tag的映射字典
    id2tag = {v: k for k, v in tag2id.items()}
    # 利用char2id生成id到字符的映射字典
    id2char = {v: k for k, v in char2id.items()}

    # 按照参数epochs的设定来循环epochs次
    for epoch in range(epochs):
        # 在进度条打印前, 先输出当前所执行批次
        tqdm.write("Epoch {}/{}".format(epoch + 1, epochs))
        # 定义要记录的正确总实体数, 识别实体数以及真实实体数
        total_acc_entities_length, \
        total_predict_entities_length, \
        total_gold_entities_length = 0, 0, 0
        # 定义每batch步数, 批次loss总值, 准确度, f1值
        step, total_loss, correct, f1 = 1, 0.0, 0, 0

        # 开启当前epochs的训练部分
        for inputs, labels in tqdm(data_loader["train"]):
            # 将数据以Variable进行封装
            inputs, labels = Variable(inputs), Variable(labels)
            # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加
            optimizer.zero_grad()
            # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数
            loss = model.neg_log_likelihood(inputs, labels)
            # 获取当前步的loss, 由tensor转为数字
            step_loss = loss.data
            # 累计每步损失值
            total_loss += step_loss
            # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
            best_path_list = model(inputs)
            # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数
            step_acc, step_recall, f1_score, acc_entities_length, \
            predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
                                                                     labels.tolist(),
                                                                     best_path_list,
                                                                     id2char,
                                                                     id2tag)
            # 训练日志内容
            '''
            log_text = "Epoch: %s | Step: %s " \
                       "| loss: %.5f " \
                       "| acc: %.5f " \
                       "| recall: %.5f " \
                       "| f1 score: %.5f" % \
                       (epoch, step, step_loss, step_acc, step_recall,f1_score)
            '''

            # 分别累计正确总实体数、识别实体数以及真实实体数
            total_acc_entities_length += acc_entities_length
            total_predict_entities_length += predict_entities_length
            total_gold_entities_length += gold_entities_length

            # 对损失函数进行反向传播
            loss.backward()
            # 通过optimizer.step()计算损失, 梯度和更新参数
            optimizer.step()
            # 记录训练日志
            # train_log_file.write(log_text + "\n")
            step += 1
        # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量)
        epoch_loss = total_loss / data_size["train"]
        # 计算当前epochs准确率
        if total_predict_entities_length > 0:
            total_acc = total_acc_entities_length / total_predict_entities_length
        # 计算当前epochs召回率
        if total_gold_entities_length > 0:
            total_recall = total_acc_entities_length / total_gold_entities_length
        # 计算当前epochs的F1值
        total_f1 = 0
        if total_acc + total_recall != 0:
            total_f1 = 2 * total_acc * total_recall / (total_acc +
                                                       total_recall)
        log_text = "Epoch: %s " \
                   "| mean loss: %.5f " \
                   "| total acc: %.5f " \
                   "| total recall: %.5f " \
                   "| total f1 scroe: %.5f" % (epoch, epoch_loss,
                                               total_acc,
                                               total_recall,
                                               total_f1)

        # 当前epochs训练后更新学习率, 必须在优化器更新之后
        # scheduler.step()

        # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值
        train_loss_list.append(epoch_loss)
        train_acc_list.append(total_acc)
        train_recall_list.append(total_recall)
        train_f1_list.append(total_f1)
        train_log_file.write(log_text + "\n")

        # 定义要记录的正确总实体数, 识别实体数以及真实实体数
        total_acc_entities_length, \
        total_predict_entities_length, \
        total_gold_entities_length = 0, 0, 0
        # 定义每batch步数, 批次loss总值, 准确度, f1值
        step, total_loss, correct, f1 = 1, 0.0, 0, 0

        # 开启当前epochs的验证部分
        with torch.no_grad():
            for inputs, labels in tqdm(data_loader["validation"]):
                # 将数据以 Variable 进行封装
                inputs, labels = Variable(inputs), Variable(labels)
                # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数
                # 返回最终的 CRF 的对数似然结果
                try:
                    loss = model.neg_log_likelihood(inputs, labels)
                except:
                    continue
                # 获取当前步的 loss 值,由 tensor 转为数字
                step_loss = loss.data
                # 累计每步损失值
                total_loss += step_loss
                # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数
                best_path_list = model(inputs)
                # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数
                step_acc, step_recall, f1_score, acc_entities_length, \
                predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
                                                                         labels.tolist(),
                                                                         best_path_list,
                                                                         id2char,
                                                                         id2tag)

                # 训练日志内容
                '''
                log_text = "Epoch: %s | Step: %s " \
                           "| loss: %.5f " \
                           "| acc: %.5f " \
                           "| recall: %.5f " \
                           "| f1 score: %.5f" % \
                           (epoch, step, step_loss, step_acc, step_recall,f1_score)
                '''

                # 分别累计正确总实体数、识别实体数以及真实实体数
                total_acc_entities_length += acc_entities_length
                total_predict_entities_length += predict_entities_length
                total_gold_entities_length += gold_entities_length

                # 记录验证集损失日志
                # validate_log_file.write(log_text + "\n")
                step += 1

            # 获取当前批次平均损失值(每一批次损失总值除以数据量)
            epoch_loss = total_loss / data_size["validation"]
            # 计算总批次准确率
            if total_predict_entities_length > 0:
                total_acc = total_acc_entities_length / total_predict_entities_length
            # 计算总批次召回率
            if total_gold_entities_length > 0:
                total_recall = total_acc_entities_length / total_gold_entities_length
            # 计算总批次F1值
            total_f1 = 0
            if total_acc + total_recall != 0.0:
                total_f1 = 2 * total_acc * total_recall / (total_acc +
                                                           total_recall)
            log_text = "Epoch: %s " \
                       "| mean loss: %.5f " \
                       "| total acc: %.5f " \
                       "| total recall: %.5f " \
                       "| total f1 scroe: %.5f" % (epoch, epoch_loss,
                                                   total_acc,
                                                   total_recall,
                                                   total_f1)

            # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值
            validate_loss_list.append(epoch_loss)
            validate_acc_list.append(total_acc)
            validate_recall_list.append(total_recall)
            validate_f1_list.append(total_f1)
            validate_log_file.write(log_text + "\n")

    # 保存模型
    torch.save(model.state_dict(), model_saved_path)

    # 将loss下降历史数据转为图片存储
    save_train_history_image(train_loss_list, validate_loss_list,
                             train_history_image_path, "Loss")
    # 将准确率提升历史数据转为图片存储
    save_train_history_image(train_acc_list, validate_acc_list,
                             train_history_image_path, "Acc")
    # 将召回率提升历史数据转为图片存储
    save_train_history_image(train_recall_list, validate_recall_list,
                             train_history_image_path, "Recall")
    # 将F1上升历史数据转为图片存储
    save_train_history_image(train_f1_list, validate_f1_list,
                             train_history_image_path, "F1")
    print("train Finished".center(100, "-"))
Beispiel #10
0
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim,
          sentence_length, num_layers, epochs, learning_rate, tag2id, 
          model_saved_path, train_log_path,
          validate_log_path, train_history_image_path):

    char2id = json.load(open("./data/char_to_id.json", mode="r", encoding="utf-8"))
    # 初始化模型
    model = BiLSTM(vocab_size=len(char2id), tag_to_ix=tag2id,
                   embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                   batch_size=batch_size, num_layers=num_layers,
                   sequence_length=sentence_length)

    # 定义优化器,使用 SGD 作为优化器(因为 torch 中 Embedding 支持的 GPU 加速为 SGD 和 SparseAdam)
    # 参数说明如下:
    #   params      需要更新的模型参数;
    #   lr          优化器学习率;
    #   momentum    优化下降的动量因子,加速梯度下降过程。
    optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85)

    # 设定优化器学习率更新策略
    # 参数说明如下:
    #   optimizer   待更新优化器;
    #   step_size   更新频率,即没多少个 epoch 更新一次优化器学习率;
    #   gamma       学习率衰减幅度,
    #               按照什么比例调整(衰减)学习率(相对于上一轮次数 epoch 而言),默认 0.1
    # ----------------------------------------------------------------------
    #   例:
    #   >>> # 初始学习率 lr = 0.5 step_size = 20, gamma=0.1
    #   >>> # lr = 0.5     if epoch < 20
    #   >>> # lr = 0.05    if 20 <= epoch < 40
    #   >>> # lr = 0.005   if 40 <= epoch < 60
    scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.2)

    train_loss_list = []
    train_acc_list = []
    train_recall_list = []
    train_f1_list = []
    train_log_file = open(train_log_path, mode="w", encoding="utf-8")
    # 定义记录验证 loss 值(用于图表展示)列表以及需要记录的验证日志文件
    validate_loss_list = []
    validate_acc_list = []
    validate_recall_list = []
    validate_f1_list = []
    validate_log_file = open(validate_log_path, mode="w", encoding="utf-8")
    # 调转字符标签与id值
    id2tag = {v:k for k, v in tag2id.items()}
    # 调转字符编码与id值
    id2char = {v:k for k, v in char2id.items()}

    for epoch in range(epochs):
        # 在进度条打印前,先输出当前所执行批次
        tqdm.write("Epoch {}/{}".format(epoch + 1, epochs))
        # 定义要记录的正确总实体数、识别实体数以及真实实体数
        total_acc_entities_length, \
        total_predict_entities_length, \
        total_gold_entities_length = 0, 0, 0
        # 定义每 batch 步数,批次 loss 总值,准确度,f1值
        step, total_loss, correct, f1 = 1, 0.0, 0, 0

        for inputs, labels in tqdm(data_loader["train"]):
            # 将数据以 Variable 进行封装
            inputs, labels = Variable(inputs), Variable(labels)
            # 请记住Pytorch会积累梯度。我们需要在每个实例之前清除它们
            optimizer.zero_grad()
            # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数
            # 返回最终的 CRF 的对数似然结果
            loss = model.neg_log_likelihood(inputs, labels)
            # 获取当前步的 loss 值,由 tensor 转为数字
            step_loss = loss.data
            # 累计每步损失值
            total_loss += step_loss
            # 获取解码最佳路径列表
            best_path_list = model(inputs)
            # 模型评估指标值获取包括:当前批次准确率、召回率、F1值以及对应的实体个数
            step_acc, step_recall, f1_score, acc_entities_length, \
            predict_entities_length, gold_entities_length = evaluate(inputs.tolist(),
                                                                     labels.tolist(),
                                                                     best_path_list,
                                                                     id2char,
                                                                     id2tag)
            # 训练日志内容
            log_text = "Epoch: %s | Step: %s " \
                       "| loss: %.5f " \
                       "| acc: %.5f " \
                       "| recall: %.5f " \
                       "| f1 score: %.5f" % \
                       (epoch, step, step_loss, step_acc, step_recall,f1_score)
            # 分别累计正确总实体数、识别实体数以及真实实体数
            total_acc_entities_length += acc_entities_length
            total_predict_entities_length += predict_entities_length
            total_gold_entities_length += gold_entities_length
            loss.backward()
            # 通过optimizer.step()计算损失、梯度和更新参数
            optimizer.step()
            # 记录训练日志
            train_log_file.write(log_text + "\n")
            step += 1
        # 获取当前批次平均损失值(每一批次损失总值除以数据量)
        epoch_loss = total_loss / data_size["train"]
        # 计算总批次准确率
        total_acc = total_acc_entities_length / total_predict_entities_length
        # 计算总批次召回率
        total_recall = total_acc_entities_length / total_gold_entities_length
        # 计算总批次F1值
        total_f1 = 0
        if total_acc + total_recall != 0:
            total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall)
        log_text = "Epoch: %s " \
                   "| mean loss: %.5f " \
                   "| total acc: %.5f " \
                   "| total recall: %.5f " \
                   "| total f1 scroe: %.5f" % (epoch, epoch_loss,
                                               total_acc,
                                               total_recall,
                                               total_f1)
        # 当前批次训练后更新学习率
        # 必须在优化器更新之后
        scheduler.step()
        # 记录当前批次训练 loss 值(用于图表展示)、准确率、召回率、f1值
        train_loss_list.append(epoch_loss)
        train_acc_list.append(total_acc)
        train_recall_list.append(total_recall)
        train_f1_list.append(total_f1)
        train_log_file.write(log_text + "\n")
    # 保存模型
    torch.save(model.state_dict(), model_saved_path)

    # 将 loss 下降历史数据转为图片存储
    save_train_history_image(train_loss_list,
                             validate_loss_list,
                             train_history_image_path,
                             "Loss")
    # 将准确率提升历史数据转为图片存储
    save_train_history_image(train_acc_list,
                             validate_acc_list,
                             train_history_image_path,
                             "Acc")
    # 将召回提升历史数据转为图片存储
    save_train_history_image(train_recall_list,
                             validate_recall_list,
                             train_history_image_path,
                             "Recall")
    # 将F1上升历史数据转为图片存储
    save_train_history_image(train_f1_list,
                             validate_f1_list,
                             train_history_image_path,
                             "F1")
    print("train Finished".center(100, "-"))
Beispiel #11
0
def main():
    #mypath = r'/home/connlab/108IR/will/final/NVSM_pytorch/'
    mypath = r'C:/Users/willll/Desktop/WIillll/IRfinal/NVSM_pytorch'
    print(mypath)
    pretrained_model      = 'bert-base-uncased'
    glove_path            = Path(mypath + '/glove')
    model_folder          = Path(mypath + '/models')
   # data_folder           = Path(mypath + '/data/processed')
    data_folder           = Path(mypath + '/Willll/fakedoc')
    testing_query_folder  = Path(mypath + '/Willll/test/query')
    model_path            = model_folder / 'nvsm_bert.pt'
    batch_size            = 140 # for 150, 8053 / 8113MB GPU memory, to tweak
    epochs                = 3
    docs, queries ,tokenizer       = load_data(
        data_folder,
        testing_query_folder,
        pretrained_model
    )
    # docs = docs[:20]
    doc_names             = [doc['name'] for doc in docs]
    n_grams, document_ids = create_dataset(
        tok_docs  = [doc['tokens'] for doc in docs],
        tokenizer = tokenizer,
        n         = 10
    )

    print('N-grams number', len(n_grams))
    k_values              = [1, 3, 5, 10]
    (train_data,
     eval_data,
     eval_train_data)     = create_pytorch_datasets(n_grams, document_ids)
    print('Train dataset size', len(train_data))
    print('Eval dataset size', len(eval_data))
    print('Eval (training) dataset size', len(eval_train_data))


    eval_loader           = DataLoader(eval_data, batch_size = batch_size, shuffle = False)
    eval_train_loader     = DataLoader(eval_train_data, batch_size = batch_size, shuffle = False)
    device                = torch.device('cuda')
    lamb                  = 1e-3

    nvsm                  = NVSMBERT(
        pretrained_model  = pretrained_model,
        n_doc             = len(doc_names),
        dim_doc_emb       = 20,
        neg_sampling_rate = 10,
    ).to(device)
    #torch.save(nvsm.state_dict(), model_path)
    nvsm.load_state_dict(torch.load(model_path))
    nvsm.eval()
    recall_at_ks = evaluate(
        nvsm          = nvsm,
        device        = device,
        eval_loader   = eval_loader,
        recalls       = k_values,
        loss_function = loss_function,
    )
    print(generate_eval(k_values, recall_at_ks))
    queries_text             = [query['tokens'] for query in queries]
    queries_name             = [query['name'] for query in queries]
#    queries_text          = [
#        'violence king louis decapitated',
#        'domain language translate',
#        'governement robespierre',
#        'perfect imperfect information',
#        'ontology translation',
#        'high levels of political violence',
#        'state education system which promotes civic values',
#        'political struggles',
#        'Almost all future revolutionary movements looked back to the Revolution as their predecessor',
#        'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture',
#        'mathematical model winning strategy',
#        'solutions for two-person zero-sum games',
#        'cooperative coalitions bargaining',
#        'eigenvalue',
#        'graph, dimension and components',
#        'inner product vertex'
#    ]
    
    

    
    evaluation_results,ranksResults = evaluate_queries_bert(
        nvsm,
        queries_text,
        doc_names,
        tokenizer,
        batch_size,
        device
    )
    print(evaluation_results)
    # print(len(ranksResults))
    for query_name,query_text, doc_idx in zip(queries_name,queries_text, evaluation_results):
        print(f'{query_name} {query_text:35} -> {doc_names[doc_idx]}')

    with open(mypath + './Willll/result.txt','w') as f:
        f.write('Query,RetrievedDocuments\n')
        resuList = ' '
        for qIndex,qName in enumerate(queries_name):
            f.write(f'{qName},')
            f.write(f'{resuList.join(doc_names[x] for x in ranksResults[qIndex])}\n')
Beispiel #12
0
        y_train,
        penalty=best_parameters["model_non_regularized"]["Penalty"],
        C=best_parameters["model_non_regularized"]["C"],
        solver=best_parameters["model_non_regularized"]["Solver"],
        multi_class=best_parameters["model_non_regularized"]["MultiClass"],
        max_iter=1000)

    lg = LogisticRegression(
        penalty=best_parameters["model_regularized"]["Penalty"],
        C=best_parameters["model_regularized"]["C"],
        solver=best_parameters["model_regularized"]["Solver"],
        multi_class=best_parameters["model_regularized"]["MultiClass"],
        max_iter=1000)
    cvs = cross_val_score(lg, x_train, y_train, cv=4)

    # Evaluate the model
    print("\n Evaluate the model \n")
    print("\nRegularized:")
    evaluate(trained_model_regularized, x_train, y_train)
    print("\n Cross_validation")
    print(cvs)
    print("\nNon Regularized:")
    evaluate(trained_model_non_regularized, x_train, y_train)

    # Test the model
    print("\n Test the model \n")
    print("\nRegularized:")
    test(trained_model_regularized, x_test, y_test)
    print("\nNon Regularized:")
    test(trained_model_non_regularized, x_test, y_test)
Beispiel #13
0
def main():
    pretrained_model = 'bert-base-uncased'
    glove_path = Path('../../glove')
    model_folder = Path('../../models')
    data_folder = Path('../../data/processed')
    model_path = model_folder / 'nvsm_bert.pt'
    batch_size = 140  # for 150, 8053 / 8113MB GPU memory, to tweak
    epochs = 3
    docs, tokenizer = load_data(data_folder, pretrained_model)
    # docs = docs[:20]
    doc_names = [doc['name'] for doc in docs]
    n_grams, document_ids = create_dataset(
        tok_docs=[doc['tokens'] for doc in docs], tokenizer=tokenizer, n=10)
    print('N-grams number', len(n_grams))
    k_values = [1, 3, 5, 10]
    (train_data, eval_data,
     eval_train_data) = create_pytorch_datasets(n_grams, document_ids)
    print('Train dataset size', len(train_data))
    print('Eval dataset size', len(eval_data))
    print('Eval (training) dataset size', len(eval_train_data))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_data, batch_size=batch_size, shuffle=False)
    eval_train_loader = DataLoader(eval_train_data,
                                   batch_size=batch_size,
                                   shuffle=False)
    device = torch.device('cuda')
    lamb = 1e-3
    nvsm = NVSMBERT(
        pretrained_model=pretrained_model,
        n_doc=len(doc_names),
        dim_doc_emb=20,
        neg_sampling_rate=10,
    ).to(device)
    # BERT custom optimizer
    param_optimizer = list(nvsm.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=5e-5,
                         warmup=0.1,
                         t_total=len(train_loader) * epochs)
    train(nvsm=nvsm,
          device=device,
          optimizer=optimizer,
          epochs=epochs,
          train_loader=train_loader,
          eval_loader=eval_train_loader,
          k_values=k_values,
          loss_function=loss_function,
          lamb=lamb,
          print_every=500)
    torch.save(nvsm.state_dict(), model_path)
    nvsm.eval()
    recall_at_ks = evaluate(
        nvsm=nvsm,
        device=device,
        eval_loader=eval_loader,
        recalls=k_values,
        loss_function=loss_function,
    )
    print(generate_eval(k_values, recall_at_ks))
    queries_text = [
        'violence king louis decapitated', 'domain language translate',
        'governement robespierre', 'perfect imperfect information',
        'ontology translation', 'high levels of political violence',
        'state education system which promotes civic values',
        'political struggles',
        'Almost all future revolutionary movements looked back to the Revolution as their predecessor',
        'Habermas argued that the dominant cultural model in 17th century France was a "representational" culture',
        'mathematical model winning strategy',
        'solutions for two-person zero-sum games',
        'cooperative coalitions bargaining', 'eigenvalue',
        'graph, dimension and components', 'inner product vertex'
    ]
    evaluation_results = evaluate_queries_bert(nvsm, queries_text, doc_names,
                                               tokenizer, batch_size, device)
    for query, doc_idx in zip(queries_text, evaluation_results):
        print(f'{query:35} -> {doc_names[doc_idx]}')
Beispiel #14
0
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

#Import custom function for evaluation and video recording
from evaluate_model import evaluate
from record_model import record

from nes_py.wrappers import JoypadSpace
import gym_tetris
from gym_tetris.actions import MOVEMENT, SIMPLE_MOVEMENT, TRAIN_MOVEMENT
from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

env = gym_tetris.make('TetrisA-v3')
env = JoypadSpace(env, TRAIN_MOVEMENT)
env = DummyVecEnv([lambda: env])

model = DQN.load("TetrisA-v2_DQN_200k", env=env, verbose=1)
mean_reward = evaluate(model=model, env=env, episode=20, render=True)

#status = evaluate(model, env, num_steps=12000, render = True)
#status = record( model=model, env=env, num_episodes=3)

print(mean_reward)