Example #1
0
def predict():
    test_contents, test_labels = load_corpus('./dataset/test.txt',
                                             word2id,
                                             max_sen_len=50)
    # 加载测试集
    test_dataset = TensorDataset(
        torch.from_numpy(test_contents).type(torch.float),
        torch.from_numpy(test_labels).type(torch.long))
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=False,
                                 num_workers=2)
    # 读取模型
    model = TextCNN(config)
    model.load_state_dict(torch.load(config.model_path))
    model.eval()
    model.to(device)

    # 测试过程
    count, correct = 0, 0
    for _, (batch_x, batch_y) in enumerate(test_dataloader):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
        # correct += (output.argmax(1) == batch_y).float().sum().item()
        correct += (output.argmax(1) == batch_y).sum().item()
        count += len(batch_x)

    # 打印准确率
    print('test accuracy is {:.2f}%.'.format(100 * correct / count))
Example #2
0
def train(**kwargs):

    opt.parse(kwargs)
    device = torch.device(
        "cuda:{}".format(opt.gpu_id) if torch.cuda.is_available() else "cpu")
    opt.device = device

    x_text, y = load_data_and_labels("./data/rt-polarity.pos",
                                     "./data/rt-polarity.neg")
    x_train, x_test, y_train, y_test = train_test_split(
        x_text, y, test_size=opt.test_size)

    train_data = Data(x_train, y_train)
    test_data = Data(x_test, y_test)

    train_loader = DataLoader(train_data,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    test_loader = DataLoader(test_data,
                             batch_size=opt.batch_size,
                             shuffle=False,
                             collate_fn=collate_fn)

    print("{} train data: {}, test data: {}".format(now(), len(train_data),
                                                    len(test_data)))

    model = TextCNN(opt)
    print("{} init model finished".format(now()))

    if opt.use_gpu:
        model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.lr,
                           weight_decay=opt.weight_decay)

    for epoch in range(opt.epochs):
        total_loss = 0.0
        model.train()
        for step, batch_data in enumerate(train_loader):
            x, labels = batch_data
            labels = torch.LongTensor(labels)
            if opt.use_gpu:
                labels = labels.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        acc = test(model, test_loader)
        print("{} {} epoch: loss: {}, acc: {}".format(now(), epoch, total_loss,
                                                      acc))
Example #3
0
def test():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))

    # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu'))
    model.load_state_dict(torch.load("./output/model.bin"))
    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    data_len = len(test_dataloader)

    model.eval()
    y_pred = np.array([])
    y_test = np.array([])
    for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))):
        
        label_id = batch['label_id'].squeeze(1).to(device) 
        segment_ids = batch['segment_ids'].to(device) 
        with torch.no_grad():
            pred = model.get_labels(segment_ids)
        y_pred = np.hstack((y_pred,pred))
        y_test = np.hstack((y_test,label_id.to("cpu").numpy()))

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label')))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
Example #4
0
def objective(trial):

    model = TextCNN(trial, len(id2vocab), CLS)
    model.to(device)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss= []
        for batch in train_iter:           
            text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device)
            model.zero_grad()
            out = model(text_idx_batch)
            loss = criterion(out, label_idx_batch)
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()   
        #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}')

        model.eval()
        predict_all = np.array([], dtype=int)
        labels_all = np.array([], dtype=int)
        with torch.no_grad():        
            for batch in val_iter:
                text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label
                pred = model(text_idx_batch)
                pred = torch.max(pred.data, 1)[1].cpu().numpy()
                predict_all = np.append(predict_all, pred)
                
                truth = label_idx_batch.cpu().numpy()
                labels_all = np.append(labels_all, truth)            
            
        acc = metrics.accuracy_score(labels_all, predict_all)
        
        trial.report(acc, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return acc
Example #5
0
def train():
    train_contents, train_labels = load_corpus('./dataset/train.txt',
                                               word2id,
                                               max_sen_len=50)
    val_contents, val_labels = load_corpus('./dataset/validation.txt',
                                           word2id,
                                           max_sen_len=50)
    # 混合训练集和验证集
    contents = np.vstack([train_contents, val_contents])
    labels = np.concatenate([train_labels, val_labels])
    # 加载训练用的数据
    train_dataset = TensorDataset(
        torch.from_numpy(contents).type(torch.float),
        torch.from_numpy(labels).type(torch.long))
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=2)
    model = TextCNN(config)
    if config.model_path:
        model.load_state_dict(torch.load(config.model_path))
    model.to(device)
    # 设置优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    # 设置损失函数
    criterion = nn.CrossEntropyLoss()
    # 定义训练过程
    for epoch in range(config.epochs):
        for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            output = model(batch_x)
            loss = criterion(output, batch_y)
            if batch_idx % 200 == 0 & config.verbose:
                print("Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}".format(
                    epoch + 1, batch_idx * len(batch_x),
                    len(train_dataloader.dataset),
                    100. * batch_idx / len(train_dataloader), loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # 保存模型
    torch.save(model.state_dict(), './models/model.pth')
Example #6
0
File: main.py Project: scuhz/BERT
    PAD = 0
    model_name = 'GoogleNews-vectors-negative300.bin'
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(model_name,
                                                               binary=True)
    vocab_file = make_vocab(data_file, vocab_output_file)
    vocab2idx = convert_vocab_to_idx(vocab_output_file)
    word_embedding = load_word_embedding(vocab2idx, word2vec)

    #训练集划分
    X, Y = load_data(data_file, vocab2idx)
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.1,
                                                        random_state=1)
    train_dataset = TensorDataset(torch.from_numpy(x_train),
                                  torch.from_numpy(y_train))
    test_dataset = TensorDataset(torch.from_numpy(x_test),
                                 torch.from_numpy(y_test))
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    #模型准备
    model = TextCNN(word_embedding)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_func = nn.CrossEntropyLoss()

    #模型训练
    train(model, device, optimizer, loss_func)
Example #7
0
def train():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len)
    train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))
    # 优化器用adam
    optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    model.train()
    for epoch_id in trange(cf.epoch,desc="Epoch"):
        for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))):
            
            label_id = batch['label_id'].squeeze(1).to(device) 
            segment_ids = batch['segment_ids'].to(device) 

            loss = model(segment_ids,label_id)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1 

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train,acc_train = model.get_loss_acc(segment_ids,label_id)
                loss_val,acc_val = evaluate(model,test_dataloader,device)
                
                if acc_val  > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    torch.save(model.state_dict(),"./output/model.bin")
                    improved_str = "*"
                else:
                    improved_str = ""
                
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
                
                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break
Example #8
0
def train(config):
    try:
        split = config["split"]
        data_path = config["data_path"]
        pretrained_model_dir = config["pretrained_model_dir"]
        pretrained_model_file = config["pretrained_model_file"]
        last_model_path = config["last_model_path"]
        save_to = config["save_to"]
        min_freq = config["min_freq"]
        batch_size = config["batch_size"]
        max_sent_length = config["max_sent_length"]
        embed_dim = config["embed_dim"]
        filter_num = config["filter_num"]
        filter_widths = config["filter_widths"]
        learning_rate = config["learning_rate"]
        patience = config["patience"]
        lr_decay = config["lr_decay"]
        max_num_trial = config["max_num_trial"]
        max_epoch = config["max_epoch"]
        save_every = config["save_every"]
        cuda = config["cuda"]
        debug = config["debug"]
    except KeyError:
        print("Input Parameter Error")
        exit(1)

    if not Path(save_to).exists():
        Path(save_to).mkdir()
    device = torch.device("cuda:0" if (
        torch.cuda.is_available() and cuda) else "cpu")

    # build torchtext field
    TEXT = torchtext.data.Field(tokenize='spacy', lower=True)
    LABEL = torchtext.data.Field(dtype=torch.long)

    train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path)
    if debug:
        train_data, val_data = train_data.split(split_ratio=0.1)
    train_data, val_data = train_data.split(split_ratio=0.7)
    train_iter, val_iter = torchtext.data.Iterator.splits(
        (train_data, val_data), batch_size=batch_size, device=device)

    if (pretrained_model_file is not None) and (pretrained_model_dir
                                                is not None):
        pretrained_vector = Vectors(name=pretrained_model_file,
                                    cache=pretrained_model_dir)

    TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector)
    LABEL.build_vocab(train_data)

    logging.info("saving TEXT/LABEL vocabulary...")
    with open(f"{save_to}/TEXT_vocab.bin", "wb") as f:
        dill.dump(TEXT, f)
    with open(f"{save_to}/LABEL_vocab.bin", "wb") as f:
        dill.dump(LABEL, f)

    assert embed_dim == TEXT.vocab.vectors.shape[
        -1], "incompatiable embeddings"
    embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab)

    model = TextCNN(embed_num,
                    embed_dim,
                    class_num,
                    filter_num,
                    filter_widths,
                    from_pretrained=TEXT.vocab.vectors).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor(
        [0, 0, 1.0, 1.0], device=device))  # class [<unk>,<pad>,'pos','neg']
    if last_model_path is not None:
        # load model
        logging.info(f'load model from  {last_model_path}')
        params = torch.load(last_model_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        logging.info('restore parameters of the optimizers')
        optimizer.load_state_dict(torch.load(last_model_path + '.optim'))

    model.train()

    epoch = 0
    cur_trial = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    logging.info("begin training!")
    while True:
        epoch += 1
        train_loss = 0
        cum_cnt = 0
        step = 0
        for batch in iter(train_iter):
            feature, target = batch.text.T, batch.label.squeeze(0)
            step += 1
            optimizer.zero_grad()
            res = model(feature)
            loss = cross_entropy(res, target)
            train_loss += loss
            loss.backward()
            optimizer.step()
        train_loss = train_loss / step
        val_loss, accuracy = evaluate(model, val_iter, cross_entropy)

        logging.info(
            f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy}  speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s'
        )
        train_time = time.time()

        is_better = len(
            hist_valid_scores) == 0 or val_loss < min(hist_valid_scores)
        hist_valid_scores.append(val_loss)

        if epoch % save_every == 0:
            model.save(f"{save_to}/model_step_{epoch}")
            torch.save(optimizer.state_dict(),
                       f"{save_to}/model_step_{epoch}.optim")
        if is_better:
            cur_patience = 0
            model_save_path = f"{save_to}/model_best"
            print(f'save currently the best model to [{model_save_path}]')
            model.save(model_save_path)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif cur_patience < patience:
            cur_patience += 1
            print('hit patience %d' % cur_patience)

            if cur_patience == patience:
                cur_trial += 1
                print(f'hit #{cur_trial} trial')
                if cur_trial == max_num_trial:
                    print('early stop!')
                    exit(0)

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * lr_decay
                logging.info(
                    f'load previously best model and decay learning rate to {lr}'
                )

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                logging.info('restore parameters of the optimizers')
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                cur_patience = 0

        if epoch == max_epoch:
            print('reached maximum number of epochs!')
            exit(0)