Esempio n. 1
0
def predict():
    test_contents, test_labels = load_corpus('./dataset/test.txt',
                                             word2id,
                                             max_sen_len=50)
    # 加载测试集
    test_dataset = TensorDataset(
        torch.from_numpy(test_contents).type(torch.float),
        torch.from_numpy(test_labels).type(torch.long))
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=False,
                                 num_workers=2)
    # 读取模型
    model = TextCNN(config)
    model.load_state_dict(torch.load(config.model_path))
    model.eval()
    model.to(device)

    # 测试过程
    count, correct = 0, 0
    for _, (batch_x, batch_y) in enumerate(test_dataloader):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
        # correct += (output.argmax(1) == batch_y).float().sum().item()
        correct += (output.argmax(1) == batch_y).sum().item()
        count += len(batch_x)

    # 打印准确率
    print('test accuracy is {:.2f}%.'.format(100 * correct / count))
Esempio n. 2
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
    else:
        model.eval()

    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Esempio n. 3
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
        #在训练模型时会在前面加上train();
    else:
        model.eval()
        #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Esempio n. 4
0
def test():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))

    # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu'))
    model.load_state_dict(torch.load("./output/model.bin"))
    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    data_len = len(test_dataloader)

    model.eval()
    y_pred = np.array([])
    y_test = np.array([])
    for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))):
        
        label_id = batch['label_id'].squeeze(1).to(device) 
        segment_ids = batch['segment_ids'].to(device) 
        with torch.no_grad():
            pred = model.get_labels(segment_ids)
        y_pred = np.hstack((y_pred,pred))
        y_test = np.hstack((y_test,label_id.to("cpu").numpy()))

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label')))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
Esempio n. 5
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
        #在训练模型时会在前面加上train();
    else:
        model.eval()
        #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值

    #train()与eval()两个方法是针对网络train和eval时采用不同方式的情况
    #比如Batch Normalization和Dropout
    #BN的作用主要是对网络中间的每层进行归一化处理,并且使用变换重构保证所提取的特征分布不会被破坏;
    #由于训练完毕后参数都是固定的,所有BN的训练和测试时的操作不同
    #Dropopt能够克服过拟合,在每个训练batch中,通过忽略一般的特征检测器,可以明显地减少过拟合现象。
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Esempio n. 6
0
def objective(trial):

    model = TextCNN(trial, len(id2vocab), CLS)
    model.to(device)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss= []
        for batch in train_iter:           
            text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device)
            model.zero_grad()
            out = model(text_idx_batch)
            loss = criterion(out, label_idx_batch)
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()   
        #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}')

        model.eval()
        predict_all = np.array([], dtype=int)
        labels_all = np.array([], dtype=int)
        with torch.no_grad():        
            for batch in val_iter:
                text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label
                pred = model(text_idx_batch)
                pred = torch.max(pred.data, 1)[1].cpu().numpy()
                predict_all = np.append(predict_all, pred)
                
                truth = label_idx_batch.cpu().numpy()
                labels_all = np.append(labels_all, truth)            
            
        acc = metrics.accuracy_score(labels_all, predict_all)
        
        trial.report(acc, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return acc
Esempio n. 7
0
output = None
dl_output = None
ml_output = None

FEATURE_LABEL = [
    "PROJECT_NAME", "BUSINESS_UNIT", "REGION_ID", "REP_OFFICE_ID",
    "CUSTOMER_ID", "PROJECT_LEVEL_NAME", "BUSINESS_GROUP_NAME",
    "DELIVERY_TYPE", "PROJECT_LABEL"
]

# Deep Learning
if args.snapshot is not None:
    net.load_state_dict(torch.load(args.snapshot))

    net.eval()
    feature = []
    for label in FEATURE_LABEL:
        text = getattr(args, label)
        text = text_fields.preprocess(text)
        text = [[text_fields.vocab.stoi[x] for x in text]]
        x = text_fields.tensor_type(text)
        x = autograd.Variable(x, volatile=True)
        feature.append(x)

    dl_output = net(feature).int().squeeze(0).tolist()

# Machine Learning
if args.machine_learning_model is not None:
    classifiers = np.load(args.machine_learning_model)
Esempio n. 8
0
        loss = criterion(target, label)

        loss.backward()
        optimizer.step()

        total_loss += loss.data[0]

        current_count += sample_batched['data'].size()[0]
        sys.stdout.write('epoch {0} / {1}: {2} / {3}\r'.format(
            epoch, nb_epoch, current_count, len(dataset_train)))

    sys.stdout.write('epoch {0} / {1}: {2} / {3}\n'.format(
        epoch, nb_epoch, current_count, len(dataset_train)))

    # 计算开发集loss
    text_cnn.eval()
    for i_batch, sample_batched in enumerate(data_loader_dev):
        data = Variable(sample_batched['data'])
        label = Variable(sample_batched['label'])
        if use_cuda:
            data = data.cuda()
            label = label.cuda()
        pred = text_cnn(data)
        loss = criterion(pred, label)
        dev_loss += loss.data[0]

    total_loss /= float(len(data_loader_train))
    dev_loss /= float(len(data_loader_dev))
    print('\ttrain loss: {:.4f}, dev loss: {:.4f}'.format(
        total_loss, dev_loss))
Esempio n. 9
0
from torch_config import EMBEDDINGS_DIR

app = Sanic('PyTorch API')

embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

model = TextCNN(
    embeddings=embeddings,
    n_filters=64,
    filter_sizes=[2, 3],
    dropout=0.0,
)

device = torch.device('cpu')
model.load_state_dict(torch.load('model.pth', map_location=device))
model.eval()

text_processing = TextProcessor(
    wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
    tokenizer=get_tokenizer('basic_english'),
    standardize=True,
    min_len=3,
)


@app.post('/game')
async def game(request: Request):
    q = request.form.get('q', None)

    if q is None:
        return HTTPResponse(status=400)
Esempio n. 10
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Esempio n. 11
0
        print('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))
        logging.info('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))

        dir = 'checkpoint/WithoutImage_' + str(epoch + 1) + '.pkl'
        torch.save(model.state_dict(), dir)

# test
model = TextCNN(args, W)
model.load_state_dict(torch.load(dir))
if torch.cuda.is_available():
    model.cuda()
model.eval()
test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float)
batch = len(label_df['id']) // args.batch_size

for i, (test_data, event_labels) in enumerate(test_loader):
    test_text, test_mask = to_var(test_data[0]), to_var(test_data[1])

    test_text = test_text.long()
    test_mask = test_mask.float()
    test_outputs, domain_outputs = model(test_text, test_mask)
    if i != batch:
        test_sub[i * args.batch_size:(i + 1) *
                 args.batch_size, :] = to_np(test_outputs)
    else:
        test_sub[i *
                 args.batch_size:len(test_df['id']), :] = to_np(test_outputs)
Esempio n. 12
0
def train():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len)
    train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))
    # 优化器用adam
    optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    model.train()
    for epoch_id in trange(cf.epoch,desc="Epoch"):
        for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))):
            
            label_id = batch['label_id'].squeeze(1).to(device) 
            segment_ids = batch['segment_ids'].to(device) 

            loss = model(segment_ids,label_id)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1 

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train,acc_train = model.get_loss_acc(segment_ids,label_id)
                loss_val,acc_val = evaluate(model,test_dataloader,device)
                
                if acc_val  > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    torch.save(model.state_dict(),"./output/model.bin")
                    improved_str = "*"
                else:
                    improved_str = ""
                
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
                
                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break
                       label_df.comment_all.apply(lambda x: ' '.join(jieba.cut(cut_sub(x).lower())))

text_train = pd.DataFrame(train_df)
text_test = pd.DataFrame(test_df)

train_data, test_data, label_data, W = load_weight(args, text_train, text_test,
                                                   label_df)

# 图片->训练数据
test_data = DatasetWithoutImg(label_data, mode='test_images')
test_loader = DataLoader(dataset=test_data,
                         batch_size=args.batch_size,
                         shuffle=False)

# test
TextCNN.eval()
test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float)
batch = len(label_df['id']) // args.batch_size

for i, (test_data, event_labels) in enumerate(test_loader):
    test_text, test_mask = to_var(test_data[0]), to_var(test_data[1])

    test_text = test_text.long()
    test_mask = test_mask.float()
    test_outputs, domain_outputs = TextCNN(test_text, test_mask)
    if i != batch:
        test_sub[i * args.batch_size:(i + 1) *
                 args.batch_size, :] = to_np(test_outputs)
    else:
        test_sub[i *
                 args.batch_size:len(test_df['id']), :] = to_np(test_outputs)
Esempio n. 14
0
class Trainer:
    def __init__(self, config):
        self.config = config
        self.train_data_loader = None
        self.eval_data_loader = None

        # 加载数据集
        self.load_data()
        self.train_inputs, self.train_labels, label_to_idx = self.train_data_loader.gen_data(
        )
        self.vocab_size = self.train_data_loader.vocab_size
        self.word_vectors = self.train_data_loader.word_vectors
        print(f"train data size: {len(self.train_labels)}")
        print(f"vocab size: {self.vocab_size}")
        self.label_list = [value for key, value in label_to_idx.items()]

        self.eval_inputs, self.eval_labels = self.eval_data_loader.gen_data()

        # 初始化模型
        self.model = TextCNN(config=self.config,
                             vocab_size=self.vocab_size,
                             word_vectors=self.word_vectors)

    def load_data(self):
        """加载数据集"""
        self.train_data_loader = TrainData(self.config)
        self.config.test_data = self.config.eval_data  # 使用验证集,进行训练过程中的测试
        self.eval_data_loader = TestData(self.config)

    def train(self):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9,
                                    allow_growth=True)
        sess_config = tf.ConfigProto(log_device_placement=False,
                                     allow_soft_placement=True,
                                     gpu_options=gpu_options)
        with tf.Session(config=sess_config) as sess:
            sess.run(tf.global_variables_initializer())  # 初始化变量
            current_step = 0

            # 创建Train/Eval的summar路径和写入对象
            train_summary_path = os.path.join(
                self.config.BASE_DIR, self.config.summary_path + "/train")
            if not os.path.exists(train_summary_path):
                os.makedirs(train_summary_path)
            train_summary_writer = tf.summary.FileWriter(
                train_summary_path, sess.graph)
            eval_summary_path = os.path.join(
                self.config.BASE_DIR, self.config.summary_path + "/eval")
            if not os.path.exists(eval_summary_path):
                os.makedirs(eval_summary_path)
            eval_summary_writer = tf.summary.FileWriter(
                eval_summary_path, sess.graph)

            # Train & Eval Process
            for epoch in range(self.config.epochs):
                print(f"----- Epoch {epoch + 1}/{self.config.epochs} -----")
                for batch in self.train_data_loader.next_batch(
                        self.train_inputs, self.train_labels,
                        self.config.batch_size):
                    summary, loss, predictions = self.model.train(
                        sess, batch, self.config.keep_prob)
                    train_summary_writer.add_summary(summary)
                    if self.config.num_classes == 1:
                        acc = get_binary_metrics(pred_y=predictions.tolist(),
                                                 true_y=batch['y'])
                        print("Train step: {}, acc: {:.3f}".format(
                            current_step, acc))
                    elif self.config.num_classes > 1:
                        acc = get_multi_metrics(pred_y=predictions.tolist(),
                                                true_y=batch['y'])
                        print("Train step: {}, acc: {:.3f}".format(
                            current_step, acc))

                    current_step += 1

                    if self.eval_data_loader and current_step % self.config.ckeckpoint_every == 0:
                        eval_losses = []
                        eval_accs = []
                        for eval_batch in self.eval_data_loader.next_batch(
                                self.eval_inputs, self.eval_labels,
                                self.config.batch_size):
                            eval_summary, eval_loss, eval_predictions = self.model.eval(
                                sess, eval_batch)
                            eval_summary_writer.add_summary(eval_summary)
                            eval_losses.append(eval_loss)
                            if self.config.num_classes == 1:
                                acc = get_binary_metrics(
                                    pred_y=eval_predictions.tolist(),
                                    true_y=batch['y'])
                                eval_accs.append(acc)
                            elif self.config.num_classes > 1:
                                acc = get_multi_metrics(
                                    pred_y=eval_predictions.tolist(),
                                    true_y=batch['y'])
                                eval_accs.append(acc)
                        print(
                            f"Eval \tloss: {list_mean(eval_losses)}, acc: {list_mean(eval_accs)}"
                        )

                        if self.config.ckpt_model_path:
                            save_path = os.path.join(
                                self.config.BASE_DIR,
                                self.config.ckpt_model_path)
                            if not os.path.exists(save_path):
                                os.makedirs(save_path)
                            model_save_path = os.path.join(
                                save_path, self.config.model_name)
                            self.model.saver.save(sess,
                                                  model_save_path,
                                                  global_step=current_step)