Esempio n. 1
0
def predict():
    model = Model().to(device)
    model.load_state_dict(torch.load(config.model_save_path))
    test_df = pd.read_csv(config.test_path)
    texts = test_df['微博中文内容'].tolist()
    test_df.dropna(axis=0, inplace=True)
    test_dataset = MyDataset(test_df, 'test')
    test_loader = DataLoader(test_dataset,
                             batch_size=config.batch_size,
                             shuffle=False)
    model.eval()
    tmp = []
    preds_list = []
    with torch.no_grad():
        for batch_x, _ in tqdm(test_loader):
            batch_x = batch_x.to(device)
            probs = model(batch_x)
            # pred = torch.argmax(output, dim=1)
            _, preds = torch.max(probs, 1)
            tmp += [p.item() for p in preds]
    cnt = 0
    for text in texts:
        if pd.isnull(text):
            # 直接赋值为0
            preds_list.append(0)
        else:
            preds_list.append(config.id2label[tmp[cnt]])
            cnt += 1
    submission = pd.read_csv(config.sample_submission_path)
    submission['y'] = preds_list
    submission.to_csv('../data/submission.csv', index=False)
Esempio n. 2
0
def predict():
    comment_dict = {0: '0c', 1: '2c', 2: 'all'}
    fake_prob_label = defaultdict(list)
    real_prob_label = defaultdict(list)
    ncw_prob_label = defaultdict(list)
    test_df = pd.read_csv(config.test_path)
    test_df.fillna({'content': ''}, inplace=True)
    test_dataset = MyDataset(test_df, 'test')
    test_loader = DataLoader(test_dataset,
                             batch_size=config.batch_size,
                             shuffle=False)
    for i in range(3):
        model = Model().to(device)
        # model.load_state_dict(torch.load(config.model_save_path))
        # os.path.join(config.model_path, '{}_task{}.bin'.format(model_name, task))
        os.path.join(config.model_path, '{}.bin'.format(model_name, task))
        model.eval()
        with torch.no_grad():
            for batch_x, _ in tqdm(test_loader):
                batch_x = batch_x.to(device)
                logits = model(batch_x)
                # _, preds = torch.max(probs, 1)
                probs = torch.softmax(logits, 1)
                # probs_data = probs.cpu().data.numpy()
                fake_prob_label[i] += [p[0].item() for p in probs]
                real_prob_label[i] += [p[1].item() for p in probs]
                ncw_prob_label[i] += [p[2].item() for p in probs]
    submission = pd.read_csv(config.sample_submission_path)
    for i in range(3):
        submission['fake_prob_label_{}'.format(
            comment_dict[i])] = fake_prob_label[i]
        submission['real_prob_label_{}'.format(
            comment_dict[i])] = real_prob_label[i]
        submission['ncw_prob_label_{}'.format(
            comment_dict[i])] = ncw_prob_label[i]
    submission.to_csv('submission.csv', index=False)
Esempio n. 3
0
#训练开始
# ==================================================
# with tf.Graph().as_default():
#     session_conf = tf.ConfigProto(
#       allow_soft_placement=config.allow_soft_placement,
#       log_device_placement=config.log_device_placement)
config = tf.ConfigProto()
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
with tf.Session(config=config) as sess:
    #卷积池化网络导入
    model = Model(
        # rnn_size=config_parm.rnn_size,
        # num_layers=config_parm.num_layers,
        seq_len=config_parm.seq_len,
        embedding_size=config_parm.embedding_size,
        num_classes=config_parm.num_classes,
        learning_rate=config_parm.learning_rate,
        vocab_size=len(dictionary),
        size_layer = config_parm.size_layer,
        num_layers = config_parm.num_layers,

        )
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", config_parm.model_dir))

    ckpt = tf.train.get_checkpoint_state(out_dir)
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print('Reloading model parameters..')
        model.saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())  # tf.metrics.accuracy会产生两个局部变量
Esempio n. 4
0
def train(train_data, val_data, fold_idx=None):
    train_dataset = MyDataset(train_data, 'train', task)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True)
    val_dataset = MyDataset(val_data, 'val', task)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False)

    model = Model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_config.learning_rate)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

    if fold_idx is None:
        print('start')
        model_save_path = os.path.join(
            config.model_path, '{}_task{}.bin'.format(model_name, task))
    else:
        print('start fold: {}'.format(fold_idx + 1))
        model_save_path = os.path.join(
            config.model_path,
            '{}_task{}_fold{}.bin'.format(model_name, task, fold_idx))

    best_val_score = 0
    last_improved_epoch = 0
    adjust_lr_num = 0
    for cur_epoch in range(config.epochs_num):
        start_time = int(time.time())
        model.train()
        print('epoch:{}, step:{}'.format(cur_epoch + 1, len(train_loader)))
        cur_step = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_x)
            # probs = nn.Softmax(logits)
            train_loss = criterion(logits, batch_y)
            train_loss.backward()
            optimizer.step()

            cur_step += 1
            if cur_step % config.train_print_step == 0:
                msg = 'the current step: {0}/{1}, train loss: {2:>5.2}, train score: {3:>6.2%}'
                print(
                    msg.format(cur_step, len(train_loader), train_loss.item(),
                               get_score(train_loss.item())))
        val_loss, val_score = evaluate(model, val_loader, criterion)
        if val_score >= best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), model_save_path)
            improved_str = '*'
            last_improved_epoch = cur_epoch
        else:
            improved_str = ''
        msg = 'the current epoch: {0}/{1}, val loss: {2:>5.2}, val score: {3:>6.2%}, cost: {4}s {5}'
        end_time = int(time.time())
        print(
            msg.format(cur_epoch + 1, config.epochs_num, val_loss, val_score,
                       end_time - start_time, improved_str))
        if cur_epoch - last_improved_epoch >= model_config.patience_epoch:
            if adjust_lr_num >= model_config.adjust_lr_num:
                print("No optimization for a long time, auto stopping...")
                break
            print("No optimization for a long time, adjust lr...")
            scheduler.step()
            last_improved_epoch = cur_epoch  # 加上,不然会连续更新的
            adjust_lr_num += 1
    del model
    gc.collect()

    if fold_idx is not None:
        model_score[fold_idx] = best_val_score
Esempio n. 5
0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  #不全部占满显存, 按需分配
tf.reset_default_graph()
graph_1 = tf.Graph()
with graph_1.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=config.allow_soft_placement,
        log_device_placement=config.log_device_placement)

    with tf.Session(config=session_conf) as sess:
        model = Model(
            # size_layer=config_parm.size_layer,
            # num_layers=config_parm.num_layers,
            seq_len=config_parm.seq_len,
            embedding_size=config_parm.embedding_size,
            num_classes=config_parm.num_classes,
            learning_rate=config_parm.learning_rate,
            vocab_size=len(word2id),
            size_layer=config_parm.size_layer,
            num_layers=config_parm.num_layers,
        )
        ckpt = tf.train.get_checkpoint_state("runs/{}".format(
            config_parm.model_dir))
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(config_parm.model_dir))

        # Generate batches for one epoch
        batches = data_helpers.batch_iter(list(
Esempio n. 6
0
from utils.predict import predict
from flask import Flask, request, jsonify
import os
from flask_cors import CORS, cross_origin
from model.bert import Model
app = Flask(__name__)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

model = Model(model_name='distilbert-base-german-cased')


@app.route('/predict', methods=['POST'])
@cross_origin()
def get_prediciton():
    data = request.get_json()
    response = predict(data['text'])
    return jsonify({"text": response})


@app.route('/predict/word', methods=['POST'])
@cross_origin()
def get_prediciton_word():
    data = request.get_json()
    response = model.predict_next_word(data['text'])
    return jsonify(response)


if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))