Beispiel #1
0
def main(arch):
    logger = init_logger(log_name=arch, log_dir=config['log_dir'])
    logger.info("seed is %d" % args['seed'])
    seed_everything(seed=args['seed'])
    checkpoint_path = os.path.join(config['checkpoint_dir'].format(arch=arch),
                                   config['best_model_name'].format(arch=arch))
    device = 'cuda:%d' % config['n_gpus'][0] if len(
        config['n_gpus']) else 'cpu'

    # 加载数据集
    logger.info('starting load test data from disk')
    data_transformer = DataTransformer(vocab_path=config['vocab_path'],
                                       test_file=config['test_file_path'],
                                       logger=logger,
                                       skip_header=False,
                                       is_train_mode=False,
                                       seed=args['seed'])
    data_transformer.build_vocab()
    data_transformer.sentence2id(raw_data_path=config['raw_test_path'],
                                 x_var=config['x_var'],
                                 y_var=config['y_var'])
    embedding_weight = data_transformer.build_embedding_matrix(
        embedding_path=config['embedding_weight_path'])
    test_loader = DataLoader(logger=logger,
                             is_train_mode=False,
                             x_var=config['x_var'],
                             y_var=config['y_var'],
                             skip_header=False,
                             data_path=config['test_file_path'],
                             batch_size=args['batch_size'],
                             max_sentence_length=config['max_length'],
                             device=device)
    test_iter = test_loader.make_iter()
    # 初始化模型和优化器
    logger.info("initializing model")
    bilstm = Model(num_classes=config['num_classes'],
                   embedding_dim=config['embedding_dim'],
                   model_config=config['models'][arch],
                   embedding_weight=embedding_weight,
                   vocab_size=len(data_transformer.vocab),
                   device=device)
    # 初始化模型训练器
    logger.info('predicting model....')
    predicter = Predicter(model=bilstm,
                          logger=logger,
                          n_gpu=config['n_gpus'],
                          test_data=test_iter,
                          checkpoint_path=checkpoint_path,
                          label_to_id=config['label_to_id'])
    # 拟合模型
    predictions = predicter.predict()
    test_write(data=predictions,
               filename=config['result_path'],
               raw_text_path=config['raw_test_path'])
    # 释放显存
    if len(config['n_gpus']) > 0:
        torch.cuda.empty_cache()
Beispiel #2
0
def main(arch):
    logger = init_logger(log_name=arch, log_dir=config['log_dir'])
    logger.info("seed is %d" % args['seed'])
    seed_everything(seed=args['seed'])
    checkpoint_path = os.path.join(config['checkpoint_dir'].format(arch=arch),
                                   config['best_model_name'].format(arch=arch))
    device = 'cuda:%d' % config['n_gpus'][0] if len(
        config['n_gpus']) else 'cpu'

    logger.info('starting load test data from disk')
    default_tk = True if 'bert' in arch else False
    data_transformer = DataTransformer(vocab_path=config['vocab_path'],
                                       rev_vocab_path=config['rev_vocab_path'],
                                       all_data_path=config['all_data_path'],
                                       test_file=config['test_file_path'],
                                       logger=logger,
                                       skip_header=False,
                                       is_train_mode=False,
                                       default_token=default_tk,
                                       seed=args['seed'])
    data_transformer.build_vocab()
    data_transformer.sentence2id(raw_data_path=config['raw_test_path'],
                                 x_var=config['x_var'],
                                 y_var=config['y_var'])
    embedding_weight, words_embedding, gaz_tree = data_transformer.build_embedding_matrix(
        embedding_path=config['embedding_weight_path'],
        dict_path=config['embedding_dict_path'])
    bs = args['batch_size']
    if ('lattice' in arch):
        bs = 1
    test_loader = DataLoader(logger=logger,
                             is_train_mode=False,
                             vocab=data_transformer.vocab,
                             rev_vocab=data_transformer.rev_vocab,
                             x_var=config['x_var'],
                             y_var=config['y_var'],
                             skip_header=False,
                             data_path=config['test_file_path'],
                             batch_size=bs,
                             max_sentence_length=config['max_length'],
                             gaz=gaz_tree,
                             default_token=default_tk,
                             device=device)
    test_iter = test_loader.make_iter()

    logger.info("initializing model")
    if (arch == 'cnn_crf' or arch == 'cnn'):
        model = CNN(num_classes=config['num_classes'],
                    embedding_dim=config['embedding_dim'],
                    model_config=config['models'][arch],
                    embedding_weight=embedding_weight,
                    vocab_size=len(data_transformer.vocab),
                    device=device)
    elif (arch == 'bilstm' or arch == 'bilstm_crf'):
        model = BiLSTM(num_classes=config['num_classes'],
                       embedding_dim=config['embedding_dim'],
                       model_config=config['models'][arch],
                       embedding_weight=embedding_weight,
                       vocab_size=len(data_transformer.vocab),
                       device=device)
    elif (arch == 'lattice_lstm'):
        model = Lattice(num_classes=config['num_classes'],
                        embedding_dim=config['embedding_dim'],
                        model_config=config['models'][arch],
                        embedding_weight=embedding_weight,
                        vocab_size=len(data_transformer.vocab),
                        dict_size=len(data_transformer.word_vocab),
                        pretrain_dict_embedding=words_embedding,
                        device=device)
    elif (arch == 'bert_lstm'):
        model = BERT_LSTM(num_classes=config['num_classes'],
                          model_config=config['models'][arch],
                          device=device)

    logger.info('predicting model....')
    predicter = Predicter(model=model,
                          model_name=arch,
                          logger=logger,
                          n_gpu=config['n_gpus'],
                          test_data=test_iter,
                          checkpoint_path=checkpoint_path,
                          label_to_id=config['label_to_id'])

    predictions = predicter.predict()
    test_write(data=predictions,
               filename=config['result_path'],
               raw_text_path=config['raw_test_path'])

    if len(config['n_gpus']) > 0:
        torch.cuda.empty_cache()
def main(arch):
    logger = init_logger(log_name=arch, log_dir=config['log_dir'])
    logger.info("seed is %d"%args['seed'])
    seed_everything(seed = args['seed'])
    checkpoint_path = os.path.join(config['checkpoint_dir'].format(arch =arch),
                                    config['best_model_name'].format(arch = arch))
    device = 'cuda:%d' % config['n_gpus'][0] if len(config['n_gpus']) else 'cpu'

    # 加载数据集
    logger.info('starting load test data from disk')
    data_transformer = DataTransformer(
                     vocab_path    = config['vocab_path'],
                     test_file     = config['test_file_path'],
                     label_to_id   = config['label_to_id'],
                     logger        = logger,
                     skip_header   = False,
                     is_train_mode = False,
                     seed          = args['seed'])
    data_transformer.build_vocab()
    data_transformer.sentence2id(raw_data_path = config['raw_test_path'],
                                 raw_target_path = config['target_test_path'],
                                 x_var=config['x_var'],
                                 y_var=config['y_var']
                                 )
    embedding_weight = data_transformer.build_embedding_matrix(embedding_path = config['embedding_weight_path'])
    test_loader = DataLoader(logger=logger,
                        is_train_mode=False,
                        x_var = config['x_var'],
                        y_var = config['y_var'],
                        skip_header = False,
                        data_path   = config['test_file_path'],
                        batch_size  = args['batch_size'],
                        max_sentence_length = config['max_length'],
                        device = device)
    test_iter = test_loader.make_iter()
    # 初始化模型和优化器
    logger.info("initializing model")
    bilstm = Model(num_classes      = config['num_classes'],
                   embedding_dim    = config['embedding_dim'],
                   model_config     = config['models'][arch],
                   embedding_weight = embedding_weight,
                   vocab_size       = len(data_transformer.vocab),
                   device           = device)
    # 初始化模型训练器
    logger.info('predicting model....')
    predicter = Predicter(model=bilstm,
                          logger=logger,
                          n_gpu=config['n_gpus'],
                          test_data=test_iter,
                          checkpoint_path=checkpoint_path,
                          label_to_id=config['label_to_id'],
                          evaluate=F1_score(num_classes=config['num_classes']),
                          total_evaluate=Classification_Report(num_classes=config['num_classes']),
                          bioes=config['bioes'],
                          sep=config['sep'],
                          i2v=data_transformer.i2v,
                          i2l=data_transformer.id_to_label)
    # 拟合模型
    results, text_epoch, tags, predictions, f1 = predicter.predict()
    print('-------Test Micro score------')
    print('Loss: {}, Token_Acc: {}, Token_F1: {}, F1: {}'.format(results['test_loss'], results['test_acc'],
                                                                 results['test_f1'], f1))
    with open('test_predict.out', 'w') as f:
        for sentence, tag, pred in zip(text_epoch, tags, predictions):
            for token, t, p in zip(sentence, tag, pred):
                if config['sep'] == '_':
                    t = '-'.join(t.split(config['sep']))
                    p = '-'.join(p.split(config['sep']))
                f.write(' '.join([token, t, p]) + '\n')
            f.write('\n')
    os.system('python conlleval.py test_predict.out')
    if args['prediction']:
        test_write(data=predictions, filename=config['result_path'], raw_text_path=config['raw_test_path'])
    # 释放显存
    if len(config['n_gpus']) > 0:
        torch.cuda.empty_cache()
Beispiel #4
0
def main(arch):
    logger = init_logger(log_name=arch, log_dir=config['log_dir'])
    logger.info("seed is %d" % args['seed'])
    seed_everything(seed=args['seed'])
    device = 'cuda:%d' % config['n_gpus'][0] if len(
        config['n_gpus']) else 'cpu'

    logger.info('starting load train data from disk')
    default_tk = True if 'bert' in arch else False

    data_transformer = DataTransformer(logger=logger,
                                       is_train_mode=True,
                                       all_data_path=config['all_data_path'],
                                       vocab_path=config['vocab_path'],
                                       rev_vocab_path=config['rev_vocab_path'],
                                       max_features=config['max_features'],
                                       label_to_id=config['label_to_id'],
                                       train_file=config['train_file_path'],
                                       valid_file=config['valid_file_path'],
                                       valid_size=config['valid_size'],
                                       min_freq=config['min_freq'],
                                       seed=args['seed'],
                                       default_token=default_tk)

    data_transformer.build_vocab()

    data_transformer.sentence2id(raw_data_path=config['raw_train_path'],
                                 raw_target_path=config['raw_target_path'],
                                 x_var=config['x_var'],
                                 y_var=config['y_var'])

    char_embedding_weight, words_embedding, gaz_tree = data_transformer.build_embedding_matrix(
        embedding_path=config['embedding_weight_path'],
        dict_path=config['embedding_dict_path'])
    # glove_embedding_weight = data_transformer.build_embedding_matrix(embedding_path = config['glove_weight_path'])
    # embedding_weight = np.concatenate((word2vec_embedding_weight,glove_embedding_weight),axis=1)
    embedding_weight = char_embedding_weight
    bs = config['batch_size']
    if ('lattice' in arch):
        bs = 1

    train_loader = DataLoader(logger=logger,
                              vocab=data_transformer.vocab,
                              rev_vocab=data_transformer.rev_vocab,
                              is_train_mode=True,
                              x_var=config['x_var'],
                              y_var=config['y_var'],
                              skip_header=False,
                              data_path=config['train_file_path'],
                              batch_size=bs,
                              max_sentence_length=config['max_length'],
                              gaz=gaz_tree,
                              default_token=default_tk,
                              device=device)

    val_loader = DataLoader(logger=logger,
                            vocab=data_transformer.vocab,
                            rev_vocab=data_transformer.rev_vocab,
                            is_train_mode=True,
                            x_var=config['x_var'],
                            y_var=config['y_var'],
                            skip_header=False,
                            data_path=config['valid_file_path'],
                            batch_size=bs,
                            max_sentence_length=config['max_length'],
                            gaz=gaz_tree,
                            device=device)

    train_iter = train_loader.make_iter()
    val_iter = val_loader.make_iter()

    logger.info("initializing model")
    if (arch == 'cnn_crf' or arch == 'cnn'):
        model = CNN(num_classes=config['num_classes'],
                    embedding_dim=config['embedding_dim'],
                    model_config=config['models'][arch],
                    embedding_weight=embedding_weight,
                    vocab_size=len(data_transformer.vocab),
                    device=device)
    elif (arch == 'bilstm' or arch == 'bilstm_crf'):
        model = BiLSTM(num_classes=config['num_classes'],
                       embedding_dim=config['embedding_dim'],
                       model_config=config['models'][arch],
                       embedding_weight=embedding_weight,
                       vocab_size=len(data_transformer.vocab),
                       device=device)
    elif (arch == 'lattice_lstm'):
        model = Lattice(num_classes=config['num_classes'],
                        embedding_dim=config['embedding_dim'],
                        model_config=config['models'][arch],
                        embedding_weight=embedding_weight,
                        vocab_size=len(data_transformer.vocab),
                        dict_size=len(data_transformer.word_vocab),
                        pretrain_dict_embedding=words_embedding,
                        device=device)
    elif (arch == 'bert_lstm'):
        model = BERT_LSTM(num_classes=config['num_classes'],
                          model_config=config['models'][arch],
                          device=device)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=config['learning_rate'],
                           weight_decay=config['weight_decay'])

    logger.info("initializing callbacks")

    model_checkpoint = ModelCheckpoint(
        checkpoint_dir=config['checkpoint_dir'],
        mode=config['mode'],
        monitor=config['monitor'],
        save_best_only=config['save_best_only'],
        best_model_name=config['best_model_name'],
        epoch_model_name=config['epoch_model_name'],
        arch=arch,
        logger=logger)

    train_monitor = TrainingMonitor(fig_dir=config['figure_dir'],
                                    json_dir=config['log_dir'],
                                    arch=arch)

    lr_scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                     factor=0.5,
                                     patience=config['lr_patience'],
                                     min_lr=1e-9,
                                     epsilon=1e-5,
                                     verbose=1,
                                     mode=config['mode'])

    logger.info('training model....')
    trainer = Trainer(model=model,
                      model_name=arch,
                      train_data=train_iter,
                      val_data=val_iter,
                      optimizer=optimizer,
                      epochs=config['epochs'],
                      label_to_id=config['label_to_id'],
                      evaluate=F1_score(num_classes=config['num_classes']),
                      logger=logger,
                      model_checkpoint=model_checkpoint,
                      training_monitor=train_monitor,
                      resume=args['resume'],
                      lr_scheduler=lr_scheduler,
                      n_gpu=config['n_gpus'],
                      avg_batch_loss=True)

    trainer.summary()

    trainer.train()

    if len(config['n_gpus']) > 0:
        torch.cuda.empty_cache()
def main(arch):
    logger = init_logger(log_name=arch, log_dir=config['log_dir'])
    logger.info("seed is %d" % args['seed'])
    seed_everything(seed=args['seed'])
    device = 'cuda:%d' % config['n_gpus'][0] if len(
        config['n_gpus']) else 'cpu'
    # 加载数据集
    logger.info('starting load train data from disk')

    # **************************** 数据生成 ***********************
    data_transformer = DataTransformer(logger=logger,
                                       is_train_mode=True,
                                       all_data_path=config['all_data_path'],
                                       vocab_path=config['vocab_path'],
                                       max_features=config['max_features'],
                                       label_to_id=config['label_to_id'],
                                       train_file=config['train_file_path'],
                                       valid_file=config['valid_file_path'],
                                       valid_size=config['valid_size'],
                                       min_freq=config['min_freq'],
                                       seed=args['seed'])
    # 生成词典
    data_transformer.build_vocab()
    # 将句子转化为id形式
    data_transformer.sentence2id(
        raw_data_path=config['raw_train_path'],
        raw_target_path=config['raw_target_path'],
        raw_val_path=config['raw_val_path'],
        raw_val_target_path=config['raw_val_target_path'],
        x_var=config['x_var'],
        y_var=config['y_var'])
    # 建立词向量矩阵
    word2vec_embedding_weight = data_transformer.build_embedding_matrix(
        embedding_path=config['embedding_weight_path'])
    # glove_embedding_weight = data_transformer.build_embedding_matrix(embedding_path = config['glove_weight_path'])
    # embedding_weight = np.concatenate((word2vec_embedding_weight,glove_embedding_weight),axis=1)
    embedding_weight = word2vec_embedding_weight
    # 加载训练数据集
    train_loader = DataLoader(logger=logger,
                              is_train_mode=True,
                              x_var=config['x_var'],
                              y_var=config['y_var'],
                              skip_header=False,
                              data_path=config['train_file_path'],
                              batch_size=config['batch_size'],
                              max_sentence_length=config['max_length'],
                              device=device)
    # 验证数据集
    val_loader = DataLoader(logger=logger,
                            is_train_mode=True,
                            x_var=config['x_var'],
                            y_var=config['y_var'],
                            skip_header=False,
                            data_path=config['valid_file_path'],
                            batch_size=config['batch_size'],
                            max_sentence_length=config['max_length'],
                            device=device)
    # 产生数据迭代器
    train_iter = train_loader.make_iter()
    val_iter = val_loader.make_iter()

    # **************************** 模型和优化器 ***********************
    logger.info("initializing model")
    model = Model(num_classes=config['num_classes'],
                  embedding_dim=config['embedding_dim'],
                  model_config=config['models'][arch],
                  embedding_weight=embedding_weight,
                  vocab_size=len(data_transformer.vocab),
                  device=device)
    if config['optimizer'] == 'adam':
        optimizer = optim.Adam(params=model.parameters(),
                               lr=config['learning_rate'],
                               weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'sgd':
        optimizer = optim.SGD(params=model.parameters(),
                              lr=config['learning_rate'],
                              weight_decay=config['weight_decay'],
                              momentum=config['momentum'])

    # **************************** callbacks ***********************
    logger.info("initializing callbacks")
    # 模型保存
    model_checkpoint = ModelCheckpoint(
        checkpoint_dir=config['checkpoint_dir'],
        mode=config['mode'],
        monitor=config['monitor'],
        save_best_only=config['save_best_only'],
        best_model_name=config['best_model_name'],
        epoch_model_name=config['epoch_model_name'],
        arch=arch,
        logger=logger)
    # 监控训练过程
    train_monitor = TrainingMonitor(fig_dir=config['figure_dir'],
                                    json_dir=config['log_dir'],
                                    arch=arch)
    # 学习率机制
    lr_scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                     factor=0.995,
                                     patience=config['lr_patience'],
                                     min_lr=0.00005,
                                     epsilon=1e-8,
                                     verbose=1,
                                     mode=config['mode'])
    early_stopping = EarlyStopping(min_delta=0.0,
                                   patience=config['early_patience'],
                                   mode=config['mode'],
                                   monitor=config['monitor'],
                                   logger=logger)
    # **************************** training model ***********************
    logger.info('training model....')
    trainer = Trainer(model=model,
                      train_data=train_iter,
                      val_data=val_iter,
                      optimizer=optimizer,
                      epochs=config['epochs'],
                      label_to_id=config['label_to_id'],
                      evaluate=F1_score(num_classes=config['num_classes']),
                      logger=logger,
                      model_checkpoint=model_checkpoint,
                      training_monitor=train_monitor,
                      resume=args['resume'],
                      lr_scheduler=lr_scheduler,
                      n_gpu=config['n_gpus'],
                      avg_batch_loss=True,
                      bioes=True,
                      early_stopping=early_stopping,
                      sep=config['sep'],
                      train_loader=train_loader)
    # 查看模型结构
    trainer.summary()
    # 拟合模型
    trainer.train()
    # 释放显存
    if len(config['n_gpus']) > 0:
        torch.cuda.empty_cache()