def decodeAPI(config_file, dataset_name):
    """
    Author: Thanh Thieu
    Decode tag sequences. Configuration comes from a file.

    :param config_file: Path to a decode config file.
    :param dataset_name: Name of the dataset to be replaced in directory paths.
    :return:
    """
    print('NCRFpp: Sequence decoding for "{}" dataset'.format(dataset_name))
    data = Data()
    data.HP_gpu = torch.cuda.is_available()
    data.read_config(config_file)
    _substituteDatasetName(data, dataset_name)
    status = data.status.lower()
    print("Seed num:", seed_num)

    data.load(data.dset_dir)
    data.read_config(config_file)
    _substituteDatasetName(data, dataset_name)
    print(data.raw_dir)
    data.generate_instance('raw')
    print("nbest: %s" % (data.nbest))
    decode_results, pred_scores = load_model_decode(data, 'raw')
    if data.nbest:
        data.write_nbest_decoded_results(decode_results, pred_scores, 'raw')
    else:
        data.write_decoded_results(decode_results, 'raw')

    return decode_results, pred_scores
Exemple #2
0
def get_ncrf_data_object(model_name):  #, input_path, output_path):
    data = Data()
    model = MODEL_PATHS[model_name]
    data.dset_dir = model['dset']
    data.load(data.dset_dir)
    data.HP_gpu = False
    #data.raw_dir = input_path
    #data.decode_dir = output_path
    data.load_model_dir = model['model']
    data.nbest = None
    return data
Exemple #3
0
def easyTrain(confdict):
    print('Model Train')
    data = Data()
    data.read_config(confdict)
    data.HP_gpu = torch.cuda.is_available()
    data_initialization(data)
    data.generate_instance('train')
    data.generate_instance('dev')
    data.generate_instance('test')
    data.build_pretrain_emb()
    f1 = train(data)
    return f1
Exemple #4
0
def dispatch(config=None, status="train", data=None):
    if data is None:
        data = Data()
        data.HP_gpu = torch.cuda.is_available()
        data.read_config(config)
    else:
        data.HP_gpu = torch.cuda.is_available()

    data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        return train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Exemple #5
0
def easyDecode(confdict, verbose=True):
    data = Data()
    data.read_config(confdict)
    print('Model Decode')
    try:
        data.load(data.dset_dir)
    except TypeError:
        data.load_export(data.xpt_dir)
    data.read_config(confdict)
    data.HP_gpu = torch.cuda.is_available()
    print('Decoding source: ', data.raw_dir)
    if verbose:
        data.show_data_summary()
    data.generate_instance('raw')
    decode_results, pred_scores = load_model_decode(data, 'raw')
    if data.decode_dir:
        data.write_decoded_results(decode_results, 'raw')
Exemple #6
0
def main():
    doc = """
    Usage:
    predict <model_dir>  <input_file> <output_file>
    """

    args = docopt(doc)
    config = {}
    config['status'] = 'decode'
    config['raw_dir'] = args['<input_file>']
    config['decode_dir'] = args['<output_file>']
    dset_dir = os.path.join(args['<model_dir>'], 'model.dset')
    load_model_dir = os.path.join(args['<model_dir>'], 'model.best.model')
    config['dset_dir'] = dset_dir
    config['load_model_dir'] = load_model_dir

    data = Data()
    data.read_config(config)

    data.HP_gpu = torch.cuda.is_available()
    print("Seed num: %s", seed_num)

    if data.status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(config)
        print(f"Reading from {data.raw_dir}")
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print(f"nbest: {data.nbest}")
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print("Invalid command")
Exemple #7
0
            "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
            % (name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" %
              (name, time_cost, speed, acc))
    return pred_results, pred_scores


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--config', help='Configuration File')

    args = parser.parse_args()
    data = Data()
    data.HP_gpu = torch.cuda.is_available()
    data.read_config(args.config)
    data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
Exemple #8
0
    print("Seg: ", seg)
    print("Train file:", train_file)
    print("Dev file:", dev_file)
    print("Test file:", test_file)
    print("Char emb:", char_emb)
    print("Bichar emb:", bichar_emb)
    print("Gaz file:", gaz_file)
    if status == 'train':
        print("Model saved to:", save_model_dir)
    # 立即把stdout缓存内容输出
    sys.stdout.flush()

    if status == 'train':
        data = Data()
        data.model_name = model_name
        data.HP_gpu = gpu
        data.use_bichar = conf_dict['use_bichar']
        data.HP_batch_size = conf_dict['HP_batch_size']  # 1
        data.HP_iteration = conf_dict['HP_iteration']  # 100
        data.HP_lr = conf_dict['HP_lr']  # 0.015
        data.HP_lr_decay = conf_dict['HP_lr_decay']  # 0.5
        data.HP_hidden_dim = conf_dict['HP_hidden_dim']
        data.MAX_SENTENCE_LENGTH = conf_dict['MAX_SENTENCE_LENGTH']
        data.HP_lstm_layer = conf_dict['HP_lstm_layer']
        data_initialization(data, gaz_file, train_file, dev_file, test_file)

        if data.model_name in ['CNN_model', 'LSTM_model']:
            data.generate_instance_with_gaz_2(train_file, 'train')
            data.generate_instance_with_gaz_2(dev_file, 'dev')
            data.generate_instance_with_gaz_2(test_file, 'test')
        elif data.model_name in ['WC-LSTM_model']:
    print ("Status:", status)
    print ("Seg: ", seg)
    print ("Train file:", train_file)
    print ("Dev file:", dev_file)
    print ("Test file:", test_file)
    print ("Raw file:", raw_file)
    print ("Char emb:", char_emb)
    print ("Bichar emb:", bichar_emb)
    print ("Gaz file:",gaz_file)
    if status == 'train':
        print ("Model saved to:", save_model_dir)
    sys.stdout.flush()
    
    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 10
        data.use_bigram = False
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)

        data.generate_instance_with_gaz(train_file,'train')
        data.generate_instance_with_gaz(dev_file,'dev')
        data.generate_instance_with_gaz(test_file,'test')

        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_gaz_pretrain_emb(gaz_file)
def main():
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--config', help='Configuration File', default='None')
    parser.add_argument('--wordemb',
                        help='Embedding for words',
                        default='None')
    parser.add_argument('--charemb',
                        help='Embedding for chars',
                        default='None')
    parser.add_argument('--status',
                        choices=['train', 'decode'],
                        help='update algorithm',
                        default='train')
    parser.add_argument('--savemodel',
                        default="data/model/saved_model.lstmcrf.")
    parser.add_argument('--savedset', help='Dir of saved data setting')
    parser.add_argument('--train', default="data/conll03/train.bmes")
    parser.add_argument('--dev', default="data/conll03/dev.bmes")
    parser.add_argument('--test', default="data/conll03/test.bmes")
    parser.add_argument('--seg', default="True")
    parser.add_argument('--random-seed', type=int, default=42)
    parser.add_argument('--lr', type=float)
    parser.add_argument('--batch-size', type=int)
    parser.add_argument('--raw')
    parser.add_argument('--loadmodel')
    parser.add_argument('--output')
    parser.add_argument('--output-tsv')
    parser.add_argument('--model-prefix')
    parser.add_argument('--cpu', action='store_true')

    args = parser.parse_args()

    # Set random seed
    seed_num = args.random_seed
    random.seed(seed_num)
    torch.manual_seed(seed_num)
    np.random.seed(seed_num)

    data = Data()
    data.random_seed = seed_num
    data.HP_gpu = torch.cuda.is_available()
    if args.config == 'None':
        data.train_dir = args.train
        data.dev_dir = args.dev
        data.test_dir = args.test
        data.model_dir = args.savemodel
        data.dset_dir = args.savedset
        print("Save dset directory:", data.dset_dir)
        save_model_dir = args.savemodel
        data.word_emb_dir = args.wordemb
        data.char_emb_dir = args.charemb
        if args.seg.lower() == 'true':
            data.seg = True
        else:
            data.seg = False
        print("Seed num:", seed_num)
    else:
        data.read_config(args.config)
    if args.lr:
        data.HP_lr = args.lr
    if args.batch_size:
        data.HP_batch_size = args.batch_size
    data.output_tsv_path = args.output_tsv
    if args.cpu:
        data.HP_gpu = False
    if args.model_prefix:
        data.model_dir = args.model_prefix

    # data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Exemple #11
0
    args = parser.parse_args()

    decode = Data()
    decode.read_config(args.decode)

    if args.train is not None:
        #TRAIN
        train_data = Data()
        encoding_index = train_data.encoding
        train_data.read_config(args.train)
        train_enc = l.Encoding(train_data.encoding, train_data.postag_type)
        dict_encoded, all_sent, _ = train_enc.encode(train_data.dev_gold)
        processing.write_to_conllu(dict_encoded, train_data.dev_enc_dep2label,
                                   0)
        train_data.HP_gpu = torch.cuda.is_available()
        print("Seed num:", seed_num)
        train_enc = l.Encoding(train_data.encoding, train_data.postag_type)
        dict_encoded, all_sent, _ = train_enc.encode(train_data.train_gold)
        processing.write_to_conllu(dict_encoded,
                                   train_data.train_enc_dep2label, 0)
        data_initialization(train_data)
        train_data.generate_instance('train')
        train_data.generate_instance('dev')
        #train_data.generate_instance('test')
        train_data.build_pretrain_emb()
        train(train_data, decode, args)

    else:
        #DECODE
        decode.HP_gpu = torch.cuda.is_available()
Exemple #12
0
            data.seg = False
        # print("Seed num:",seed_num)
    else:
        data.read_config(args.config)
    # data.show_data_summary()
    status = data.status.lower()

    if args.seed_num != 0:
        random.seed(args.seed_num)
        np.random.seed(args.seed_num)
        torch.manual_seed(args.seed_num)
        torch.cuda.manual_seed_all(args.seed_num)
    print("Seed num:", args.seed_num)

    data.model_dir = args.savemodel
    data.HP_gpu = args.gpu

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
        print(data.raw_dir)
        # exit(0)
Exemple #13
0
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results, pred_scores




if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--msf', action='store_true', default = False)
    parser.add_argument('--config',  help='Configuration File' )

    args = parser.parse_args()
    data = Data()
    #data.HP_gpu = torch.cuda.is_available()
    data.HP_gpu = False
    data.read_config(args.config)
    status = data.status.lower()
    print("Seed num:",seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data, args.msf)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
Exemple #14
0
    random.seed(seed_num)
    torch.manual_seed(seed_num)
    np.random.seed(seed_num)
    torch.cuda.manual_seed(seed_num)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    ## weibo
    train_file = "data/Weibo/weiboNER.train"
    dev_file = "data/Weibo/weiboNER.dev"
    test_file = "data/Weibo/weiboNER.test"

    word_emb_file = "data/gigaword_chn.all.a2b.uni.ite50.vec"
    print(train_file)
    data = Data()
    data.HP_gpu = False  #是否使用GPU
    data.norm_gaz_emb = False  #词向量是否归一化
    data.HP_fix_gaz_emb = True  #词向量表大小是否固定
    data.HP_bilstm = True
    data.random_seed = seed_num

    # 整体参数设定位置
    data.HP_lr = 0.01
    data.HP_lr_decay = 0.01
    data.HP_iteration = 150
    data.HP_batch_size = 20
    data.gaz_dropout = 0.4
    data.weight_decay = 0.00000005
    data.use_clip = False  #是否控制梯度
    data.HP_clip = 30  #最大梯度
    # LSTM参数
Exemple #15
0
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '3'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    parser.add_argument('--config',
                        help='Configuration File',
                        default='train.config')
    args = parser.parse_args()

    data = Data()
    data.read_config(args.config)
    status = data.status.lower()
    data.HP_gpu = data.HP_gpu and torch.cuda.is_available()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        # data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        name = 'domain-cws'
        train(data, name)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
Exemple #16
0
 parser.add_argument('--seg', default="True") 
 parser.add_argument('--raw') 
 parser.add_argument('--loadmodel')
 parser.add_argument('--output') 
 args = parser.parse_args()
 data = Data()
 
 data.train_dir = args.train 
 data.dev_dir = args.dev 
 data.test_dir = args.test
 data.model_dir = args.savemodel
 data.dset_dir = args.savedset
 print("aaa",data.dset_dir)
 status = args.status.lower()
 save_model_dir = args.savemodel
 data.HP_gpu = torch.cuda.is_available()
 print("Seed num:",seed_num)
 data.number_normalized = True
 data.word_emb_dir = "../data/glove.6B.100d.txt"
 
 if status == 'train':
     print("MODEL: train")
     data_initialization(data)
     data.use_char = True
     data.HP_batch_size = 10
     data.HP_lr = 0.015
     data.char_seq_feature = "CNN"
     data.generate_instance('train')
     data.generate_instance('dev')
     data.generate_instance('test')
     data.build_pretrain_emb()
Exemple #17
0
# -*- coding: utf-8 -*-
# @Author: Jie
# @Date:   2017-06-15 14:11:08
# @Last Modified by:   Jie Yang,     Contact: [email protected]
# @Last Modified time: 2018-07-06 11:08:27
import time
import sys
import argparse
import random
import copy
import torch
import gc
import pickle as pickle
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from utils.metric import get_ner_fmeasure
from model.bilstmcrf import BiLSTM_CRF as SeqModel
from utils.data import Data
seed_num = 100
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)

def data_initialization(data, gaz_file, train_file, dev_file, test_file):
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)