def decodeAPI(config_file, dataset_name): """ Author: Thanh Thieu Decode tag sequences. Configuration comes from a file. :param config_file: Path to a decode config file. :param dataset_name: Name of the dataset to be replaced in directory paths. :return: """ print('NCRFpp: Sequence decoding for "{}" dataset'.format(dataset_name)) data = Data() data.HP_gpu = torch.cuda.is_available() data.read_config(config_file) _substituteDatasetName(data, dataset_name) status = data.status.lower() print("Seed num:", seed_num) data.load(data.dset_dir) data.read_config(config_file) _substituteDatasetName(data, dataset_name) print(data.raw_dir) data.generate_instance('raw') print("nbest: %s" % (data.nbest)) decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') return decode_results, pred_scores
def get_ncrf_data_object(model_name): #, input_path, output_path): data = Data() model = MODEL_PATHS[model_name] data.dset_dir = model['dset'] data.load(data.dset_dir) data.HP_gpu = False #data.raw_dir = input_path #data.decode_dir = output_path data.load_model_dir = model['model'] data.nbest = None return data
def easyTrain(confdict): print('Model Train') data = Data() data.read_config(confdict) data.HP_gpu = torch.cuda.is_available() data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() f1 = train(data) return f1
def dispatch(config=None, status="train", data=None): if data is None: data = Data() data.HP_gpu = torch.cuda.is_available() data.read_config(config) else: data.HP_gpu = torch.cuda.is_available() data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() return train(data) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(config) print(data.raw_dir) # exit(0) data.show_data_summary() data.generate_instance('raw') print("nbest: %s" % (data.nbest)) decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest and not data.sentence_classification: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print( "Invalid argument! Please use valid arguments! (train/test/decode)" )
def easyDecode(confdict, verbose=True): data = Data() data.read_config(confdict) print('Model Decode') try: data.load(data.dset_dir) except TypeError: data.load_export(data.xpt_dir) data.read_config(confdict) data.HP_gpu = torch.cuda.is_available() print('Decoding source: ', data.raw_dir) if verbose: data.show_data_summary() data.generate_instance('raw') decode_results, pred_scores = load_model_decode(data, 'raw') if data.decode_dir: data.write_decoded_results(decode_results, 'raw')
def main(): doc = """ Usage: predict <model_dir> <input_file> <output_file> """ args = docopt(doc) config = {} config['status'] = 'decode' config['raw_dir'] = args['<input_file>'] config['decode_dir'] = args['<output_file>'] dset_dir = os.path.join(args['<model_dir>'], 'model.dset') load_model_dir = os.path.join(args['<model_dir>'], 'model.best.model') config['dset_dir'] = dset_dir config['load_model_dir'] = load_model_dir data = Data() data.read_config(config) data.HP_gpu = torch.cuda.is_available() print("Seed num: %s", seed_num) if data.status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(config) print(f"Reading from {data.raw_dir}") # exit(0) data.show_data_summary() data.generate_instance('raw') print(f"nbest: {data.nbest}") decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print("Invalid command")
"%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (name, time_cost, speed, acc, p, r, f)) else: print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" % (name, time_cost, speed, acc)) return pred_results, pred_scores if __name__ == '__main__': parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--config', help='Configuration File') args = parser.parse_args() data = Data() data.HP_gpu = torch.cuda.is_available() data.read_config(args.config) data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data) elif status == 'decode': print("MODEL: decode")
print("Seg: ", seg) print("Train file:", train_file) print("Dev file:", dev_file) print("Test file:", test_file) print("Char emb:", char_emb) print("Bichar emb:", bichar_emb) print("Gaz file:", gaz_file) if status == 'train': print("Model saved to:", save_model_dir) # 立即把stdout缓存内容输出 sys.stdout.flush() if status == 'train': data = Data() data.model_name = model_name data.HP_gpu = gpu data.use_bichar = conf_dict['use_bichar'] data.HP_batch_size = conf_dict['HP_batch_size'] # 1 data.HP_iteration = conf_dict['HP_iteration'] # 100 data.HP_lr = conf_dict['HP_lr'] # 0.015 data.HP_lr_decay = conf_dict['HP_lr_decay'] # 0.5 data.HP_hidden_dim = conf_dict['HP_hidden_dim'] data.MAX_SENTENCE_LENGTH = conf_dict['MAX_SENTENCE_LENGTH'] data.HP_lstm_layer = conf_dict['HP_lstm_layer'] data_initialization(data, gaz_file, train_file, dev_file, test_file) if data.model_name in ['CNN_model', 'LSTM_model']: data.generate_instance_with_gaz_2(train_file, 'train') data.generate_instance_with_gaz_2(dev_file, 'dev') data.generate_instance_with_gaz_2(test_file, 'test') elif data.model_name in ['WC-LSTM_model']:
print ("Status:", status) print ("Seg: ", seg) print ("Train file:", train_file) print ("Dev file:", dev_file) print ("Test file:", test_file) print ("Raw file:", raw_file) print ("Char emb:", char_emb) print ("Bichar emb:", bichar_emb) print ("Gaz file:",gaz_file) if status == 'train': print ("Model saved to:", save_model_dir) sys.stdout.flush() if status == 'train': data = Data() data.HP_gpu = gpu data.HP_use_char = False data.HP_batch_size = 10 data.use_bigram = False data.gaz_dropout = 0.5 data.norm_gaz_emb = False data.HP_fix_gaz_emb = False data_initialization(data, gaz_file, train_file, dev_file, test_file) data.generate_instance_with_gaz(train_file,'train') data.generate_instance_with_gaz(dev_file,'dev') data.generate_instance_with_gaz(test_file,'test') data.build_word_pretrain_emb(char_emb) data.build_biword_pretrain_emb(bichar_emb) data.build_gaz_pretrain_emb(gaz_file)
def main(): parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--config', help='Configuration File', default='None') parser.add_argument('--wordemb', help='Embedding for words', default='None') parser.add_argument('--charemb', help='Embedding for chars', default='None') parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--savemodel', default="data/model/saved_model.lstmcrf.") parser.add_argument('--savedset', help='Dir of saved data setting') parser.add_argument('--train', default="data/conll03/train.bmes") parser.add_argument('--dev', default="data/conll03/dev.bmes") parser.add_argument('--test', default="data/conll03/test.bmes") parser.add_argument('--seg', default="True") parser.add_argument('--random-seed', type=int, default=42) parser.add_argument('--lr', type=float) parser.add_argument('--batch-size', type=int) parser.add_argument('--raw') parser.add_argument('--loadmodel') parser.add_argument('--output') parser.add_argument('--output-tsv') parser.add_argument('--model-prefix') parser.add_argument('--cpu', action='store_true') args = parser.parse_args() # Set random seed seed_num = args.random_seed random.seed(seed_num) torch.manual_seed(seed_num) np.random.seed(seed_num) data = Data() data.random_seed = seed_num data.HP_gpu = torch.cuda.is_available() if args.config == 'None': data.train_dir = args.train data.dev_dir = args.dev data.test_dir = args.test data.model_dir = args.savemodel data.dset_dir = args.savedset print("Save dset directory:", data.dset_dir) save_model_dir = args.savemodel data.word_emb_dir = args.wordemb data.char_emb_dir = args.charemb if args.seg.lower() == 'true': data.seg = True else: data.seg = False print("Seed num:", seed_num) else: data.read_config(args.config) if args.lr: data.HP_lr = args.lr if args.batch_size: data.HP_batch_size = args.batch_size data.output_tsv_path = args.output_tsv if args.cpu: data.HP_gpu = False if args.model_prefix: data.model_dir = args.model_prefix # data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(args.config) print(data.raw_dir) # exit(0) data.show_data_summary() data.generate_instance('raw') print("nbest: %s" % (data.nbest)) decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest and not data.sentence_classification: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print( "Invalid argument! Please use valid arguments! (train/test/decode)" )
args = parser.parse_args() decode = Data() decode.read_config(args.decode) if args.train is not None: #TRAIN train_data = Data() encoding_index = train_data.encoding train_data.read_config(args.train) train_enc = l.Encoding(train_data.encoding, train_data.postag_type) dict_encoded, all_sent, _ = train_enc.encode(train_data.dev_gold) processing.write_to_conllu(dict_encoded, train_data.dev_enc_dep2label, 0) train_data.HP_gpu = torch.cuda.is_available() print("Seed num:", seed_num) train_enc = l.Encoding(train_data.encoding, train_data.postag_type) dict_encoded, all_sent, _ = train_enc.encode(train_data.train_gold) processing.write_to_conllu(dict_encoded, train_data.train_enc_dep2label, 0) data_initialization(train_data) train_data.generate_instance('train') train_data.generate_instance('dev') #train_data.generate_instance('test') train_data.build_pretrain_emb() train(train_data, decode, args) else: #DECODE decode.HP_gpu = torch.cuda.is_available()
data.seg = False # print("Seed num:",seed_num) else: data.read_config(args.config) # data.show_data_summary() status = data.status.lower() if args.seed_num != 0: random.seed(args.seed_num) np.random.seed(args.seed_num) torch.manual_seed(args.seed_num) torch.cuda.manual_seed_all(args.seed_num) print("Seed num:", args.seed_num) data.model_dir = args.savemodel data.HP_gpu = args.gpu if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(args.config) print(data.raw_dir) # exit(0)
print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc)) return pred_results, pred_scores if __name__ == '__main__': parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--msf', action='store_true', default = False) parser.add_argument('--config', help='Configuration File' ) args = parser.parse_args() data = Data() #data.HP_gpu = torch.cuda.is_available() data.HP_gpu = False data.read_config(args.config) status = data.status.lower() print("Seed num:",seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data, args.msf) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir)
random.seed(seed_num) torch.manual_seed(seed_num) np.random.seed(seed_num) torch.cuda.manual_seed(seed_num) os.environ["CUDA_VISIBLE_DEVICES"] = "0" ## weibo train_file = "data/Weibo/weiboNER.train" dev_file = "data/Weibo/weiboNER.dev" test_file = "data/Weibo/weiboNER.test" word_emb_file = "data/gigaword_chn.all.a2b.uni.ite50.vec" print(train_file) data = Data() data.HP_gpu = False #是否使用GPU data.norm_gaz_emb = False #词向量是否归一化 data.HP_fix_gaz_emb = True #词向量表大小是否固定 data.HP_bilstm = True data.random_seed = seed_num # 整体参数设定位置 data.HP_lr = 0.01 data.HP_lr_decay = 0.01 data.HP_iteration = 150 data.HP_batch_size = 20 data.gaz_dropout = 0.4 data.weight_decay = 0.00000005 data.use_clip = False #是否控制梯度 data.HP_clip = 30 #最大梯度 # LSTM参数
import os os.environ["CUDA_VISIBLE_DEVICES"] = '3' if __name__ == '__main__': parser = argparse.ArgumentParser(description='Tuning with NCRF++') parser.add_argument('--config', help='Configuration File', default='train.config') args = parser.parse_args() data = Data() data.read_config(args.config) status = data.status.lower() data.HP_gpu = data.HP_gpu and torch.cuda.is_available() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') # data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() name = 'domain-cws' train(data, name) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(args.config)
parser.add_argument('--seg', default="True") parser.add_argument('--raw') parser.add_argument('--loadmodel') parser.add_argument('--output') args = parser.parse_args() data = Data() data.train_dir = args.train data.dev_dir = args.dev data.test_dir = args.test data.model_dir = args.savemodel data.dset_dir = args.savedset print("aaa",data.dset_dir) status = args.status.lower() save_model_dir = args.savemodel data.HP_gpu = torch.cuda.is_available() print("Seed num:",seed_num) data.number_normalized = True data.word_emb_dir = "../data/glove.6B.100d.txt" if status == 'train': print("MODEL: train") data_initialization(data) data.use_char = True data.HP_batch_size = 10 data.HP_lr = 0.015 data.char_seq_feature = "CNN" data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb()
# -*- coding: utf-8 -*- # @Author: Jie # @Date: 2017-06-15 14:11:08 # @Last Modified by: Jie Yang, Contact: [email protected] # @Last Modified time: 2018-07-06 11:08:27 import time import sys import argparse import random import copy import torch import gc import pickle as pickle import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import numpy as np from utils.metric import get_ner_fmeasure from model.bilstmcrf import BiLSTM_CRF as SeqModel from utils.data import Data seed_num = 100 random.seed(seed_num) torch.manual_seed(seed_num) np.random.seed(seed_num) def data_initialization(data, gaz_file, train_file, dev_file, test_file): data.build_alphabet(train_file) data.build_alphabet(dev_file) data.build_alphabet(test_file)