def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
def do_shell(args): config = Config(args) helper = ModelHelper.load(args.model_path) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = NERModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) saver.restore(session, model.config.model_output) while True: # Create simple REPL try: sentence = eval(input("input> ")) tokens = sentence.strip().split(" ") for sentence, _, predictions in model.output(session, [(tokens, ["O"] * len(tokens))]): predictions = [LBLS[l] for l in predictions] print_sentence(sys.stdout, sentence, [ ""] * len(tokens), predictions) except EOFError: print("Closing session.") break
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--do_train", default=False, action='store_true') parser.add_argument('--do_eval', default=False, action='store_true') parser.add_argument("--do_predict", default=False, action='store_true') parser.add_argument('--markup', default='bios', type=str, choices=['bios', 'bio']) parser.add_argument("--arch", default='bilstm_crf', type=str) parser.add_argument('--learning_rate', default=0.001, type=float) parser.add_argument('--seed', default=1234, type=int) # parser.add_argument('--gpu',default='0',type=str) parser.add_argument('--gpu', default='', type=str) parser.add_argument('--epochs', default=50, type=int) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--embedding_size', default=128, type=int) parser.add_argument('--hidden_size', default=384, type=int) parser.add_argument("--grad_norm", default=5.0, type=float, help="Max gradient norm.") parser.add_argument("--task_name", type=str, default='ner') args = parser.parse_args() args.data_dir = config.data_dir if not config.output_dir.exists(): args.output_dir.mkdir() args.output_dir = config.output_dir / '{}'.format(args.arch) if not args.output_dir.exists(): args.output_dir.mkdir() init_logger(log_file=str(args.output_dir / '{}-{}.log'.format(args.arch, args.task_name))) seed_everything(args.seed) if args.gpu != '': args.device = torch.device(f"cuda:{args.gpu}") else: args.device = torch.device("cpu") args.id2label = {i: label for i, label in enumerate(config.label2id)} args.label2id = config.label2id processor = CluenerProcessor(data_dir=config.data_dir) processor.get_vocab() model = NERModel(vocab_size=len(processor.vocab), embedding_size=args.embedding_size, hidden_size=args.hidden_size, device=args.device, label2id=args.label2id) model.to(args.device) if args.do_train: train(args, model, processor) if args.do_eval: model_path = args.output_dir / 'best-model.bin' model = load_model(model, model_path=str(model_path)) evaluate(args, model, processor) if args.do_predict: predict(args, model, processor)
def main(config): # load vocabs vocab_words, idx2words = load_vocab(config.words_filename) vocab_tags, _ = load_vocab(config.tags_filename) vocab_chars, _ = load_vocab(config.chars_filename) vocab_pos, _ = load_vocab(config.pos_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(vocab_pos, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename) pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename) NE_dic = get_trimmed_glove_vectors(config.trimmed_dic) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model model = NERModel(config, embeddings, embeddings_uni, pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words, NE_dic=NE_dic) model.build() # train, evaluate and interact if state == "train": model.train(train, dev, vocab_tags) elif state == "evaluate": model.evaluate(dev, vocab_tags) else: #state == predict convert(file) t2o("data_format/test_convert.txt","data_format/test.txt") test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) model.evaluate(test, vocab_tags) tagging("data_format/test_convert.txt")
class nlu(): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) # get processing functions embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # get logger # logger = get_logger(config.log_path) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None) model.build() idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()} saver = tf.train.Saver() sess = tf.Session() saver.restore(sess, config.model_output) # model.logger.info("This is an interactive mode, enter a sentence:") @staticmethod def rec(sentence): try: processing_word = get_processing_word(nlu.vocab_words, lowercase=config.lowercase) # print character_separation(sentence)[0] words_raw = character_separation(sentence)[0].split(' ') # for word in words_raw: # if type(word)==str: words_raw = [unicode(word, 'utf-8') for word in words_raw] # words_raw = [word.decode('utf-8') for word in words_raw] # else: # words_raw = [unicode(word, 'utf-8') for word in words_raw] words = map(processing_word, words_raw) words = list(words) pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words]) preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0])) # print(list(preds)) print_sentence(nlu.model.logger, {"x": words_raw, "y": preds}) return list(preds) except EOFError: print("Closing session.") # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花')
def do_evaluate(args): config = Config(args) helper = ModelHelper.load(args.model_path) input_data = read_conll(args.data) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = NERModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) saver.restore(session, model.config.model_output) for sentence, labels, predictions in model.output(session, input_data): predictions = [LBLS[l] for l in predictions] print_sentence(args.output, sentence, labels, predictions)
def main(args): # User parameters parser = OptionParser() parser.add_option( "-m", "--model", default="", help="Model location" ) parser.add_option( "-i", "--input", default="", help="Input file, one sample per line" ) parser.add_option( "-o", "--output", default="", help="Output file location" ) parser.add_option( "--output_format", default="iob", help="Whether to output predicted tokens in IOB format or src/tgt format. [iob|st]" ) parser.add_option('--get_probs', default=0, help="Get normalized log likelihoods of each sample") parser.add_option('--get_vectors', default=0, help="Get output vectors of second-to-last layer in the network. Currently only tested with the CNN-BLSTM-CRF configuration") opts = parser.parse_args(args)[0] # Check parameters validity assert opts.output_format in ["iob", "st"] assert os.path.isfile(opts.model) assert os.path.isfile(opts.model + "_parameters.pkl") # need params file to reload model assert os.path.isfile(opts.input) # Add parameters parameters = {'reload': True, 'tag': True, 'repickle_data': True} # Load existing model print "Loading model..." model = NERModel(model_path=opts.model, parameters=parameters) parameters = model.parameters parameters['input'] = opts.input parameters['output'] = opts.output parameters['output_format'] = opts.output_format parameters['model'] = model.model parameters['get_probs'] = int(opts.get_probs) == 1 parameters['get_vectors'] = int(opts.get_vectors) == 1 print 'Tagging...' start = time.time() load_data_and_predict(parameters) print '---- lines tagged in %.4fs ----' % (time.time() - start)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3} # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=4) model.build() # train, evaluate and interact print vocab_tags model.train(train, dev, vocab_tags) stime = time.time() model.evaluate(test, vocab_tags) etime = time.time() print etime - stime
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_pref_suff = load_vocab( config.PS_filename) ############### For prefix and suffix vocab_pref_suff_2 = load_vocab(config.PS_filename_2) vocab_pref_suff_4 = load_vocab(config.PS_filename_4) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, vocab_pref_suff, vocab_pref_suff_2, vocab_pref_suff_4, lowercase=True, chars=config.chars, Pref_Suff=config.pref_suff) processing_tag = get_processing_word(vocab_tags, lowercase=False, Geoparser=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) ##create dataset dev = CoNLLDataset( config.dev_filename, processing_word, ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index processing_tag, config.max_iter ) ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags)
def do_train(args): # Set up some parameters. config = Config(args) helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args) embeddings = load_embeddings(args, helper) config.embed_size = embeddings.shape[1] helper.save(config.output_path) handler = logging.FileHandler(config.log_output) handler.setLevel(logging.DEBUG) handler.setFormatter(logging.Formatter( '%(asctime)s:%(levelname)s: %(message)s')) logging.getLogger().addHandler(handler) report = None # Report(Config.eval_output) with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = NERModel(helper, config, embeddings) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) model.fit(session, saver, train, dev) if report: report.log_output(model.output(session, dev_raw)) report.save() else: # Save predictions in a text file. output = model.output(session, dev_raw) sentences, labels, predictions = list(zip(*output)) predictions = [[LBLS[l] for l in preds] for preds in predictions] output = list(zip(sentences, labels, predictions)) with open(model.config.conll_output, 'w') as f: write_conll(f, output) with open(model.config.eval_output, 'w') as f: for sentence, labels, predictions in output: print_sentence(f, sentence, labels, predictions)
# get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
# create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model lmwords = len(vocab_words) lmposs = len(pos_tags) model = NERModel(config, embeddings, dic_embeddings, pos_embeddings, syl_embeddings, morph_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), nsyls=len(vocab_syls), nmorphs=len(vocab_morphs), nwords=lmwords, nposs=lmposs) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags, test_flag=1) #model.interactive_shell(vocab_tags, processing_word)
vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
pairs_batch_train = DataLoader(dataset=data_train, batch_size=batch_size, shuffle=True, collate_fn=prepare_data.collate, pin_memory=True) pairs_batch_dev = DataLoader(dataset=data_dev, batch_size=batch_size, shuffle=True, collate_fn=prepare_data.collate, pin_memory=True) # initialize the model model = NERModel(word_embedding_dim, char_embedding_dim, morph_embedding_dim, word_hidden_size, char_hidden_size, morph_hidden_size, len(char2idx), len(morph2idx), len(tag2idx)+1, word_num_layers, char_num_layers, morph_num_layers, dropout_prob).to(device) model.train() criterion = nn.NLLLoss() optimizer = radam.RAdam(model.parameters(), lr=learning_rate) print(model) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('The number of trainable parameters is: %d' % (total_trainable_params)) # train the model if skip_training == False: train(model, word_num_layers, char_num_layers, morph_num_layers, num_epochs, pairs_batch_train, pairs_batch_dev, word_hidden_size,
def main(): hp = parse_args() # Setup model directories model_name = get_model_name(hp) model_path = path.join(hp.model_dir, model_name) best_model_path = path.join(model_path, 'best_models') if not path.exists(model_path): os.makedirs(model_path) if not path.exists(best_model_path): os.makedirs(best_model_path) # Set random seed torch.manual_seed(hp.seed) # Hacky way of assigning the number of labels. encoder = Encoder( model=hp.model, model_size=hp.model_size, fine_tune=hp.fine_tune, # CASE-PRESERVED!! cased=True) # Load data logging.info("Loading data") train_iter, val_iter, test_iter, num_labels = NERDataset.iters( hp.data_dir, encoder, batch_size=hp.batch_size, eval_batch_size=hp.eval_batch_size, train_frac=hp.train_frac) logging.info("Data loaded") # Initialize the model model = NERModel(encoder, num_labels=num_labels, **vars(hp)).cuda() sys.stdout.flush() if not hp.fine_tune: optimizer = torch.optim.Adam(model.get_other_params(), lr=hp.lr) else: optimizer = torch.optim.Adam(model.parameters(), lr=hp.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5, factor=0.5, verbose=True) steps_done = 0 max_f1 = 0 init_num_stuck_evals = 0 num_steps = (hp.n_epochs * len(train_iter.data())) // hp.real_batch_size # Quantize the number of training steps to eval steps num_steps = (num_steps // hp.eval_steps) * hp.eval_steps logging.info("Total training steps: %d" % num_steps) location = path.join(model_path, "model.pt") if path.exists(location): logging.info("Loading previous checkpoint") checkpoint = torch.load(location) model.encoder.weighing_params = checkpoint['weighing_params'] if hp.fine_tune: model.encoder.model.load_state_dict(checkpoint['encoder']) model.span_net.load_state_dict(checkpoint['span_net']) model.label_net.load_state_dict(checkpoint['label_net']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) steps_done = checkpoint['steps_done'] init_num_stuck_evals = checkpoint['num_stuck_evals'] max_f1 = checkpoint['max_f1'] torch.set_rng_state(checkpoint['rng_state']) logging.info("Steps done: %d, Max F1: %.3f" % (steps_done, max_f1)) if not hp.eval: train(hp, model, train_iter, val_iter, optimizer, scheduler, model_path, best_model_path, init_steps=steps_done, max_f1=max_f1, eval_steps=hp.eval_steps, num_steps=num_steps, init_num_stuck_evals=init_num_stuck_evals) val_f1, test_f1 = final_eval(hp, model, best_model_path, val_iter, test_iter) perf_dir = path.join(hp.model_dir, "perf") if not path.exists(perf_dir): os.makedirs(perf_dir) if hp.slurm_job_id and hp.slurm_array_id: perf_file = path.join( perf_dir, hp.slurm_job_id + "_" + hp.slurm_array_id + ".txt") else: perf_file = path.join(model_path, "perf.txt") with open(perf_file, "w") as f: f.write("%s\n" % (model_path)) f.write("%s\t%.4f\n" % ("Valid", val_f1)) f.write("%s\t%.4f\n" % ("Test", test_f1))
corpus = read_lines('/eng.txt') datax, datay, tag_to_int = read_corpus(corpus) corpus_test = read_lines('/eng_test.txt') testx, testy, _ = read_corpus(corpus_test) corpus_validate = read_lines('/eng_validate.txt') validatex, validatey, _ = read_corpus(corpus_validate) embed_size = 50 scrf_size = 100 allowed_span_length = 6 epochs = 100 validate_epochs = len(validatex) test_epochs = len(testx) model = NERModel(embed_size, scrf_size, tag_to_int, tag_to_int['<STOP>'], tag_to_int['<START>'], allowed_span_length) optimizer = optim.Adagrad(model.parameters(), lr=0.009) word_dict = gs.Word2Vec(datax + validatex + testx, min_count=1, size=embed_size) data_loader = DataLoader(word_dict, datax, datay, testx, testy, validatex, validatey) train(model, data_loader, optimizer, epochs, validate_epochs) test(model, data_loader, test_epochs)
parser.add_argument('--decay_rate', type=float, default=0.05, help='decay rate') parser.add_argument('--plot_interval', type=int, default=2000, help='plot every # steps') args = parser.parse_args() torch.manual_seed(args.seed) # =============== Load device =============== if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") device = torch.device("cuda" if args.cuda else "cpu") # =============== Load data =============== cleaner = data.Cleaner(args) raw_train_data, raw_dev_data, raw_test_data = cleaner.clean() dataset = data.Dataset(raw_train_data, raw_dev_data, raw_test_data, args) word2idx, tag2idx, char2idx = dataset.word_to_id, dataset.tag_to_id, dataset.char_to_id train_data, dev_data, test_data = dataset.train_data, dataset.dev_data, dataset.test_data print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data))) # =============== Build the model =============== model = NERModel(word2idx, tag2idx, char2idx, args) if args.cuda: model.to(device) print('Model Initialized!!, n_params = {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad))) # =============== Train the model =============== all_f1, all_acc = create_and_train_model(model, train_data, dev_data, test_data, tag2idx, args) print('f1 = {}'.format(all_f1)) print('acc = {}'.format(all_acc))
def train(): # 配置信息 options.init(FLAGS) # 读入数据 print("Preparing data...") data = data_loader.ConllLoader() options.opts.vocab_size = data.vocab_size options.opts.num_tags = data.num_tags options.opts.dim_handcraft = data.dim_handcraft options.opts.char_vocab_size = data.char_vocab_size opts = options.opts # 输出配置信息 for item in opts.__dict__: print("{:20s}: {}".format(item, opts.__dict__[item])) with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-opts.init_scale, opts.init_scale) # 建模 print("\n\nBuilding graphs...") with tf.variable_scope("model", reuse=None, initializer=initializer): m = NERModel(data.dwords, is_training=True, dtype=tf.float32) if opts.restore: m.restore(session, opts.restore) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = NERModel(data.dwords, is_training=False, dtype=tf.float32) mtest = NERModel(data.dwords, is_training=False, dtype=tf.float32) tf.global_variables_initializer().run() best_valid = -np.inf best_test = -np.inf start_time = time.time() print("\n\nRunning epoches...") try: for i in range(opts.max_max_epoch): lr_decay = opts.learning_rate_decay**max( i - opts.max_epoch, 0.0) m.assign_lr(session, opts.learning_rate * lr_decay) # 学习率 print("Epoch: %d Learning rate: %f" % (i + 1, session.run(m.lr))) run_epoch(session, m, data, "train", display=1) print("Validating...") valid_score = run_epoch(session, mvalid, data, "valid") if valid_score > best_valid: print("New best score on validation dataset:", valid_score) best_valid = valid_score # mvalid.save(session, name="model") if (i + 1) % 10 == 0: print("Test...") test_score = run_epoch(session, mtest, data, "test") if test_score > best_test: print("New best score on test dataset:", test_score) best_test = test_score except KeyboardInterrupt: record.logging("epoches finished = {}".format(i + 1)) record.record(opts, best_valid, best_test, start_time)
def train_eval(args, train_data_path, valid_data_path): index = read_pickle(args.index_path) word2index, tag2index = index['word2id'], index['tag2id'] args.num_labels = len(tag2index) args.vocab_size = len(word2index)+1 set_seed(args.seed_num) train_dataloader, train_samples = get_dataloader(train_data_path, args.train_batch_size, True) valid_dataloader, _ = get_dataloader(valid_data_path, args.valid_batch_size, False) if args.model == 'bert': bert_config = BertConfig(args.bert_config_path) model = NERBert(bert_config, args) model.load_state_dict(torch.load(args.bert_model_path), strict=False) # model = NERBert.from_pretrained('bert_chinese', # # cache_dir='/home/dutir/yuetianchi/.pytorch_pretrained_bert', # num_labels=args.num_labels) else: if args.embedding: word_embedding_matrix = read_pickle(args.embedding_data_path) model = NERModel(args, word_embedding_matrix) else: model = NERModel(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.model == 'bert': param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if 'bert' not in n], 'lr': 5e-5, 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and ('bert' in n)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and ('bert' in n)], 'weight_decay': 0.0} ] warmup_proportion = 0.1 num_train_optimization_steps = int( train_samples / args.train_batch_size / args.gradient_accumulation_steps) * args.epochs optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps) else: current_learning_rate = args.learning_rate optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=current_learning_rate ) if args.init_checkpoint: # Restore model from checkpoint directory logging.info('Loading checkpoint %s...' % args.init_checkpoint) checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint')) init_step = checkpoint['step'] model.load_state_dict(checkpoint['model_state_dict']) if args.do_train: current_learning_rate = checkpoint['current_learning_rate'] warm_up_steps = checkpoint['warm_up_steps'] optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: logging.info('Ramdomly Initializing %s Model...' % args.model) init_step = 0 global_step = init_step best_score = 0.0 logging.info('Start Training...') logging.info('init_step = %d' % global_step) for epoch_id in range(int(args.epochs)): tr_loss = 0 model.train() for step, train_batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in train_batch) _, loss = model(batch[0], batch[1]) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % 500 == 0: print(loss.item()) if args.do_valid and global_step % args.valid_step == 1: true_res = [] pred_res = [] len_res = [] model.eval() for valid_step, valid_batch in enumerate(valid_dataloader): valid_batch = tuple(t.to(device) for t in valid_batch) with torch.no_grad(): logit = model(valid_batch[0]) if args.model == 'bert': # 第一个token是‘cls’ len_res.extend(torch.sum(valid_batch[0].gt(0), dim=-1).detach().cpu().numpy()-1) true_res.extend(valid_batch[1].detach().cpu().numpy()[:,1:]) pred_res.extend(logit.detach().cpu().numpy()[:,1:]) else: len_res.extend(torch.sum(valid_batch[0].gt(0),dim=-1).detach().cpu().numpy()) true_res.extend(valid_batch[1].detach().cpu().numpy()) pred_res.extend(logit.detach().cpu().numpy()) acc, score = cal_score(true_res, pred_res, len_res, tag2index) score = f1_score(true_res, pred_res, len_res, tag2index) logging.info('Evaluation:step:{},acc:{},fscore:{}'.format(str(epoch_id), acc, score)) if score>=best_score: best_score = score if args.model == 'bert': model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_dir = '{}_{}'.format('bert', str(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) else: save_variable_list = { 'step': global_step, 'current_learning_rate': args.learning_rate, 'warm_up_steps': step } save_model(model, optimizer, save_variable_list, args) model.train()
lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings # embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings = None # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() #x=raw_input('xxxxxxx') # train, evaluate and interact # model.train(train, dev, vocab_tags) import time start = time.time() model.evaluate(dev, vocab_tags) print time.time() - start #model.interactive_shell(vocab_tags, processing_word, test)
vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, lowercase=config.lowercase) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # get logger logger = get_logger(config.log_path) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=logger) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) #model.evaluate(test, vocab_tags) #model.interactive_shell(vocab_tags, processing_word)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--lr", type=float, default=0.0001) parser.add_argument("--n_epochs", type=int, default=1) parser.add_argument("--finetuning", dest="finetuning", action="store_true") parser.add_argument("--logdir", type=str, default="checkpoints/01") parser.add_argument("--trainset", type=str, default="data/train.txt") parser.add_argument("--validset", type=str, default="data/valid.txt") parser.add_argument("--model", type=str, default="bert-base-cased") hp = parser.parse_args() # device = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device('cpu') model = NERModel(tag_size=len(TAGS), device='cpu', finetuning=True, bert_model=hp.model) model = nn.DataParallel(model) tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) if tokenizer is not None: print("success") train_dataset = NerDataset(hp.trainset, tokenizer=tokenizer) eval_dataset = NerDataset(hp.validset, tokenizer=tokenizer) train_iter = data.DataLoader(dataset=train_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4,
char2idx = pickle.load(f) with open('weights/morph_dict_lower.pkl', 'rb') as f: morph2idx = pickle.load(f) word2morph = word_to_morph(whole_data_morphs) indexed_data = data_to_idx(whole_data, embeddings) indexed_char = char_to_idx(whole_data, char2idx) indexed_morph = morph_to_idx(whole_data, morph2idx, word2morph) indexed_whole_data = combine_data(indexed_data, indexed_char, indexed_morph, MAX_SEQ_LENGTH) # initialize the model model = NERModel(word_embedding_dim, char_embedding_dim, morph_embedding_dim, word_hidden_size, char_hidden_size, morph_hidden_size, len(char2idx), len(morph2idx), num_tags, word_num_layers, char_num_layers, morph_num_layers, dropout_prob).to(device) # load the model if lowercase_model == False: model.load_state_dict( torch.load('weights/model_upper.pt', map_location=torch.device('cpu'))) else: model.load_state_dict( torch.load('weights/model_lower.pt', map_location=torch.device('cpu'))) model.eval() batch_size = 1
# get processing functions processing_word = get_processing_word(vocab_words, lowercase=config.lowercase) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # get logger logger = get_logger(config.log_path) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=logger) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) # model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
train_dataset = dataset.NERdataset(train_sentences, train_tags) val_dataset = dataset.NERdataset(val_sentences, val_tags) train_dataloader = torch.utils.data.DataLoader( dataset = train_dataset, batch_size = config.TRAIN_BATCH_SIZE ) val_dataloader = torch.utils.data.DataLoader( dataset = val_dataset, batch_size = config.VALID_BATCH_SIZE ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = NERModel(num_tags) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ],
if arr[i][j] < 3: arr[i][j] = enc_tags.inverse_transform([arr[i][j]])[0] elif arr[i][j] == 3: arr[i][j] = 'X' else: raise KeyError(str(arr[i][j])+' as key not found in Label Encoder ') return arr if __name__ == "__main__": my_parser = argparse.ArgumentParser() my_parser.version = '1.0' my_parser.add_argument('-g','--grouped_entities', action='store_true',help='if used, evaluate all metrics on exact entity-level matching, instead of just wordpiece-level tokens ') args = my_parser.parse_args() grouped_entities = args.grouped_entities meta_data = joblib.load(config.METADATA_PATH) enc_tags = meta_data['enc_tags'] num_tags = len(list(enc_tags.classes_)) sentences, tags = preprocess_data(enc_tags) test_dataloader = get_dataloader(sentences, tags) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = NERModel(num_tags) model.load_state_dict(torch.load(config.MODEL_PATH,map_location=device)) tags_ypred, tags_ytrue = evaluate(test_dataloader, model, device, num_tags, grouped_entities=grouped_entities) # tags_ypred = enc_tags.inverse_transform(tags_ypred) # tags_ytrue = enc_tags.inverse_transform(tags_ytrue) tags_ypred = decode_transform(tags_ypred, enc_tags) tags_ytrue = decode_transform(tags_ytrue, enc_tags) # print(tags_ytrue,tags_ypred) print(seqeval_classification_report(tags_ytrue, tags_ypred))