def predict(self, test_data, model_details=None, options={}): super(knet, self).predict(test_data, model_details, options) assert len(test_data) != 0, "test_data list shouldn't be empty" self.test_file = test_data[0] if not os.path.exists(self.test_file): assert False, "File doesn't exists" print("Start Predicting") direct_entity, direct_context, self.predict_types = util.raw2npy( self.test_file) embedding = np.load(self.embedding) model = models.KA_D("KA+D", self.disamb_file) sess = tf.Session() w2v = util.build_vocab(self.glove, model.word_size) sess.run(model.initializer) model.saver.restore(sess, self.model_name) util.printlog("Begin computing direct outputs") self.final_result = util.direct(w2v, sess, model, direct_entity, direct_context, embedding, self.type_file) dir_name = os.path.dirname(test_data[0]) output_file = os.path.join(dir_name, "entity_typing_test_output.txt") final_str = "" for i in range(len(self.final_result)): final_str = "{}\n{}\t{}\t{}".format(final_str, " ".join(direct_entity[i]), self.predict_types[i], self.final_result[i].lower()) with open(output_file, 'w') as fin: fin.write(final_str.strip()) return output_file
def main(args): with open('data/multim_poem.json') as f, open( 'data/unim_poem.json') as unif: multim = json.load(f) unim = json.load(unif) if args.bert: word2idx, idx2word = util.build_vocab_bert(unim + multim, args.threshold) else: word2idx, idx2word = util.build_vocab(unim + multim, args.threshold) sys.stderr.write('vocab size {}\n'.format(len(word2idx))) if args.bert: with open('./data/vocab_bert.pkl', 'wb') as f: pickle.dump([word2idx, idx2word], f) with open(args.vocab_path, 'wb') as f: pickle.dump([word2idx, idx2word], f)
from sklearn.metrics import accuracy_score as acc def score(w, models, log_prob=True): raw = models["UK"].transform( w, log_prob=log_prob) / models["USA"].transform(w, log_prob=log_prob) return max(raw, 1 / raw) - 1 models = {} for country in ["UK", "USA"]: print("Country", country) data = list( map(str, util.load_data("./data/%s_tokenized.txt" % country)[5].tolist())) vocab = util.build_vocab(data) inverted_vocab = {k: v for v, k in enumerate(vocab)} docs = [] for d in tqdm(data, desc="Processing docs"): docs.append( np.array( list( map( lambda x: inverted_vocab[x] if x in inverted_vocab else -1, d.split(" "))))) model = FrequencyModel(inverted_vocab) model.fit(docs) models[country] = model
import pandas as pd import numpy as np import random import re import util # regexp = re.compile('[,.;:@#?!&$”\"\-]+') SPECIAL = ",.;:@#?!&$”\"\-" inverted_vocabs = {} for c in ["UK", "USA"]: data = list(map(str, util.load_data("./data/%s_tokenized.txt" % c)[5].tolist())) vocab = util.build_vocab(data, least_freq=21) inverted_vocabs[c] = {k: v for v, k in enumerate(vocab)} print("start joint_vocab") joint_vocab = set(inverted_vocabs["UK"].keys()) & set(inverted_vocabs["USA"].keys()) joint_vocab = {w for w in joint_vocab if not any(special in w for special in SPECIAL)} word_list = pd.read_csv("./data/word_list.csv", encoding="gbk") # Clean word list words = word_list["Word"] for i in range(len(words)): words[i] = re.sub("\(.+\)", "", words[i]) words[i] = re.sub("\[.+\]", "", words[i]) words[i] = re.sub("\r\n", ",", words[i]) words[i] = re.sub(" ", "", words[i]) word_list["Word"] = words # Match joint words
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np efficient_gpu = False if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) efficient_gpu = args.get('efficient_gpu', False) def to_gpu(x): if args['gpu'] >= 0: return chainer.cuda.to_gpu(x) return x # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] input_idx = map(int, args['input_idx'].split(',')) output_idx = map(int, args['output_idx'].split(',')) word_input_idx = input_idx[0] # NOTE: word_idx is first column! additional_input_idx = input_idx[1:] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words vocab_adds = [] if is_train: sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence] for sentence in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) # Additional setup for ad_feat_id in additional_input_idx: sentences_additional_train = [[feat_obj[ad_feat_id] for feat_obj in sentence] for sentence in sentences_train] vocab_add = util.build_vocab(sentences_additional_train) vocab_adds.append(vocab_add) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) if args.get('word_emb_file', False): # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_emb_vocab_type = args.get('word_emb_vocab_type') def assert_word_emb_shape(shape1, shape2): err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})''' if shape1 != shape2: err_msg = err_msg.format(str(shape1), str(shape2)) raise ValueError(err_msg) def assert_no_emb(word_vecs): err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`''' if word_vecs.shape[0] == 0: raise ValueError(err_msg) if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file) vocab = vocab_glove elif word_emb_vocab_type == 'replace_only': word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) assert_no_emb(word_vecs) elif word_emb_vocab_type == 'additional': word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file) additional_vecs = [] for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]): if word not in vocab: vocab[word] = len(vocab) additional_vecs.append(word_vecs[word_idx]) additional_vecs = np.array(additional_vecs, dtype=np.float32) if args.get('vocab_file', False): vocab_file = args['vocab_file'] vocab = util.load_vocab(vocab_file) if args.get('vocab_char_file', False): vocab_char_file = args['vocab_char_file'] vocab_char = util.load_vocab(vocab_char_file) vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items()) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] tmp_xp = xp if efficient_gpu: tmp_xp = np # use CPU (numpy) def parse_to_word_ids(sentences, word_input_idx, vocab): return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=word_input_idx) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) x_train_additionals = [parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_dev_additionals = [parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev] # tag_names = [] tag_names = list(set([tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()])) x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) x_test_additionals = [parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) init_emb = None if is_train: util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) n_vocab_add = [len(_vadd) for _vadd in vocab_adds] net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=init_emb, char_input_dim=args['n_char_emb'], char_hidden_dim=args['n_char_hidden'], n_label=len(vocab_tags), n_add_feature_dim=args['n_add_feature_emb'], n_add_feature=len(n_vocab_add), n_vocab_add=n_vocab_add, use_cudnn=args['use_cudnn']) my_cudnn(args['use_cudnn']) if args.get('word_emb_file', False): if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data = word_vecs[:] elif word_emb_vocab_type == 'replace_only': assert_no_emb(word_vecs) assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data[word_ids] = word_vecs[:] elif word_emb_vocab_type == 'additional': assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) v_size = additional_vecs.shape[0] net.word_embed.W.data[-v_size:] = additional_vecs[:] if args.get('return_model', False): return net if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] x_additional = [] if len(x_train_additionals): x_additional = [[to_gpu(_) for _ in x_ad[index:index + batchsize]] for x_ad in x_train_additionals] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) _, predict_tags = zip(*predict_lists) predicted_results = [] for predict in predict_tags: predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)] predicted_results.append(predicted) return predict_lists, sum_loss, predicted_results if args['model_filename']: model_filename = args['model_filename'] serializers.load_hdf5(model_filename, net) if is_test: # predict # model_filename = args['model_filename'] # model_filename = save_dir + model_filename # serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train if dev_file: predict_dev, loss_dev, predict_dev_tags = eval_loop(x_dev, x_char_dev, y_dev) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval( gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] print 'all_result:', all_result predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 prev_dev_accuracy = 0.0 prev_dev_f = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) x_additional = [] if len(x_train_additionals): x_additional = [[to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize]] for x_ad in x_train_additionals] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev, x_dev_additionals) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) logging.info(' f_measure :' + str(all_result[-1])) dev_f = all_result[-1] if prev_dev_f < dev_f: logging.info(' [update best model on dev set!]') dev_list = [prev_dev_f, dev_f] dev_str = ' ' + ' => '.join(map(str, dev_list)) logging.info(dev_str) prev_dev_f = dev_f # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
linkmanual = np.load(datadir + '/linkmanual.npy') ####### build model if modelname == "SA": model = model.SA("SA") elif modelname == "MA": model = model.MA("MA") elif modelname == "KA": model = model.KA("KA") elif modelname == "KA+D": model = model.KA_D("KA+D") else: raise ValueError("No such model!") sess = tf.Session() w2v = util.build_vocab(w2vfile, model.word_size) sess.run(model.initializer) if args.load_model: model.saver.restore(sess, args.load_model) elif not training: raise ValueError("Must load a model for testing!") ####### direct if direct: util.printlog("Begin computing direct outputs") util.direct(w2v, sess, model, direct_entity, direct_context, embedding) ####### train elif training: util.printlog("Begin training")
def train(self, train_data=None, options={}): super(knet, self).train(train_data, options) if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) util.printlog("Loading Data") embedding = np.load(self.embedding) train_entity = np.load(self.train_entity) train_context = np.load(self.train_context) train_label = np.load(self.train_labels) train_fbid = np.load(self.train_fbid) valid_entity = np.load(self.valid_entity) valid_context = np.load(self.valid_context) valid_label = np.load(self.valid_labels) valid_fbid = np.load(self.valid_fbid) train_size = len(train_entity) if train_size < 500: batch_size = train_size iter_num = train_size check_freq = train_size elif train_size < 10000: batch_size = train_size / 100 iter_num = train_size / 10 check_freq = train_size / 100 else: batch_size = train_size / 1000 iter_num = train_size / 100 check_freq = train_size / 1000 batch_size = int(batch_size) iter_num = int(iter_num) check_freq = int(check_freq) model = models.KA_D("KA+D", self.disamb_file) sess = tf.Session() w2v = util.build_vocab(self.glove, model.word_size) sess.run(model.initializer) util.printlog("Begin training") for i in range(iter_num): if i % check_freq == 0: util.printlog("Validating after running " + str(int(i * batch_size / train_size)) + " epoches") util.test(w2v, model, valid_entity, valid_context, valid_label, valid_fbid, embedding, batch_size, sess, "all") model.saver.save(sess, os.path.join(self.model_dir, str(i))) fd = model.fdict(w2v, (i * batch_size) % train_size, batch_size, 1, train_entity, train_context, train_label, train_fbid, embedding, False) fd[model.kprob] = 0.5 sess.run(model.train, feed_dict=fd) if batch_size != train_size and i % int( train_size / batch_size / 10) == 0: util.printlog("Epoch {}, Batch {}".format( int((i * batch_size) / train_size), int((i * batch_size) % train_size / batch_size))) model.saver.save(sess, self.model_name)
def main(): ''' Main function that coordinates the entire process. Parses arguments that specify the exercise and the experiment that should be run. Initializes the model and the checkpoint managers. ''' parser = argparse.ArgumentParser( description='Define configuration of experiments') parser.add_argument('--mode', type=str, nargs='+', choices=['train', 'evaluate', 'generate'], required=True) parser.add_argument('--experiment', type=str, choices=['a', 'b', 'c'], required=True) parser.add_argument('--id', type=str, required=False) parser.add_argument('--epochs', type=int, default=EPOCHS, required=False) args = parser.parse_args() # Setting Experiment Id if args.id is None: exp_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") print(f"No Experiment Id Set, Creating New: {exp_id}") else: exp_id = args.id print(f"Using Experiment Id: {exp_id}") # Setting Directories base_dir = f"{OUTPUT_DIR}/exp_{args.experiment}/{exp_id}" log_dir = f"{base_dir}/logs" submission_dir = f"{base_dir}/submissions" if not os.path.exists(submission_dir): os.makedirs(submission_dir) ckpt_dir = f"{base_dir}/ckpts" print(f"Experiment Directory: {base_dir}") print(f"Using Tensorflow Version: {tf.__version__}") print("Building Vocabulary...") build_vocab(input_file=PATH_TRAIN, output_file=PATH_VOCAB, top_k=VOCAB_SIZE, special=SPECIAL) word2id, id2word = build_vocab_lookup(PATH_VOCAB, "<unk>") # Setting Experiment Specific Configurations if args.experiment == 'a': lstm_hidden_state_size = 512 word_embeddings = None elif args.experiment == 'b': lstm_hidden_state_size = 512 word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE) elif args.experiment == 'c': lstm_hidden_state_size = 1024 word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE) else: raise ValueError(f"Unknown Experiment {args.experiment}") print(f'Initializing Model...') model = LanguageModel(vocab_size=VOCAB_SIZE, sentence_length=SENTENCE_LENGTH, embedding_size=EMBEDDING_SIZE, hidden_state_size=lstm_hidden_state_size, output_size=LSTM_OUTPUT_SIZE, batch_size=BATCH_SIZE, word_embeddings=word_embeddings, index_to_word_table=id2word) print(f'Initializing Optimizer...') optimizer = tf.keras.optimizers.Adam() ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5) if manager.latest_checkpoint: print(f"Restoring Model from {manager.latest_checkpoint}...") ckpt.restore(manager.latest_checkpoint) model_loaded = True else: print("Initializing Model from Scratch") model_loaded = False if "train" in args.mode: print(f"Starting Training...") train_summary_writer = tf.summary.create_file_writer( f"{log_dir}/train") with train_summary_writer.as_default(): train(ckpt=ckpt, manager=manager, model=model, optimizer=optimizer, word2id=word2id, id2word=id2word, epochs=args.epochs) model_loaded = True if "evaluate" in args.mode: print(f"Starting Evaluation...") assert model_loaded, 'model must be loaded from checkpoint in order to be evaluated' test_summary_writer = tf.summary.create_file_writer( f"{log_dir}/evaluate") with test_summary_writer.as_default(): evaluate( model=model, word2id=word2id, id2word=id2word, step=optimizer.iterations, path_submission= f"{submission_dir}/group35.perplexity{args.experiment.upper()}" ) if "generate" in args.mode: print(f"Starting Generation...") assert model_loaded, 'model must be loaded from checkpoint in order to start generation' generate_summary_writer = tf.summary.create_file_writer( f"{log_dir}/generate") with generate_summary_writer.as_default(): generate(word2id, id2word, model=model, path_submission=f"{submission_dir}/group35.continuation")
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter, input_idx=0, output_idx=-1) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter, input_idx=0, output_idx=-1) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter, input_idx=0, output_idx=-1) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words if is_train: sentences_words_train = [w_obj[0] for w_obj in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] def parse_to_word_ids(sentences): return util.parse_to_word_ids(sentences, xp=xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=0) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=0) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) # if is_train: x_train = parse_to_word_ids(sentences_train) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) # elif is_test: # x_predict = parse_to_word_ids(sentences_predict) # x_char_predict = parse_to_char_ids(sentences_predict) # y_predict = parse_to_tag_ids(sentences_predict) x_dev = parse_to_word_ids(sentences_dev) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_test = parse_to_word_ids(sentences_test) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) cnt_train_unk = sum([xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=None, n_label=len(vocab_tags)) if args['word_emb_file']: # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) net.word_embed.W.data[word_ids] = word_vecs if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) return predict_lists, sum_loss if is_test: # predict model_filename = args['model_filename'] model_filename = save_dir + model_filename serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train predict_pairs, _ = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev = eval_loop(x_dev, x_char_dev, y_dev) # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np efficient_gpu = False if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) efficient_gpu = args.get('efficient_gpu', False) def to_gpu(x): if args['gpu'] >= 0: return chainer.cuda.to_gpu(x) return x # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] input_idx = map(int, args['input_idx'].split(',')) output_idx = map(int, args['output_idx'].split(',')) word_input_idx = input_idx[0] # NOTE: word_idx is first column! additional_input_idx = input_idx[1:] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words vocab_adds = [] if is_train: sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence] for sentence in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) # Additional setup for ad_feat_id in additional_input_idx: sentences_additional_train = [[ feat_obj[ad_feat_id] for feat_obj in sentence ] for sentence in sentences_train] vocab_add = util.build_vocab(sentences_additional_train) vocab_adds.append(vocab_add) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) if args.get('word_emb_file', False): # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_emb_vocab_type = args.get('word_emb_vocab_type') def assert_word_emb_shape(shape1, shape2): err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})''' if shape1 != shape2: err_msg = err_msg.format(str(shape1), str(shape2)) raise ValueError(err_msg) def assert_no_emb(word_vecs): err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`''' if word_vecs.shape[0] == 0: raise ValueError(err_msg) if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings word_vecs, vocab_glove = util.load_glove_embedding_include_vocab( emb_file) vocab = vocab_glove elif word_emb_vocab_type == 'replace_only': word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) assert_no_emb(word_vecs) elif word_emb_vocab_type == 'additional': word_vecs, vocab_glove = util.load_glove_embedding_include_vocab( emb_file) additional_vecs = [] for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]): if word not in vocab: vocab[word] = len(vocab) additional_vecs.append(word_vecs[word_idx]) additional_vecs = np.array(additional_vecs, dtype=np.float32) if args.get('vocab_file', False): vocab_file = args['vocab_file'] vocab = util.load_vocab(vocab_file) if args.get('vocab_char_file', False): vocab_char_file = args['vocab_char_file'] vocab_char = util.load_vocab(vocab_char_file) vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items()) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] tmp_xp = xp if efficient_gpu: tmp_xp = np # use CPU (numpy) def parse_to_word_ids(sentences, word_input_idx, vocab): return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=word_input_idx) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) x_train_additionals = [ parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_dev_additionals = [ parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev] # tag_names = [] tag_names = list( set([ tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys() ])) x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) x_test_additionals = [ parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) init_emb = None if is_train: util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) n_vocab_add = [len(_vadd) for _vadd in vocab_adds] net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=init_emb, char_input_dim=args['n_char_emb'], char_hidden_dim=args['n_char_hidden'], n_label=len(vocab_tags), n_add_feature_dim=args['n_add_feature_emb'], n_add_feature=len(n_vocab_add), n_vocab_add=n_vocab_add, use_cudnn=args['use_cudnn']) my_cudnn(args['use_cudnn']) if args.get('word_emb_file', False): if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data = word_vecs[:] elif word_emb_vocab_type == 'replace_only': assert_no_emb(word_vecs) assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data[word_ids] = word_vecs[:] elif word_emb_vocab_type == 'additional': assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) v_size = additional_vecs.shape[0] net.word_embed.W.data[-v_size:] = additional_vecs[:] if args.get('return_model', False): return net if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] x_additional = [] if len(x_train_additionals): x_additional = [[ to_gpu(_) for _ in x_ad[index:index + batchsize] ] for x_ad in x_train_additionals] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) _, predict_tags = zip(*predict_lists) predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) return predict_lists, sum_loss, predicted_results if args['model_filename']: model_filename = args['model_filename'] serializers.load_hdf5(model_filename, net) if is_test: # predict # model_filename = args['model_filename'] # model_filename = save_dir + model_filename # serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train if dev_file: predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] print 'all_result:', all_result predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 prev_dev_accuracy = 0.0 prev_dev_f = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) x_additional = [] if len(x_train_additionals): x_additional = [[ to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize] ] for x_ad in x_train_additionals] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev, x_dev_additionals) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) logging.info(' f_measure :' + str(all_result[-1])) dev_f = all_result[-1] if prev_dev_f < dev_f: logging.info(' [update best model on dev set!]') dev_list = [prev_dev_f, dev_f] dev_str = ' ' + ' => '.join(map(str, dev_list)) logging.info(dev_str) prev_dev_f = dev_f # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
# Load data print("Loading data...") labels, sentences = load_data_and_labels_from_csv_file(data_file) params = {'max_chars_features': 500} lines_chars_level_features = generate_char_level_features( sentences, params['max_chars_features']) params['max_chars_features'] = max( [len(lines) for lines in lines_chars_level_features]) lines_chars_level_features = np.array(lines_chars_level_features) # Build vocabulary print("Build the vocabulary") vocabulary = build_vocab(lines_chars_level_features, max_vocab_size=10000) #print(vocabulary) # Pad sentence print("Padding sentences...") x_text = pad_sentences(lines_chars_level_features, max_sequence_length=params['max_chars_features']) seq_len = len(x_text[0]) print("The sequence length is: ", seq_len) # Represent sentence with char index, using char index to represent a sentence x = text_to_sequence(x_text, vocabulary) # Shuffle data #np.random.seed(1) #same shuffling each time