def get_slot(self, sentence): # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( self.vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary( self.vocab_slot) jieba.load_userdict("../data_resource/doctor_dict.txt") jieba.load_userdict("../data_resource/disease_dict.txt") jieba.load_userdict("../data_resource/division_dict.txt") jieba.load_userdict("../data_resource/week_dict.txt") jieba.load_userdict("../data_resource/other_dict.txt") model = load_model(model_file) _WORD_FILTER = re.compile("([.,!?\"':;)(])") sentence = _WORD_FILTER.sub('', sentence) if not sentence.isalpha(): return ("sentence should be words!") seg_gen = list(jieba.cut(sentence, cut_all=False)) _sentence = " ".join(seg_gen) # Get token-ids for the input sentence. token_ids = data_helper.sentence_to_token_ids( tf.compat.as_bytes(_sentence), w2id_sentence) # Add GO symbol at the end of sentence if data_helper.GO_ID not in token_ids: token_ids.append(data_helper.GO_ID) pred = model.predict_on_batch(np.array(token_ids)[np.newaxis, :]) _pred = np.argmax(pred, -1)[0].tolist() # If there is an EOS symbol in outputs, cut them at that point. if data_helper.EOS_ID in _pred: _pred = _pred[:_pred.index(data_helper.EOS_ID)] slot_list = [ tf.compat.as_str(id2w_slot[slot_pred]) for slot_pred in _pred ] slot_dictionary = { 'disease': '', 'division': '', 'doctor': '', 'time': '' } for index, item in enumerate(slot_list): if item == 'b-disease': slot_dictionary['disease'] = seg_gen[index] elif item == 'b-division': slot_dictionary['division'] = seg_gen[index] elif item == 'b-doctor': slot_dictionary['doctor'] = seg_gen[index] elif item == 'b-time': slot_dictionary['time'] = seg_gen[index] return slot_dictionary
def train(): print('Applying Parameters:') for k, v in FLAGS.__dict__['__flags'].items(): print('%s: %s' % (k, str(v))) print("Preparing data in %s" % FLAGS.data_dir) vocab_path = '' tag_vocab_path = '' label_vocab_path = '' in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_helper.prepare_multi_task_data( FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size) result_dir = FLAGS.train_dir + '/test_results' if not os.path.isdir(result_dir): os.makedirs(result_dir) current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt' current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt' vocab, rev_vocab = data_helper.initialize_vocabulary(vocab_path) tag_vocab, rev_tag_vocab = data_helper.initialize_vocabulary( tag_vocab_path) label_vocab, rev_label_vocab = data_helper.initialize_vocabulary( label_vocab_path) with tf.Session() as sess: # Create model. print("Max sequence length: %d." % _buckets[0][0]) print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab)) print( "Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab))) # Read data into buckets and compute their sizes. print("Reading train/valid/test data (training set limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(in_seq_dev, out_seq_dev, label_dev) test_set = read_data(in_seq_test, out_seq_test, label_test) train_set = read_data(in_seq_train, out_seq_train, label_train) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 best_valid_score = 0 best_test_score = 0 while model.global_step.eval() < FLAGS.max_training_steps: random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, tags, tag_weights, batch_sequence_length, labels = model.get_batch( train_set, bucket_id) if task['joint'] == 1: _, step_loss, tagging_logits, classification_logits = model.joint_step( sess, encoder_inputs, tags, tag_weights, labels, batch_sequence_length, bucket_id, False) elif task['tagging'] == 1: _, step_loss, tagging_logits = model.tagging_step( sess, encoder_inputs, tags, tag_weights, batch_sequence_length, bucket_id, False) elif task['intent'] == 1: _, step_loss, classification_logits = model.classification_step( sess, encoder_inputs, labels, batch_sequence_length, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp(loss) if loss < 300 else float('inf') print("global step %d step-time %.2f. Training perplexity %.2f" % (model.global_step.eval(), step_time, perplexity)) sys.stdout.flush() # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 def run_valid_test(data_set, mode): # mode: Eval, Test # Run evals on development/test set and print the accuracy. word_list = list() ref_tag_list = list() hyp_tag_list = list() ref_label_list = list() hyp_label_list = list() correct_count = 0 accuracy = 0.0 tagging_eval_result = dict() for bucket_id in xrange(len(_buckets)): eval_loss = 0.0 count = 0 for i in xrange(len(data_set[bucket_id])): count += 1 encoder_inputs, tags, tag_weights, sequence_length, labels = model_test.get_one( data_set, bucket_id, i) tagging_logits = [] classification_logits = [] if task['joint'] == 1: _, step_loss, tagging_logits, classification_logits = model_test.joint_step( sess, encoder_inputs, tags, tag_weights, labels, sequence_length, bucket_id, True) elif task['tagging'] == 1: _, step_loss, tagging_logits = model_test.tagging_step( sess, encoder_inputs, tags, tag_weights, sequence_length, bucket_id, True) elif task['intent'] == 1: _, step_loss, classification_logits = model_test.classification_step( sess, encoder_inputs, labels, sequence_length, bucket_id, True) eval_loss += step_loss / len(data_set[bucket_id]) hyp_label = None if task['intent'] == 1: ref_label_list.append( rev_label_vocab[labels[0][0]]) hyp_label = np.argmax(classification_logits[0], 0) hyp_label_list.append(rev_label_vocab[hyp_label]) if labels[0] == hyp_label: correct_count += 1 if task['tagging'] == 1: word_list.append([ rev_vocab[x[0]] for x in encoder_inputs[:sequence_length[0]] ]) ref_tag_list.append([ rev_tag_vocab[x[0]] for x in tags[:sequence_length[0]] ]) hyp_tag_list.append([ rev_tag_vocab[np.argmax(x)] for x in tagging_logits[:sequence_length[0]] ]) accuracy = float(correct_count) * 100 / count if task['intent'] == 1: print(" %s accuracy: %.2f %d/%d" % (mode, accuracy, correct_count, count)) sys.stdout.flush() if task['tagging'] == 1: if mode == 'Eval': taging_out_file = current_taging_valid_out_file elif mode == 'Test': taging_out_file = current_taging_test_out_file tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list, word_list, taging_out_file) print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1'])) sys.stdout.flush() return accuracy, tagging_eval_result # valid valid_accuracy, valid_tagging_result = run_valid_test( dev_set, 'Eval') if task['tagging'] == 1 and valid_tagging_result[ 'f1'] > best_valid_score: best_valid_score = valid_tagging_result['f1'] # save the best output file subprocess.call([ 'mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' % best_valid_score ]) # test, run test after each validation for development purpose. test_accuracy, test_tagging_result = run_valid_test( test_set, 'Test') if task['tagging'] == 1 and test_tagging_result[ 'f1'] > best_test_score: best_test_score = test_tagging_result['f1'] # save the best output file subprocess.call([ 'mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' % best_test_score ])
def decode(self, sentence=None): # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( self.vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary( self.vocab_slot) jieba.load_userdict("../data_resource/doctor_dict.txt") jieba.load_userdict("../data_resource/disease_dict.txt") jieba.load_userdict("../data_resource/division_dict.txt") jieba.load_userdict("../data_resource/other_dict.txt") model = load_model(model_file) if sentence == None: # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: seg_gen = jieba.cut(sentence, cut_all=False) _sentence = " ".join(seg_gen) # Get token-ids for the input sentence. token_ids = data_helper.sentence_to_token_ids( tf.compat.as_bytes(_sentence), w2id_sentence) print(token_ids) # Add GO symbol at the end of sentence if data_helper.GO_ID not in token_ids: token_ids.append(data_helper.GO_ID) pred = model.predict_on_batch( np.array(token_ids)[np.newaxis, :]) _pred = np.argmax(pred, -1)[0].tolist() # If there is an EOS symbol in outputs, cut them at that point. if data_helper.EOS_ID in _pred: _pred = _pred[:_pred.index(data_helper.EOS_ID)] print(" ".join([ tf.compat.as_str(id2w_slot[slot_pred]) for slot_pred in _pred ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline() elif sentence.isalpha(): seg_gen = jieba.cut(sentence, cut_all=False) _sentence = " ".join(seg_gen) # Get token-ids for the input sentence. token_ids = data_helper.sentence_to_token_ids( tf.compat.as_bytes(_sentence), w2id_sentence) # Add GO symbol at the end of sentence if data_helper.GO_ID not in token_ids: token_ids.append(data_helper.GO_ID) pred = model.predict_on_batch(np.array(token_ids)[np.newaxis, :]) _pred = np.argmax(pred, -1)[0].tolist() # If there is an EOS symbol in outputs, cut them at that point. if data_helper.EOS_ID in _pred: _pred = _pred[:_pred.index(data_helper.EOS_ID)] return " ".join([ tf.compat.as_str(id2w_slot[slot_pred]) for slot_pred in _pred ]) else: raise ValueError('sentence should be string!')
def train(self): # Prepare data sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\ vocab_slot = data_helper.prepare_data( "data", sentence_training_file, slot_training_file, sentence_developing_file, slot_developing_file, from_vocabulary_size=2000, to_vocabulary_size=2000, tokenizer=None) sentence_developing, slot_devloping = data_helper.read_data( sentence_dev, slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( sentence_train, slot_train, max_size=None) ## TODO: #sentence_training, slot_training = sentence_training[:1000],\ # slot_training[:1000] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) #model = Sequential() #model.add(Embedding(n_vocab,100)) #model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) #model.add(Dropout(0.25)) #model.add(GRU(100,return_sequences=True)) #model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) #model.compile('rmsprop', 'categorical_crossentropy') ## Training ##n_epochs = 30 #n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 #print("Training =>") #train_pred_label = [] #avgLoss = 0 #for i in range(n_epochs): # print("Training epoch {}".format(i)) # bar = progressbar.ProgressBar(max_value=len(sentence_training)) # for n_batch, sent in bar(enumerate(sentence_training)): # label = slot_training[n_batch] # # Make labels one hot # label = np.eye(n_classes)[label][np.newaxis, :] # # View each sentence as a batch # sent = sent[np.newaxis, :] # if sent.shape[1] > 1: #ignore 1 word sentences # loss = model.train_on_batch(sent, label) # avgLoss += loss # pred = model.predict_on_batch(sent) # pred = np.argmax(pred, -1)[0] # train_pred_label.append(pred) # avgLoss = avgLoss/n_batch # predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y)) # for y in train_pred_label] # con_dict = conlleval(predword_train, labels_train, # words_train, 'measure.txt') # train_f_scores.append(con_dict['f1']) # print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( # avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # # Save model # model.save(filepath_model) # gc.collect() print("Validating =>") from keras.models import load_model model = load_model(filepath_model) labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch gc.collect() predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] print('here') with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print()
def train(self): sentence_developing, slot_devloping = data_helper.read_data( self.sentence_dev, self.slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( self.sentence_train, self.slot_train, max_size=None) # Make toy data; comment this block to train on the full dataset #n_toy = 1000 #sentence_training, slot_training = sentence_training[:n_toy],\ # slot_training[:n_toy] #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\ # slot_devloping[:round(n_toy/2)] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( self.vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary( self.vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) model = Sequential() model.add(Embedding(n_vocab, 100)) model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) model.add(Dropout(0.25)) model.add(GRU(100, return_sequences=True)) model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) model.compile('rmsprop', 'categorical_crossentropy') # Training #n_epochs = 30 n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 print("Training =>") train_pred_label = [] avgLoss = 0 for i in range(n_epochs): print("Training epoch {}".format(i)) bar = progressbar.ProgressBar(max_value=len(sentence_training)) for n_batch, sent in bar(enumerate(sentence_training)): label = slot_training[n_batch] # Make labels one hot label = np.eye(n_classes)[label][np.newaxis, :] # View each sentence as a batch sent = sent[np.newaxis, :] if sent.shape[1] > 1: #ignore 1 word sentences loss = model.train_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] train_pred_label.append(pred) avgLoss = avgLoss / n_batch predword_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in train_pred_label ] con_dict = conlleval(predword_train, labels_train, words_train, 'measure.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # Save model model.save(model_file) print("Validating =>") labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print() # Prevent from tensorflow bugs: BaseSession.__del__ gc.collect()