def __init__(self): (self.sentence_train, self.slot_train, self.sentence_dev, self.slot_dev, self.vocab_sentence, self.vocab_slot) = data_helper.prepare_data("data", sentence_training_file, slot_training_file, sentence_developing_file, slot_developing_file, from_vocabulary_size=2000, to_vocabulary_size=2000, tokenizer=None)
def main(): train_data = [data1, data2, data3, data4] test_data = [data5] lang = Lang() arg = ARG() lang.insert_data(train_data) train_data_all = prepare_data(train_data, lang.char2idx, arg) test_data_all = prepare_data(test_data, lang.char2idx, arg) # print(lang.char2idx) # prepare for train model = RNNReader(arg, len(lang.char2idx)) optimer = optim.SGD(model.parameters(), lr=1e-4) # train model model_file = 'model_1.pkl' if os.path.exists(model_file): model = torch.load(model_file) else: train(model, optimer, train_data_all, test_data_all, test_data) torch.save(model, model_file) predict(model, test_data_all, test_data)
def train(self): # Prepare data sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\ vocab_slot = data_helper.prepare_data( "data", sentence_training_file, slot_training_file, sentence_developing_file, slot_developing_file, from_vocabulary_size=2000, to_vocabulary_size=2000, tokenizer=None) sentence_developing, slot_devloping = data_helper.read_data( sentence_dev, slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( sentence_train, slot_train, max_size=None) ## TODO: #sentence_training, slot_training = sentence_training[:1000],\ # slot_training[:1000] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) #model = Sequential() #model.add(Embedding(n_vocab,100)) #model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) #model.add(Dropout(0.25)) #model.add(GRU(100,return_sequences=True)) #model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) #model.compile('rmsprop', 'categorical_crossentropy') ## Training ##n_epochs = 30 #n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 #print("Training =>") #train_pred_label = [] #avgLoss = 0 #for i in range(n_epochs): # print("Training epoch {}".format(i)) # bar = progressbar.ProgressBar(max_value=len(sentence_training)) # for n_batch, sent in bar(enumerate(sentence_training)): # label = slot_training[n_batch] # # Make labels one hot # label = np.eye(n_classes)[label][np.newaxis, :] # # View each sentence as a batch # sent = sent[np.newaxis, :] # if sent.shape[1] > 1: #ignore 1 word sentences # loss = model.train_on_batch(sent, label) # avgLoss += loss # pred = model.predict_on_batch(sent) # pred = np.argmax(pred, -1)[0] # train_pred_label.append(pred) # avgLoss = avgLoss/n_batch # predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y)) # for y in train_pred_label] # con_dict = conlleval(predword_train, labels_train, # words_train, 'measure.txt') # train_f_scores.append(con_dict['f1']) # print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( # avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # # Save model # model.save(filepath_model) # gc.collect() print("Validating =>") from keras.models import load_model model = load_model(filepath_model) labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch gc.collect() predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] print('here') with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print()
saver = tf.train.Saver(variables_to_restore) for name in variables_to_restore: print(name) @log_time_delta def predict(model, sess, batch, test): scores = [] for data in batch: score = model.predict(sess, data) scores.extend(score) return np.array(scores[:len(test)]) text = "怎么 提取 公积金 ?" splited_text = data_helper.encode_to_split(text, alphabet) mb_q, mb_q_mask = data_helper.prepare_data([splited_text]) mb_a, mb_a_mask = data_helper.prepare_data([splited_text]) data = (mb_q, mb_a, mb_q_mask, mb_a_mask) score = model.predict(sess, data) print(score) feed_dict = { model.question: data[0], model.answer: data[1], model.q_mask: data[2], model.a_mask: data[3], model.dropout_keep_prob_holder: 1.0 } sess.run(model.position_embedding, feed_dict=feed_dict)[0]