def cnn_lstm_DA(args): logger_exp.info('-' * 50) logger_exp.info('Load data files..') # get prune dictionaries redundent_1, redundent_2 = tools.prune_data(args.train_file) # load training data train_examples, max_d, max_q, max_s = tools.load_jsondata( args.train_file, redundent_1, redundent_2, args.stopwords) # load development data dev_examples, a, b, c = tools.load_jsondata(args.dev_file, redundent_1, redundent_2, args.stopwords) num_train = len(train_examples[0]) num_dev = len(dev_examples[0]) logger_exp.info('-' * 50) logger_exp.info('Build dictionary..') word_dict = tools.build_dict(train_examples[0], train_examples[1]) # entity dictionary for entire dataset entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@ent')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logger_exp.info('Entity markers: %d' % len(entity_dict)) num_labels = len(entity_dict) logger_exp.info('-' * 50) # Load embedding file embeddings = tools.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (vocab_size, args.embedding_size) = embeddings.shape logger_exp.info('Building Model..') # build model if args.model_to_run == 'cnn_lstm_DA': cnn_model = CNN_LSTM_DA_Model( 'CNN_LSTM_DA_Model', num_labels, vocab_size, args.embedding_size, max_d, max_q, max_s, nb_filters_utterance=args.utterance_filters, nb_filters_query=args.query_filters, learning_rate=args.learning_rate, dropout=args.dropout, nb_hidden_unit=args.hidden_size) if args.model_to_run == 'cnn_lstm': cnn_model = CNN_LSTM_Model('CNN_LSTM_Model', num_labels, vocab_size, args.embedding_size, max_d, max_q, max_s, nb_filters_utterance=args.utterance_filters, nb_filters_query=args.query_filters, learning_rate=args.learning_rate, dropout=args.dropout, nb_hidden_unit=args.hidden_size) cnn_model.load_embedding(np.array([embeddings])) if args.pre_trained is not None: cnn_model.load_weights(args.pre_trained) logger_exp.info('Done.') logger_exp.info('-' * 50) logger_exp.info(args) logger_exp.info('-' * 50) logger_exp.info('Intial test..') # vectorize development data dev_x1, dev_x2, dev_l, dev_y, dev_qmask, dev_dmask = tools.vectorize( dev_examples, word_dict, entity_dict, max_d, max_q, max_s) assert len(dev_x1) == num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, dev_qmask, dev_dmask, args.batch_size) dev_acc = eval_acc(cnn_model, all_dev, max_d, max_q, max_s) logger_exp.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return cnn_model.save_model(args.save_model) # Training logger_exp.info('-' * 50) logger_exp.info('Start training..') # vectorize training data train_x1, train_x2, train_l, train_y, train_qmask, train_dmask = tools.vectorize( train_examples, word_dict, entity_dict, max_d, max_q, max_s) assert len(train_x1) == num_train train_x1, train_x2, train_l, train_y, train_qmask, train_dmask = pre_shuffle( train_x1, train_x2, train_l, train_y, train_qmask, train_dmask) start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_l, train_y, train_qmask, train_dmask, args.batch_size) for epoch in range(args.nb_epoch): np.random.shuffle(all_train) for idx, (mb_x1, mb_x2, mb_l, mb_y, mb_qmask, mb_dmask) in enumerate(all_train): logger_exp.info('#Examples = %d' % (len(mb_x1))) # rearrange each batch of dialogs newx1 = [] for i in xrange(len(mb_x1[0])): newx1.append(np.array([scene[i] for scene in mb_x1])) hist = cnn_model.fit(newx1 + [np.array(mb_x2)] + [np.array(mb_l)] + [np.array(mb_qmask)] + [np.array(mb_dmask)], np.array(mb_y), batch_size=args.batch_size, verbose=0) logger_exp.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), hist.history['loss'][0], time.time() - start_time)) n_updates += 1 # evaluate every 100 batches if n_updates % 100 == 0: samples = sorted( np.random.choice(num_train, min(num_train, num_dev), replace=False)) sample_train = gen_examples( [train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], train_qmask[samples], train_dmask[samples], args.batch_size) logger_exp.info( 'Train accuracy: %.2f %%' % eval_acc(cnn_model, sample_train, max_d, max_q, max_s)) dev_acc = eval_acc(cnn_model, all_dev, max_d, max_q, max_s) logger_exp.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logger_exp.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) cnn_model.save_model(args.save_model)
def train_sentiment(): #load data samples, labels, ID2label = load_training_data_sentiment( './data/manually_labeled_data_sentiment.txt') samples_val, labels_val = load_val_data_sentiment( './data/sentiment_test_data.txt') dict = tools.build_dict(samples, tools.MAX_NB_WORDS) #bulid dict tools.save_dict(dict) #save the dict to local #calculate weight for different to improve balance sentiment_weight = {} for i in range(2): sentiment_weight[i] = len(labels) / labels.count(i) print(len(dict)) embedding_matrix, nb_words, EMBEDDING_DIM = tools.load_embedding( dict) #load embedding N_label = len(ID2label) X, y = tools.normalize_training_data(samples, labels, N_label, dict, 100) #normalize the input data X_val, y_val = tools.normalize_training_data(samples_val, labels_val, N_label, dict, 100) print(len(X)) print(len(y)) NUM = len(X) indices = np.arange(NUM) np.random.shuffle(indices) X = X[indices] y = y[indices] samples = np.asarray(samples) samples = samples[indices] labels = np.asarray(labels) labels = labels[indices] training_ratio = 1 #setting the training data percentage N_train = int(NUM * training_ratio) X_train = X[:N_train] y_train = y[:N_train] #X_val = X[N_train:] #y_val = y[N_train:] #samples_val = samples[N_train:] #labels_val = labels[N_train:] sample_weights = np.ones( len(y_train)) #initialize the sample weight as all 1 model = tools.define_model(tools.MAX_SEQUENCE_LENGTH, embedding_matrix, nb_words, EMBEDDING_DIM, N_label) model_save_path = 'code\model_sentiment' #save the best model model = tools.train_model(model, X_train, y_train, X_val, y_val, sample_weights, model_save_path, sentiment_weight) score, acc = model.evaluate( X_val, y_val, batch_size=2000) #get the score and acc for the model print('Test score:', score) print('Test accuracy:', acc) pred = model.predict( X_val, batch_size=2000) #get the concrete predicted value for each text labels_pred = tools.probs2label( pred) #change the predicted value to labels #save the wrong result writer_sentiment = codecs.open( './data/wrong_analysis/sentiment_wrong_result.txt', "w", encoding='utf-8', errors='ignore') for i in range(len(labels_val)): if labels_val[i] != labels_pred[i]: writer_sentiment.write(samples_val[i] + '\t' + ID2label[labels_val[i]] + '\t' + ID2label[labels_pred[i]] + '\n') writer_sentiment.flush() writer_sentiment.close() return acc
def train_class(): samples, labels, ID2label = load_training_data_class2( tools.PATH + '/data/class2_labels.txt', tools.PATH + '/data/manually_labeled_data_class2.txt') #load class data dict = tools.build_dict(samples, tools.MAX_NB_WORDS) #bulid dict print(len(dict)) tools.save_dict(dict) #save the dict to local embedding_matrix, nb_words, EMBEDDING_DIM = tools.load_embedding( dict) #load embedding N_label = len(ID2label) X, y = tools.normalize_training_data(samples, labels, N_label, dict, 100) #normalize the input data print(len(X)) print(len(y)) NUM = len(X) indices = np.arange(NUM) np.random.shuffle(indices) X = X[indices] y = y[indices] samples = np.asarray(samples) samples = samples[indices] labels = np.asarray(labels) labels = labels[indices] training_ratio = 0.9 #setting the training data percentage N_train = int(NUM * training_ratio) X_train = X[:N_train] y_train = y[:N_train] X_val = X[N_train:] y_val = y[N_train:] samples_val = samples[N_train:] labels_val = labels[N_train:] sample_weights = np.ones( len(y_train)) #initialize the sample weight as all 1 model = tools.define_model(tools.MAX_SEQUENCE_LENGTH, embedding_matrix, nb_words, EMBEDDING_DIM, N_label) model_save_path = 'code\model_class2' #save the best model model = tools.train_model(model, X_train, y_train, X_val, y_val, sample_weights, model_save_path) score, accuracy_class2 = model.evaluate( X_val, y_val, batch_size=2000) #get the score and acc for the model print('Test score:', score) print('Test accuracy:', accuracy_class2) pred = model.predict( X_val, batch_size=2000) #get the concrete predicted value for each text labels_pred = tools.probs2label( pred) #change the predicted value to labels #save the wrong result for class2 writer_class2 = codecs.open(tools.PATH + '/data/wrong_analysis/class2_wrong_result.txt', "w", encoding='utf-8', errors='ignore') for i in range(len(labels_val)): if labels_val[i] != labels_pred[i]: writer_class2.write(samples_val[i] + '\t' + ID2label[labels_val[i]] + '\t' + ID2label[labels_pred[i]] + '\n') writer_class2.flush() writer_class2.close() class2_class1 = load_class2_to_class1( tools.PATH + '/data/class2_class1.txt') #merge the class2 to class1 N_class1_true = 0 worng_class = [] for i in range(len(labels_val)): if class2_class1[ID2label[labels_val[i]]] == class2_class1[ID2label[ labels_pred[i]]]: N_class1_true += 1 else: worng_class.append(class2_class1[ID2label[labels_val[i]]] + "\t" + class2_class1[ID2label[labels_pred[i]]] + "\t" + samples_val[i]) #save the wrong result for class1 writer = codecs.open(tools.PATH + '/data/wrong_analysis/class1_wrong_result.txt', "w", encoding='utf-8', errors='ignore') writer.write("original_label" + "\t" + "predict_label" + "\t" + "sample" + "\n") for item in worng_class: writer.write(item + '\n') writer.flush() writer.close() accuracy_class1 = N_class1_true / len(labels_val) print(accuracy_class1) return accuracy_class2, accuracy_class1