def load_data(config): """ Load training examples and pretrained word embeddings from disk. Return training inputs, labels and pretrianed embeddings. """ # Load raw data wq_file = config["webquestions_examples_file"] n_neg_sample = config["num_classes"] - 1 x_u, x_r, y, max_len = dh.get_training_examples_for_softmax(wq_file, n_neg_sample) # Pad sentences pad = lambda x: dh.pad_sentences(x, max_len) pad_lst = lambda x: map(pad, x) x_u = map(pad, x_u) x_r = map(pad_lst, x_r) # Load tokens and pretrained embeddings we_file = config["word_embeddings_file"] voc_size = config["vocabulary_size"] embedding_size = config["embedding_size"] tokens, U = dh.get_pretrained_wordvec_from_file(we_file, (voc_size, embedding_size)) # Represent sentences as list(nparray) of ints dctize = lambda word: tokens[word] if tokens.has_key(word) else tokens["pad"] dctizes = lambda words: map(dctize, words) dctizess = lambda wordss: map(dctizes, wordss) x_u_i = np.array(map(dctizes, x_u)) x_r_i = np.array(map(dctizess, x_r)) y = np.array(y) return (x_u_i, x_r_i, y, max_len, U)
def load_data(config): """ Load training examples and pretrained word embeddings from disk. Return training inputs, labels and pretrianed embeddings. """ # Load raw data wq_file = config["webquestions_examples_file"] n_neg_sample = config["num_classes"] - 1 x_u, x_r, y, max_len = dh.get_training_examples_for_softmax( wq_file, n_neg_sample) # Pad sentences pad = lambda x: dh.pad_sentences(x, max_len) pad_lst = lambda x: map(pad, x) x_u = map(pad, x_u) x_r = map(pad_lst, x_r) # Load tokens and pretrained embeddings we_file = config["word_embeddings_file"] voc_size = config["vocabulary_size"] embedding_size = config["embedding_size"] tokens, U = dh.get_pretrained_wordvec_from_file(we_file, (voc_size, embedding_size)) # Represent sentences as list(nparray) of ints dctize = lambda word: tokens[word] if tokens.has_key(word) else tokens[ "pad"] dctizes = lambda words: map(dctize, words) dctizess = lambda wordss: map(dctizes, wordss) x_u_i = np.array(map(dctizes, x_u)) x_r_i = np.array(map(dctizess, x_r)) y = np.array(y) return (x_u_i, x_r_i, y, max_len, U)
def prepare_sentences(sentences, vocabulary, max_length): print(sentences) sentences_processed = process_sentences(sentences) sentences_padded, _ = pad_sentences(sentences_processed, sequence_length=max_length) x, _ = build_input_data(sentences_padded, 0, vocabulary) return x
def preencode(df): sentences = make_text_matrix(df) s = [x.split() for x in sentences['text'].values] l = sentences['target'].values sentences_padded = pad_sentences(s) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, l, vocabulary) return x,y,vocabulary,vocabulary_inv
def preprocess(model): dict_sentences = {} reverse_dict = {} match_dictionary = {} pair_list = [] import sys i = 0 k = 0 maxlen = 0 # this reads in one line at a time from stdin for line in sys.stdin: i+=1 tokens = line.split("\t") sent1 = tokens[0] sent2 = tokens[1] if clean_sent_cond(sent1) or clean_sent_cond(sent2): continue else: k += 1 if not sent1 in dict_sentences: dict_sentences[sent1] = len(dict_sentences) + 1 if not sent2 in dict_sentences: dict_sentences[sent2] = len(dict_sentences) + 1 index_1 = dict_sentences[sent1] index_2 = dict_sentences[sent2] if not index_1 in match_dictionary: match_dictionary[index_1] = [] if not index_2 in match_dictionary: match_dictionary[index_2] = [] match_dictionary[index_1].append(index_2) match_dictionary[index_2].append(index_1) pair_list.append((index_1, index_2)) if i % 10000 == 0: print(str(k) + "/" + str(i)) if k == 500000: break; i = 0 for entry in dict_sentences: simple_sent1 = filter(lambda x: len(x) > 1, data_helpers.clean_str(entry).split(" ")) sent1 = data_helpers.build_input_data(data_helpers.pad_sentences([simple_sent1], 40, padding_word="<PAD/>"), model.vocab) reverse_dict[dict_sentences[entry]] = sent1 if i % 10000 == 0: print(i) i += 1 random.shuffle(pair_list) pickle.dump(reverse_dict, open("sentences_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("writing sentences " + str(len(reverse_dict))) pickle.dump(match_dictionary, open("pairs_index_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("writing map " + str(len(match_dictionary))) pickle.dump(pair_list, open("pairs_list_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("pairs " + str(len(pair_list)))
def transform_testdata(test_strs, vocabulary): test_strs = [ Lemmatizer(data_helpers.clean_str(sent)) for sent in test_strs ] test_strs = [s.split(" ") for s in test_strs] test_strs_padded = data_helpers.pad_sentences(test_strs, testStringLength=90) x = np.array([[ vocabulary[word] if word in vocabulary else vocabulary["UNKNOWN_WORD"] for word in sentence ] for sentence in test_strs_padded]) return x
def predict(self, x_raw): x_raw = list(x_raw) x_raw = [s.strip() for s in x_raw] x_raw = [list(s) for s in x_raw] x_pad, _ = data_helpers.pad_sentences(x_raw, sequence_length) x_test = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in x_pad]) # Get the placeholders from the graph by name input_x = self.graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = self.graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = self.graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = self.sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) return all_predictions
from data_helpers import get_labels from data_helpers import plot_confusion_matrix from data_helpers import multiclass_roc_auc_score import numpy as np import pandas as pd import json import matplotlib.pyplot as plt from matplotlib import rcParams rcParams.update({'figure.autolayout': True}) print('Loading annotated social text data') x_text, y_class = load_data_and_labels() y = get_labels() sentences_padded, sequence_length = pad_sentences(x_text) # global variebles embedding_dim = 200 num_filters = 512 drop = 0.5 epochs = 1 batch_size = 100 # define 10-fold cross validation test harness kfold = KFold(n_splits=10, shuffle=True, random_state=42) cvscores = [] auc_scores = [] print('10 fold CV starting')
print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) x_eval = data_helpers.load_test_data(FLAGS.test_data_file) # Pad sentences sentences_padded_all, max_length = data_helpers.pad_sentences(x_text + x_eval) sentences_padded, max_length = data_helpers.pad_sentences(x_text, max_length) # Build vocabulary vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded_all) x, y = data_helpers.build_input_data(sentences_padded, y, vocabulary) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
#======================== preprocess data ======================== # #TODO: After complete all training, use argparse to store the params. positive_data_file = "./data/rt-polaritydata/rt-polarity.pos" negtive_data_file = "./data/rt-polaritydata/rt-polarity.neg" # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file) # Pad sentence print("Padding sentences...") x_text = data_helpers.pad_sentences(x_text) print("The sequence length is: ", len(x_text[0])) # Build vocabulary vocabulary, vocabulary_inv = data_helpers.build_vocab(x_text) # Represent sentence with word index, using word index to represent a sentence x = data_helpers.build_index_sentence(x_text, vocabulary) y = y.argmax( axis=1) # y: [1, 1, 1, ...., 0, 0, 0]. 1 for positive, 0 for negative # Shuffle data np.random.seed(42) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) ctx_len = int(params['context_length']) train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') transcript_contexts = [] for call in trainset: for i, (log_utter, translations, label_utter) in enumerate(call): transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) transcript_contexts += [transcript] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] train_utters += [(transcript, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') transcript_contexts = [] for call in testset: for i, (log_utter, translations, label_utter) in enumerate(call): try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' transcript_contexts += [translation] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] test_utters += [(translation, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # make windowed input data as context train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len) test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
model_dir = 'word2vec_models' model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format(embedding_dim, min_word_count, context, model_variation) print ("Loading word2vec model: " + model_name) model_name = os.path.join(model_dir, model_name) eb = word2vec.Word2Vec.load(model_name) print ("Loading CNN") arch = 'imdb_' + model_variation + '7_arch.json' weights = 'imdb_' + model_variation + '7.h5' model = model_from_json(open(arch).read()) model.load_weights(weights) print ("padding senetences") pad_size = model.input_shape[1] sentences = data_helpers.pad_sentences(sentences, sequence_length=pad_size) x = [] sentence = sentences[0] for word in sentence: try: vect = eb[word] except: vect = eb['<PAD/>'] x.append(vect) x = np.asarray(x) pred = model.predict_classes(x, batch_size=1) print pred # for sentence in sentences: # temp = []
def main(argv): parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide', 'tourist'], required=True, help='speaker') args = parser.parse_args() threshold_predictor = None train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue transcript = data_helpers.tokenize_and_lower(log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) train_utters += [(transcript, log_utter['speaker'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue try: translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) test_utters += [(translation, log_utter['speaker'], sa_label_list)] pprint(train_utters[:2]) pprint(test_utters[:2]) # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) num_epochs = int(params['num_epochs']) validation_split = float(params['validation_split']) batch_size = int(params['batch_size']) multilabel = params['multilabel']=="true" # build vocabulary sents = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_sents = data_helpers.pad_sentences(sents, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents) print("vocabulary size: %d" % len(vocabulary)) # params['max_sent_len'] = max_sent_len # build inputs train_inputs = data_helpers.build_input_data(pad_sents, vocabulary) test_sents = [utter[0].split(' ') for utter in test_utters] test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len) test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels+sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split and shuffle data indices = np.arange(train_inputs.shape[0]) np.random.shuffle(indices) train_inputs = train_inputs[indices] train_labels = train_labels[indices] num_validation = int(validation_split * train_inputs.shape[0]) # x_train = train_inputs[:-num_validation] # y_train = train_labels[:-num_validation] # x_val = train_inputs[-num_validation:] # y_val = train_labels[-num_validation:] x_train = train_inputs y_train = train_labels x_test = test_inputs y_test = test_labels # construct a pytorch data_loader x_train = torch.from_numpy(x_train).long() y_train = torch.from_numpy(y_train).float() dataset_tensor = data_utils.TensorDataset(x_train, y_train) train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) x_test = torch.from_numpy(x_test).long() y_test = torch.from_numpy(y_test).long() dataset_tensor = data_utils.TensorDataset(x_test, y_test) test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) # load model model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1]) if torch.cuda.is_available(): model = model.cuda() learning_rate = float(params['learning_rate']) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.MultiLabelSoftMarginLoss() # loss_fn = nn.BCEWithLogitsLoss() for epoch in range(num_epochs): model.train() # set the model to training mode (apply dropout etc) for i, (inputs, labels) in enumerate(train_loader): inputs, labels = autograd.Variable(inputs), autograd.Variable(labels) if torch.cuda.is_available(): inputs, labels = inputs.cuda(), labels.cuda() preds = model(inputs) if torch.cuda.is_available(): preds = preds.cuda() loss = loss_fn(preds, labels) optimizer.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: print("current loss: %.4f" % loss) model.eval() # set the model to evaluation mode # if threshold_predictor is None: threshold_predictor = train_threshold(model, train_loader, y_train.numpy()) # count_predictor = train_count(model, train_loader, y_train.numpy()) true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor) # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) # end of training true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) with open(("pred_result_%s.txt" % args.roletype), "w") as f: for pred_act, true_act in zip(pred_acts, true_acts): f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
def sent_embedding(sent): sent = remove_stop_word(sent) sent = split_sentence([sent]) sent = pad_sentences(sent) sent_vec = sentence_word2vec(sent) return sent_vec
print("") # CHANGE THIS: Load data. Load your own data here x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) vocabulary = pickle.load( open( os.path.abspath(os.path.join(FLAGS.checkpoint_dir, "..", "vocab.txt")), "rb")) sequence_length = pickle.load( open(os.path.abspath(os.path.join(FLAGS.checkpoint_dir, "..", "len.txt")), "rb")) # Map data into vocabulary x_pad, _ = data_helpers.pad_sentences(x_raw, sequence_length) x_test = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in x_pad]) x_readable = np.array([[word.encode('utf-8') for word in sentence] for sentence in x_raw]) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1)) num_total += len(batchY) dev_acc = num_correct * 100 / float(num_total) print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \ --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc), file=logs) if __name__ == '__main__': mix_model = mix_cnn() print('Loading data...') # word2vec = data_helpers.load_google_word2vec('/Users/guo/TrainData/google300/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('VecForMR_.txt') sentences, labels = data_helpers.load_data_and_labels() sentences_padded = data_helpers.pad_sentences(sentences) x, y = data_helpers.build_input_data_with_word2vec(sentences_padded, labels, word2vec) mix_model.dic = data_helpers.buildGram(sentences, min1=6, min2=7) mix_model.initTheta() x_sent, mix_model.idf = data_helpers.buildDocsTFIDF(mix_model.dic, sentences) x_sent = np.array(x_sent) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) cv = 1 cv_length = len(y)/10 sample_test = shuffle_indices[cv_length * cv:cv_length * (cv + 1)] sample_train = np.concatenate((shuffle_indices[:cv_length * cv], shuffle_indices[cv_length * (cv + 1):])) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices]
def main(_): text, y_test = data_helpers.load_test_and_labels(FLAGS.test_file) with open('vocab_index.pkl', 'rb') as tr_file: train_int_to_vab = pickle.load(tr_file) #print (train_int_to_vab) train_to_int = {word: word_i for word_i, word in train_int_to_vab.items()} test_ids = [[ train_to_int.get(term, train_to_int['<UNK>']) for term in line ] for line in text] x_test = data_helpers.pad_sentences(test_ids, 20) print(x_test[:3]) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1) # Collect the predictions here all_predictions = [] batch_predictions = sess.run(predictions, { input_x: x_test, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) #for x_test_batch in batches: print(all_predictions) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions / float(len(y_test)))) # Save the evaluation to a csv predictions_human_readable = np.column_stack( (np.array(text), all_predictions)) out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'w') as f: csv.writer(f).writerows(predictions_human_readable)
def main(_): # Load data print("Loading data...") x_, y = data_helpers.load_data_and_labels(FLAGS.train_file) train_int_to_vab, train_to_int = data_helpers.cret_dict(x_) #保存对应的词和词索引 #print (train_int_to_vab) embeding_matric = data_helpers.word_matric(train_to_int) #存储所有字的文件,以便测试加载 pickle.dump(train_int_to_vab, open('./vocab_index.pkl', 'wb')) #print(train_int_to_vab) train_ids = [[ train_to_int.get(term, train_to_int['<UNK>']) for term in line ] for line in x_] x_ = data_helpers.pad_sentences(train_ids, 20) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x_[shuffle_indices] y = np.array(y) y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) y_train = np_utils.to_categorical(y_train) y_dev = np_utils.to_categorical(y_dev) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(train_int_to_vab), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, word_embedding_matrix=embeding_matric, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
model_dir = 'word2vec_models' model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format( embedding_dim, min_word_count, context, model_variation) print("Loading word2vec model: " + model_name) model_name = os.path.join(model_dir, model_name) eb = word2vec.Word2Vec.load(model_name) print("Loading CNN") arch = 'imdb_' + model_variation + '7_arch.json' weights = 'imdb_' + model_variation + '7.h5' model = model_from_json(open(arch).read()) model.load_weights(weights) print("padding senetences") pad_size = model.input_shape[1] sentences = data_helpers.pad_sentences(sentences, sequence_length=pad_size) x = [] sentence = sentences[0] for word in sentence: try: vect = eb[word] except: vect = eb['<PAD/>'] x.append(vect) x = np.asarray(x) pred = model.predict_classes(x, batch_size=1) print pred # for sentence in sentences: # temp = []
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '<PAD/>' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [transcript] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] train_utters += [ (transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [translation] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] test_utters += [ (translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] ctx_utters = [utter[1].split(' ') for utter in train_utters] print("max context utter length: %d " % max([len(ctx_utter) for ctx_utter in ctx_utters])) max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] ctx_utters = [utter[1].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) # build labels sa_train_labels = [utter[3] for utter in train_utters] sa_test_labels = [utter[3] for utter in test_utters] sa_train_ctx_labels = [utter[5] for utter in train_utters] sa_test_ctx_labels = [utter[5] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels) test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] guide_train_ctx_labels = train_ctx_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] guide_test_ctx_labels = test_ctx_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_ctx_inputs, tourist_train_labels, tourist_train_ctx_labels, tourist_test_inputs, tourist_test_ctx_inputs, tourist_test_labels, tourist_test_ctx_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_ctx_inputs, guide_train_labels, guide_train_ctx_labels, guide_test_inputs, guide_test_ctx_inputs, guide_test_labels, guide_test_ctx_labels) print("")
def main(_): # Load data print("Loading data...") x_, y = data_helpers.build_train_data(FLAGS.label_file, FLAGS.train_file) train_int_to_vab, train_to_int = data_helpers.cret_dict(x_) #保存对应的词和词索引 #存储所有字的文件,以便测试加载 pickle.dump(train_int_to_vab, open('./vocab_index.pkl', 'wb')) train_ids = [[ train_to_int.get(term, train_to_int['<UNK>']) for term in line ] for line in x_] x_ = data_helpers.pad_sentences(train_ids, 20) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x_[shuffle_indices] y = np.array(y) y_shuffled = y[shuffle_indices] folids_list = data_helpers.cross_validation_split_for_smp( x_shuffled, y_shuffled) for i in range(10): if not os.path.exists('save_model/' + str(i) + '/'): os.makedirs(os.path.join('save_model', str(i))) else: continue for i in range(10): best_acc = 0.0 print(i) print('##################') x_train, y_train, x_dev, y_dev = folids_list[i] y_train = np_utils.to_categorical(y_train) y_dev = np_utils.to_categorical(y_dev) # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): rnn = TextRNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(train_int_to_vab), batch_size=FLAGS.batch_size, embedding_size=FLAGS.embedding_dim, hidden_size=FLAGS.hidden_size, num_layers=FLAGS.num_layers #word_embedding_matrix=embeding_matric ) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { rnn.input_x: x_batch, rnn.input_y: y_batch, rnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, loss, accuracy = sess.run( [train_op, global_step, rnn.loss, rnn.accuracy], feed_dict) return step, loss, accuracy def dev_step(x_batch, y_batch): """ Evaluates model on a dev set """ feed_dict = { rnn.input_x: x_batch, rnn.input_y: y_batch, rnn.dropout_keep_prob: 1.0 } step, loss, accuracy = sess.run( [global_step, rnn.loss, rnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print('dev') print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) return accuracy def save_best_model(sess, path): path = saver.save(sess, path) for epoch in range(FLAGS.num_epochs): print('epoch', epoch) # Generate batches for batch_i, (x_batch, y_batch) in enumerate( data_helpers.get_batches(y_train, x_train, FLAGS.batch_size)): step, train_loss, train_accuracy = train_step( x_batch, y_batch) #print('step',step) if batch_i % FLAGS.evaluate_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, train_loss, train_accuracy)) #===================== accuracy = dev_step(x_dev, y_dev) if accuracy > best_acc: best_acc = accuracy print('save_model' + str(i) + '/best_model.ckpt') save_best_model( sess, 'save_model/' + str(i) + '/best_model.ckpt')
# Define Parameters tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") tf.flags.DEFINE_string("sentence", "the movie was bad", "sentence to classify") FLAGS = tf.flags.FLAGS ####################################################################################################################### # process the raw sentence new_review = data_helpers.clean_senetnce(FLAGS.sentence) # load vocabulary sentences, _ = data_helpers.load_data_and_labels() sequence_length = max(len(x) for x in sentences) sentences_padded = data_helpers.pad_sentences(sentences) vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded) num_padding = sequence_length - len(new_review) new_sentence = new_review + ["<PAD/>"] * num_padding # convert sentence to input matrix array = [] for word in new_sentence: try: word_vector=vocabulary[word] except KeyError: word_vector=vocabulary["<PAD/>"] array.append(word_vector) x=np.array([array])
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset( trainset, devset, testset) train_utters += dev_utters # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask( trainset, devset, testset) train_utters += dev_utters context_case = 1 # 여기다가 previous labels context 를 구성하는 코드를 작성하자! # 1) 이전 화행 N개 (speaker 구분안함) # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개) if context_case == 1: pass else: pass # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels train_labels_category = [utter[3] for utter in train_utters] test_labels_category = [utter[3] for utter in test_utters] train_labels_attr = [utter[4] for utter in train_utters] test_labels_attr = [utter[4] for utter in test_utters] train_labels_sa = [utter[5] for utter in train_utters] test_labels_sa = [utter[5] for utter in test_utters] label_binarizer_category = preprocessing.MultiLabelBinarizer() label_binarizer_category.fit(train_labels_category + test_labels_category) label_binarizer_attr = preprocessing.MultiLabelBinarizer() label_binarizer_attr.fit(train_labels_attr + test_labels_attr) label_binarizer_sa = preprocessing.MultiLabelBinarizer() label_binarizer_sa.fit(train_labels_sa + test_labels_sa) train_labels_category = label_binarizer_category.transform( train_labels_category) test_labels_category = label_binarizer_category.transform( test_labels_category) train_labels_attr = label_binarizer_attr.transform(train_labels_attr) test_labels_attr = label_binarizer_attr.transform(test_labels_attr) train_labels_sa = label_binarizer_sa.transform(train_labels_sa) test_labels_sa = label_binarizer_sa.transform(test_labels_sa) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels_category = train_labels_category[ tourist_train_indices] tourist_train_labels_attr = train_labels_attr[tourist_train_indices] tourist_train_labels_sa = train_labels_sa[tourist_train_indices] tourist_train_labels = (tourist_train_labels_category, tourist_train_labels_attr, tourist_train_labels_sa) guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels_category = train_labels_category[guide_train_indices] guide_train_labels_attr = train_labels_attr[guide_train_indices] guide_train_labels_sa = train_labels_sa[guide_train_indices] guide_train_labels = (guide_train_labels_category, guide_train_labels_attr, guide_train_labels_sa) tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels_category = test_labels_category[tourist_test_indices] tourist_test_labels_attr = test_labels_attr[tourist_test_indices] tourist_test_labels_sa = test_labels_sa[tourist_test_indices] tourist_test_labels = (tourist_test_labels_category, tourist_test_labels_attr, tourist_test_labels_sa) guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels_category = test_labels_category[guide_test_indices] guide_test_labels_attr = test_labels_attr[guide_test_indices] guide_test_labels_sa = test_labels_sa[guide_test_indices] guide_test_labels = (guide_test_labels_category, guide_test_labels_attr, guide_test_labels_sa) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels)
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") x_raw = data_helpers.load_test_data( '/Users/Winnerineast/Documents/haodaifu/NewData/tobetrained.csv') # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocabulary, vocabulary_inv, max_length = data_helpers.restore_vocabulary( vocab_path) sentences_padded, tmp_length = data_helpers.pad_sentences(x_raw, max_length) x_test, y_test = data_helpers.build_input_data(sentences_padded, None, vocabulary) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():