query_maxlen = max(map(len, (x for x, _ in train_data + test_data))) print('-') print('Vocab size:', vocab_size, 'unique words') print('Query max length:', query_maxlen, 'words') print('Number of training data:', len(train_data)) print('Number of test data:', len(test_data)) print('-') print('Here\'s what a "data" tuple looks like (query, answer):') print(train_data[0]) print('-') print('Vectorizing the word sequences...') print('Number of entities', len(entities)) queries_train, answers_train = vectorize(train_data, w2i, query_maxlen, w2i_label) queries_test, answers_test = vectorize(test_data, w2i, query_maxlen, w2i_label) # queries_dev, answers_dev = vectorize(dev_data, w2i, query_maxlen, w2i_label) print('-') print('queries: integer tensor of shape (samples, max_length)') print('queries_train shape:', queries_train.shape) print('queries_test shape:', queries_test.shape) print('-') print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)') print('answers_train shape:', answers_train.shape) print('answers_test shape:', answers_test.shape) # max_mem_len = 3 mem_key_len = 2 # ['Blade Runner', 'directed_by'] mem_val_len = 1 # ['Ridley Scott']
from sklearn import svm import numpy as np import time #TODO: fix up preprocessing data code and upload it as a separate file to WMD as well #TODO: debug low dimensional approximation #TODO: run more extensive tests on performance of mSDA with and without low dimensional approximation #fetch training documents from 20 newsgroups dataset in random order categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med'] all_20news = fetch_20newsgroups(subset='all',categories=categories, shuffle=True, random_state=42) all_raw_data = all_20news.data #all the data all_data_stringsList = process_data.createWordLists(process_data.unicodeToString(all_raw_data)) all_data_words = process_data.preprocess_by_word(all_data_stringsList) all_labels = all_20news.target #all the labels all_full_data = process_data.vectorize(all_data_words) #convert to bag of words all_full_data = all_full_data.transpose() #so rows are data, columns are features (format we predominantly use) num_mostCommon = 800 all_mostCommonFeatures_data = process_data.getMostCommonFeatures(all_full_data, num_mostCommon) train_data, train_labels, test_data, test_labels = process_data.splitTrainTest(all_mostCommonFeatures_data, all_labels) print "Shape of training data: ", train_data.shape print "Shape of test data: ", test_data.shape #classify with linear SVM #transpose because sklearn requires (#data x #features) clf_baseline = svm.SVC().fit(train_data.transpose(), train_labels) baseline_preds = clf_baseline.predict(test_data.transpose()) base_accuracy = np.mean(baseline_preds == test_labels) print "Accuracy with linear SVM on basic representation: ", base_accuracy
#TODO: run more extensive tests on performance of mSDA with and without low dimensional approximation #fetch training documents from 20 newsgroups dataset in random order categories = [ 'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med' ] all_20news = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) all_raw_data = all_20news.data #all the data all_data_stringsList = process_data.createWordLists( process_data.unicodeToString(all_raw_data)) all_data_words = process_data.preprocess_by_word(all_data_stringsList) all_labels = all_20news.target #all the labels all_full_data = process_data.vectorize( all_data_words) #convert to bag of words all_full_data = all_full_data.transpose( ) #so rows are data, columns are features (format we predominantly use) num_mostCommon = 800 all_mostCommonFeatures_data = process_data.getMostCommonFeatures( all_full_data, num_mostCommon) train_data, train_labels, test_data, test_labels = process_data.splitTrainTest( all_mostCommonFeatures_data, all_labels) print "Shape of training data: ", train_data.shape print "Shape of test data: ", test_data.shape #classify with linear SVM #transpose because sklearn requires (#data x #features) clf_baseline = svm.SVC().fit(train_data.transpose(), train_labels) baseline_preds = clf_baseline.predict(test_data.transpose())
def __init__(self, max_mem_size, batch_size, device): '''max_mem_size means how many memories can be visited for one query ''' self.device = device train_data = load_pickle('pickle/mov_task1_qa_pipe_train.pickle') test_data = load_pickle('pickle/mov_task1_qa_pipe_test.pickle') dev_data = load_pickle('pickle/mov_task1_qa_pipe_dev.pickle') kv_pairs = load_pickle('pickle/mov_kv_pairs.pickle') train_k = np.array(load_pickle('pickle/mov_train_k.pickle')) train_v = np.array(load_pickle('pickle/mov_train_v.pickle')) test_k = np.array(load_pickle('pickle/mov_test_k.pickle')) test_v = np.array(load_pickle('pickle/mov_test_v.pickle')) dev_k = np.array(load_pickle('pickle/mov_dev_k.pickle')) dev_v = np.array(load_pickle('pickle/mov_dev_v.pickle')) entities = load_pickle('pickle/mov_entities.pickle') entity_size = len(entities) # TODO vocab = load_pickle('pickle/mov_vocab.pickle') self.vocab_size = len(vocab) stopwords = load_pickle('pickle/mov_stopwords.pickle') w2i = load_pickle('pickle/mov_w2i.pickle') i2w = load_pickle('pickle/mov_i2w.pickle') w2i_label = load_pickle('pickle/mov_w2i_label.pickle') i2w_label = load_pickle('pickle/mov_i2w_label.pickle') print('before filter:', len(train_data), len(test_data)) train_data, train_k, train_v = filter_data(train_data, train_k, train_v, 0, 100) pdb.set_trace() test_data, test_k, test_v = filter_data(test_data, test_k, test_v, 0, 100) dev_data, dev_k, dev_v = filter_data(dev_data, dev_k, dev_v, 0, 100) print('after filter:', len(train_data), len(test_data)) query_maxlen = max(map(len, (x for x, _ in train_data + test_data))) print('-') print('Vocab size:', self.vocab_size, 'unique words') print('Query max length:', query_maxlen, 'words') print('Number of training data:', len(train_data)) print('Number of test data:', len(test_data)) print('-') print('Here\'s what a "data" tuple looks like (query, answer):') print(train_data[0]) print('-') print('Vectorizing the word sequences...') print('Number of entities', len(entities)) # TODO, change the vectorize function self.queries_train, self.answers_train = vectorize( train_data, w2i, query_maxlen, w2i_label) self.queries_test, self.answers_test = vectorize( test_data, w2i, query_maxlen, w2i_label) # queries_dev, answers_dev = vectorize(dev_data, w2i, query_maxlen, w2i_label) print('-') print('queries: integer tensor of shape (samples, max_length)') print('queries_train shape:', self.queries_train.shape) print('queries_test shape:', self.queries_test.shape) print('-') print( 'answers: binary (1 or 0) tensor of shape (samples, len(w2i_label))' ) print('answers_train shape:', self.answers_train.shape) print('answers_test shape:', self.answers_test.shape) # max_mem_len = 3 self.query_max_len = query_maxlen self.max_mem_size = max_mem_size self.mem_key_len = 2 # ['Blade Runner', 'directed_by'] self.mem_val_len = 1 # ['Ridley Scott'] self.vec_train_k = vectorize_kv(train_k, self.mem_key_len, self.max_mem_size, w2i) self.vec_train_v = vectorize_kv(train_v, self.mem_val_len, self.max_mem_size, w2i) self.vec_test_k = vectorize_kv(test_k, self.mem_key_len, self.max_mem_size, w2i) self.vec_test_v = vectorize_kv(test_v, self.mem_val_len, self.max_mem_size, w2i) print('vec_k', self.vec_train_k.shape) print('vec_v', self.vec_train_v.shape) assert len(self.vec_train_k) == len(self.queries_train) assert len(self.vec_test_k) == len(self.queries_test) print('vec_k', self.vec_train_k.shape) print('vec_v', self.vec_train_v.shape) self.batch_size = batch_size print("The batch size is %d." % self.batch_size) self.num_qa_pairs_train = len(self.vec_train_k) self.num_steps = int(self.num_qa_pairs_train / self.batch_size) # drop the tail self.current_step = -1 # current_step should be in [0, num_steps-1] print("There's %d steps in one epoch" % self.num_steps) self.answer_set_size = len(w2i_label)
print('Build dictionary..') word_dict = build_dict(train_d + train_q) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + train_a)) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} print('Entity markers: %d' % len(entity_dict)) num_labels = len(entity_dict) doc_maxlen = max(map(len, (d for d in train_d))) query_maxlen = max(map(len, (q for q in train_q))) print('doc_maxlen:', doc_maxlen, ', q_maxlen:', query_maxlen) v_train_d, v_train_q, v_train_y, _ = vectorize(train_d, train_q, train_a, word_dict, entity_dict, doc_maxlen, query_maxlen) v_dev_d, v_dev_q, v_dev_y, _ = vectorize(dev_d, dev_q, dev_a, word_dict, entity_dict, doc_maxlen, query_maxlen) print('vectroized shape') print(v_train_d.shape, v_train_q.shape, v_train_y.shape) print(v_dev_d.shape, v_dev_q.shape, v_dev_y.shape) vocab_size = max(word_dict.values()) + 1 print('vocab_size:', vocab_size) embd_size = 100 rnn_half_hidden_size = 64 glove_embd_w = load_glove_weights('./dataset', 100, vocab_size, word_dict) model = Net(vocab_size, embd_size, rnn_half_hidden_size, glove_embd_w, doc_maxlen, query_maxlen, len(entity_dict)) print(model.summary())
test_k = np.array(load_pickle('pickle/mov_test_k.pickle')) test_v = np.array(load_pickle('pickle/mov_test_v.pickle')) # filter data which have zero KV or too many KVs print('before filter:', len(test_data)) test_data, test_k, test_v = filter_data(test_data, test_k, test_v, 0, 100) print('after filter:', len(test_data)) vocab = load_pickle('pickle/mov_vocab.pickle') vocab_size = len(vocab) w2i = load_pickle('pickle/mov_w2i.pickle') i2w = load_pickle('pickle/mov_i2w.pickle') w2i_label = load_pickle('pickle/mov_w2i_label.pickle') i2w_label = load_pickle('pickle/mov_i2w_label.pickle') queries_test, answers_test = vectorize(test_data, w2i, max_query_len, w2i_label, True) vec_test_k = vectorize_kv(test_k, 2, max_mem_size, w2i) vec_test_v = vectorize_kv(test_v, 1, max_mem_size, w2i) model = load_model(model_name) # ret = model.evaluate([vec_test_k, vec_test_v, queries_test], answers_test, verbose=1) # print('=====result=====') # print('loss: {:.5f}, acc: {:.5f}'.format(ret[0], ret[1])) print('=====wrong examples=====') pred = model.predict([vec_test_k, vec_test_v, queries_test], batch_size=32, verbose=1) wrong_ct = 0 for i, (p, a) in enumerate(zip(pred, answers_test)): p_id = np.argmax(p)