def main(): # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ model, utt_train = load_all_models(model_path) path = dataset_path train_x, train_y= grab_data(path+'.train', model) test_x, test_y = grab_data(path+'.test', model) f = open('swda_embedded_'+ model_path.split('/')[-1] + '.pkl', 'wb') pkl.dump((train_x, train_y), f, -1) pkl.dump((test_x, test_y), f, -1) f.close()
def evaluate_model(embedding_model_location, with_context=False, f=e_add, aggregated_tagset=False): # load training and test data train_utt, train_Y, test_utt, test_Y = load_data() # uncomment this to find basline scores # baseline_scores(train_utt, train_Y, test_utt, test_Y) print "Creating representations" # Load utterance embedding models embedding_model, _ = load_all_models(embedding_model_location) # represent utterances in some way, # train_X = represent_simple(train_utt, embedding_model) # test_X = represent_simple(test_utt, embedding_model) if with_context: # ---------- lqrz: add or concatenate the previous utterance # f = concat # f = e_add test_X = represent_mix_simple(test_utt, test_Y, embedding_model, f) train_X = represent_mix_lookup(train_Y, embedding_model, f) # ---------- else: train_X = represent_lookup(train_Y, embedding_model) test_X = represent_simple(test_utt, embedding_model) # encode tags train_Y, test_Y = encode_tags(train_Y, test_Y, aggregated_tagset=aggregated_tagset) # print np.array(train_X).shape, np.array(test_X).shape, np.array(train_Y).shape, np.array(test_Y).shape print "Training classifiers" # Train classifiers, print scores print "Model: ", embedding_model_location print "KNN Accuracy: ", cl.KNN_classifier(train_X, train_Y, test_X, test_Y) # print "SVM Accuracy: ", cl.SVM_classifier(train_X, train_Y, test_X, test_Y) print "NB Accuracy: ", cl.NB_classifier(train_X, train_Y, test_X, test_Y) print "MLP Accuracy: ", cl.MLP_classifier(train_X, train_Y, test_X, test_Y, n_iter=10)
for tag, tokens in get_utterances_from_file(fname): # remove id from tag tag = tag.split("/")[0] utterances[tag].append(tokens) return utterances if __name__ == '__main__': #TODO: change paths accordingly embedding_model_filename = 'data/test' # path to doc2vec model utterance_filename = 'data/swda_utterances.txt' # path to utterance file model, _ = load_all_models(embedding_model_filename) utterances = load_utterances(utterance_filename) #TODO: select tags to plot (max 5) tags_to_plot = ['qw','ft','ar','fa', 't1'] #TODO: select nr of samples n_samples = 500 samples = np.empty((n_samples*len(tags_to_plot), model.layer1_size)) for i,tag in enumerate(tags_to_plot): tag_samples = get_vector_samples(tag, utterances, model, n_samples) samples[i*n_samples:(i+1)*n_samples,:] = tag_samples plotGraph(samples, n_samples, tags_to_plot, dimensions='2D')