src_vecs.normalize() trg_vecs = WordVecs(args.trg_embedding) trg_vecs.mean_center() trg_vecs.normalize() # Setup projection dataset trans = 'lexicons/bingliu_en_{0}.one-2-one.txt'.format(args.lang) pdataset = ProjectionDataset(trans, src_vecs, trg_vecs) # learn the translation matrix W print('Projecting src embeddings to trg space...') W = get_projection_matrix(pdataset, src_vecs, trg_vecs) print('W done') # project the source matrix to the new shared space src_vecs._matrix = np.dot(src_vecs._matrix, W) print('src_vecs done') # open datasets src_dataset = General_Dataset(args.src_dataset, None, rep=word_reps, binary=args.binary) print('src_dataset done') trg_dataset = General_Dataset(args.trg_dataset, None, rep=word_reps, binary=args.binary) print('trg_dataset done') # get joint vocabulary and maximum sentence length
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-src_vecs', default='embeddings/original/google.txt', help=" source language vectors (default: GoogleNewsVecs )") parser.add_argument( '-trg_vecs', default='embeddings/original/sg-300-{0}.txt', help=" target language vectors (default: SGNS on Wikipedia)") parser.add_argument( '-trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='lexicons/bingliu/en-{0}.txt') parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() # Loop over the three languages for lang in ['es', 'ca', 'eu']: print('################ {0} ##############'.format(lang)) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) src_vecs.mean_center() src_vecs.normalize() trg_vecs = WordVecs(args.trg_vecs.format(lang)) trg_vecs.mean_center() trg_vecs.normalize() # Setup projection dataset pdataset = ProjectionDataset(args.trans.format(lang), src_vecs, trg_vecs) # learn the translation matrix W W = get_W(pdataset, src_vecs, trg_vecs) # project the source matrix to the new shared space src_vecs._matrix = np.dot(src_vecs._matrix, W) # Import datasets (representation will depend on final classifier) print('importing datasets') binary_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) fine_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) fine_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) # Train linear SVM classifier if True in args.bi: best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain) cpred = clf.predict(binary_cross_dataset._Xtest) cf1 = macro_f1(binary_cross_dataset._ytest, cpred) print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'artetxe', '{0}-bi.txt'.format(args.dataset))) print('-binary-') print('Acc: {0:.3f}'.format( clf.score(binary_cross_dataset._Xtest, binary_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print() if False in args.bi: best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain) cpred = clf.predict(fine_cross_dataset._Xtest) cf1 = macro_f1(fine_cross_dataset._ytest, cpred) print_prediction( clf, fine_cross_dataset, os.path.join('predictions', lang, 'artetxe', '{0}-4cls.txt'.format(args.dataset))) print('-fine-') print('Acc: {0:.3f}'.format( clf.score(fine_cross_dataset._Xtest, fine_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print()
def test_embeddings(file, threshold, file_type): emotions = [ "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust" ] # Import dataset where each test example is the words in the tweet dataset = Fine_Grained_Emotion_Dataset('data', None, rep=words, threshold=threshold) print('Basic statistics') table = [] for i, emo in enumerate(emotions): train = dataset._ytrain[:, i].sum() test = dataset._ytest[:, i].sum() table.append((emo, train, test)) print(tabulate.tabulate(table, headers=['emotion', '#train', '#test'])) #### Get Parameters #### max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list( dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} print('Importing vectors') for line in open(file): try: split = line.split() word = split[0] vec = np.array(split[1:], dtype='float32') if word in vocab: wordvecs[word] = vec except ValueError: pass dim = len(vec) oov = len(vocab) - len(wordvecs) print('OOV: {0}'.format(oov)) # Add vectors for <unk> add_unknown_words(wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) # TODO: change this so I don't have to import vectors I don't need vecs = WordVecs(file) vecs._matrix = W vecs._w2idx = word_idx_map vecs.vocab_length, vecs.vector_size = W.shape ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs) # Get padded word indexes for all X Xtrain = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtrain ]) Xdev = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xdev ]) Xtest = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtest ]) #### Test Models #### names = ['LSTM', 'BiLSTM', 'CNN'] # Keep all mean and standard deviations of each emotion over datasets here all_emo_results = [] all_emo_std_devs = [] # Keep all mean and standard deviations of the averaged emotions here averaged_results = [] averaged_std_devs = [] # TEST EACH MODEL for name in names: print('Getting best parameters') dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params( name, dev_params_file, Xtrain, dataset._ytrain, Xdev, dataset._ydev, wordvecs, W) print('Testing {0}'.format(name)) # Keep the results for the 5 runs over the dataset model_results = [] model_average_results = [] # 5 runs to get average and standard deviation for i, it in enumerate(range(5)): print('Run: {0}'.format(i + 1)) # create and train a new classifier for each iteration if name == 'LSTM': model = create_LSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'BiLSTM': model = create_BiLSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'CNN': model = create_cnn(W, Xtrain.shape[1]) h = model.fit(Xtrain, dataset._ytrain, validation_data=[Xdev, dataset._ydev], nb_epoch=best_epoch, verbose=0) pred = model.predict(Xtest) pred = np.array([cutoff(x) for x in pred]) y = dataset._ytest emo_results = [] for j in range(len(emotions)): emo_y = y[:, j] emo_pred = pred[:, j] mm = MyMetrics(emo_y, emo_pred, one_hot=False, average='binary') acc = mm.accuracy() precision, recall, f1 = mm.get_scores() emo_results.append([acc, precision, recall, f1]) emo_results = np.array(emo_results) model_results.append(emo_results) # print('F1 scores') # for emo, result in zip(emotions, emo_results): # a, p, r, f = result # print('{0}: {1:.3f}'.format(emo, f)) ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0) mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred) model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1)) print( 'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}' .format(ave_acc, mic_prec, mic_rec, mic_f1)) print() model_results = np.array(model_results) model_average_results = np.array(model_average_results) average_model_results = model_results.mean(axis=0) model_std_dev_results = model_results.std(axis=0) overall_avg = model_average_results.mean(axis=0) overall_std = model_average_results.std(axis=0) all_emo_results.append(average_model_results) all_emo_std_devs.append(model_std_dev_results) averaged_results.append(overall_avg) averaged_std_devs.append(overall_std) return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim