def main(): parser = argparse.ArgumentParser() parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)") args = parser.parse_args() vecs = WordVecs('/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS/BLSE/google.txt') en = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) langs = ['es', 'ca', 'eu'] for lang in langs: print('#### {0} ####'.format(lang)) cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) print('-binary-') best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en_binary._Xtrain, en_binary._ytrain) acc, prec, rec, f1 = scores(clf, binary_cross_dataset) print_prediction(clf, binary_cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-bi.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('prec: {0:.3f}'.format(prec)) print('rec: {0:.3f}'.format(rec)) print('f1: {0:.3f}'.format(f1)) print('-fine-') best_c, best_f1 = get_best_C(en, cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en._Xtrain, en._ytrain) acc, prec, rec, f1 = scores(clf, cross_dataset) print_prediction(clf, cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-4cls.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('prec: {0:.3f}'.format(prec)) print('rec: {0:.3f}'.format(rec)) print('f1: {0:.3f}'.format(f1))
def test_embeddings(bi, embedding_file, file_type): """ bi: if true, use a bidirectional lstm, otherwise use a normal lstm embedding_file: the word embeddings file file_type: word2vec, glove, tang, bin Use averaged word embeddings for each word in a text as features for l2 regularized logistiic regression. We test the embeddings on 10 benchmarks. Stanford Sentiment corpus (Socher et al., 2013) OpeNER corpus (Agerri et al., 2016) Sentube Corpora (Severyn et al., 2016) Semeval 2016 twitter corpus - task A """ print('importing vectors...') vecs = WordVecs(embedding_file, file_type) dim = vecs.vector_size lstm_dim=50 dropout=.3 train=True print('Importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=False, rep=words) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=True, rep=words) opener_dataset = General_Dataset('datasets/opener', None, one_hot=True, rep=words) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', None, rep=words, binary=True, one_hot=True) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', None, rep=words, binary=True, one_hot=True) semeval_dataset = Semeval_Dataset('datasets/semeval', None, rep=words, one_hot=True) datasets = [st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset] names = ['sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval'] # Collect results here results = [] std_devs = [] for name, dataset in zip(names, datasets): print('Testing on {0}...'.format(name)) max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} for w in vecs._w2idx.keys(): if w in vocab: wordvecs[w] = vecs[w] add_unknown_words(wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) print('Converting and Padding dataset...') dataset = convert_dataset(dataset, word_idx_map, max_length) output_dim = dataset._ytest.shape[1] """ Get best Dev params =========================================================== """ if bi: dev_params_file = 'dev_params/'+str(W.shape[1])+'_bilstm.dev.txt' else: dev_params_file = 'dev_params/'+str(W.shape[1])+'_lstm.dev.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(name, dev_params_file, bi, dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs) """ Test model 5 times and get averages and std dev. """ print('Running 5 runs to get average and standard deviations') dataset_results = [] for i, it in enumerate(range(5)): np.random.seed() print(i+1) if bi: clf = create_BiLSTM(wordvecs, best_dim, output_dim, best_dropout, weights=W, train=train) checkpoint = ModelCheckpoint('models/bilstm/' + name +'/run'+ str(i+1)+'/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto') else: checkpoint = ModelCheckpoint('models/lstm/' + name + '/run'+ str(i+1)+'/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto') clf = create_LSTM(wordvecs, best_dim, output_dim, best_dropout, weights=W, train=train) h = clf.fit(dataset._Xtrain, dataset._ytrain, validation_data=[dataset._Xdev, dataset._ydev], epochs=best_epoch, verbose=1, callbacks=[checkpoint]) if bi: base_dir = 'models/bilstm/'+ name +'/run'+ str(i+1) weights = os.listdir(base_dir) else: base_dir = 'models/lstm/' + name + '/run'+str(i+1) weights = os.listdir(base_dir) best_val = 0 best_weights = '' for weight in weights: val_acc = re.sub('weights.[0-9]*-', '', weight) val_acc = re.sub('.hdf5', '', val_acc) val_acc = float(val_acc) if val_acc > best_val: best_val = val_acc best_weights = weight clf = load_model(os.path.join(base_dir, best_weights)) pred = clf.predict(dataset._Xtest, verbose=1) classes = clf.predict_classes(dataset._Xtest, verbose=1) if bi: prediction_file = 'predictions/bilstm/' + name + '/run' + str(i+1) + '/pred.txt' w2idx_file = 'predictions/bilstm/' + name + '/w2idx.pkl' else: prediction_file = 'predictions/lstm/' + name + '/run' + str(i+1) + '/pred.txt' w2idx_file = 'predictions/lstm/' + name + '/w2idx.pkl' print_prediction(prediction_file, classes) with open(w2idx_file, 'wb') as out: pickle.dump(word_idx_map, out) labels = sorted(set(dataset._ytrain.argmax(1))) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, labels=labels, average=average) acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) # Get the average and std deviation over 10 runs with 10 random seeds dataset_results = np.array(dataset_results) ave_results = dataset_results.mean(axis=0) std_results = dataset_results.std(axis=0) print(u'acc: {0:.3f} \u00B1{1:.3f}'.format(ave_results[0], std_results[0])) print(u'prec: {0:.3f} \u00B1{1:.3f}'.format(ave_results[1], std_results[1])) print(u'recall: {0:.3f} \u00B1{1:.3f}'.format(ave_results[2], std_results[2])) print(u'f1: {0:.3f} \u00B1{1:.3f}'.format(ave_results[3], std_results[3])) results.append(ave_results) std_devs.append(std_results) results.append(list(np.array(results).mean(axis=0))) std_devs.append(list(np.array(std_devs).mean(axis=0))) names.append('overall') return names, results, std_devs, dim
parser.add_argument('-lr', '--learning_rate', default=0.001, type=float) parser.add_argument('-wd', '--weight_decay', default=3e-5, type=float) parser.add_argument('-cuda', default=True, type=str2bool) parser.add_argument('-seed', default=123, type=int) args = parser.parse_args() print_args(args) args.cuda = args.cuda and torch.cuda.is_available() np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print('Importing embeddings...') vecs = WordVecs(args.embeddings) synonyms1, synonyms2, neg = get_syn_ant(args.src_lang, vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.trg_lang, vecs) pdataset = ProjectionDataset( 'lexicons/{0}_{1}.txt'.format(args.src_lang, args.trg_lang), vecs, vecs) print('Importing datasets...') # Get training, dev, and test data if args.src_dataset == 'opener': if args.binary: train_data, dev_data, test_data = open_dataset( 'datasets/OpeNER/preprocessed/binary/en') else:
def test_embeddings(embedding_file, file_type): """ embedding_file: the word embeddings file file_type: word2vec, glove, tang, bin Use averaged word embeddings for each word in a text as features for l2 regularized logistiic regression. We test the embeddings on 10 benchmarks. Stanford Sentiment corpus (Socher et al., 2013) OpeNER corpus (Agerri et al., 2016) Sentube Corpora (Severyn et al., 2016) Semeval 2013 twitter corpus - task 2 """ print('importing vectors...') vecs = WordVecs(embedding_file, file_type) dim = vecs.vector_size print('importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', vecs, one_hot=False, binary=False, rep=ave_vecs) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', vecs, one_hot=False, binary=True, rep=ave_vecs) opener_dataset = General_Dataset('datasets/opener', vecs, one_hot=False, rep=ave_vecs) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', vecs, rep=ave_vecs, binary=True, one_hot=False) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', vecs, rep=ave_vecs, binary=True, one_hot=False) semeval_dataset = Semeval_Dataset('datasets/semeval', vecs, rep=ave_vecs, one_hot=False) datasets = [st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset] names = ['sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval'] # Collect results here results = [] for name, dataset in zip(names, datasets): print('Testing vectors on {0}...'.format(name)) # Get best parameters best_c, best_f1 = get_best_C(dataset) # Get predictions classifier = LogisticRegression(C=best_c) history = classifier.fit(dataset._Xtrain, dataset._ytrain) pred = classifier.predict(dataset._Xtest) predictions_file = "predictions/ave/" + name + '/pred.txt' print_prediction(predictions_file, pred) # Get results labels = sorted(set(dataset._ytrain)) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, labels=labels, average=average, one_hot=False) acc, precision, recall, f1 = mm.get_scores() results.append([acc, precision, recall, f1]) # Add overall results results.append(list(np.array(results).mean(axis=0))) names.append('overall') return names, results, dim
def main(): parser = argparse.ArgumentParser() parser.add_argument('-sl', '--source_lang', help="source language: es, ca, eu, en (default: en)", default='en') parser.add_argument('-tl', '--target_lang', help="target language: es, ca, eu, en (default: es)", default='es') parser.add_argument('-bi', '--binary', help="binary or 4-class (default: True)", default=True, type=str2bool) parser.add_argument('-e', '--epochs', help="training epochs (default: 200)", default=200, type=int) parser.add_argument( '-a', '--alpha', help= "trade-off between projection and classification objectives (default: .001)", default=.1, type=float) parser.add_argument('-pl', '--proj_loss', help="projection loss: mse, cosine (default: cosine)", default='cosine') parser.add_argument('-bs', '--batch_size', help="classification batch size (default: 50)", default=21, type=int) parser.add_argument( '-sv', '--src_vecs', help=" source language vectors (default: GoogleNewsVecs )", default='embeddings/original/google.txt') parser.add_argument( '-tr', '--trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='bingliu') parser.add_argument( '-da', '--dataset', help="dataset to train and test on (default: opener_sents)", default='opener_sents', ) parser.add_argument( '-sd', '--savedir', help="where to dump weights during training (default: ./models)", default='models') parser.add_argument( '-lr', '--learning_rate', help="where to dump weights during training (default: 0.0001)", default=0.0001, type=float) parser.add_argument( '-m', '--model', help="where to dump weights during training (default: attn_rnn_blse)", default='rnn_attn_blse') parser.add_argument( '-cu', '--to_cuda', help="where to dump weights during training (default: True)", default=True, type=bool) args = parser.parse_args() # If there's no savedir, create it if args.model not in ['rnn_attn_blse', 'rnn_blse']: print("no such model: {}".format(args.model)) exit(1) os.makedirs(args.savedir, exist_ok=True) if args.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' weight_dir = "{}/{}/{}-{}-{}".format(args.savedir, args.model, args.dataset, args.target_lang, b) results_file_name = "results/report_{}_alpha-{}_batch_size-{}_epochs-{}_lr-{}.txt".format( args.model, args.alpha, args.batch_size, args.epochs, '{0:.15f}'.format(args.learning_rate).rstrip('0').rstrip('.')) # import datasets (representation will depend on final classifier) print() print('training model') print('Parameters:') print('model: {0}'.format(args.model)) print('binary: {0}'.format(b)) print('epochs: {0}'.format(args.epochs)) print('alpha (projection loss coef): {0}'.format(args.alpha)) print('batchsize: {0}'.format(args.batch_size)) print('learning rate: {0}'.format(args.learning_rate)) print('weight_dir: {0}'.format(weight_dir)) print('results_file_name: {0}'.format(results_file_name)) print() print('importing datasets') dataset = General_Dataset(os.path.join('datasets', args.source_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) # print("len(cross_dataset._Xdev): {}".format(len(cross_dataset._Xdev))) # print("len(cross_dataset._Xtest): {}".format(len(cross_dataset._Xtest))) # Import monolingual vectors print('importing word embeddings') trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format( args.target_lang) print("trg_vecs_file_path: {}".format(trg_vecs_file_path)) src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(trg_vecs_file_path) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs) # Import translation pairs translation_file_path = "lexicons/{}/en-{}.txt".format( args.trans, args.target_lang) print("translation_file_path: {}".format(translation_file_path)) pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs) # Set up model if args.model == 'rnn_blse': model = RNN_BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, batch_size=args.batch_size, to_cuda=args.to_cuda, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) elif args.model == 'rnn_attn_blse': model = Rnn_Attn_BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, to_cuda=args.to_cuda, batch_size=args.batch_size, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) if torch.cuda.is_available() and args.to_cuda: print("cuda is available") model.cuda() else: print("cuda is not available") # Loss Functions class_criterion = nn.CrossEntropyLoss() proj_criterion = nn.MSELoss() if args.proj_loss == 'mse': proj_criterion = nn.MSELoss() elif args.proj_loss == 'cosine': proj_criterion = cosine_loss else: print("no projection criterion supported: {}".format(args.proj_loss)) exit(1) # Optimizer optim = torch.optim.Adam(model.parameters(), args.learning_rate) # Fit model results_file = open(results_file_name, "w+") trainer = Trainer(model, args.alpha, optim, args.learning_rate, class_criterion, proj_criterion, args.epochs, args.batch_size, results_file, weight_dir, args.to_cuda) best_model_file_path = trainer.train(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain) # Get best dev f1 and weights print("looking in dir: {}".format(weight_dir)) best_f1, best_params = get_best_model_params(best_model_file_path) best_model = torch.load(best_model_file_path) state_dict = best_model.state_dict() model.load_state_dict(state_dict) print() print('Dev set') print('best dev f1: {0:.3f}'.format(best_f1)) print('parameters: epochs {0} batch size {1} alpha {2} learning rate {3}'. format(*best_params)) results_file.write('\n') results_file.write('Dev set\n') results_file.write('best dev f1: {0:.3f}\n'.format(best_f1)) results_file.write( 'parameters: epochs {0} batch size {1} alpha {2}\n'.format( *best_params)) # Evaluate on test set model.eval() model.evaluate(cross_dataset._Xtest, cross_dataset._ytest, results_file=results_file, src=False) model.confusion_matrix(cross_dataset._Xtest, cross_dataset._ytest, src=False, results_file=results_file) results_file.close()
def test_embeddings(file, threshold, file_type): emotions = [ "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust" ] # Import dataset where each test example is the words in the tweet dataset = Fine_Grained_Emotion_Dataset('data', None, rep=words, threshold=threshold) print('Basic statistics') table = [] for i, emo in enumerate(emotions): train = dataset._ytrain[:, i].sum() test = dataset._ytest[:, i].sum() table.append((emo, train, test)) print(tabulate.tabulate(table, headers=['emotion', '#train', '#test'])) #### Get Parameters #### max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list( dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} print('Importing vectors') for line in open(file): try: split = line.split() word = split[0] vec = np.array(split[1:], dtype='float32') if word in vocab: wordvecs[word] = vec except ValueError: pass dim = len(vec) oov = len(vocab) - len(wordvecs) print('OOV: {0}'.format(oov)) # Add vectors for <unk> add_unknown_words(wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) # TODO: change this so I don't have to import vectors I don't need vecs = WordVecs(file) vecs._matrix = W vecs._w2idx = word_idx_map vecs.vocab_length, vecs.vector_size = W.shape ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs) # Get padded word indexes for all X Xtrain = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtrain ]) Xdev = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xdev ]) Xtest = np.array([ get_idx_from_sent(' '.join(sent), word_idx_map, max_l=max_length, k=dim) for sent in dataset._Xtest ]) #### Test Models #### names = ['LSTM', 'BiLSTM', 'CNN'] # Keep all mean and standard deviations of each emotion over datasets here all_emo_results = [] all_emo_std_devs = [] # Keep all mean and standard deviations of the averaged emotions here averaged_results = [] averaged_std_devs = [] # TEST EACH MODEL for name in names: print('Getting best parameters') dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params( name, dev_params_file, Xtrain, dataset._ytrain, Xdev, dataset._ydev, wordvecs, W) print('Testing {0}'.format(name)) # Keep the results for the 5 runs over the dataset model_results = [] model_average_results = [] # 5 runs to get average and standard deviation for i, it in enumerate(range(5)): print('Run: {0}'.format(i + 1)) # create and train a new classifier for each iteration if name == 'LSTM': model = create_LSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'BiLSTM': model = create_BiLSTM(wordvecs, dim=best_dim, output_dim=8, dropout=best_dropout, weights=W, train=True) elif name == 'CNN': model = create_cnn(W, Xtrain.shape[1]) h = model.fit(Xtrain, dataset._ytrain, validation_data=[Xdev, dataset._ydev], nb_epoch=best_epoch, verbose=0) pred = model.predict(Xtest) pred = np.array([cutoff(x) for x in pred]) y = dataset._ytest emo_results = [] for j in range(len(emotions)): emo_y = y[:, j] emo_pred = pred[:, j] mm = MyMetrics(emo_y, emo_pred, one_hot=False, average='binary') acc = mm.accuracy() precision, recall, f1 = mm.get_scores() emo_results.append([acc, precision, recall, f1]) emo_results = np.array(emo_results) model_results.append(emo_results) # print('F1 scores') # for emo, result in zip(emotions, emo_results): # a, p, r, f = result # print('{0}: {1:.3f}'.format(emo, f)) ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0) mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred) model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1)) print( 'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}' .format(ave_acc, mic_prec, mic_rec, mic_f1)) print() model_results = np.array(model_results) model_average_results = np.array(model_average_results) average_model_results = model_results.mean(axis=0) model_std_dev_results = model_results.std(axis=0) overall_avg = model_average_results.mean(axis=0) overall_std = model_average_results.std(axis=0) all_emo_results.append(average_model_results) all_emo_std_devs.append(model_std_dev_results) averaged_results.append(overall_avg) averaged_std_devs.append(overall_std) return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim
def main(): parser = argparse.ArgumentParser() parser.add_argument('-sl', '--source_lang', help="source language: es, ca, eu, en (default: en)", default='en') parser.add_argument('-tl', '--target_lang', help="target language: es, ca, eu, en (default: es)", default='es') parser.add_argument('-bi', '--binary', help="binary or 4-class (default: True)", default=True, type=str2bool) parser.add_argument('-e', '--epochs', help="training epochs (default: 200)", default=200, type=int) parser.add_argument( '-a', '--alpha', help= "trade-off between projection and classification objectives (default: .001)", default=.001, type=float) parser.add_argument('-pl', '--proj_loss', help="projection loss: mse, cosine (default: cosine)", default='mse') parser.add_argument('-bs', '--batch_size', help="classification batch size (default: 50)", default=20, type=int) parser.add_argument( '-sv', '--src_vecs', help=" source language vectors (default: GoogleNewsVecs )", default='google.txt') parser.add_argument( '-tv', '--trg_vecs', help=" target language vectors (default: SGNS on Wikipedia)", default='sg-300-es.txt') parser.add_argument( '-tr', '--trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='lexicons/bingliu/en-es.txt') parser.add_argument( '-da', '--dataset', help="dataset to train and test on (default: opener_sents)", default='opener_sents', ) parser.add_argument( '-sd', '--savedir', help="where to dump weights during training (default: ./models)", default='models/blse') args = parser.parse_args() # import datasets (representation will depend on final classifier) print('importing datasets') dataset = General_Dataset(os.path.join('datasets', args.source_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang, args.dataset), None, binary=args.binary, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(args.trg_vecs) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs) # Import translation pairs pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs) if args.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' # Set up model blse = BLSE( src_vecs, trg_vecs, pdataset, dataset, cross_dataset, projection_loss=args.proj_loss, output_dim=output_dim, src_syn1=synonyms1, src_syn2=synonyms2, src_neg=neg, trg_syn1=cross_syn1, trg_syn2=cross_syn2, trg_neg=cross_neg, ) # If there's no savedir, create it os.makedirs(args.savedir, exist_ok=True) # Fit model blse.fit(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain, weight_dir=args.savedir, batch_size=args.batch_size, alpha=args.alpha, epochs=args.epochs) # Get best dev f1 and weights best_f1, best_params, best_weights = get_best_run(args.savedir) blse.load_weights(best_weights) print() print('Dev set') print('best dev f1: {0:.3f}'.format(best_f1)) print( 'parameters: epochs {0} batch size {1} alpha {2}'.format(*best_params)) # Evaluate on test set blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False) blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False, outfile=os.path.join( 'predictions', args.target_lang, 'blse', '{0}-{1}-alpha{2}-epoch{3}-batch{4}.txt'.format( args.dataset, b, args.alpha, best_params[0], args.batch_size))) blse.confusion_matrix(cross_dataset._Xtest, cross_dataset._ytest, src=False) blse.plot()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-dataset', default='opener', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() langs = ['es', 'ca', 'eu'] for lang in langs: print('#### {0} ####'.format(lang)) en = General_Dataset(os.path.join('datasets', 'en', args.dataset), None, one_hot=False, rep=words) cross_dataset = General_Dataset(os.path.join('datasets', lang, args.dataset), None, one_hot=False, rep=words) vocab = en.vocab.update(cross_dataset.vocab) vecs = WordVecs( 'embeddings/barista/sg-300-window4-negative20_en_{0}.txt'.format( lang), vocab=vocab) en = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) cross_dataset = General_Dataset(os.path.join('datasets', lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False) if True in args.bi: print('-binary-') best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en_binary._Xtrain, en_binary._ytrain) acc, f1 = scores(clf, binary_cross_dataset, 'binary') print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'barista', '{0}-bi.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('f1: {0:.3f}'.format(f1)) if False in args.bi: print('-fine-') best_c, best_f1 = get_best_C(en, cross_dataset) clf = LinearSVC(C=best_c) clf.fit(en._Xtrain, en._ytrain) acc, f1 = scores(clf, cross_dataset) print_prediction( clf, cross_dataset, os.path.join('predictions', lang, 'barista', '{0}-4cls.txt'.format(args.dataset))) print('acc: {0:.3f}'.format(acc)) print('f1: {0:.3f}'.format(f1))
30, 60 ]: clf = LinearSVC(C=c) clf.fit(dataset._Xtrain, dataset._ytrain) pred = clf.predict(dataset._Xdev) f1 = per_class_f1(dataset._ydev, pred).mean() if f1 > best_f1: best_f1 = f1 best_c = c return best_c, best_f1 if __name__ == '__main__': embeddingdir = '/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS' amazon_vecs = WordVecs( os.path.join(embeddingdir, 'SubjQuant/amazon-sg-300.txt')) twitter_vecs = WordVecs( os.path.join(embeddingdir, 'twitter_embeddings.txt')) pdataset = ProjectionDataset('lexicons/general_vocab.txt', amazon_vecs, twitter_vecs) books = Book_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True) dvd = DVD_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True) electronics = Electronics_Dataset(amazon_vecs, rep=ave_vecs, binary=True, one_hot=False) kitchen = Kitchen_Dataset(amazon_vecs, rep=ave_vecs, binary=True,
def run_model_on_datasets_with_embeddings(embedding_file, file_type): """ embedding_file: the word embeddings file file_type: word2vec, glove """ print('importing word embedding vectors...') vecs = WordVecs(embedding_file, file_type) # load the word2vec dictionary. dim = vecs.vector_size # dimensionality of the word embeddings # For collecting results to return results = [] std_devs = [] datasetNames = [ # 'sst_fine', # 'sst_binary', # 'opener', # 'sentube_auto', 'sentube_tablets', 'semeval', ] # train & test the model on every dataset above for datasetName in datasetNames: # dataset_load_start = datetime.now() if datasetName == 'sst_fine': dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=False, rep=words) elif datasetName == 'sst_binary': dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=True, rep=words) elif datasetName == 'opener': dataset = General_Dataset('datasets/opener', None, one_hot=True, rep=words) elif datasetName == 'sentube_auto': dataset = General_Dataset('datasets/SenTube/auto', None, rep=words, binary=True, one_hot=True) elif datasetName == 'sentube_tablets': dataset = General_Dataset('datasets/SenTube/tablets', None, rep=words, binary=True, one_hot=True) elif datasetName == 'semeval': dataset = Semeval_Dataset('datasets/semeval', None, rep=words, one_hot=True) print('Loading & Testing on {}:'.format(datasetName)) # if hp.lowercase_all_sentences: # for sent in dataset._Xtrain: # for word in sent: # if word != word.lower(): # print("Word has an uppercase character:", word.decode('utf-8')) # find out the max length of sentences in the dataset and construct the vocab frequency dict. max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 # create a dict of words that are in our word2vec embeddings # wordvecs: String -> embedding_vec wordvecs = {} for w in vecs._w2idx.keys(): if w in vocab: wordvecs[w] = vecs[w] # Assign random w2v vectors to the unknown words. These are random uniformly distrubuted vectors of size dim. add_unknown_words(vecs, wordvecs, vocab, min_df=1, dim=dim) W, word_idx_map = get_W(wordvecs, dim=dim) # Get the w2v index map for out final vocab print('Converting dataset to being right padded...') dataset = convert_dataset(dataset, word_idx_map, datasetName, max_length) output_dim = dataset._ytest.shape[1] # Test model hp.run_exps_amount times and get averages and std dev. dataset_results = [] for i in range(1, hp.run_exps_amount + 1): tf.reset_default_graph() # Clears the current loaded tensorflow graph. w2i = tf.Variable(tf.constant(0.0, shape=[W.shape[0], W.shape[1]]), trainable=False, name="embedding_table") wordIndxToVec_tensor = tf.placeholder(tf.float32, [W.shape[0], W.shape[1]], name="embedding_table") # [vobab_size x word_embedding_dim] w2i.assign(wordIndxToVec_tensor) start_time = datetime.now() # Print time for logging. clf, best_mm_val, best_mm_test = createAndTrainTransformer(dataset, W, wordIndxToVec_tensor, output_dim, datasetName, max_length) print("Finished run #", i, "Time taken: " + str(datetime.now() - start_time)) mm = best_mm_test acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) if hp.run_exps_amount == 1: acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) # add twice so the average is the same... avoid running multiple runs this way. if hp.run_exps_amount != 1: # Print the metrics for this run, unless we're running experiment only once. this_run_result = [] this_run_result.append([acc, precision, recall, micro_f1]) this_run_result.append([acc, precision, recall, micro_f1]) this_run_result = np.array(this_run_result) this_run_ave_results = this_run_result.mean(axis=0) this_run_std_results = this_run_result.std(axis=0) printMetrics(this_run_ave_results, this_run_std_results, datasetName) # Get the average and std deviation over 10 runs with 10 random seeds dataset_results = np.array(dataset_results) ave_results = dataset_results.mean(axis=0) std_results = dataset_results.std(axis=0) printMetrics(ave_results, std_results, datasetName) results.append(ave_results) std_devs.append(std_results) results.append(list(np.array(results).mean(axis=0))) std_devs.append(list(np.array(std_devs).mean(axis=0))) datasetNames.append('overall') return datasetNames, results, std_devs, dim
def train_model_with_different_params(params): # Train a certain model (rnn_blse / rnn_attn_blse) for a certain target language # for different combinations of hyper parameters if params.model not in ['rnn_attn_blse', 'rnn_blse']: print("no such model: {}".format(params.model)) exit(1) # If there's no savedir, create it os.makedirs(params.savedir, exist_ok=True) if params.binary: output_dim = 2 b = 'bi' else: output_dim = 4 b = '4cls' weight_dir = "{}/{}/{}-{}-{}".format(params.savedir, params.model, params.dataset, params.target_lang, b) best_params_file_name = "results/best_params_report_{}_{}_{}.txt".format( params.model, params.target_lang, b) best_params_file = open(best_params_file_name, "w+") best_params_file.write("Start parameter search:\n") best_params_file.write("Model: {}\n".format(params.model)) best_params_file.write("is_binary: {}\n".format(params.binary)) best_params_file.write("target_lang: {}\n".format(params.target_lang)) best_f1 = 0.0 best_params = None old_file_name = None old_results_file_name = None rest_of_scores = [] print('importing datasets') dataset = General_Dataset(os.path.join('datasets', params.source_lang, params.dataset), None, binary=params.binary, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', params.target_lang, params.dataset), None, binary=params.binary, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format( params.target_lang) print("trg_vecs_file_path: {}".format(trg_vecs_file_path)) src_vecs = WordVecs(params.src_vecs) trg_vecs = WordVecs(trg_vecs_file_path) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant(params.source_lang, src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(params.target_lang, trg_vecs) # Import translation pairs translation_file_path = "lexicons/{}/en-{}.txt".format( params.trans, params.target_lang) print("translation_file_path: {}".format(translation_file_path)) pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs) for proj_loss in params.proj_losses: for alpha in params.alphas: for learning_rate in params.learning_rates: for batch_size in params.batch_sizes: best_model_file_path, acc, prec, rec, f1, results_file_name = train_model( params.model, dataset, cross_dataset, src_vecs, trg_vecs, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, pdataset, weight_dir, proj_loss, alpha, learning_rate, batch_size, output_dim, b, params) if f1 > best_f1: print() print("Found new set of best hyper params:") print("f1: {0:.3f}".format(f1)) print("acc: {0:.3f}".format(acc)) print("prec: {0:.3f}".format(prec)) print("rec: {0:.3f}".format(rec)) print('model: {0}'.format(params.model)) print('is_binary: {0}'.format(params.binary)) print('epochs: {0}'.format(params.epochs)) print('proj_loss: {0}'.format(proj_loss)) print('alpha (projection loss coef): {0}'.format( alpha)) print('batch size: {0}'.format(batch_size)) print('learning rate: {0}'.format(learning_rate)) print('weight_dir: {0}'.format(weight_dir)) print('best_model_file_path: {0}'.format( best_model_file_path)) print() best_params_file.write("\n") best_params_file.write( "Found new set of best hyper params:\n") best_params_file.write( "f1 {0:.3f}:\n".format(f1)) best_params_file.write( "acc {0:.3f}:\n".format(acc)) best_params_file.write( "prec {0:.3f}:\n".format(prec)) best_params_file.write( "rec {0:.3f}:\n".format(rec)) best_params_file.write('model: {0}\n'.format( params.model)) best_params_file.write('is_binary: {0}\n'.format( params.binary)) best_params_file.write('epochs: {0}\n'.format( params.epochs)) best_params_file.write( 'proj_loss: {0}\n'.format(proj_loss)) best_params_file.write( "alpha (projection loss coef): {0}\n".format( alpha)) best_params_file.write( 'batch size: {0}\n'.format(batch_size)) best_params_file.write( 'learning: {0}\n'.format(learning_rate)) best_params_file.write( 'weight_dir: {0}\n'.format(weight_dir)) best_params_file.write( 'best_model_file_path: {0}\n'.format( best_model_file_path)) if old_file_name != None: os.remove(old_file_name) if old_results_file_name != None: os.remove(old_results_file_name) torch.save(params.model, best_model_file_path) old_file_name = best_model_file_path old_results_file_name = results_file_name best_f1 = f1 rest_of_scores = [acc, prec, rec] best_params = [ proj_loss, alpha, learning_rate, batch_size ] else: os.remove(results_file_name) os.remove(best_model_file_path) print("") print("Done parameters search") print("best f1: {0:.3f}".format(best_f1)) print("its acc: {0:.3f}".format(rest_of_scores[0])) print("its prec: {0:.3f}".format(rest_of_scores[1])) print("its rec: {0:.3f}".format(rest_of_scores[2])) print("best_params:") print('model: {0}'.format(params.model)) print('is_binary: {0}'.format(params.binary)) print('proj_loss: {0}'.format(best_params[0])) print('alpha (projection loss coef): {0}'.format(best_params[1])) print('learning rate: {0}'.format(best_params[2])) print('batch size: {0}'.format(best_params[3])) print("") best_params_file.write("\n") best_params_file.write("Done parameters search\n") best_params_file.write("best f1: {0:.3f}\n".format(best_f1)) best_params_file.write("its acc: {0:.3f}\n".format(rest_of_scores[0])) best_params_file.write("its prec: {0:.3f}\n".format(rest_of_scores[1])) best_params_file.write("its rec: {0:.3f}\n".format(rest_of_scores[2])) best_params_file.write('model: {0}\n'.format(params.model)) best_params_file.write('is_binary: {0}\n'.format(params.binary)) best_params_file.write('proj_loss: {0}\n'.format(best_params[0])) best_params_file.write("alpha (projection loss coef): {0}\n".format( best_params[1])) best_params_file.write('learning: {0}\n'.format(best_params[2])) best_params_file.write('batch size: {0}\n'.format(best_params[3])) best_params_file.close()
def test_embeddings(embedding_file, file_type): """ Tang et al. (2014) embeddings and cassification approach on a number of benchmark datasets. """ print('importing vectors...') vecs = WordVecs(embedding_file, file_type) dim = vecs.vector_size print('Importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=False, binary=False, rep=words) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=False, binary=True, rep=words) opener_dataset = General_Dataset('datasets/opener', vecs, one_hot=False, rep=words) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', vecs._w2idx, rep=words, binary=True, one_hot=False) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', vecs._w2idx, rep=words, binary=True, one_hot=False) semeval_dataset = Semeval_Dataset('datasets/semeval', vecs._w2idx, rep=words, one_hot=False) datasets = [ st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset ] names = [ 'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval' ] # Collect results here results = [] for name, dataset in zip(names, datasets): print('Testing on {0}...'.format(name)) Xtrain = np.array( [conv_tweet(' '.join(t), vecs) for t in dataset._Xtrain]) Xtest = np.array( [conv_tweet(' '.join(t), vecs) for t in dataset._Xtest]) Xdev = np.array([conv_tweet(' '.join(t), vecs) for t in dataset._Xdev]) # get best parameters on dev set best_C, best_rate = get_best_C(Xtrain, dataset._ytrain, Xdev, dataset._ydev) clf = LogisticRegression(C=best_C) h = clf.fit(Xtrain, dataset._ytrain) pred = clf.predict(Xtest) predictions_file = "predictions/joint/" + name + '/pred.txt' print_prediction(predictions_file, pred) labels = sorted(set(dataset._ytrain)) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, one_hot=False, labels=labels, average=average) acc, precision, recall, f1 = mm.get_scores() results.append([acc, precision, recall, f1]) results.append(list(np.array(results).mean(axis=0))) names.append('overall') return names, results, dim
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', help="target language: es, ca, eu", default='es') parser.add_argument('-bi', help="binary or 4-class", default=False, type=str2bool) parser.add_argument('-epoch', default=300, type=int) parser.add_argument('-alpha', default=.5, type=float) parser.add_argument('-batch_size', default=200, type=int) parser.add_argument('-src_vecs', default='embeddings/original/google.txt') parser.add_argument('-trg_vecs', default='embeddings/original/sg-300-es.txt') parser.add_argument( '-trans', help='translation pairs', default= 'lexicons/bingliu_en_es.one-2-one_AND_Negators_Intensifiers_Diminishers.txt' ) parser.add_argument('-dataset', default='opener') args = parser.parse_args() # import datasets (representation will depend on final classifier) print('importing datasets') dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), None, binary=args.bi, rep=words, one_hot=False) cross_dataset = General_Dataset(os.path.join('datasets', args.l, args.dataset), None, binary=args.bi, rep=words, one_hot=False) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) trg_vecs = WordVecs(args.trg_vecs) # Get sentiment synonyms and antonyms to check how they move during training synonyms1, synonyms2, neg = get_syn_ant('en', src_vecs) cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.l, trg_vecs) # Import translation pairs pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs) # initialize classifier if args.bi: ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, 2) else: ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset, synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg, 4) # train model print('training model') print('Parameters:') print('lang: {0}'.format(args.l)) print('binary: {0}'.format(args.bi)) print('epoch: {0}'.format(args.epoch)) print('alpha: {0}'.format(args.alpha)) print('batchsize: {0}'.format(args.batch_size)) print('src vecs: {0}'.format(args.src_vecs)) print('trg_vecs: {0}'.format(args.trg_vecs)) print('trans dict: {0}'.format(args.trans)) print('dataset: {0}'.format(args.dataset)) if args.bi: b = 'bi' else: b = '4cls' weight_dir = os.path.join('models', '{0}-{1}-{2}'.format(args.dataset, args.l, b)) ble.fit(pdataset._Xtrain, pdataset._ytrain, dataset._Xtrain, dataset._ytrain, weight_dir=weight_dir, alpha=args.alpha, epochs=args.epoch, batch_size=args.batch_size) # get the best weights best_f1, best_params, best_weights = get_best_run(weight_dir) epochs, batch_size, alpha = best_params ble.load_weights(best_weights) # evaluate if args.bi: ble.plot(outfile=os.path.join( 'figures', 'syn-ant', args.l, 'ble', '{0}-bi-alpha{1}-epoch{2}-batch{3}.pdf'.format( args.dataset, alpha, epochs, batch_size))) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False, outfile=os.path.join( 'predictions', args.l, 'ble', '{0}-bi-alpha{1}-epoch{2}-batch{3}.txt'.format( args.dataset, alpha, epochs, batch_size))) else: ble.plot(outfile=os.path.join( 'figures', 'syn-ant', args.l, 'ble', '{0}-4cls-alpha{1}-epoch{2}-batch{3}.pdf'.format( args.dataset, alpha, epochs, batch_size))) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, average='macro', src=False) ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, average='macro', src=False, outfile=os.path.join( 'predictions', args.l, 'ble', '{0}-4cls-alpha{1}-epoch{2}-batch{3}.txt'.format( args.dataset, alpha, epochs, batch_size)))
def test_embeddings(file, file_type): print('Importing vecs...') #vec_file = sys.argv[1] #vec_file = '/home/jeremy/Escritorio/sentiment_retrofitting/embeddings/sswe-u-50.txt' vecs = WordVecs(file) print('Importing datasets...') st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=False, rep=words) st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis', None, one_hot=True, binary=True, rep=words) opener_dataset = General_Dataset('datasets/opener', vecs, one_hot=True, rep=word_reps) twitter_dataset = Semeval_Dataset('datasets/twitter', vecs._w2idx, rep=word_reps, one_hot=True) sentube_auto_dataset = General_Dataset('datasets/SenTube/auto', vecs._w2idx, rep=word_reps, binary=True, one_hot=True) sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets', vecs._w2idx, rep=word_reps, binary=True, one_hot=True) semeval_dataset = Semeval_Dataset('datasets/semeval', vecs, rep=words, one_hot=True) datasets = [ st_fine, st_binary, opener_dataset, sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset ] names = [ 'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets', 'semeval' ] dim = vecs.vector_size for name, dataset in zip(names, datasets): print('Testing on {0}...'.format(name)) max_length = 0 vocab = {} for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list( dataset._Xtest): if len(sent) > max_length: max_length = len(sent) for w in sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 wordvecs = {} for w in vecs._w2idx.keys(): if w in vocab: wordvecs[w] = vecs[w] add_unknown_words(wordvecs, vocab, min_df=1, k=dim) W, word_idx_map = get_W(wordvecs, k=dim) print('Converting and Padding dataset...') dataset = convert_dataset(dataset, word_idx_map, max_length) output_dim = dataset._ytest.shape[1] """ Get best Dev params =========================================================== """ dev_params_file = 'dev_params/' + str(W.shape[1]) + '_cnn.dev.txt' best_dim, best_dropout, best_epoch, best_f1 = get_dev_params( name, dev_params_file, max_length, dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, W) # Collect results here results = [] std_devs = [] for i, it in enumerate(range(5)): dataset_results = [] checkpoint = ModelCheckpoint( 'models/cnn/' + name + '/run' + str(i + 1) + '/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto') clf = create_cnn(W, max_length, dim=best_dim, dropout=best_dropout, output_dim=output_dim) h = clf.fit(dataset._Xtrain, dataset._ytrain, validation_data=[dataset._Xdev, dataset._ydev], epochs=best_epoch, verbose=1, callbacks=[checkpoint]) base_dir = 'models/cnn/' + name + '/run' + str(i + 1) weights = os.listdir(base_dir) best_val = 0 best_weights = '' for weight in weights: val_acc = re.sub('weights.[0-9]*-', '', weight) val_acc = re.sub('.hdf5', '', val_acc) val_acc = float(val_acc) if val_acc > best_val: best_val = val_acc best_weights = weight clf = load_model(os.path.join(base_dir, best_weights)) pred = clf.predict(dataset._Xtest, verbose=1) classes = clf.predict_classes(dataset._Xtest, verbose=1) prediction_file = 'predictions/cnn/' + name + '/run' + str( i + 1) + '/pred.txt' w2idx_file = 'predictions/cnn/' + name + '/w2idx.pkl' print_prediction(prediction_file, classes) with open(w2idx_file, 'wb') as out: pickle.dump(word_idx_map, out) labels = sorted(set(dataset._ytrain.argmax(1))) if len(labels) == 2: average = 'binary' else: average = 'micro' mm = MyMetrics(dataset._ytest, pred, labels=labels, average=average) acc, precision, recall, micro_f1 = mm.get_scores() dataset_results.append([acc, precision, recall, micro_f1]) return names, results, std_devs, dim
parser = argparse.ArgumentParser() parser.add_argument("--NUM_LAYERS", "-nl", default=1, type=int) parser.add_argument("--HIDDEN_DIM", "-hd", default=100, type=int) parser.add_argument("--BATCH_SIZE", "-bs", default=50, type=int) parser.add_argument("--EMBEDDING_DIM", "-ed", default=300, type=int) parser.add_argument("--TRAIN_EMBEDDINGS", "-te", action="store_true") parser.add_argument("--AUXILIARY_TASK", "-aux", default="negation_scope") parser.add_argument("--EMBEDDINGS", "-emb", default="../../embeddings/google.txt") args = parser.parse_args() print(args) # Get embeddings (CHANGE TO GLOVE OR FASTTEXT EMBEDDINGS) embeddings = WordVecs(args.EMBEDDINGS) w2idx = embeddings._w2idx # Create shared vocabulary for tasks vocab = Vocab(train=True) # Update with word2idx from pretrained embeddings so we don't lose them # making sure to change them by one to avoid overwriting the UNK token # at index 0 with_unk = {} for word, idx in embeddings._w2idx.items(): with_unk[word] = idx + 1 vocab.update(with_unk) # Import datasets # This will update vocab with words not found in embeddings
default="../../embeddings/BLSE/google.txt") parser.add_argument('-te', '--trg_embedding', default="../../embeddings/BLSE/sg-300-es.txt") parser.add_argument('-sd', '--src_dataset', default="datasets/training/en/raw") parser.add_argument('-td', '--trg_dataset', default="datasets/training/es/raw") args = parser.parse_args() # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_embedding) src_vecs.mean_center() src_vecs.normalize() trg_vecs = WordVecs(args.trg_embedding) trg_vecs.mean_center() trg_vecs.normalize() # Setup projection dataset trans = 'lexicons/bingliu_en_{0}.one-2-one.txt'.format(args.lang) pdataset = ProjectionDataset(trans, src_vecs, trg_vecs) # learn the translation matrix W print('Projecting src embeddings to trg space...') W = get_projection_matrix(pdataset, src_vecs, trg_vecs) print('W done')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-src_vecs', default='embeddings/original/google.txt', help=" source language vectors (default: GoogleNewsVecs )") parser.add_argument( '-trg_vecs', default='embeddings/original/sg-300-{0}.txt', help=" target language vectors (default: SGNS on Wikipedia)") parser.add_argument( '-trans', help= 'translation pairs (default: Bing Liu Sentiment Lexicon Translations)', default='lexicons/bingliu/en-{0}.txt') parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() # Loop over the three languages for lang in ['es', 'ca', 'eu']: print('################ {0} ##############'.format(lang)) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs(args.src_vecs) src_vecs.mean_center() src_vecs.normalize() trg_vecs = WordVecs(args.trg_vecs.format(lang)) trg_vecs.mean_center() trg_vecs.normalize() # Setup projection dataset pdataset = ProjectionDataset(args.trans.format(lang), src_vecs, trg_vecs) # learn the translation matrix W W = get_W(pdataset, src_vecs, trg_vecs) # project the source matrix to the new shared space src_vecs._matrix = np.dot(src_vecs._matrix, W) # Import datasets (representation will depend on final classifier) print('importing datasets') binary_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) fine_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) fine_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) # Train linear SVM classifier if True in args.bi: best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain) cpred = clf.predict(binary_cross_dataset._Xtest) cf1 = macro_f1(binary_cross_dataset._ytest, cpred) print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'artetxe', '{0}-bi.txt'.format(args.dataset))) print('-binary-') print('Acc: {0:.3f}'.format( clf.score(binary_cross_dataset._Xtest, binary_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print() if False in args.bi: best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain) cpred = clf.predict(fine_cross_dataset._Xtest) cf1 = macro_f1(fine_cross_dataset._ytest, cpred) print_prediction( clf, fine_cross_dataset, os.path.join('predictions', lang, 'artetxe', '{0}-4cls.txt'.format(args.dataset))) print('-fine-') print('Acc: {0:.3f}'.format( clf.score(fine_cross_dataset._Xtest, fine_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-vec_dir', default='../deployment/MUSE/', help=" directory that hold MUSE vectors") parser.add_argument('-dataset', default='opener', help="dataset to train and test on (default: opener)") parser.add_argument( '-bi', help= 'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])', default=[True, False], nargs='+', type=str2bool) args = parser.parse_args() # Loop over the three languages for lang in ['ca', 'eu', 'es']: print('################ {0} ##############'.format(lang)) # Import monolingual vectors print('importing word embeddings') src_vecs = WordVecs( os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-en.txt')) trg_vecs = WordVecs( os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-{0}.txt'.format(lang))) # Import datasets (representation will depend on final classifier) print('importing datasets') binary_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) binary_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=True, rep=ave_vecs, one_hot=False, lowercase=False) fine_dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset), src_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) fine_cross_dataset = General_Dataset(os.path.join( 'datasets', lang, args.dataset), trg_vecs, binary=False, rep=ave_vecs, one_hot=False, lowercase=False) # Train linear SVM classifier if True in args.bi: best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain) cpred = clf.predict(binary_cross_dataset._Xtest) cf1 = macro_f1(binary_cross_dataset._ytest, cpred) print_prediction( clf, binary_cross_dataset, os.path.join('predictions', lang, 'muse', '{0}-bi.txt'.format(args.dataset))) print('-binary-') print('Acc: {0:.3f}'.format( clf.score(binary_cross_dataset._Xtest, binary_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print() if False in args.bi: best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset) clf = LinearSVC(C=best_c) clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain) cpred = clf.predict(fine_cross_dataset._Xtest) cf1 = macro_f1(fine_cross_dataset._ytest, cpred) print_prediction( clf, fine_cross_dataset, os.path.join('predictions', lang, 'muse', '{0}-4cls.txt'.format(args.dataset))) print('-fine-') print('Acc: {0:.3f}'.format( clf.score(fine_cross_dataset._Xtest, fine_cross_dataset._ytest))) print('Macro F1: {0:.3f}'.format(cf1)) print()