예제 #1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)")
    args = parser.parse_args()

    vecs = WordVecs('/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS/BLSE/google.txt')

    en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                         vecs, one_hot=False, rep=ave_vecs, lowercase=False)
    en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                                 vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False)

    langs = ['es', 'ca', 'eu']

    for lang in langs:
        print('#### {0} ####'.format(lang))
        cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset),
                                        vecs, one_hot=False, rep=ave_vecs, lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset),
                                               vecs, one_hot=False, rep=ave_vecs,
                                               binary=True, lowercase=False)

        print('-binary-')
        best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset)
        clf = LinearSVC(C=best_c)
        clf.fit(en_binary._Xtrain, en_binary._ytrain)
        acc, prec, rec, f1 = scores(clf, binary_cross_dataset)
        print_prediction(clf, binary_cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-bi.txt'.format(args.dataset)))
        print('acc:   {0:.3f}'.format(acc))
        print('prec:  {0:.3f}'.format(prec))
        print('rec:   {0:.3f}'.format(rec))
        print('f1:    {0:.3f}'.format(f1))

        print('-fine-')
        best_c, best_f1 = get_best_C(en, cross_dataset)
        clf = LinearSVC(C=best_c)
        clf.fit(en._Xtrain, en._ytrain)
        acc, prec, rec, f1 = scores(clf, cross_dataset)
        print_prediction(clf, cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-4cls.txt'.format(args.dataset)))
        print('acc:   {0:.3f}'.format(acc))
        print('prec:  {0:.3f}'.format(prec))
        print('rec:   {0:.3f}'.format(rec))
        print('f1:    {0:.3f}'.format(f1))
예제 #2
0
def test_embeddings(bi, embedding_file, file_type):
    """
    bi: if true, use a bidirectional lstm, otherwise use a normal lstm
    embedding_file: the word embeddings file
    file_type:      word2vec, glove, tang, bin

    Use averaged word embeddings for each word in a text as features
    for l2 regularized logistiic regression. We test the embeddings
    on 10 benchmarks.


    Stanford Sentiment corpus (Socher et al., 2013)
    OpeNER corpus (Agerri et al., 2016)
    Sentube Corpora (Severyn et al., 2016)
    Semeval 2016 twitter corpus - task A


    """

    print('importing vectors...')
    vecs = WordVecs(embedding_file, file_type)
    dim = vecs.vector_size
    lstm_dim=50
    dropout=.3
    train=True

    print('Importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=False,
                                            rep=words)


    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=True,
                                            rep=words)

    opener_dataset = General_Dataset('datasets/opener',
                                     None,
                                     one_hot=True,
                                     rep=words)


    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           None, rep=words,
                                           binary=True,
                                           one_hot=True)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              None, rep=words,
                                              binary=True,
                                              one_hot=True)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)

    datasets = [st_fine, st_binary, opener_dataset,
                sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset]

    names = ['sst_fine', 'sst_binary', 'opener',
             'sentube_auto', 'sentube_tablets', 'semeval']

    # Collect results here
    results = []
    std_devs = []


    for name, dataset in zip(names, datasets):
        print('Testing on {0}...'.format(name))

        max_length = 0
        vocab = {}
        for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
            if len(sent) > max_length:
                max_length = len(sent)
            for w in sent:
                if w not in vocab:
                    vocab[w] = 1
                else:
                    vocab[w] += 1

        wordvecs = {}
        for w in vecs._w2idx.keys():
            if w in vocab:
                wordvecs[w] = vecs[w]

        add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
        W, word_idx_map = get_W(wordvecs, dim=dim)

        print('Converting and Padding dataset...')

        dataset = convert_dataset(dataset, word_idx_map, max_length)


        output_dim = dataset._ytest.shape[1]

        """
        Get best Dev params
        ===========================================================
        """
        if bi:
            dev_params_file = 'dev_params/'+str(W.shape[1])+'_bilstm.dev.txt'
        else:
            dev_params_file = 'dev_params/'+str(W.shape[1])+'_lstm.dev.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(name, dev_params_file, bi,
                   dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs)



        """
        Test model 5 times and get averages and std dev.
        """
        print('Running 5 runs to get average and standard deviations')
        dataset_results = []
        for i, it in enumerate(range(5)):
            np.random.seed()
            print(i+1)

            if bi:
                clf = create_BiLSTM(wordvecs, best_dim, output_dim, best_dropout, weights=W, train=train)
                checkpoint = ModelCheckpoint('models/bilstm/' + name +'/run'+ str(i+1)+'/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
            else:
                checkpoint = ModelCheckpoint('models/lstm/' + name + '/run'+ str(i+1)+'/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
                clf = create_LSTM(wordvecs, best_dim, output_dim, best_dropout, weights=W, train=train)

            h = clf.fit(dataset._Xtrain, dataset._ytrain, validation_data=[dataset._Xdev, dataset._ydev],
                        epochs=best_epoch, verbose=1, callbacks=[checkpoint])

            if bi:
                base_dir = 'models/bilstm/'+ name +'/run'+ str(i+1)
                weights = os.listdir(base_dir)
            else:
                base_dir = 'models/lstm/' + name + '/run'+str(i+1)
                weights = os.listdir(base_dir)

            best_val = 0
            best_weights = ''
            for weight in weights:
                val_acc = re.sub('weights.[0-9]*-', '', weight)
                val_acc = re.sub('.hdf5', '', val_acc)
                val_acc = float(val_acc)
                if val_acc > best_val:
                    best_val = val_acc
                    best_weights = weight

            clf = load_model(os.path.join(base_dir, best_weights))
            pred = clf.predict(dataset._Xtest, verbose=1)
            classes = clf.predict_classes(dataset._Xtest, verbose=1)
            if bi:
                prediction_file = 'predictions/bilstm/' + name + '/run' + str(i+1) + '/pred.txt'
                w2idx_file = 'predictions/bilstm/' + name + '/w2idx.pkl'
            else:
                prediction_file = 'predictions/lstm/' + name + '/run' + str(i+1) + '/pred.txt'
                w2idx_file = 'predictions/lstm/' + name + '/w2idx.pkl'
            print_prediction(prediction_file, classes)
            with open(w2idx_file, 'wb') as out:
                pickle.dump(word_idx_map, out)

            labels = sorted(set(dataset._ytrain.argmax(1)))
            if len(labels) == 2:
                average = 'binary'
            else:
                average = 'micro'
            mm = MyMetrics(dataset._ytest, pred, labels=labels, average=average)
            acc, precision, recall, micro_f1 = mm.get_scores()
            dataset_results.append([acc, precision, recall, micro_f1])



        # Get the average and std deviation over 10 runs with 10 random seeds
        dataset_results = np.array(dataset_results)
        ave_results = dataset_results.mean(axis=0)
        std_results = dataset_results.std(axis=0)
        print(u'acc: {0:.3f} \u00B1{1:.3f}'.format(ave_results[0], std_results[0]))
        print(u'prec: {0:.3f} \u00B1{1:.3f}'.format(ave_results[1], std_results[1]))
        print(u'recall: {0:.3f} \u00B1{1:.3f}'.format(ave_results[2], std_results[2]))
        print(u'f1: {0:.3f} \u00B1{1:.3f}'.format(ave_results[3], std_results[3]))

        results.append(ave_results)
        std_devs.append(std_results)

    results.append(list(np.array(results).mean(axis=0)))
    std_devs.append(list(np.array(std_devs).mean(axis=0)))
    names.append('overall')

    return names, results, std_devs, dim
예제 #3
0
    parser.add_argument('-lr', '--learning_rate', default=0.001, type=float)
    parser.add_argument('-wd', '--weight_decay', default=3e-5, type=float)
    parser.add_argument('-cuda', default=True, type=str2bool)
    parser.add_argument('-seed', default=123, type=int)
    args = parser.parse_args()

    print_args(args)
    args.cuda = args.cuda and torch.cuda.is_available()

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    print('Importing embeddings...')
    vecs = WordVecs(args.embeddings)

    synonyms1, synonyms2, neg = get_syn_ant(args.src_lang, vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.trg_lang, vecs)
    pdataset = ProjectionDataset(
        'lexicons/{0}_{1}.txt'.format(args.src_lang, args.trg_lang), vecs,
        vecs)

    print('Importing datasets...')

    # Get training, dev, and test data
    if args.src_dataset == 'opener':
        if args.binary:
            train_data, dev_data, test_data = open_dataset(
                'datasets/OpeNER/preprocessed/binary/en')
        else:
def test_embeddings(embedding_file, file_type):
    """
	embedding_file: the word embeddings file
	file_type:		word2vec, glove, tang, bin

	Use averaged word embeddings for each word in a text as features
	for l2 regularized logistiic regression. We test the embeddings
	on 10 benchmarks.

	Stanford Sentiment corpus (Socher et al., 2013)
	OpeNER corpus (Agerri et al., 2016)
	Sentube Corpora (Severyn et al., 2016)
	Semeval 2013 twitter corpus - task 2
	

    """

    print('importing vectors...')
    vecs = WordVecs(embedding_file, file_type)
    dim = vecs.vector_size

    print('importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            vecs,
                                            one_hot=False,
                                            binary=False,
                                            rep=ave_vecs)

    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            vecs,
                                            one_hot=False,
                                            binary=True,
                                            rep=ave_vecs)

    opener_dataset = General_Dataset('datasets/opener',
                                     vecs,
                                     one_hot=False,
                                     rep=ave_vecs)

    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           vecs, rep=ave_vecs,
                                           binary=True,
                                           one_hot=False)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              vecs, rep=ave_vecs,
                                              binary=True,
                                              one_hot=False)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                                vecs, rep=ave_vecs,
                                                one_hot=False)

    datasets = [st_fine, st_binary, opener_dataset, 
                sentube_auto_dataset, sentube_tablets_dataset, semeval_dataset]

    names = ['sst_fine', 'sst_binary', 'opener',
             'sentube_auto', 'sentube_tablets', 'semeval']

    # Collect results here
    results = []

    for name, dataset in zip(names, datasets):
        print('Testing vectors on {0}...'.format(name))

        # Get best parameters
        best_c, best_f1 = get_best_C(dataset)

        # Get predictions
        classifier = LogisticRegression(C=best_c)
        history = classifier.fit(dataset._Xtrain, dataset._ytrain)
        pred = classifier.predict(dataset._Xtest)
        predictions_file = "predictions/ave/" + name + '/pred.txt'
        print_prediction(predictions_file, pred)

        # Get results
        labels = sorted(set(dataset._ytrain))
        if len(labels) == 2:
            average = 'binary'
        else:
            average = 'micro'
        mm = MyMetrics(dataset._ytest, pred, labels=labels,
                       average=average, one_hot=False)
        acc, precision, recall, f1 = mm.get_scores()
        results.append([acc, precision, recall, f1])

    # Add overall results
    results.append(list(np.array(results).mean(axis=0)))
    names.append('overall')

    return names, results, dim
예제 #5
0
파일: main.py 프로젝트: UriSha/blse
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-sl',
                        '--source_lang',
                        help="source language: es, ca, eu, en (default: en)",
                        default='en')
    parser.add_argument('-tl',
                        '--target_lang',
                        help="target language: es, ca, eu, en (default: es)",
                        default='es')
    parser.add_argument('-bi',
                        '--binary',
                        help="binary or 4-class (default: True)",
                        default=True,
                        type=str2bool)
    parser.add_argument('-e',
                        '--epochs',
                        help="training epochs (default: 200)",
                        default=200,
                        type=int)
    parser.add_argument(
        '-a',
        '--alpha',
        help=
        "trade-off between projection and classification objectives (default: .001)",
        default=.1,
        type=float)
    parser.add_argument('-pl',
                        '--proj_loss',
                        help="projection loss: mse, cosine (default: cosine)",
                        default='cosine')
    parser.add_argument('-bs',
                        '--batch_size',
                        help="classification batch size (default: 50)",
                        default=21,
                        type=int)
    parser.add_argument(
        '-sv',
        '--src_vecs',
        help=" source language vectors (default: GoogleNewsVecs )",
        default='embeddings/original/google.txt')
    parser.add_argument(
        '-tr',
        '--trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='bingliu')
    parser.add_argument(
        '-da',
        '--dataset',
        help="dataset to train and test on (default: opener_sents)",
        default='opener_sents',
    )
    parser.add_argument(
        '-sd',
        '--savedir',
        help="where to dump weights during training (default: ./models)",
        default='models')
    parser.add_argument(
        '-lr',
        '--learning_rate',
        help="where to dump weights during training (default: 0.0001)",
        default=0.0001,
        type=float)
    parser.add_argument(
        '-m',
        '--model',
        help="where to dump weights during training (default: attn_rnn_blse)",
        default='rnn_attn_blse')
    parser.add_argument(
        '-cu',
        '--to_cuda',
        help="where to dump weights during training (default: True)",
        default=True,
        type=bool)
    args = parser.parse_args()

    # If there's no savedir, create it
    if args.model not in ['rnn_attn_blse', 'rnn_blse']:
        print("no such model: {}".format(args.model))
        exit(1)

    os.makedirs(args.savedir, exist_ok=True)

    if args.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    weight_dir = "{}/{}/{}-{}-{}".format(args.savedir, args.model,
                                         args.dataset, args.target_lang, b)

    results_file_name = "results/report_{}_alpha-{}_batch_size-{}_epochs-{}_lr-{}.txt".format(
        args.model, args.alpha, args.batch_size, args.epochs,
        '{0:.15f}'.format(args.learning_rate).rstrip('0').rstrip('.'))

    # import datasets (representation will depend on final classifier)
    print()
    print('training model')
    print('Parameters:')
    print('model:     {0}'.format(args.model))
    print('binary:     {0}'.format(b))
    print('epochs:      {0}'.format(args.epochs))
    print('alpha (projection loss coef):      {0}'.format(args.alpha))
    print('batchsize:  {0}'.format(args.batch_size))
    print('learning rate:  {0}'.format(args.learning_rate))
    print('weight_dir:  {0}'.format(weight_dir))
    print('results_file_name:  {0}'.format(results_file_name))
    print()

    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', args.source_lang,
                                           args.dataset),
                              None,
                              binary=args.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang,
                                                 args.dataset),
                                    None,
                                    binary=args.binary,
                                    rep=words,
                                    one_hot=False)
    # print("len(cross_dataset._Xdev): {}".format(len(cross_dataset._Xdev)))
    # print("len(cross_dataset._Xtest): {}".format(len(cross_dataset._Xtest)))

    # Import monolingual vectors
    print('importing word embeddings')
    trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format(
        args.target_lang)
    print("trg_vecs_file_path: {}".format(trg_vecs_file_path))
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(trg_vecs_file_path)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs)

    # Import translation pairs
    translation_file_path = "lexicons/{}/en-{}.txt".format(
        args.trans, args.target_lang)
    print("translation_file_path: {}".format(translation_file_path))
    pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs)

    # Set up model
    if args.model == 'rnn_blse':
        model = RNN_BLSE(
            src_vecs,
            trg_vecs,
            pdataset,
            dataset,
            cross_dataset,
            projection_loss=args.proj_loss,
            output_dim=output_dim,
            batch_size=args.batch_size,
            to_cuda=args.to_cuda,
            src_syn1=synonyms1,
            src_syn2=synonyms2,
            src_neg=neg,
            trg_syn1=cross_syn1,
            trg_syn2=cross_syn2,
            trg_neg=cross_neg,
        )
    elif args.model == 'rnn_attn_blse':
        model = Rnn_Attn_BLSE(
            src_vecs,
            trg_vecs,
            pdataset,
            dataset,
            cross_dataset,
            projection_loss=args.proj_loss,
            output_dim=output_dim,
            to_cuda=args.to_cuda,
            batch_size=args.batch_size,
            src_syn1=synonyms1,
            src_syn2=synonyms2,
            src_neg=neg,
            trg_syn1=cross_syn1,
            trg_syn2=cross_syn2,
            trg_neg=cross_neg,
        )

    if torch.cuda.is_available() and args.to_cuda:
        print("cuda is available")
        model.cuda()
    else:
        print("cuda is not available")

    # Loss Functions
    class_criterion = nn.CrossEntropyLoss()
    proj_criterion = nn.MSELoss()

    if args.proj_loss == 'mse':
        proj_criterion = nn.MSELoss()
    elif args.proj_loss == 'cosine':
        proj_criterion = cosine_loss
    else:
        print("no projection criterion supported: {}".format(args.proj_loss))
        exit(1)

    # Optimizer
    optim = torch.optim.Adam(model.parameters(), args.learning_rate)

    # Fit model
    results_file = open(results_file_name, "w+")
    trainer = Trainer(model, args.alpha, optim, args.learning_rate,
                      class_criterion, proj_criterion, args.epochs,
                      args.batch_size, results_file, weight_dir, args.to_cuda)

    best_model_file_path = trainer.train(pdataset._Xtrain, pdataset._ytrain,
                                         dataset._Xtrain, dataset._ytrain)

    # Get best dev f1 and weights
    print("looking in dir: {}".format(weight_dir))
    best_f1, best_params = get_best_model_params(best_model_file_path)
    best_model = torch.load(best_model_file_path)
    state_dict = best_model.state_dict()
    model.load_state_dict(state_dict)

    print()
    print('Dev set')
    print('best dev f1: {0:.3f}'.format(best_f1))
    print('parameters: epochs {0} batch size {1} alpha {2} learning rate {3}'.
          format(*best_params))

    results_file.write('\n')
    results_file.write('Dev set\n')
    results_file.write('best dev f1: {0:.3f}\n'.format(best_f1))
    results_file.write(
        'parameters: epochs {0} batch size {1} alpha {2}\n'.format(
            *best_params))

    # Evaluate on test set
    model.eval()

    model.evaluate(cross_dataset._Xtest,
                   cross_dataset._ytest,
                   results_file=results_file,
                   src=False)

    model.confusion_matrix(cross_dataset._Xtest,
                           cross_dataset._ytest,
                           src=False,
                           results_file=results_file)

    results_file.close()
예제 #6
0
def test_embeddings(file, threshold, file_type):
    emotions = [
        "anger", "anticipation", "disgust", "fear", "joy", "sadness",
        "surprise", "trust"
    ]

    # Import dataset where each test example is the words in the tweet
    dataset = Fine_Grained_Emotion_Dataset('data',
                                           None,
                                           rep=words,
                                           threshold=threshold)

    print('Basic statistics')
    table = []
    for i, emo in enumerate(emotions):
        train = dataset._ytrain[:, i].sum()
        test = dataset._ytest[:, i].sum()
        table.append((emo, train, test))
    print(tabulate.tabulate(table, headers=['emotion', '#train', '#test']))

    #### Get Parameters ####
    max_length = 0
    vocab = {}
    for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(
            dataset._Xtest):
        if len(sent) > max_length:
            max_length = len(sent)
        for w in sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    wordvecs = {}

    print('Importing vectors')
    for line in open(file):
        try:
            split = line.split()
            word = split[0]
            vec = np.array(split[1:], dtype='float32')
            if word in vocab:
                wordvecs[word] = vec
        except ValueError:
            pass

    dim = len(vec)

    oov = len(vocab) - len(wordvecs)
    print('OOV: {0}'.format(oov))

    # Add vectors for <unk>
    add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
    W, word_idx_map = get_W(wordvecs, dim=dim)

    # TODO: change this so I don't have to import vectors I don't need
    vecs = WordVecs(file)
    vecs._matrix = W
    vecs._w2idx = word_idx_map
    vecs.vocab_length, vecs.vector_size = W.shape

    ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs)

    # Get padded word indexes for all X
    Xtrain = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtrain
    ])
    Xdev = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xdev
    ])
    Xtest = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtest
    ])

    #### Test Models ####

    names = ['LSTM', 'BiLSTM', 'CNN']

    # Keep all mean and standard deviations of each emotion over datasets here
    all_emo_results = []
    all_emo_std_devs = []

    # Keep all mean and standard deviations of the averaged emotions here
    averaged_results = []
    averaged_std_devs = []

    # TEST EACH MODEL
    for name in names:

        print('Getting best parameters')

        dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(
            name, dev_params_file, Xtrain, dataset._ytrain, Xdev,
            dataset._ydev, wordvecs, W)

        print('Testing {0}'.format(name))

        # Keep the results for the 5 runs over the dataset
        model_results = []
        model_average_results = []

        # 5 runs to get average and standard deviation
        for i, it in enumerate(range(5)):
            print('Run: {0}'.format(i + 1))

            # create and train a new classifier for each iteration
            if name == 'LSTM':
                model = create_LSTM(wordvecs,
                                    dim=best_dim,
                                    output_dim=8,
                                    dropout=best_dropout,
                                    weights=W,
                                    train=True)
            elif name == 'BiLSTM':
                model = create_BiLSTM(wordvecs,
                                      dim=best_dim,
                                      output_dim=8,
                                      dropout=best_dropout,
                                      weights=W,
                                      train=True)
            elif name == 'CNN':
                model = create_cnn(W, Xtrain.shape[1])

            h = model.fit(Xtrain,
                          dataset._ytrain,
                          validation_data=[Xdev, dataset._ydev],
                          nb_epoch=best_epoch,
                          verbose=0)
            pred = model.predict(Xtest)

            pred = np.array([cutoff(x) for x in pred])
            y = dataset._ytest

            emo_results = []
            for j in range(len(emotions)):
                emo_y = y[:, j]
                emo_pred = pred[:, j]
                mm = MyMetrics(emo_y,
                               emo_pred,
                               one_hot=False,
                               average='binary')
                acc = mm.accuracy()
                precision, recall, f1 = mm.get_scores()
                emo_results.append([acc, precision, recall, f1])

            emo_results = np.array(emo_results)
            model_results.append(emo_results)

            # print('F1 scores')
            # for emo, result in zip(emotions, emo_results):
            #    a, p, r, f = result
            #    print('{0}: {1:.3f}'.format(emo, f))
            ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0)
            mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred)
            model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1))

            print(
                'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}'
                .format(ave_acc, mic_prec, mic_rec, mic_f1))
            print()

        model_results = np.array(model_results)
        model_average_results = np.array(model_average_results)
        average_model_results = model_results.mean(axis=0)
        model_std_dev_results = model_results.std(axis=0)
        overall_avg = model_average_results.mean(axis=0)
        overall_std = model_average_results.std(axis=0)

        all_emo_results.append(average_model_results)
        all_emo_std_devs.append(model_std_dev_results)

        averaged_results.append(overall_avg)
        averaged_std_devs.append(overall_std)

    return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-sl',
                        '--source_lang',
                        help="source language: es, ca, eu, en (default: en)",
                        default='en')
    parser.add_argument('-tl',
                        '--target_lang',
                        help="target language: es, ca, eu, en (default: es)",
                        default='es')
    parser.add_argument('-bi',
                        '--binary',
                        help="binary or 4-class (default: True)",
                        default=True,
                        type=str2bool)
    parser.add_argument('-e',
                        '--epochs',
                        help="training epochs (default: 200)",
                        default=200,
                        type=int)
    parser.add_argument(
        '-a',
        '--alpha',
        help=
        "trade-off between projection and classification objectives (default: .001)",
        default=.001,
        type=float)
    parser.add_argument('-pl',
                        '--proj_loss',
                        help="projection loss: mse, cosine (default: cosine)",
                        default='mse')
    parser.add_argument('-bs',
                        '--batch_size',
                        help="classification batch size (default: 50)",
                        default=20,
                        type=int)
    parser.add_argument(
        '-sv',
        '--src_vecs',
        help=" source language vectors (default: GoogleNewsVecs )",
        default='google.txt')
    parser.add_argument(
        '-tv',
        '--trg_vecs',
        help=" target language vectors (default: SGNS on Wikipedia)",
        default='sg-300-es.txt')
    parser.add_argument(
        '-tr',
        '--trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='lexicons/bingliu/en-es.txt')
    parser.add_argument(
        '-da',
        '--dataset',
        help="dataset to train and test on (default: opener_sents)",
        default='opener_sents',
    )
    parser.add_argument(
        '-sd',
        '--savedir',
        help="where to dump weights during training (default: ./models)",
        default='models/blse')
    args = parser.parse_args()

    # import datasets (representation will depend on final classifier)
    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', args.source_lang,
                                           args.dataset),
                              None,
                              binary=args.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang,
                                                 args.dataset),
                                    None,
                                    binary=args.binary,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(args.trg_vecs)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs)

    # Import translation pairs
    pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs)

    if args.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    # Set up model
    blse = BLSE(
        src_vecs,
        trg_vecs,
        pdataset,
        dataset,
        cross_dataset,
        projection_loss=args.proj_loss,
        output_dim=output_dim,
        src_syn1=synonyms1,
        src_syn2=synonyms2,
        src_neg=neg,
        trg_syn1=cross_syn1,
        trg_syn2=cross_syn2,
        trg_neg=cross_neg,
    )

    # If there's no savedir, create it
    os.makedirs(args.savedir, exist_ok=True)

    # Fit model
    blse.fit(pdataset._Xtrain,
             pdataset._ytrain,
             dataset._Xtrain,
             dataset._ytrain,
             weight_dir=args.savedir,
             batch_size=args.batch_size,
             alpha=args.alpha,
             epochs=args.epochs)

    # Get best dev f1 and weights
    best_f1, best_params, best_weights = get_best_run(args.savedir)
    blse.load_weights(best_weights)
    print()
    print('Dev set')
    print('best dev f1: {0:.3f}'.format(best_f1))
    print(
        'parameters: epochs {0} batch size {1} alpha {2}'.format(*best_params))

    # Evaluate on test set
    blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False)

    blse.evaluate(cross_dataset._Xtest,
                  cross_dataset._ytest,
                  src=False,
                  outfile=os.path.join(
                      'predictions', args.target_lang, 'blse',
                      '{0}-{1}-alpha{2}-epoch{3}-batch{4}.txt'.format(
                          args.dataset, b, args.alpha, best_params[0],
                          args.batch_size)))

    blse.confusion_matrix(cross_dataset._Xtest,
                          cross_dataset._ytest,
                          src=False)

    blse.plot()
예제 #8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset',
                        default='opener',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    langs = ['es', 'ca', 'eu']

    for lang in langs:
        print('#### {0} ####'.format(lang))
        en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                             None,
                             one_hot=False,
                             rep=words)
        cross_dataset = General_Dataset(os.path.join('datasets', lang,
                                                     args.dataset),
                                        None,
                                        one_hot=False,
                                        rep=words)
        vocab = en.vocab.update(cross_dataset.vocab)

        vecs = WordVecs(
            'embeddings/barista/sg-300-window4-negative20_en_{0}.txt'.format(
                lang),
            vocab=vocab)

        en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                             vecs,
                             one_hot=False,
                             rep=ave_vecs,
                             lowercase=False)
        en_binary = General_Dataset(os.path.join('datasets', 'en',
                                                 args.dataset),
                                    vecs,
                                    one_hot=False,
                                    rep=ave_vecs,
                                    binary=True,
                                    lowercase=False)

        cross_dataset = General_Dataset(os.path.join('datasets', lang,
                                                     args.dataset),
                                        vecs,
                                        one_hot=False,
                                        rep=ave_vecs,
                                        lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               vecs,
                                               one_hot=False,
                                               rep=ave_vecs,
                                               binary=True,
                                               lowercase=False)

        if True in args.bi:
            print('-binary-')
            best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(en_binary._Xtrain, en_binary._ytrain)
            acc, f1 = scores(clf, binary_cross_dataset, 'binary')
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'barista',
                             '{0}-bi.txt'.format(args.dataset)))
            print('acc: {0:.3f}'.format(acc))
            print('f1:  {0:.3f}'.format(f1))

        if False in args.bi:
            print('-fine-')
            best_c, best_f1 = get_best_C(en, cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(en._Xtrain, en._ytrain)
            acc, f1 = scores(clf, cross_dataset)
            print_prediction(
                clf, cross_dataset,
                os.path.join('predictions', lang, 'barista',
                             '{0}-4cls.txt'.format(args.dataset)))

            print('acc: {0:.3f}'.format(acc))
            print('f1:  {0:.3f}'.format(f1))
예제 #9
0
            30, 60
    ]:
        clf = LinearSVC(C=c)
        clf.fit(dataset._Xtrain, dataset._ytrain)
        pred = clf.predict(dataset._Xdev)
        f1 = per_class_f1(dataset._ydev, pred).mean()
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c, best_f1


if __name__ == '__main__':

    embeddingdir = '/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS'
    amazon_vecs = WordVecs(
        os.path.join(embeddingdir, 'SubjQuant/amazon-sg-300.txt'))
    twitter_vecs = WordVecs(
        os.path.join(embeddingdir, 'twitter_embeddings.txt'))

    pdataset = ProjectionDataset('lexicons/general_vocab.txt', amazon_vecs,
                                 twitter_vecs)

    books = Book_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True)
    dvd = DVD_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True)
    electronics = Electronics_Dataset(amazon_vecs,
                                      rep=ave_vecs,
                                      binary=True,
                                      one_hot=False)
    kitchen = Kitchen_Dataset(amazon_vecs,
                              rep=ave_vecs,
                              binary=True,
예제 #10
0
def run_model_on_datasets_with_embeddings(embedding_file, file_type):
    """
    embedding_file: the word embeddings file
    file_type:      word2vec, glove
    """
    print('importing word embedding vectors...')
    vecs = WordVecs(embedding_file, file_type)  # load the word2vec dictionary.
    dim = vecs.vector_size      # dimensionality of the word embeddings

    # For collecting results to return
    results = []
    std_devs = []


    datasetNames = [
                # 'sst_fine',
                # 'sst_binary',
                # 'opener',
                # 'sentube_auto',
                'sentube_tablets',
                'semeval',
                ]

    # train & test the model on every dataset above
    for datasetName in datasetNames:
        # dataset_load_start = datetime.now()
        if datasetName == 'sst_fine':
            dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=False,
                                            rep=words)
        elif datasetName == 'sst_binary':
            dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=True,
                                            rep=words)
        elif datasetName == 'opener':
            dataset = General_Dataset('datasets/opener',
                                             None,
                                             one_hot=True,
                                             rep=words)
        
        elif datasetName == 'sentube_auto':
            dataset = General_Dataset('datasets/SenTube/auto',
                                                   None, rep=words,
                                                   binary=True,
                                                   one_hot=True)
        elif datasetName == 'sentube_tablets':
            dataset = General_Dataset('datasets/SenTube/tablets',
                                                      None, rep=words,
                                                      binary=True,
                                                      one_hot=True)
        elif datasetName == 'semeval':
            dataset = Semeval_Dataset('datasets/semeval',
                                                        None, rep=words,
                                                        one_hot=True)


        print('Loading & Testing on {}:'.format(datasetName))

        # if hp.lowercase_all_sentences:
        #     for sent in dataset._Xtrain:
        #         for word in sent:
        #             if word != word.lower():
        #                 print("Word has an uppercase character:", word.decode('utf-8'))


        # find out the max length of sentences in the dataset and construct the vocab frequency dict.
        max_length = 0
        vocab = {}
        for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
            if len(sent) > max_length:
                max_length = len(sent)
            for w in sent:
                if w not in vocab:
                    vocab[w] = 1
                else:
                    vocab[w] += 1

        # create a dict of words that are in our word2vec embeddings
        # wordvecs: String -> embedding_vec
        wordvecs = {}
        for w in vecs._w2idx.keys():
            if w in vocab:
                wordvecs[w] = vecs[w]

        # Assign random w2v vectors to the unknown words. These are random uniformly distrubuted vectors of size dim.
        add_unknown_words(vecs, wordvecs, vocab, min_df=1, dim=dim)
        W, word_idx_map = get_W(wordvecs, dim=dim)  # Get the w2v index map for out final vocab

        print('Converting dataset to being right padded...')
        dataset = convert_dataset(dataset, word_idx_map, datasetName, max_length)
        output_dim = dataset._ytest.shape[1]

        # Test model hp.run_exps_amount times and get averages and std dev.
        dataset_results = []
        for i in range(1, hp.run_exps_amount + 1):  
            tf.reset_default_graph()  # Clears the current loaded tensorflow graph.

            w2i = tf.Variable(tf.constant(0.0, shape=[W.shape[0], W.shape[1]]),
                trainable=False, name="embedding_table")
            wordIndxToVec_tensor = tf.placeholder(tf.float32, [W.shape[0], W.shape[1]], name="embedding_table")     # [vobab_size x word_embedding_dim]
            w2i.assign(wordIndxToVec_tensor)

            start_time = datetime.now()     # Print time for logging.
            clf, best_mm_val, best_mm_test = createAndTrainTransformer(dataset, W, wordIndxToVec_tensor, output_dim, datasetName, max_length)
            print("Finished run #", i, "Time taken: " + str(datetime.now() - start_time))

            mm = best_mm_test
            acc, precision, recall, micro_f1 = mm.get_scores()
            dataset_results.append([acc, precision, recall, micro_f1])
            if hp.run_exps_amount == 1:
                acc, precision, recall, micro_f1 = mm.get_scores()
                dataset_results.append([acc, precision, recall, micro_f1])  # add twice so the average is the same... avoid running multiple runs this way.

            if hp.run_exps_amount != 1:   # Print the metrics for this run, unless we're running experiment only once.
                this_run_result = []
                this_run_result.append([acc, precision, recall, micro_f1])
                this_run_result.append([acc, precision, recall, micro_f1])
                this_run_result = np.array(this_run_result)
                this_run_ave_results = this_run_result.mean(axis=0) 
                this_run_std_results = this_run_result.std(axis=0)
                printMetrics(this_run_ave_results, this_run_std_results, datasetName)

        # Get the average and std deviation over 10 runs with 10 random seeds    
        dataset_results = np.array(dataset_results)
        ave_results = dataset_results.mean(axis=0)
        std_results = dataset_results.std(axis=0)
        printMetrics(ave_results, std_results, datasetName)
        
        results.append(ave_results)
        std_devs.append(std_results)

    results.append(list(np.array(results).mean(axis=0)))
    std_devs.append(list(np.array(std_devs).mean(axis=0)))
    datasetNames.append('overall')
    
    return datasetNames, results, std_devs, dim
예제 #11
0
def train_model_with_different_params(params):
    # Train a certain model (rnn_blse / rnn_attn_blse) for a certain target language
    #  for different combinations of hyper parameters

    if params.model not in ['rnn_attn_blse', 'rnn_blse']:
        print("no such model: {}".format(params.model))
        exit(1)

    # If there's no savedir, create it
    os.makedirs(params.savedir, exist_ok=True)

    if params.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    weight_dir = "{}/{}/{}-{}-{}".format(params.savedir, params.model,
                                         params.dataset, params.target_lang, b)
    best_params_file_name = "results/best_params_report_{}_{}_{}.txt".format(
        params.model, params.target_lang, b)
    best_params_file = open(best_params_file_name, "w+")

    best_params_file.write("Start parameter search:\n")
    best_params_file.write("Model: {}\n".format(params.model))
    best_params_file.write("is_binary: {}\n".format(params.binary))
    best_params_file.write("target_lang: {}\n".format(params.target_lang))

    best_f1 = 0.0
    best_params = None
    old_file_name = None
    old_results_file_name = None
    rest_of_scores = []

    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', params.source_lang,
                                           params.dataset),
                              None,
                              binary=params.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets',
                                                 params.target_lang,
                                                 params.dataset),
                                    None,
                                    binary=params.binary,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format(
        params.target_lang)
    print("trg_vecs_file_path: {}".format(trg_vecs_file_path))
    src_vecs = WordVecs(params.src_vecs)
    trg_vecs = WordVecs(trg_vecs_file_path)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(params.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(params.target_lang,
                                                    trg_vecs)

    # Import translation pairs
    translation_file_path = "lexicons/{}/en-{}.txt".format(
        params.trans, params.target_lang)
    print("translation_file_path: {}".format(translation_file_path))
    pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs)

    for proj_loss in params.proj_losses:
        for alpha in params.alphas:
            for learning_rate in params.learning_rates:
                for batch_size in params.batch_sizes:
                    best_model_file_path, acc, prec, rec, f1, results_file_name = train_model(
                        params.model, dataset, cross_dataset, src_vecs,
                        trg_vecs, synonyms1, synonyms2, neg, cross_syn1,
                        cross_syn2, cross_neg, pdataset, weight_dir, proj_loss,
                        alpha, learning_rate, batch_size, output_dim, b,
                        params)

                    if f1 > best_f1:
                        print()
                        print("Found new set of best hyper params:")
                        print("f1:      {0:.3f}".format(f1))
                        print("acc:      {0:.3f}".format(acc))
                        print("prec:      {0:.3f}".format(prec))
                        print("rec:      {0:.3f}".format(rec))
                        print('model:     {0}'.format(params.model))
                        print('is_binary:     {0}'.format(params.binary))
                        print('epochs:      {0}'.format(params.epochs))
                        print('proj_loss:      {0}'.format(proj_loss))
                        print('alpha (projection loss coef):      {0}'.format(
                            alpha))
                        print('batch size:  {0}'.format(batch_size))
                        print('learning rate:  {0}'.format(learning_rate))
                        print('weight_dir:  {0}'.format(weight_dir))
                        print('best_model_file_path:  {0}'.format(
                            best_model_file_path))
                        print()

                        best_params_file.write("\n")
                        best_params_file.write(
                            "Found new set of best hyper params:\n")
                        best_params_file.write(
                            "f1       {0:.3f}:\n".format(f1))
                        best_params_file.write(
                            "acc       {0:.3f}:\n".format(acc))
                        best_params_file.write(
                            "prec       {0:.3f}:\n".format(prec))
                        best_params_file.write(
                            "rec       {0:.3f}:\n".format(rec))
                        best_params_file.write('model:     {0}\n'.format(
                            params.model))
                        best_params_file.write('is_binary:     {0}\n'.format(
                            params.binary))
                        best_params_file.write('epochs:      {0}\n'.format(
                            params.epochs))
                        best_params_file.write(
                            'proj_loss:      {0}\n'.format(proj_loss))
                        best_params_file.write(
                            "alpha (projection loss coef):      {0}\n".format(
                                alpha))
                        best_params_file.write(
                            'batch size:  {0}\n'.format(batch_size))
                        best_params_file.write(
                            'learning:  {0}\n'.format(learning_rate))
                        best_params_file.write(
                            'weight_dir:  {0}\n'.format(weight_dir))
                        best_params_file.write(
                            'best_model_file_path:  {0}\n'.format(
                                best_model_file_path))

                        if old_file_name != None:
                            os.remove(old_file_name)

                        if old_results_file_name != None:
                            os.remove(old_results_file_name)

                        torch.save(params.model, best_model_file_path)
                        old_file_name = best_model_file_path
                        old_results_file_name = results_file_name
                        best_f1 = f1
                        rest_of_scores = [acc, prec, rec]

                        best_params = [
                            proj_loss, alpha, learning_rate, batch_size
                        ]

                    else:
                        os.remove(results_file_name)
                        os.remove(best_model_file_path)

    print("")
    print("Done parameters search")
    print("best f1: {0:.3f}".format(best_f1))
    print("its acc: {0:.3f}".format(rest_of_scores[0]))
    print("its prec: {0:.3f}".format(rest_of_scores[1]))
    print("its rec: {0:.3f}".format(rest_of_scores[2]))
    print("best_params:")
    print('model:     {0}'.format(params.model))
    print('is_binary:     {0}'.format(params.binary))
    print('proj_loss:     {0}'.format(best_params[0]))
    print('alpha (projection loss coef):      {0}'.format(best_params[1]))
    print('learning rate:  {0}'.format(best_params[2]))
    print('batch size:  {0}'.format(best_params[3]))
    print("")

    best_params_file.write("\n")
    best_params_file.write("Done parameters search\n")
    best_params_file.write("best f1: {0:.3f}\n".format(best_f1))
    best_params_file.write("its acc: {0:.3f}\n".format(rest_of_scores[0]))
    best_params_file.write("its prec: {0:.3f}\n".format(rest_of_scores[1]))
    best_params_file.write("its rec: {0:.3f}\n".format(rest_of_scores[2]))
    best_params_file.write('model:     {0}\n'.format(params.model))
    best_params_file.write('is_binary:     {0}\n'.format(params.binary))
    best_params_file.write('proj_loss:      {0}\n'.format(best_params[0]))
    best_params_file.write("alpha (projection loss coef):      {0}\n".format(
        best_params[1]))
    best_params_file.write('learning:  {0}\n'.format(best_params[2]))
    best_params_file.write('batch size:  {0}\n'.format(best_params[3]))
    best_params_file.close()
예제 #12
0
def test_embeddings(embedding_file, file_type):
    """
    Tang et al. (2014) embeddings and cassification approach
    on a number of benchmark datasets.

    """

    print('importing vectors...')
    vecs = WordVecs(embedding_file, file_type)
    dim = vecs.vector_size

    print('Importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                         None,
                                         one_hot=False,
                                         binary=False,
                                         rep=words)

    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                           None,
                                           one_hot=False,
                                           binary=True,
                                           rep=words)

    opener_dataset = General_Dataset('datasets/opener',
                                     vecs,
                                     one_hot=False,
                                     rep=words)

    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           vecs._w2idx,
                                           rep=words,
                                           binary=True,
                                           one_hot=False)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              vecs._w2idx,
                                              rep=words,
                                              binary=True,
                                              one_hot=False)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                      vecs._w2idx,
                                      rep=words,
                                      one_hot=False)

    datasets = [
        st_fine, st_binary, opener_dataset, sentube_auto_dataset,
        sentube_tablets_dataset, semeval_dataset
    ]

    names = [
        'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets',
        'semeval'
    ]

    # Collect results here
    results = []

    for name, dataset in zip(names, datasets):
        print('Testing on {0}...'.format(name))

        Xtrain = np.array(
            [conv_tweet(' '.join(t), vecs) for t in dataset._Xtrain])
        Xtest = np.array(
            [conv_tweet(' '.join(t), vecs) for t in dataset._Xtest])
        Xdev = np.array([conv_tweet(' '.join(t), vecs) for t in dataset._Xdev])

        # get best parameters on dev set
        best_C, best_rate = get_best_C(Xtrain, dataset._ytrain, Xdev,
                                       dataset._ydev)

        clf = LogisticRegression(C=best_C)
        h = clf.fit(Xtrain, dataset._ytrain)
        pred = clf.predict(Xtest)
        predictions_file = "predictions/joint/" + name + '/pred.txt'
        print_prediction(predictions_file, pred)

        labels = sorted(set(dataset._ytrain))
        if len(labels) == 2:
            average = 'binary'
        else:
            average = 'micro'
        mm = MyMetrics(dataset._ytest,
                       pred,
                       one_hot=False,
                       labels=labels,
                       average=average)
        acc, precision, recall, f1 = mm.get_scores()
        results.append([acc, precision, recall, f1])

    results.append(list(np.array(results).mean(axis=0)))
    names.append('overall')

    return names, results, dim
예제 #13
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-l', help="target language: es, ca, eu", default='es')
    parser.add_argument('-bi',
                        help="binary or 4-class",
                        default=False,
                        type=str2bool)
    parser.add_argument('-epoch', default=300, type=int)
    parser.add_argument('-alpha', default=.5, type=float)
    parser.add_argument('-batch_size', default=200, type=int)
    parser.add_argument('-src_vecs', default='embeddings/original/google.txt')
    parser.add_argument('-trg_vecs',
                        default='embeddings/original/sg-300-es.txt')
    parser.add_argument(
        '-trans',
        help='translation pairs',
        default=
        'lexicons/bingliu_en_es.one-2-one_AND_Negators_Intensifiers_Diminishers.txt'
    )
    parser.add_argument('-dataset', default='opener')
    args = parser.parse_args()

    # import datasets (representation will depend on final classifier)
    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                              None,
                              binary=args.bi,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.l,
                                                 args.dataset),
                                    None,
                                    binary=args.bi,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(args.trg_vecs)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant('en', src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.l, trg_vecs)

    # Import translation pairs
    pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs)

    # initialize classifier
    if args.bi:
        ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset,
                  synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg,
                  2)
    else:
        ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset,
                  synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg,
                  4)

    # train model
    print('training model')
    print('Parameters:')
    print('lang:       {0}'.format(args.l))
    print('binary:     {0}'.format(args.bi))
    print('epoch:      {0}'.format(args.epoch))
    print('alpha:      {0}'.format(args.alpha))
    print('batchsize:  {0}'.format(args.batch_size))
    print('src vecs:   {0}'.format(args.src_vecs))
    print('trg_vecs:   {0}'.format(args.trg_vecs))
    print('trans dict: {0}'.format(args.trans))
    print('dataset:    {0}'.format(args.dataset))
    if args.bi:
        b = 'bi'
    else:
        b = '4cls'

    weight_dir = os.path.join('models',
                              '{0}-{1}-{2}'.format(args.dataset, args.l, b))
    ble.fit(pdataset._Xtrain,
            pdataset._ytrain,
            dataset._Xtrain,
            dataset._ytrain,
            weight_dir=weight_dir,
            alpha=args.alpha,
            epochs=args.epoch,
            batch_size=args.batch_size)

    # get the best weights
    best_f1, best_params, best_weights = get_best_run(weight_dir)
    epochs, batch_size, alpha = best_params
    ble.load_weights(best_weights)

    # evaluate
    if args.bi:
        ble.plot(outfile=os.path.join(
            'figures', 'syn-ant', args.l, 'ble',
            '{0}-bi-alpha{1}-epoch{2}-batch{3}.pdf'.format(
                args.dataset, alpha, epochs, batch_size)))
        ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False)
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     src=False,
                     outfile=os.path.join(
                         'predictions', args.l, 'ble',
                         '{0}-bi-alpha{1}-epoch{2}-batch{3}.txt'.format(
                             args.dataset, alpha, epochs, batch_size)))
    else:
        ble.plot(outfile=os.path.join(
            'figures', 'syn-ant', args.l, 'ble',
            '{0}-4cls-alpha{1}-epoch{2}-batch{3}.pdf'.format(
                args.dataset, alpha, epochs, batch_size)))
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     average='macro',
                     src=False)
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     average='macro',
                     src=False,
                     outfile=os.path.join(
                         'predictions', args.l, 'ble',
                         '{0}-4cls-alpha{1}-epoch{2}-batch{3}.txt'.format(
                             args.dataset, alpha, epochs, batch_size)))
예제 #14
0
def test_embeddings(file, file_type):
    print('Importing vecs...')
    #vec_file = sys.argv[1]
    #vec_file = '/home/jeremy/Escritorio/sentiment_retrofitting/embeddings/sswe-u-50.txt'
    vecs = WordVecs(file)

    print('Importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                         None,
                                         one_hot=True,
                                         binary=False,
                                         rep=words)

    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                           None,
                                           one_hot=True,
                                           binary=True,
                                           rep=words)

    opener_dataset = General_Dataset('datasets/opener',
                                     vecs,
                                     one_hot=True,
                                     rep=word_reps)

    twitter_dataset = Semeval_Dataset('datasets/twitter',
                                      vecs._w2idx,
                                      rep=word_reps,
                                      one_hot=True)

    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           vecs._w2idx,
                                           rep=word_reps,
                                           binary=True,
                                           one_hot=True)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              vecs._w2idx,
                                              rep=word_reps,
                                              binary=True,
                                              one_hot=True)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                      vecs,
                                      rep=words,
                                      one_hot=True)

    datasets = [
        st_fine, st_binary, opener_dataset, sentube_auto_dataset,
        sentube_tablets_dataset, semeval_dataset
    ]

    names = [
        'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets',
        'semeval'
    ]

    dim = vecs.vector_size
    for name, dataset in zip(names, datasets):
        print('Testing on {0}...'.format(name))

        max_length = 0
        vocab = {}
        for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(
                dataset._Xtest):
            if len(sent) > max_length:
                max_length = len(sent)
            for w in sent:
                if w not in vocab:
                    vocab[w] = 1
                else:
                    vocab[w] += 1

        wordvecs = {}
        for w in vecs._w2idx.keys():
            if w in vocab:
                wordvecs[w] = vecs[w]

        add_unknown_words(wordvecs, vocab, min_df=1, k=dim)
        W, word_idx_map = get_W(wordvecs, k=dim)

        print('Converting and Padding dataset...')

        dataset = convert_dataset(dataset, word_idx_map, max_length)

        output_dim = dataset._ytest.shape[1]
        """
        Get best Dev params
        ===========================================================
        """

        dev_params_file = 'dev_params/' + str(W.shape[1]) + '_cnn.dev.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(
            name, dev_params_file, max_length, dataset._Xtrain,
            dataset._ytrain, dataset._Xdev, dataset._ydev, W)

        # Collect results here
        results = []
        std_devs = []

        for i, it in enumerate(range(5)):
            dataset_results = []

            checkpoint = ModelCheckpoint(
                'models/cnn/' + name + '/run' + str(i + 1) +
                '/weights.{epoch:03d}-{val_acc:.4f}.hdf5',
                monitor='val_acc',
                verbose=1,
                save_best_only=True,
                mode='auto')
            clf = create_cnn(W,
                             max_length,
                             dim=best_dim,
                             dropout=best_dropout,
                             output_dim=output_dim)

            h = clf.fit(dataset._Xtrain,
                        dataset._ytrain,
                        validation_data=[dataset._Xdev, dataset._ydev],
                        epochs=best_epoch,
                        verbose=1,
                        callbacks=[checkpoint])

            base_dir = 'models/cnn/' + name + '/run' + str(i + 1)
            weights = os.listdir(base_dir)
            best_val = 0
            best_weights = ''
            for weight in weights:
                val_acc = re.sub('weights.[0-9]*-', '', weight)
                val_acc = re.sub('.hdf5', '', val_acc)
                val_acc = float(val_acc)
                if val_acc > best_val:
                    best_val = val_acc
                    best_weights = weight

            clf = load_model(os.path.join(base_dir, best_weights))

            pred = clf.predict(dataset._Xtest, verbose=1)
            classes = clf.predict_classes(dataset._Xtest, verbose=1)
            prediction_file = 'predictions/cnn/' + name + '/run' + str(
                i + 1) + '/pred.txt'
            w2idx_file = 'predictions/cnn/' + name + '/w2idx.pkl'
            print_prediction(prediction_file, classes)
            with open(w2idx_file, 'wb') as out:
                pickle.dump(word_idx_map, out)

            labels = sorted(set(dataset._ytrain.argmax(1)))
            if len(labels) == 2:
                average = 'binary'
            else:
                average = 'micro'
            mm = MyMetrics(dataset._ytest,
                           pred,
                           labels=labels,
                           average=average)
            acc, precision, recall, micro_f1 = mm.get_scores()
            dataset_results.append([acc, precision, recall, micro_f1])

    return names, results, std_devs, dim
예제 #15
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--NUM_LAYERS", "-nl", default=1, type=int)
    parser.add_argument("--HIDDEN_DIM", "-hd", default=100, type=int)
    parser.add_argument("--BATCH_SIZE", "-bs", default=50, type=int)
    parser.add_argument("--EMBEDDING_DIM", "-ed", default=300, type=int)
    parser.add_argument("--TRAIN_EMBEDDINGS", "-te", action="store_true")
    parser.add_argument("--AUXILIARY_TASK", "-aux", default="negation_scope")
    parser.add_argument("--EMBEDDINGS",
                        "-emb",
                        default="../../embeddings/google.txt")

    args = parser.parse_args()
    print(args)

    # Get embeddings (CHANGE TO GLOVE OR FASTTEXT EMBEDDINGS)
    embeddings = WordVecs(args.EMBEDDINGS)
    w2idx = embeddings._w2idx

    # Create shared vocabulary for tasks
    vocab = Vocab(train=True)

    # Update with word2idx from pretrained embeddings so we don't lose them
    # making sure to change them by one to avoid overwriting the UNK token
    # at index 0
    with_unk = {}
    for word, idx in embeddings._w2idx.items():
        with_unk[word] = idx + 1
    vocab.update(with_unk)

    # Import datasets
    # This will update vocab with words not found in embeddings
                        default="../../embeddings/BLSE/google.txt")
    parser.add_argument('-te',
                        '--trg_embedding',
                        default="../../embeddings/BLSE/sg-300-es.txt")
    parser.add_argument('-sd',
                        '--src_dataset',
                        default="datasets/training/en/raw")
    parser.add_argument('-td',
                        '--trg_dataset',
                        default="datasets/training/es/raw")

    args = parser.parse_args()

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_embedding)
    src_vecs.mean_center()
    src_vecs.normalize()
    trg_vecs = WordVecs(args.trg_embedding)
    trg_vecs.mean_center()
    trg_vecs.normalize()

    # Setup projection dataset
    trans = 'lexicons/bingliu_en_{0}.one-2-one.txt'.format(args.lang)
    pdataset = ProjectionDataset(trans, src_vecs, trg_vecs)

    # learn the translation matrix W
    print('Projecting src embeddings to trg space...')
    W = get_projection_matrix(pdataset, src_vecs, trg_vecs)
    print('W done')
예제 #17
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-src_vecs',
        default='embeddings/original/google.txt',
        help=" source language vectors (default: GoogleNewsVecs )")
    parser.add_argument(
        '-trg_vecs',
        default='embeddings/original/sg-300-{0}.txt',
        help=" target language vectors (default: SGNS on Wikipedia)")
    parser.add_argument(
        '-trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='lexicons/bingliu/en-{0}.txt')
    parser.add_argument('-dataset',
                        default='opener_sents',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    # Loop over the three languages
    for lang in ['es', 'ca', 'eu']:
        print('################ {0} ##############'.format(lang))

        # Import monolingual vectors
        print('importing word embeddings')
        src_vecs = WordVecs(args.src_vecs)
        src_vecs.mean_center()
        src_vecs.normalize()

        trg_vecs = WordVecs(args.trg_vecs.format(lang))
        trg_vecs.mean_center()
        trg_vecs.normalize()

        # Setup projection dataset
        pdataset = ProjectionDataset(args.trans.format(lang), src_vecs,
                                     trg_vecs)

        # learn the translation matrix W
        W = get_W(pdataset, src_vecs, trg_vecs)

        # project the source matrix to the new shared space
        src_vecs._matrix = np.dot(src_vecs._matrix, W)

        # Import datasets (representation will depend on final classifier)
        print('importing datasets')
        binary_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                      args.dataset),
                                         src_vecs,
                                         binary=True,
                                         rep=ave_vecs,
                                         one_hot=False,
                                         lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               trg_vecs,
                                               binary=True,
                                               rep=ave_vecs,
                                               one_hot=False,
                                               lowercase=False)

        fine_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                    args.dataset),
                                       src_vecs,
                                       binary=False,
                                       rep=ave_vecs,
                                       one_hot=False,
                                       lowercase=False)
        fine_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                             trg_vecs,
                                             binary=False,
                                             rep=ave_vecs,
                                             one_hot=False,
                                             lowercase=False)

        # Train linear SVM classifier
        if True in args.bi:
            best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain)
            cpred = clf.predict(binary_cross_dataset._Xtest)
            cf1 = macro_f1(binary_cross_dataset._ytest, cpred)
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-bi.txt'.format(args.dataset)))
            print('-binary-')
            print('Acc: {0:.3f}'.format(
                clf.score(binary_cross_dataset._Xtest,
                          binary_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()

        if False in args.bi:
            best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain)
            cpred = clf.predict(fine_cross_dataset._Xtest)
            cf1 = macro_f1(fine_cross_dataset._ytest, cpred)
            print_prediction(
                clf, fine_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-4cls.txt'.format(args.dataset)))
            print('-fine-')
            print('Acc: {0:.3f}'.format(
                clf.score(fine_cross_dataset._Xtest,
                          fine_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()
예제 #18
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-vec_dir',
                        default='../deployment/MUSE/',
                        help=" directory that hold MUSE vectors")
    parser.add_argument('-dataset',
                        default='opener',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    # Loop over the three languages
    for lang in ['ca', 'eu', 'es']:
        print('################ {0} ##############'.format(lang))

        # Import monolingual vectors
        print('importing word embeddings')
        src_vecs = WordVecs(
            os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-en.txt'))
        trg_vecs = WordVecs(
            os.path.join(args.vec_dir, 'en-{0}'.format(lang),
                         'muse-{0}.txt'.format(lang)))

        # Import datasets (representation will depend on final classifier)
        print('importing datasets')
        binary_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                      args.dataset),
                                         src_vecs,
                                         binary=True,
                                         rep=ave_vecs,
                                         one_hot=False,
                                         lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               trg_vecs,
                                               binary=True,
                                               rep=ave_vecs,
                                               one_hot=False,
                                               lowercase=False)

        fine_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                    args.dataset),
                                       src_vecs,
                                       binary=False,
                                       rep=ave_vecs,
                                       one_hot=False,
                                       lowercase=False)
        fine_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                             trg_vecs,
                                             binary=False,
                                             rep=ave_vecs,
                                             one_hot=False,
                                             lowercase=False)

        # Train linear SVM classifier
        if True in args.bi:
            best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain)
            cpred = clf.predict(binary_cross_dataset._Xtest)
            cf1 = macro_f1(binary_cross_dataset._ytest, cpred)
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'muse',
                             '{0}-bi.txt'.format(args.dataset)))
            print('-binary-')
            print('Acc: {0:.3f}'.format(
                clf.score(binary_cross_dataset._Xtest,
                          binary_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()

        if False in args.bi:
            best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain)
            cpred = clf.predict(fine_cross_dataset._Xtest)
            cf1 = macro_f1(fine_cross_dataset._ytest, cpred)
            print_prediction(
                clf, fine_cross_dataset,
                os.path.join('predictions', lang, 'muse',
                             '{0}-4cls.txt'.format(args.dataset)))
            print('-fine-')
            print('Acc: {0:.3f}'.format(
                clf.score(fine_cross_dataset._Xtest,
                          fine_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()