src_vecs.normalize()
    trg_vecs = WordVecs(args.trg_embedding)
    trg_vecs.mean_center()
    trg_vecs.normalize()

    # Setup projection dataset
    trans = 'lexicons/bingliu_en_{0}.one-2-one.txt'.format(args.lang)
    pdataset = ProjectionDataset(trans, src_vecs, trg_vecs)

    # learn the translation matrix W
    print('Projecting src embeddings to trg space...')
    W = get_projection_matrix(pdataset, src_vecs, trg_vecs)
    print('W done')

    # project the source matrix to the new shared space
    src_vecs._matrix = np.dot(src_vecs._matrix, W)
    print('src_vecs done')

    # open datasets
    src_dataset = General_Dataset(args.src_dataset,
                                  None,
                                  rep=word_reps,
                                  binary=args.binary)
    print('src_dataset done')
    trg_dataset = General_Dataset(args.trg_dataset,
                                  None,
                                  rep=word_reps,
                                  binary=args.binary)
    print('trg_dataset done')

    # get joint vocabulary and maximum sentence length
Example #2
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-src_vecs',
        default='embeddings/original/google.txt',
        help=" source language vectors (default: GoogleNewsVecs )")
    parser.add_argument(
        '-trg_vecs',
        default='embeddings/original/sg-300-{0}.txt',
        help=" target language vectors (default: SGNS on Wikipedia)")
    parser.add_argument(
        '-trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='lexicons/bingliu/en-{0}.txt')
    parser.add_argument('-dataset',
                        default='opener_sents',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    # Loop over the three languages
    for lang in ['es', 'ca', 'eu']:
        print('################ {0} ##############'.format(lang))

        # Import monolingual vectors
        print('importing word embeddings')
        src_vecs = WordVecs(args.src_vecs)
        src_vecs.mean_center()
        src_vecs.normalize()

        trg_vecs = WordVecs(args.trg_vecs.format(lang))
        trg_vecs.mean_center()
        trg_vecs.normalize()

        # Setup projection dataset
        pdataset = ProjectionDataset(args.trans.format(lang), src_vecs,
                                     trg_vecs)

        # learn the translation matrix W
        W = get_W(pdataset, src_vecs, trg_vecs)

        # project the source matrix to the new shared space
        src_vecs._matrix = np.dot(src_vecs._matrix, W)

        # Import datasets (representation will depend on final classifier)
        print('importing datasets')
        binary_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                      args.dataset),
                                         src_vecs,
                                         binary=True,
                                         rep=ave_vecs,
                                         one_hot=False,
                                         lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               trg_vecs,
                                               binary=True,
                                               rep=ave_vecs,
                                               one_hot=False,
                                               lowercase=False)

        fine_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                    args.dataset),
                                       src_vecs,
                                       binary=False,
                                       rep=ave_vecs,
                                       one_hot=False,
                                       lowercase=False)
        fine_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                             trg_vecs,
                                             binary=False,
                                             rep=ave_vecs,
                                             one_hot=False,
                                             lowercase=False)

        # Train linear SVM classifier
        if True in args.bi:
            best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain)
            cpred = clf.predict(binary_cross_dataset._Xtest)
            cf1 = macro_f1(binary_cross_dataset._ytest, cpred)
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-bi.txt'.format(args.dataset)))
            print('-binary-')
            print('Acc: {0:.3f}'.format(
                clf.score(binary_cross_dataset._Xtest,
                          binary_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()

        if False in args.bi:
            best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain)
            cpred = clf.predict(fine_cross_dataset._Xtest)
            cf1 = macro_f1(fine_cross_dataset._ytest, cpred)
            print_prediction(
                clf, fine_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-4cls.txt'.format(args.dataset)))
            print('-fine-')
            print('Acc: {0:.3f}'.format(
                clf.score(fine_cross_dataset._Xtest,
                          fine_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()
Example #3
0
def test_embeddings(file, threshold, file_type):
    emotions = [
        "anger", "anticipation", "disgust", "fear", "joy", "sadness",
        "surprise", "trust"
    ]

    # Import dataset where each test example is the words in the tweet
    dataset = Fine_Grained_Emotion_Dataset('data',
                                           None,
                                           rep=words,
                                           threshold=threshold)

    print('Basic statistics')
    table = []
    for i, emo in enumerate(emotions):
        train = dataset._ytrain[:, i].sum()
        test = dataset._ytest[:, i].sum()
        table.append((emo, train, test))
    print(tabulate.tabulate(table, headers=['emotion', '#train', '#test']))

    #### Get Parameters ####
    max_length = 0
    vocab = {}
    for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(
            dataset._Xtest):
        if len(sent) > max_length:
            max_length = len(sent)
        for w in sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    wordvecs = {}

    print('Importing vectors')
    for line in open(file):
        try:
            split = line.split()
            word = split[0]
            vec = np.array(split[1:], dtype='float32')
            if word in vocab:
                wordvecs[word] = vec
        except ValueError:
            pass

    dim = len(vec)

    oov = len(vocab) - len(wordvecs)
    print('OOV: {0}'.format(oov))

    # Add vectors for <unk>
    add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
    W, word_idx_map = get_W(wordvecs, dim=dim)

    # TODO: change this so I don't have to import vectors I don't need
    vecs = WordVecs(file)
    vecs._matrix = W
    vecs._w2idx = word_idx_map
    vecs.vocab_length, vecs.vector_size = W.shape

    ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs)

    # Get padded word indexes for all X
    Xtrain = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtrain
    ])
    Xdev = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xdev
    ])
    Xtest = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtest
    ])

    #### Test Models ####

    names = ['LSTM', 'BiLSTM', 'CNN']

    # Keep all mean and standard deviations of each emotion over datasets here
    all_emo_results = []
    all_emo_std_devs = []

    # Keep all mean and standard deviations of the averaged emotions here
    averaged_results = []
    averaged_std_devs = []

    # TEST EACH MODEL
    for name in names:

        print('Getting best parameters')

        dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(
            name, dev_params_file, Xtrain, dataset._ytrain, Xdev,
            dataset._ydev, wordvecs, W)

        print('Testing {0}'.format(name))

        # Keep the results for the 5 runs over the dataset
        model_results = []
        model_average_results = []

        # 5 runs to get average and standard deviation
        for i, it in enumerate(range(5)):
            print('Run: {0}'.format(i + 1))

            # create and train a new classifier for each iteration
            if name == 'LSTM':
                model = create_LSTM(wordvecs,
                                    dim=best_dim,
                                    output_dim=8,
                                    dropout=best_dropout,
                                    weights=W,
                                    train=True)
            elif name == 'BiLSTM':
                model = create_BiLSTM(wordvecs,
                                      dim=best_dim,
                                      output_dim=8,
                                      dropout=best_dropout,
                                      weights=W,
                                      train=True)
            elif name == 'CNN':
                model = create_cnn(W, Xtrain.shape[1])

            h = model.fit(Xtrain,
                          dataset._ytrain,
                          validation_data=[Xdev, dataset._ydev],
                          nb_epoch=best_epoch,
                          verbose=0)
            pred = model.predict(Xtest)

            pred = np.array([cutoff(x) for x in pred])
            y = dataset._ytest

            emo_results = []
            for j in range(len(emotions)):
                emo_y = y[:, j]
                emo_pred = pred[:, j]
                mm = MyMetrics(emo_y,
                               emo_pred,
                               one_hot=False,
                               average='binary')
                acc = mm.accuracy()
                precision, recall, f1 = mm.get_scores()
                emo_results.append([acc, precision, recall, f1])

            emo_results = np.array(emo_results)
            model_results.append(emo_results)

            # print('F1 scores')
            # for emo, result in zip(emotions, emo_results):
            #    a, p, r, f = result
            #    print('{0}: {1:.3f}'.format(emo, f))
            ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0)
            mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred)
            model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1))

            print(
                'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}'
                .format(ave_acc, mic_prec, mic_rec, mic_f1))
            print()

        model_results = np.array(model_results)
        model_average_results = np.array(model_average_results)
        average_model_results = model_results.mean(axis=0)
        model_std_dev_results = model_results.std(axis=0)
        overall_avg = model_average_results.mean(axis=0)
        overall_std = model_average_results.std(axis=0)

        all_emo_results.append(average_model_results)
        all_emo_std_devs.append(model_std_dev_results)

        averaged_results.append(overall_avg)
        averaged_std_devs.append(overall_std)

    return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim