Esempio n. 1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default='opener_sents', help="dataset to train and test on (default: opener)")
    args = parser.parse_args()

    vecs = WordVecs('/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS/BLSE/google.txt')

    en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                         vecs, one_hot=False, rep=ave_vecs, lowercase=False)
    en_binary = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                                 vecs, one_hot=False, rep=ave_vecs, binary=True, lowercase=False)

    langs = ['es', 'ca', 'eu']

    for lang in langs:
        print('#### {0} ####'.format(lang))
        cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset),
                                        vecs, one_hot=False, rep=ave_vecs, lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join('datasets','trans',lang, args.dataset),
                                               vecs, one_hot=False, rep=ave_vecs,
                                               binary=True, lowercase=False)

        print('-binary-')
        best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset)
        clf = LinearSVC(C=best_c)
        clf.fit(en_binary._Xtrain, en_binary._ytrain)
        acc, prec, rec, f1 = scores(clf, binary_cross_dataset)
        print_prediction(clf, binary_cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-bi.txt'.format(args.dataset)))
        print('acc:   {0:.3f}'.format(acc))
        print('prec:  {0:.3f}'.format(prec))
        print('rec:   {0:.3f}'.format(rec))
        print('f1:    {0:.3f}'.format(f1))

        print('-fine-')
        best_c, best_f1 = get_best_C(en, cross_dataset)
        clf = LinearSVC(C=best_c)
        clf.fit(en._Xtrain, en._ytrain)
        acc, prec, rec, f1 = scores(clf, cross_dataset)
        print_prediction(clf, cross_dataset, os.path.join('predictions', lang, 'mt', '{0}-4cls.txt'.format(args.dataset)))
        print('acc:   {0:.3f}'.format(acc))
        print('prec:  {0:.3f}'.format(prec))
        print('rec:   {0:.3f}'.format(rec))
        print('f1:    {0:.3f}'.format(f1))
Esempio n. 2
0
File: main.py Progetto: UriSha/blse
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-sl',
                        '--source_lang',
                        help="source language: es, ca, eu, en (default: en)",
                        default='en')
    parser.add_argument('-tl',
                        '--target_lang',
                        help="target language: es, ca, eu, en (default: es)",
                        default='es')
    parser.add_argument('-bi',
                        '--binary',
                        help="binary or 4-class (default: True)",
                        default=True,
                        type=str2bool)
    parser.add_argument('-e',
                        '--epochs',
                        help="training epochs (default: 200)",
                        default=200,
                        type=int)
    parser.add_argument(
        '-a',
        '--alpha',
        help=
        "trade-off between projection and classification objectives (default: .001)",
        default=.1,
        type=float)
    parser.add_argument('-pl',
                        '--proj_loss',
                        help="projection loss: mse, cosine (default: cosine)",
                        default='cosine')
    parser.add_argument('-bs',
                        '--batch_size',
                        help="classification batch size (default: 50)",
                        default=21,
                        type=int)
    parser.add_argument(
        '-sv',
        '--src_vecs',
        help=" source language vectors (default: GoogleNewsVecs )",
        default='embeddings/original/google.txt')
    parser.add_argument(
        '-tr',
        '--trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='bingliu')
    parser.add_argument(
        '-da',
        '--dataset',
        help="dataset to train and test on (default: opener_sents)",
        default='opener_sents',
    )
    parser.add_argument(
        '-sd',
        '--savedir',
        help="where to dump weights during training (default: ./models)",
        default='models')
    parser.add_argument(
        '-lr',
        '--learning_rate',
        help="where to dump weights during training (default: 0.0001)",
        default=0.0001,
        type=float)
    parser.add_argument(
        '-m',
        '--model',
        help="where to dump weights during training (default: attn_rnn_blse)",
        default='rnn_attn_blse')
    parser.add_argument(
        '-cu',
        '--to_cuda',
        help="where to dump weights during training (default: True)",
        default=True,
        type=bool)
    args = parser.parse_args()

    # If there's no savedir, create it
    if args.model not in ['rnn_attn_blse', 'rnn_blse']:
        print("no such model: {}".format(args.model))
        exit(1)

    os.makedirs(args.savedir, exist_ok=True)

    if args.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    weight_dir = "{}/{}/{}-{}-{}".format(args.savedir, args.model,
                                         args.dataset, args.target_lang, b)

    results_file_name = "results/report_{}_alpha-{}_batch_size-{}_epochs-{}_lr-{}.txt".format(
        args.model, args.alpha, args.batch_size, args.epochs,
        '{0:.15f}'.format(args.learning_rate).rstrip('0').rstrip('.'))

    # import datasets (representation will depend on final classifier)
    print()
    print('training model')
    print('Parameters:')
    print('model:     {0}'.format(args.model))
    print('binary:     {0}'.format(b))
    print('epochs:      {0}'.format(args.epochs))
    print('alpha (projection loss coef):      {0}'.format(args.alpha))
    print('batchsize:  {0}'.format(args.batch_size))
    print('learning rate:  {0}'.format(args.learning_rate))
    print('weight_dir:  {0}'.format(weight_dir))
    print('results_file_name:  {0}'.format(results_file_name))
    print()

    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', args.source_lang,
                                           args.dataset),
                              None,
                              binary=args.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang,
                                                 args.dataset),
                                    None,
                                    binary=args.binary,
                                    rep=words,
                                    one_hot=False)
    # print("len(cross_dataset._Xdev): {}".format(len(cross_dataset._Xdev)))
    # print("len(cross_dataset._Xtest): {}".format(len(cross_dataset._Xtest)))

    # Import monolingual vectors
    print('importing word embeddings')
    trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format(
        args.target_lang)
    print("trg_vecs_file_path: {}".format(trg_vecs_file_path))
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(trg_vecs_file_path)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs)

    # Import translation pairs
    translation_file_path = "lexicons/{}/en-{}.txt".format(
        args.trans, args.target_lang)
    print("translation_file_path: {}".format(translation_file_path))
    pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs)

    # Set up model
    if args.model == 'rnn_blse':
        model = RNN_BLSE(
            src_vecs,
            trg_vecs,
            pdataset,
            dataset,
            cross_dataset,
            projection_loss=args.proj_loss,
            output_dim=output_dim,
            batch_size=args.batch_size,
            to_cuda=args.to_cuda,
            src_syn1=synonyms1,
            src_syn2=synonyms2,
            src_neg=neg,
            trg_syn1=cross_syn1,
            trg_syn2=cross_syn2,
            trg_neg=cross_neg,
        )
    elif args.model == 'rnn_attn_blse':
        model = Rnn_Attn_BLSE(
            src_vecs,
            trg_vecs,
            pdataset,
            dataset,
            cross_dataset,
            projection_loss=args.proj_loss,
            output_dim=output_dim,
            to_cuda=args.to_cuda,
            batch_size=args.batch_size,
            src_syn1=synonyms1,
            src_syn2=synonyms2,
            src_neg=neg,
            trg_syn1=cross_syn1,
            trg_syn2=cross_syn2,
            trg_neg=cross_neg,
        )

    if torch.cuda.is_available() and args.to_cuda:
        print("cuda is available")
        model.cuda()
    else:
        print("cuda is not available")

    # Loss Functions
    class_criterion = nn.CrossEntropyLoss()
    proj_criterion = nn.MSELoss()

    if args.proj_loss == 'mse':
        proj_criterion = nn.MSELoss()
    elif args.proj_loss == 'cosine':
        proj_criterion = cosine_loss
    else:
        print("no projection criterion supported: {}".format(args.proj_loss))
        exit(1)

    # Optimizer
    optim = torch.optim.Adam(model.parameters(), args.learning_rate)

    # Fit model
    results_file = open(results_file_name, "w+")
    trainer = Trainer(model, args.alpha, optim, args.learning_rate,
                      class_criterion, proj_criterion, args.epochs,
                      args.batch_size, results_file, weight_dir, args.to_cuda)

    best_model_file_path = trainer.train(pdataset._Xtrain, pdataset._ytrain,
                                         dataset._Xtrain, dataset._ytrain)

    # Get best dev f1 and weights
    print("looking in dir: {}".format(weight_dir))
    best_f1, best_params = get_best_model_params(best_model_file_path)
    best_model = torch.load(best_model_file_path)
    state_dict = best_model.state_dict()
    model.load_state_dict(state_dict)

    print()
    print('Dev set')
    print('best dev f1: {0:.3f}'.format(best_f1))
    print('parameters: epochs {0} batch size {1} alpha {2} learning rate {3}'.
          format(*best_params))

    results_file.write('\n')
    results_file.write('Dev set\n')
    results_file.write('best dev f1: {0:.3f}\n'.format(best_f1))
    results_file.write(
        'parameters: epochs {0} batch size {1} alpha {2}\n'.format(
            *best_params))

    # Evaluate on test set
    model.eval()

    model.evaluate(cross_dataset._Xtest,
                   cross_dataset._ytest,
                   results_file=results_file,
                   src=False)

    model.confusion_matrix(cross_dataset._Xtest,
                           cross_dataset._ytest,
                           src=False,
                           results_file=results_file)

    results_file.close()
Esempio n. 3
0
def test_embeddings(file, threshold, file_type):
    emotions = [
        "anger", "anticipation", "disgust", "fear", "joy", "sadness",
        "surprise", "trust"
    ]

    # Import dataset where each test example is the words in the tweet
    dataset = Fine_Grained_Emotion_Dataset('data',
                                           None,
                                           rep=words,
                                           threshold=threshold)

    print('Basic statistics')
    table = []
    for i, emo in enumerate(emotions):
        train = dataset._ytrain[:, i].sum()
        test = dataset._ytest[:, i].sum()
        table.append((emo, train, test))
    print(tabulate.tabulate(table, headers=['emotion', '#train', '#test']))

    #### Get Parameters ####
    max_length = 0
    vocab = {}
    for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(
            dataset._Xtest):
        if len(sent) > max_length:
            max_length = len(sent)
        for w in sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    wordvecs = {}

    print('Importing vectors')
    for line in open(file):
        try:
            split = line.split()
            word = split[0]
            vec = np.array(split[1:], dtype='float32')
            if word in vocab:
                wordvecs[word] = vec
        except ValueError:
            pass

    dim = len(vec)

    oov = len(vocab) - len(wordvecs)
    print('OOV: {0}'.format(oov))

    # Add vectors for <unk>
    add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
    W, word_idx_map = get_W(wordvecs, dim=dim)

    # TODO: change this so I don't have to import vectors I don't need
    vecs = WordVecs(file)
    vecs._matrix = W
    vecs._w2idx = word_idx_map
    vecs.vocab_length, vecs.vector_size = W.shape

    ave_dataset = Fine_Grained_Emotion_Dataset('data', vecs, rep=ave_vecs)

    # Get padded word indexes for all X
    Xtrain = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtrain
    ])
    Xdev = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xdev
    ])
    Xtest = np.array([
        get_idx_from_sent(' '.join(sent),
                          word_idx_map,
                          max_l=max_length,
                          k=dim) for sent in dataset._Xtest
    ])

    #### Test Models ####

    names = ['LSTM', 'BiLSTM', 'CNN']

    # Keep all mean and standard deviations of each emotion over datasets here
    all_emo_results = []
    all_emo_std_devs = []

    # Keep all mean and standard deviations of the averaged emotions here
    averaged_results = []
    averaged_std_devs = []

    # TEST EACH MODEL
    for name in names:

        print('Getting best parameters')

        dev_params_file = 'dev_params/' + str(W.shape[1]) + '_params.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(
            name, dev_params_file, Xtrain, dataset._ytrain, Xdev,
            dataset._ydev, wordvecs, W)

        print('Testing {0}'.format(name))

        # Keep the results for the 5 runs over the dataset
        model_results = []
        model_average_results = []

        # 5 runs to get average and standard deviation
        for i, it in enumerate(range(5)):
            print('Run: {0}'.format(i + 1))

            # create and train a new classifier for each iteration
            if name == 'LSTM':
                model = create_LSTM(wordvecs,
                                    dim=best_dim,
                                    output_dim=8,
                                    dropout=best_dropout,
                                    weights=W,
                                    train=True)
            elif name == 'BiLSTM':
                model = create_BiLSTM(wordvecs,
                                      dim=best_dim,
                                      output_dim=8,
                                      dropout=best_dropout,
                                      weights=W,
                                      train=True)
            elif name == 'CNN':
                model = create_cnn(W, Xtrain.shape[1])

            h = model.fit(Xtrain,
                          dataset._ytrain,
                          validation_data=[Xdev, dataset._ydev],
                          nb_epoch=best_epoch,
                          verbose=0)
            pred = model.predict(Xtest)

            pred = np.array([cutoff(x) for x in pred])
            y = dataset._ytest

            emo_results = []
            for j in range(len(emotions)):
                emo_y = y[:, j]
                emo_pred = pred[:, j]
                mm = MyMetrics(emo_y,
                               emo_pred,
                               one_hot=False,
                               average='binary')
                acc = mm.accuracy()
                precision, recall, f1 = mm.get_scores()
                emo_results.append([acc, precision, recall, f1])

            emo_results = np.array(emo_results)
            model_results.append(emo_results)

            # print('F1 scores')
            # for emo, result in zip(emotions, emo_results):
            #    a, p, r, f = result
            #    print('{0}: {1:.3f}'.format(emo, f))
            ave_acc, ave_prec, ave_rec, mac_f1 = emo_results.mean(axis=0)
            mic_prec, mic_rec, mic_f1 = micro_f1(dataset._ytest, pred)
            model_average_results.append((ave_acc, mic_prec, mic_rec, mic_f1))

            print(
                'acc: {0:.3f} micro-prec:{1:.3f} micro-rec:{2:.3f} micro-f1:{3:.3f}'
                .format(ave_acc, mic_prec, mic_rec, mic_f1))
            print()

        model_results = np.array(model_results)
        model_average_results = np.array(model_average_results)
        average_model_results = model_results.mean(axis=0)
        model_std_dev_results = model_results.std(axis=0)
        overall_avg = model_average_results.mean(axis=0)
        overall_std = model_average_results.std(axis=0)

        all_emo_results.append(average_model_results)
        all_emo_std_devs.append(model_std_dev_results)

        averaged_results.append(overall_avg)
        averaged_std_devs.append(overall_std)

    return names, all_emo_results, all_emo_std_devs, averaged_results, averaged_std_devs, dim
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-sl',
                        '--source_lang',
                        help="source language: es, ca, eu, en (default: en)",
                        default='en')
    parser.add_argument('-tl',
                        '--target_lang',
                        help="target language: es, ca, eu, en (default: es)",
                        default='es')
    parser.add_argument('-bi',
                        '--binary',
                        help="binary or 4-class (default: True)",
                        default=True,
                        type=str2bool)
    parser.add_argument('-e',
                        '--epochs',
                        help="training epochs (default: 200)",
                        default=200,
                        type=int)
    parser.add_argument(
        '-a',
        '--alpha',
        help=
        "trade-off between projection and classification objectives (default: .001)",
        default=.001,
        type=float)
    parser.add_argument('-pl',
                        '--proj_loss',
                        help="projection loss: mse, cosine (default: cosine)",
                        default='mse')
    parser.add_argument('-bs',
                        '--batch_size',
                        help="classification batch size (default: 50)",
                        default=20,
                        type=int)
    parser.add_argument(
        '-sv',
        '--src_vecs',
        help=" source language vectors (default: GoogleNewsVecs )",
        default='google.txt')
    parser.add_argument(
        '-tv',
        '--trg_vecs',
        help=" target language vectors (default: SGNS on Wikipedia)",
        default='sg-300-es.txt')
    parser.add_argument(
        '-tr',
        '--trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='lexicons/bingliu/en-es.txt')
    parser.add_argument(
        '-da',
        '--dataset',
        help="dataset to train and test on (default: opener_sents)",
        default='opener_sents',
    )
    parser.add_argument(
        '-sd',
        '--savedir',
        help="where to dump weights during training (default: ./models)",
        default='models/blse')
    args = parser.parse_args()

    # import datasets (representation will depend on final classifier)
    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', args.source_lang,
                                           args.dataset),
                              None,
                              binary=args.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.target_lang,
                                                 args.dataset),
                                    None,
                                    binary=args.binary,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(args.trg_vecs)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(args.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.target_lang, trg_vecs)

    # Import translation pairs
    pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs)

    if args.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    # Set up model
    blse = BLSE(
        src_vecs,
        trg_vecs,
        pdataset,
        dataset,
        cross_dataset,
        projection_loss=args.proj_loss,
        output_dim=output_dim,
        src_syn1=synonyms1,
        src_syn2=synonyms2,
        src_neg=neg,
        trg_syn1=cross_syn1,
        trg_syn2=cross_syn2,
        trg_neg=cross_neg,
    )

    # If there's no savedir, create it
    os.makedirs(args.savedir, exist_ok=True)

    # Fit model
    blse.fit(pdataset._Xtrain,
             pdataset._ytrain,
             dataset._Xtrain,
             dataset._ytrain,
             weight_dir=args.savedir,
             batch_size=args.batch_size,
             alpha=args.alpha,
             epochs=args.epochs)

    # Get best dev f1 and weights
    best_f1, best_params, best_weights = get_best_run(args.savedir)
    blse.load_weights(best_weights)
    print()
    print('Dev set')
    print('best dev f1: {0:.3f}'.format(best_f1))
    print(
        'parameters: epochs {0} batch size {1} alpha {2}'.format(*best_params))

    # Evaluate on test set
    blse.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False)

    blse.evaluate(cross_dataset._Xtest,
                  cross_dataset._ytest,
                  src=False,
                  outfile=os.path.join(
                      'predictions', args.target_lang, 'blse',
                      '{0}-{1}-alpha{2}-epoch{3}-batch{4}.txt'.format(
                          args.dataset, b, args.alpha, best_params[0],
                          args.batch_size)))

    blse.confusion_matrix(cross_dataset._Xtest,
                          cross_dataset._ytest,
                          src=False)

    blse.plot()
Esempio n. 5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-vec_dir',
                        default='../deployment/MUSE/',
                        help=" directory that hold MUSE vectors")
    parser.add_argument('-dataset',
                        default='opener',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    # Loop over the three languages
    for lang in ['ca', 'eu', 'es']:
        print('################ {0} ##############'.format(lang))

        # Import monolingual vectors
        print('importing word embeddings')
        src_vecs = WordVecs(
            os.path.join(args.vec_dir, 'en-{0}'.format(lang), 'muse-en.txt'))
        trg_vecs = WordVecs(
            os.path.join(args.vec_dir, 'en-{0}'.format(lang),
                         'muse-{0}.txt'.format(lang)))

        # Import datasets (representation will depend on final classifier)
        print('importing datasets')
        binary_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                      args.dataset),
                                         src_vecs,
                                         binary=True,
                                         rep=ave_vecs,
                                         one_hot=False,
                                         lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               trg_vecs,
                                               binary=True,
                                               rep=ave_vecs,
                                               one_hot=False,
                                               lowercase=False)

        fine_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                    args.dataset),
                                       src_vecs,
                                       binary=False,
                                       rep=ave_vecs,
                                       one_hot=False,
                                       lowercase=False)
        fine_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                             trg_vecs,
                                             binary=False,
                                             rep=ave_vecs,
                                             one_hot=False,
                                             lowercase=False)

        # Train linear SVM classifier
        if True in args.bi:
            best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain)
            cpred = clf.predict(binary_cross_dataset._Xtest)
            cf1 = macro_f1(binary_cross_dataset._ytest, cpred)
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'muse',
                             '{0}-bi.txt'.format(args.dataset)))
            print('-binary-')
            print('Acc: {0:.3f}'.format(
                clf.score(binary_cross_dataset._Xtest,
                          binary_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()

        if False in args.bi:
            best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain)
            cpred = clf.predict(fine_cross_dataset._Xtest)
            cf1 = macro_f1(fine_cross_dataset._ytest, cpred)
            print_prediction(
                clf, fine_cross_dataset,
                os.path.join('predictions', lang, 'muse',
                             '{0}-4cls.txt'.format(args.dataset)))
            print('-fine-')
            print('Acc: {0:.3f}'.format(
                clf.score(fine_cross_dataset._Xtest,
                          fine_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()
Esempio n. 6
0
            30, 60
    ]:
        clf = LinearSVC(C=c)
        clf.fit(dataset._Xtrain, dataset._ytrain)
        pred = clf.predict(dataset._Xdev)
        f1 = per_class_f1(dataset._ydev, pred).mean()
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c, best_f1


if __name__ == '__main__':

    embeddingdir = '/home/jeremy/NS/Keep/Temp/Exps/EMBEDDINGS'
    amazon_vecs = WordVecs(
        os.path.join(embeddingdir, 'SubjQuant/amazon-sg-300.txt'))
    twitter_vecs = WordVecs(
        os.path.join(embeddingdir, 'twitter_embeddings.txt'))

    pdataset = ProjectionDataset('lexicons/general_vocab.txt', amazon_vecs,
                                 twitter_vecs)

    books = Book_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True)
    dvd = DVD_Dataset(amazon_vecs, rep=ave_vecs, one_hot=False, binary=True)
    electronics = Electronics_Dataset(amazon_vecs,
                                      rep=ave_vecs,
                                      binary=True,
                                      one_hot=False)
    kitchen = Kitchen_Dataset(amazon_vecs,
                              rep=ave_vecs,
                              binary=True,
Esempio n. 7
0
def run_model_on_datasets_with_embeddings(embedding_file, file_type):
    """
    embedding_file: the word embeddings file
    file_type:      word2vec, glove
    """
    print('importing word embedding vectors...')
    vecs = WordVecs(embedding_file, file_type)  # load the word2vec dictionary.
    dim = vecs.vector_size      # dimensionality of the word embeddings

    # For collecting results to return
    results = []
    std_devs = []


    datasetNames = [
                # 'sst_fine',
                # 'sst_binary',
                # 'opener',
                # 'sentube_auto',
                'sentube_tablets',
                'semeval',
                ]

    # train & test the model on every dataset above
    for datasetName in datasetNames:
        # dataset_load_start = datetime.now()
        if datasetName == 'sst_fine':
            dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=False,
                                            rep=words)
        elif datasetName == 'sst_binary':
            dataset = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=True,
                                            rep=words)
        elif datasetName == 'opener':
            dataset = General_Dataset('datasets/opener',
                                             None,
                                             one_hot=True,
                                             rep=words)
        
        elif datasetName == 'sentube_auto':
            dataset = General_Dataset('datasets/SenTube/auto',
                                                   None, rep=words,
                                                   binary=True,
                                                   one_hot=True)
        elif datasetName == 'sentube_tablets':
            dataset = General_Dataset('datasets/SenTube/tablets',
                                                      None, rep=words,
                                                      binary=True,
                                                      one_hot=True)
        elif datasetName == 'semeval':
            dataset = Semeval_Dataset('datasets/semeval',
                                                        None, rep=words,
                                                        one_hot=True)


        print('Loading & Testing on {}:'.format(datasetName))

        # if hp.lowercase_all_sentences:
        #     for sent in dataset._Xtrain:
        #         for word in sent:
        #             if word != word.lower():
        #                 print("Word has an uppercase character:", word.decode('utf-8'))


        # find out the max length of sentences in the dataset and construct the vocab frequency dict.
        max_length = 0
        vocab = {}
        for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
            if len(sent) > max_length:
                max_length = len(sent)
            for w in sent:
                if w not in vocab:
                    vocab[w] = 1
                else:
                    vocab[w] += 1

        # create a dict of words that are in our word2vec embeddings
        # wordvecs: String -> embedding_vec
        wordvecs = {}
        for w in vecs._w2idx.keys():
            if w in vocab:
                wordvecs[w] = vecs[w]

        # Assign random w2v vectors to the unknown words. These are random uniformly distrubuted vectors of size dim.
        add_unknown_words(vecs, wordvecs, vocab, min_df=1, dim=dim)
        W, word_idx_map = get_W(wordvecs, dim=dim)  # Get the w2v index map for out final vocab

        print('Converting dataset to being right padded...')
        dataset = convert_dataset(dataset, word_idx_map, datasetName, max_length)
        output_dim = dataset._ytest.shape[1]

        # Test model hp.run_exps_amount times and get averages and std dev.
        dataset_results = []
        for i in range(1, hp.run_exps_amount + 1):  
            tf.reset_default_graph()  # Clears the current loaded tensorflow graph.

            w2i = tf.Variable(tf.constant(0.0, shape=[W.shape[0], W.shape[1]]),
                trainable=False, name="embedding_table")
            wordIndxToVec_tensor = tf.placeholder(tf.float32, [W.shape[0], W.shape[1]], name="embedding_table")     # [vobab_size x word_embedding_dim]
            w2i.assign(wordIndxToVec_tensor)

            start_time = datetime.now()     # Print time for logging.
            clf, best_mm_val, best_mm_test = createAndTrainTransformer(dataset, W, wordIndxToVec_tensor, output_dim, datasetName, max_length)
            print("Finished run #", i, "Time taken: " + str(datetime.now() - start_time))

            mm = best_mm_test
            acc, precision, recall, micro_f1 = mm.get_scores()
            dataset_results.append([acc, precision, recall, micro_f1])
            if hp.run_exps_amount == 1:
                acc, precision, recall, micro_f1 = mm.get_scores()
                dataset_results.append([acc, precision, recall, micro_f1])  # add twice so the average is the same... avoid running multiple runs this way.

            if hp.run_exps_amount != 1:   # Print the metrics for this run, unless we're running experiment only once.
                this_run_result = []
                this_run_result.append([acc, precision, recall, micro_f1])
                this_run_result.append([acc, precision, recall, micro_f1])
                this_run_result = np.array(this_run_result)
                this_run_ave_results = this_run_result.mean(axis=0) 
                this_run_std_results = this_run_result.std(axis=0)
                printMetrics(this_run_ave_results, this_run_std_results, datasetName)

        # Get the average and std deviation over 10 runs with 10 random seeds    
        dataset_results = np.array(dataset_results)
        ave_results = dataset_results.mean(axis=0)
        std_results = dataset_results.std(axis=0)
        printMetrics(ave_results, std_results, datasetName)
        
        results.append(ave_results)
        std_devs.append(std_results)

    results.append(list(np.array(results).mean(axis=0)))
    std_devs.append(list(np.array(std_devs).mean(axis=0)))
    datasetNames.append('overall')
    
    return datasetNames, results, std_devs, dim
Esempio n. 8
0
    parser.add_argument('-lr', '--learning_rate', default=0.001, type=float)
    parser.add_argument('-wd', '--weight_decay', default=0.0, type=float)
    parser.add_argument('-cuda', default=True, type=str2bool)
    parser.add_argument('-seed', default=123, type=int)
    args = parser.parse_args()

    print_args(args)
    args.cuda = args.cuda and torch.cuda.is_available()

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    print('Importing embeddings...')
    src_vecs = WordVecs(args.src_embeddings)
    trg_vecs = WordVecs(args.trg_embeddings)

    pdataset = ProjectionDataset(
        'lexicons/bingliu_{0}_{1}.txt'.format(args.src_lang, args.trg_lang),
        src_vecs, trg_vecs)

    print('Importing datasets...')
    # Get training, dev, and test data
    if args.binary:
        train_data, dev_data, test_data = open_dataset(
            os.path.join('annotation', args.src_dataset, args.src_lang))
        train_data = [(l, r, t, y) for l, r, t, y in train_data if y in [0, 2]]
        train_data = [(l, r, t, y) if y == 0 else (l, r, t, 1)
                      for l, r, t, y in train_data]
        dev_data = [(l, r, t, y) for l, r, t, y in dev_data if y in [0, 2]]
Esempio n. 9
0
def train_model_with_different_params(params):
    # Train a certain model (rnn_blse / rnn_attn_blse) for a certain target language
    #  for different combinations of hyper parameters

    if params.model not in ['rnn_attn_blse', 'rnn_blse']:
        print("no such model: {}".format(params.model))
        exit(1)

    # If there's no savedir, create it
    os.makedirs(params.savedir, exist_ok=True)

    if params.binary:
        output_dim = 2
        b = 'bi'
    else:
        output_dim = 4
        b = '4cls'

    weight_dir = "{}/{}/{}-{}-{}".format(params.savedir, params.model,
                                         params.dataset, params.target_lang, b)
    best_params_file_name = "results/best_params_report_{}_{}_{}.txt".format(
        params.model, params.target_lang, b)
    best_params_file = open(best_params_file_name, "w+")

    best_params_file.write("Start parameter search:\n")
    best_params_file.write("Model: {}\n".format(params.model))
    best_params_file.write("is_binary: {}\n".format(params.binary))
    best_params_file.write("target_lang: {}\n".format(params.target_lang))

    best_f1 = 0.0
    best_params = None
    old_file_name = None
    old_results_file_name = None
    rest_of_scores = []

    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', params.source_lang,
                                           params.dataset),
                              None,
                              binary=params.binary,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets',
                                                 params.target_lang,
                                                 params.dataset),
                                    None,
                                    binary=params.binary,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    trg_vecs_file_path = "embeddings/original/sg-300-{}.txt".format(
        params.target_lang)
    print("trg_vecs_file_path: {}".format(trg_vecs_file_path))
    src_vecs = WordVecs(params.src_vecs)
    trg_vecs = WordVecs(trg_vecs_file_path)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant(params.source_lang, src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(params.target_lang,
                                                    trg_vecs)

    # Import translation pairs
    translation_file_path = "lexicons/{}/en-{}.txt".format(
        params.trans, params.target_lang)
    print("translation_file_path: {}".format(translation_file_path))
    pdataset = ProjectionDataset(translation_file_path, src_vecs, trg_vecs)

    for proj_loss in params.proj_losses:
        for alpha in params.alphas:
            for learning_rate in params.learning_rates:
                for batch_size in params.batch_sizes:
                    best_model_file_path, acc, prec, rec, f1, results_file_name = train_model(
                        params.model, dataset, cross_dataset, src_vecs,
                        trg_vecs, synonyms1, synonyms2, neg, cross_syn1,
                        cross_syn2, cross_neg, pdataset, weight_dir, proj_loss,
                        alpha, learning_rate, batch_size, output_dim, b,
                        params)

                    if f1 > best_f1:
                        print()
                        print("Found new set of best hyper params:")
                        print("f1:      {0:.3f}".format(f1))
                        print("acc:      {0:.3f}".format(acc))
                        print("prec:      {0:.3f}".format(prec))
                        print("rec:      {0:.3f}".format(rec))
                        print('model:     {0}'.format(params.model))
                        print('is_binary:     {0}'.format(params.binary))
                        print('epochs:      {0}'.format(params.epochs))
                        print('proj_loss:      {0}'.format(proj_loss))
                        print('alpha (projection loss coef):      {0}'.format(
                            alpha))
                        print('batch size:  {0}'.format(batch_size))
                        print('learning rate:  {0}'.format(learning_rate))
                        print('weight_dir:  {0}'.format(weight_dir))
                        print('best_model_file_path:  {0}'.format(
                            best_model_file_path))
                        print()

                        best_params_file.write("\n")
                        best_params_file.write(
                            "Found new set of best hyper params:\n")
                        best_params_file.write(
                            "f1       {0:.3f}:\n".format(f1))
                        best_params_file.write(
                            "acc       {0:.3f}:\n".format(acc))
                        best_params_file.write(
                            "prec       {0:.3f}:\n".format(prec))
                        best_params_file.write(
                            "rec       {0:.3f}:\n".format(rec))
                        best_params_file.write('model:     {0}\n'.format(
                            params.model))
                        best_params_file.write('is_binary:     {0}\n'.format(
                            params.binary))
                        best_params_file.write('epochs:      {0}\n'.format(
                            params.epochs))
                        best_params_file.write(
                            'proj_loss:      {0}\n'.format(proj_loss))
                        best_params_file.write(
                            "alpha (projection loss coef):      {0}\n".format(
                                alpha))
                        best_params_file.write(
                            'batch size:  {0}\n'.format(batch_size))
                        best_params_file.write(
                            'learning:  {0}\n'.format(learning_rate))
                        best_params_file.write(
                            'weight_dir:  {0}\n'.format(weight_dir))
                        best_params_file.write(
                            'best_model_file_path:  {0}\n'.format(
                                best_model_file_path))

                        if old_file_name != None:
                            os.remove(old_file_name)

                        if old_results_file_name != None:
                            os.remove(old_results_file_name)

                        torch.save(params.model, best_model_file_path)
                        old_file_name = best_model_file_path
                        old_results_file_name = results_file_name
                        best_f1 = f1
                        rest_of_scores = [acc, prec, rec]

                        best_params = [
                            proj_loss, alpha, learning_rate, batch_size
                        ]

                    else:
                        os.remove(results_file_name)
                        os.remove(best_model_file_path)

    print("")
    print("Done parameters search")
    print("best f1: {0:.3f}".format(best_f1))
    print("its acc: {0:.3f}".format(rest_of_scores[0]))
    print("its prec: {0:.3f}".format(rest_of_scores[1]))
    print("its rec: {0:.3f}".format(rest_of_scores[2]))
    print("best_params:")
    print('model:     {0}'.format(params.model))
    print('is_binary:     {0}'.format(params.binary))
    print('proj_loss:     {0}'.format(best_params[0]))
    print('alpha (projection loss coef):      {0}'.format(best_params[1]))
    print('learning rate:  {0}'.format(best_params[2]))
    print('batch size:  {0}'.format(best_params[3]))
    print("")

    best_params_file.write("\n")
    best_params_file.write("Done parameters search\n")
    best_params_file.write("best f1: {0:.3f}\n".format(best_f1))
    best_params_file.write("its acc: {0:.3f}\n".format(rest_of_scores[0]))
    best_params_file.write("its prec: {0:.3f}\n".format(rest_of_scores[1]))
    best_params_file.write("its rec: {0:.3f}\n".format(rest_of_scores[2]))
    best_params_file.write('model:     {0}\n'.format(params.model))
    best_params_file.write('is_binary:     {0}\n'.format(params.binary))
    best_params_file.write('proj_loss:      {0}\n'.format(best_params[0]))
    best_params_file.write("alpha (projection loss coef):      {0}\n".format(
        best_params[1]))
    best_params_file.write('learning:  {0}\n'.format(best_params[2]))
    best_params_file.write('batch size:  {0}\n'.format(best_params[3]))
    best_params_file.close()
Esempio n. 10
0
def test_embeddings(embedding_file, file_type):
    """
    Tang et al. (2014) embeddings and cassification approach
    on a number of benchmark datasets.

    """

    print('importing vectors...')
    vecs = WordVecs(embedding_file, file_type)
    dim = vecs.vector_size

    print('Importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                         None,
                                         one_hot=False,
                                         binary=False,
                                         rep=words)

    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                           None,
                                           one_hot=False,
                                           binary=True,
                                           rep=words)

    opener_dataset = General_Dataset('datasets/opener',
                                     vecs,
                                     one_hot=False,
                                     rep=words)

    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           vecs._w2idx,
                                           rep=words,
                                           binary=True,
                                           one_hot=False)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              vecs._w2idx,
                                              rep=words,
                                              binary=True,
                                              one_hot=False)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                      vecs._w2idx,
                                      rep=words,
                                      one_hot=False)

    datasets = [
        st_fine, st_binary, opener_dataset, sentube_auto_dataset,
        sentube_tablets_dataset, semeval_dataset
    ]

    names = [
        'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets',
        'semeval'
    ]

    # Collect results here
    results = []

    for name, dataset in zip(names, datasets):
        print('Testing on {0}...'.format(name))

        Xtrain = np.array(
            [conv_tweet(' '.join(t), vecs) for t in dataset._Xtrain])
        Xtest = np.array(
            [conv_tweet(' '.join(t), vecs) for t in dataset._Xtest])
        Xdev = np.array([conv_tweet(' '.join(t), vecs) for t in dataset._Xdev])

        # get best parameters on dev set
        best_C, best_rate = get_best_C(Xtrain, dataset._ytrain, Xdev,
                                       dataset._ydev)

        clf = LogisticRegression(C=best_C)
        h = clf.fit(Xtrain, dataset._ytrain)
        pred = clf.predict(Xtest)
        predictions_file = "predictions/joint/" + name + '/pred.txt'
        print_prediction(predictions_file, pred)

        labels = sorted(set(dataset._ytrain))
        if len(labels) == 2:
            average = 'binary'
        else:
            average = 'micro'
        mm = MyMetrics(dataset._ytest,
                       pred,
                       one_hot=False,
                       labels=labels,
                       average=average)
        acc, precision, recall, f1 = mm.get_scores()
        results.append([acc, precision, recall, f1])

    results.append(list(np.array(results).mean(axis=0)))
    names.append('overall')

    return names, results, dim
Esempio n. 11
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-l', help="target language: es, ca, eu", default='es')
    parser.add_argument('-bi',
                        help="binary or 4-class",
                        default=False,
                        type=str2bool)
    parser.add_argument('-epoch', default=300, type=int)
    parser.add_argument('-alpha', default=.5, type=float)
    parser.add_argument('-batch_size', default=200, type=int)
    parser.add_argument('-src_vecs', default='embeddings/original/google.txt')
    parser.add_argument('-trg_vecs',
                        default='embeddings/original/sg-300-es.txt')
    parser.add_argument(
        '-trans',
        help='translation pairs',
        default=
        'lexicons/bingliu_en_es.one-2-one_AND_Negators_Intensifiers_Diminishers.txt'
    )
    parser.add_argument('-dataset', default='opener')
    args = parser.parse_args()

    # import datasets (representation will depend on final classifier)
    print('importing datasets')

    dataset = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                              None,
                              binary=args.bi,
                              rep=words,
                              one_hot=False)

    cross_dataset = General_Dataset(os.path.join('datasets', args.l,
                                                 args.dataset),
                                    None,
                                    binary=args.bi,
                                    rep=words,
                                    one_hot=False)

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_vecs)
    trg_vecs = WordVecs(args.trg_vecs)

    # Get sentiment synonyms and antonyms to check how they move during training
    synonyms1, synonyms2, neg = get_syn_ant('en', src_vecs)
    cross_syn1, cross_syn2, cross_neg = get_syn_ant(args.l, trg_vecs)

    # Import translation pairs
    pdataset = ProjectionDataset(args.trans, src_vecs, trg_vecs)

    # initialize classifier
    if args.bi:
        ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset,
                  synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg,
                  2)
    else:
        ble = BLE(src_vecs, trg_vecs, pdataset, dataset, cross_dataset,
                  synonyms1, synonyms2, neg, cross_syn1, cross_syn2, cross_neg,
                  4)

    # train model
    print('training model')
    print('Parameters:')
    print('lang:       {0}'.format(args.l))
    print('binary:     {0}'.format(args.bi))
    print('epoch:      {0}'.format(args.epoch))
    print('alpha:      {0}'.format(args.alpha))
    print('batchsize:  {0}'.format(args.batch_size))
    print('src vecs:   {0}'.format(args.src_vecs))
    print('trg_vecs:   {0}'.format(args.trg_vecs))
    print('trans dict: {0}'.format(args.trans))
    print('dataset:    {0}'.format(args.dataset))
    if args.bi:
        b = 'bi'
    else:
        b = '4cls'

    weight_dir = os.path.join('models',
                              '{0}-{1}-{2}'.format(args.dataset, args.l, b))
    ble.fit(pdataset._Xtrain,
            pdataset._ytrain,
            dataset._Xtrain,
            dataset._ytrain,
            weight_dir=weight_dir,
            alpha=args.alpha,
            epochs=args.epoch,
            batch_size=args.batch_size)

    # get the best weights
    best_f1, best_params, best_weights = get_best_run(weight_dir)
    epochs, batch_size, alpha = best_params
    ble.load_weights(best_weights)

    # evaluate
    if args.bi:
        ble.plot(outfile=os.path.join(
            'figures', 'syn-ant', args.l, 'ble',
            '{0}-bi-alpha{1}-epoch{2}-batch{3}.pdf'.format(
                args.dataset, alpha, epochs, batch_size)))
        ble.evaluate(cross_dataset._Xtest, cross_dataset._ytest, src=False)
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     src=False,
                     outfile=os.path.join(
                         'predictions', args.l, 'ble',
                         '{0}-bi-alpha{1}-epoch{2}-batch{3}.txt'.format(
                             args.dataset, alpha, epochs, batch_size)))
    else:
        ble.plot(outfile=os.path.join(
            'figures', 'syn-ant', args.l, 'ble',
            '{0}-4cls-alpha{1}-epoch{2}-batch{3}.pdf'.format(
                args.dataset, alpha, epochs, batch_size)))
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     average='macro',
                     src=False)
        ble.evaluate(cross_dataset._Xtest,
                     cross_dataset._ytest,
                     average='macro',
                     src=False,
                     outfile=os.path.join(
                         'predictions', args.l, 'ble',
                         '{0}-4cls-alpha{1}-epoch{2}-batch{3}.txt'.format(
                             args.dataset, alpha, epochs, batch_size)))
Esempio n. 12
0
def test_embeddings(file, file_type):
    print('Importing vecs...')
    #vec_file = sys.argv[1]
    #vec_file = '/home/jeremy/Escritorio/sentiment_retrofitting/embeddings/sswe-u-50.txt'
    vecs = WordVecs(file)

    print('Importing datasets...')
    st_fine = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                         None,
                                         one_hot=True,
                                         binary=False,
                                         rep=words)

    st_binary = Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                           None,
                                           one_hot=True,
                                           binary=True,
                                           rep=words)

    opener_dataset = General_Dataset('datasets/opener',
                                     vecs,
                                     one_hot=True,
                                     rep=word_reps)

    twitter_dataset = Semeval_Dataset('datasets/twitter',
                                      vecs._w2idx,
                                      rep=word_reps,
                                      one_hot=True)

    sentube_auto_dataset = General_Dataset('datasets/SenTube/auto',
                                           vecs._w2idx,
                                           rep=word_reps,
                                           binary=True,
                                           one_hot=True)

    sentube_tablets_dataset = General_Dataset('datasets/SenTube/tablets',
                                              vecs._w2idx,
                                              rep=word_reps,
                                              binary=True,
                                              one_hot=True)

    semeval_dataset = Semeval_Dataset('datasets/semeval',
                                      vecs,
                                      rep=words,
                                      one_hot=True)

    datasets = [
        st_fine, st_binary, opener_dataset, sentube_auto_dataset,
        sentube_tablets_dataset, semeval_dataset
    ]

    names = [
        'sst_fine', 'sst_binary', 'opener', 'sentube_auto', 'sentube_tablets',
        'semeval'
    ]

    dim = vecs.vector_size
    for name, dataset in zip(names, datasets):
        print('Testing on {0}...'.format(name))

        max_length = 0
        vocab = {}
        for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(
                dataset._Xtest):
            if len(sent) > max_length:
                max_length = len(sent)
            for w in sent:
                if w not in vocab:
                    vocab[w] = 1
                else:
                    vocab[w] += 1

        wordvecs = {}
        for w in vecs._w2idx.keys():
            if w in vocab:
                wordvecs[w] = vecs[w]

        add_unknown_words(wordvecs, vocab, min_df=1, k=dim)
        W, word_idx_map = get_W(wordvecs, k=dim)

        print('Converting and Padding dataset...')

        dataset = convert_dataset(dataset, word_idx_map, max_length)

        output_dim = dataset._ytest.shape[1]
        """
        Get best Dev params
        ===========================================================
        """

        dev_params_file = 'dev_params/' + str(W.shape[1]) + '_cnn.dev.txt'
        best_dim, best_dropout, best_epoch, best_f1 = get_dev_params(
            name, dev_params_file, max_length, dataset._Xtrain,
            dataset._ytrain, dataset._Xdev, dataset._ydev, W)

        # Collect results here
        results = []
        std_devs = []

        for i, it in enumerate(range(5)):
            dataset_results = []

            checkpoint = ModelCheckpoint(
                'models/cnn/' + name + '/run' + str(i + 1) +
                '/weights.{epoch:03d}-{val_acc:.4f}.hdf5',
                monitor='val_acc',
                verbose=1,
                save_best_only=True,
                mode='auto')
            clf = create_cnn(W,
                             max_length,
                             dim=best_dim,
                             dropout=best_dropout,
                             output_dim=output_dim)

            h = clf.fit(dataset._Xtrain,
                        dataset._ytrain,
                        validation_data=[dataset._Xdev, dataset._ydev],
                        epochs=best_epoch,
                        verbose=1,
                        callbacks=[checkpoint])

            base_dir = 'models/cnn/' + name + '/run' + str(i + 1)
            weights = os.listdir(base_dir)
            best_val = 0
            best_weights = ''
            for weight in weights:
                val_acc = re.sub('weights.[0-9]*-', '', weight)
                val_acc = re.sub('.hdf5', '', val_acc)
                val_acc = float(val_acc)
                if val_acc > best_val:
                    best_val = val_acc
                    best_weights = weight

            clf = load_model(os.path.join(base_dir, best_weights))

            pred = clf.predict(dataset._Xtest, verbose=1)
            classes = clf.predict_classes(dataset._Xtest, verbose=1)
            prediction_file = 'predictions/cnn/' + name + '/run' + str(
                i + 1) + '/pred.txt'
            w2idx_file = 'predictions/cnn/' + name + '/w2idx.pkl'
            print_prediction(prediction_file, classes)
            with open(w2idx_file, 'wb') as out:
                pickle.dump(word_idx_map, out)

            labels = sorted(set(dataset._ytrain.argmax(1)))
            if len(labels) == 2:
                average = 'binary'
            else:
                average = 'micro'
            mm = MyMetrics(dataset._ytest,
                           pred,
                           labels=labels,
                           average=average)
            acc, precision, recall, micro_f1 = mm.get_scores()
            dataset_results.append([acc, precision, recall, micro_f1])

    return names, results, std_devs, dim
Esempio n. 13
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--NUM_LAYERS", "-nl", default=1, type=int)
    parser.add_argument("--HIDDEN_DIM", "-hd", default=100, type=int)
    parser.add_argument("--BATCH_SIZE", "-bs", default=50, type=int)
    parser.add_argument("--EMBEDDING_DIM", "-ed", default=300, type=int)
    parser.add_argument("--TRAIN_EMBEDDINGS", "-te", action="store_true")
    parser.add_argument("--AUXILIARY_TASK", "-aux", default="negation_scope")
    parser.add_argument("--EMBEDDINGS",
                        "-emb",
                        default="../../embeddings/google.txt")

    args = parser.parse_args()
    print(args)

    # Get embeddings (CHANGE TO GLOVE OR FASTTEXT EMBEDDINGS)
    embeddings = WordVecs(args.EMBEDDINGS)
    w2idx = embeddings._w2idx

    # Create shared vocabulary for tasks
    vocab = Vocab(train=True)

    # Update with word2idx from pretrained embeddings so we don't lose them
    # making sure to change them by one to avoid overwriting the UNK token
    # at index 0
    with_unk = {}
    for word, idx in embeddings._w2idx.items():
        with_unk[word] = idx + 1
    vocab.update(with_unk)

    # Import datasets
    # This will update vocab with words not found in embeddings
Esempio n. 14
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset',
                        default='opener',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    langs = ['es', 'ca', 'eu']

    for lang in langs:
        print('#### {0} ####'.format(lang))
        en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                             None,
                             one_hot=False,
                             rep=words)
        cross_dataset = General_Dataset(os.path.join('datasets', lang,
                                                     args.dataset),
                                        None,
                                        one_hot=False,
                                        rep=words)
        vocab = en.vocab.update(cross_dataset.vocab)

        vecs = WordVecs(
            'embeddings/barista/sg-300-window4-negative20_en_{0}.txt'.format(
                lang),
            vocab=vocab)

        en = General_Dataset(os.path.join('datasets', 'en', args.dataset),
                             vecs,
                             one_hot=False,
                             rep=ave_vecs,
                             lowercase=False)
        en_binary = General_Dataset(os.path.join('datasets', 'en',
                                                 args.dataset),
                                    vecs,
                                    one_hot=False,
                                    rep=ave_vecs,
                                    binary=True,
                                    lowercase=False)

        cross_dataset = General_Dataset(os.path.join('datasets', lang,
                                                     args.dataset),
                                        vecs,
                                        one_hot=False,
                                        rep=ave_vecs,
                                        lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               vecs,
                                               one_hot=False,
                                               rep=ave_vecs,
                                               binary=True,
                                               lowercase=False)

        if True in args.bi:
            print('-binary-')
            best_c, best_f1 = get_best_C(en_binary, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(en_binary._Xtrain, en_binary._ytrain)
            acc, f1 = scores(clf, binary_cross_dataset, 'binary')
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'barista',
                             '{0}-bi.txt'.format(args.dataset)))
            print('acc: {0:.3f}'.format(acc))
            print('f1:  {0:.3f}'.format(f1))

        if False in args.bi:
            print('-fine-')
            best_c, best_f1 = get_best_C(en, cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(en._Xtrain, en._ytrain)
            acc, f1 = scores(clf, cross_dataset)
            print_prediction(
                clf, cross_dataset,
                os.path.join('predictions', lang, 'barista',
                             '{0}-4cls.txt'.format(args.dataset)))

            print('acc: {0:.3f}'.format(acc))
            print('f1:  {0:.3f}'.format(f1))