Beispiel #1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-src_vecs',
        default='embeddings/original/google.txt',
        help=" source language vectors (default: GoogleNewsVecs )")
    parser.add_argument(
        '-trg_vecs',
        default='embeddings/original/sg-300-{0}.txt',
        help=" target language vectors (default: SGNS on Wikipedia)")
    parser.add_argument(
        '-trans',
        help=
        'translation pairs (default: Bing Liu Sentiment Lexicon Translations)',
        default='lexicons/bingliu/en-{0}.txt')
    parser.add_argument('-dataset',
                        default='opener_sents',
                        help="dataset to train and test on (default: opener)")
    parser.add_argument(
        '-bi',
        help=
        'List of booleans. True is only binary, False is only 4 class. True False is both. (default: [True, False])',
        default=[True, False],
        nargs='+',
        type=str2bool)
    args = parser.parse_args()

    # Loop over the three languages
    for lang in ['es', 'ca', 'eu']:
        print('################ {0} ##############'.format(lang))

        # Import monolingual vectors
        print('importing word embeddings')
        src_vecs = WordVecs(args.src_vecs)
        src_vecs.mean_center()
        src_vecs.normalize()

        trg_vecs = WordVecs(args.trg_vecs.format(lang))
        trg_vecs.mean_center()
        trg_vecs.normalize()

        # Setup projection dataset
        pdataset = ProjectionDataset(args.trans.format(lang), src_vecs,
                                     trg_vecs)

        # learn the translation matrix W
        W = get_W(pdataset, src_vecs, trg_vecs)

        # project the source matrix to the new shared space
        src_vecs._matrix = np.dot(src_vecs._matrix, W)

        # Import datasets (representation will depend on final classifier)
        print('importing datasets')
        binary_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                      args.dataset),
                                         src_vecs,
                                         binary=True,
                                         rep=ave_vecs,
                                         one_hot=False,
                                         lowercase=False)
        binary_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                               trg_vecs,
                                               binary=True,
                                               rep=ave_vecs,
                                               one_hot=False,
                                               lowercase=False)

        fine_dataset = General_Dataset(os.path.join('datasets', 'en',
                                                    args.dataset),
                                       src_vecs,
                                       binary=False,
                                       rep=ave_vecs,
                                       one_hot=False,
                                       lowercase=False)
        fine_cross_dataset = General_Dataset(os.path.join(
            'datasets', lang, args.dataset),
                                             trg_vecs,
                                             binary=False,
                                             rep=ave_vecs,
                                             one_hot=False,
                                             lowercase=False)

        # Train linear SVM classifier
        if True in args.bi:
            best_c, best_f1 = get_best_C(binary_dataset, binary_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(binary_dataset._Xtrain, binary_dataset._ytrain)
            cpred = clf.predict(binary_cross_dataset._Xtest)
            cf1 = macro_f1(binary_cross_dataset._ytest, cpred)
            print_prediction(
                clf, binary_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-bi.txt'.format(args.dataset)))
            print('-binary-')
            print('Acc: {0:.3f}'.format(
                clf.score(binary_cross_dataset._Xtest,
                          binary_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()

        if False in args.bi:
            best_c, best_f1 = get_best_C(fine_dataset, fine_cross_dataset)
            clf = LinearSVC(C=best_c)
            clf.fit(fine_dataset._Xtrain, fine_dataset._ytrain)
            cpred = clf.predict(fine_cross_dataset._Xtest)
            cf1 = macro_f1(fine_cross_dataset._ytest, cpred)
            print_prediction(
                clf, fine_cross_dataset,
                os.path.join('predictions', lang, 'artetxe',
                             '{0}-4cls.txt'.format(args.dataset)))
            print('-fine-')
            print('Acc: {0:.3f}'.format(
                clf.score(fine_cross_dataset._Xtest,
                          fine_cross_dataset._ytest)))
            print('Macro F1: {0:.3f}'.format(cf1))
            print()
    parser.add_argument('-te',
                        '--trg_embedding',
                        default="../../embeddings/BLSE/sg-300-es.txt")
    parser.add_argument('-sd',
                        '--src_dataset',
                        default="datasets/training/en/raw")
    parser.add_argument('-td',
                        '--trg_dataset',
                        default="datasets/training/es/raw")

    args = parser.parse_args()

    # Import monolingual vectors
    print('importing word embeddings')
    src_vecs = WordVecs(args.src_embedding)
    src_vecs.mean_center()
    src_vecs.normalize()
    trg_vecs = WordVecs(args.trg_embedding)
    trg_vecs.mean_center()
    trg_vecs.normalize()

    # Setup projection dataset
    trans = 'lexicons/bingliu_en_{0}.one-2-one.txt'.format(args.lang)
    pdataset = ProjectionDataset(trans, src_vecs, trg_vecs)

    # learn the translation matrix W
    print('Projecting src embeddings to trg space...')
    W = get_projection_matrix(pdataset, src_vecs, trg_vecs)
    print('W done')

    # project the source matrix to the new shared space