def command_classify(_args):
        training_friend_pairs = util.read_words(_args.training_friends_file_name)
        testing_friend_pairs = util.read_words(_args.testing_friends_file_name)
        model_es, model_pt = util.read_models(_args)

        T = linear_trans.load_linear_transformation(_args.translation_matrix_file_name)

        clf = CLF_OPTIONS[_args.classifier]

        if _args.cross_validation:
            X, y, _ = classifier.features_labels_and_scaler(training_friend_pairs + testing_friend_pairs, model_es,
                                                            model_pt, T, backwards=_args.backwards)
            # FIXME: I think it should scale on each different training set.
            measures = classifier.classify_with_cross_validation(X, y, clf=clf)
            print('')

            print("Cross-validation measures with 95% of confidence:")

            for measure_name, (mean, delta) in measures.items():
                print("{measure_name}: {mean:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]".format(
                    measure_name=measure_name, mean=mean, delta=delta, inf=mean - delta, sup=mean + delta))

            print('')

            mean_measures = {measure_name: mean for measure_name, (mean, delta) in measures.items()}
            __print_metrics_matrix(mean_measures)
            __print_confusion_matrix(mean_measures)
        else:
            X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt,
                                                                             T, backwards=_args.backwards)
            X_test, y_test, _ = classifier.features_labels_and_scaler(testing_friend_pairs, model_es, model_pt, T,
                                                                      scaler=scaler, backwards=_args.backwards)
            measures = classifier.classify(X_train, X_test, y_train, y_test)

            print('')

            __print_metrics_matrix(measures)
            __print_confusion_matrix(measures)
    def command_out_of_vocabulary(_args):
        friend_pairs = util.read_words(_args.friends_file_name)
        model_es, model_pt = util.read_models(_args)
        words_es = (friend_pair.word_es for friend_pair in friend_pairs)
        words_pt = (friend_pair.word_pt for friend_pair in friend_pairs)

        print("OOV es:")
        for word_es in word_vectors.words_out_of_vocabulary(model_es, words_es):
            print(word_es)

        print('')
        print("OOV pt:")
        for word_pt in word_vectors.words_out_of_vocabulary(model_pt, words_pt):
            print(word_pt)
    def command_out_of_vocabulary(args_):
        friend_pairs = util.read_words(args_.friends_file_name)
        model_es, model_pt = util.read_models(args_)
        words_es = (friend_pair.word_es for friend_pair in friend_pairs)
        words_pt = (friend_pair.word_pt for friend_pair in friend_pairs)

        print("OOV es:")
        for word_es in word_vectors.words_out_of_vocabulary(
                model_es, words_es):
            print(word_es)

        print('')
        print("OOV pt:")
        for word_pt in word_vectors.words_out_of_vocabulary(
                model_pt, words_pt):
            print(word_pt)
Example #4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import random

from sklearn import svm

from falsefriends import classifier
from falsefriends import linear_trans
from falsefriends import util, word_vectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt')

model_es = word_vectors.load_model('resources/big/vectors_es_100.bin')
model_pt = word_vectors.load_model('resources/big/vectors_pt_100.bin')

logging.info("computing equal words...")
equal_words = model_es.vocab.keys() & model_pt.vocab.keys()

print("Equal words number in the Wikipedia's:", len(equal_words))

SAMPLE_SIZE = 20
print("Sample", SAMPLE_SIZE, "equal words found:",
      random.sample(equal_words, SAMPLE_SIZE))

T = linear_trans.load_linear_transformation(
    'resources/big/trans_es_100_pt_100.npz')
lexicon = bilingual_lexicon.most_frequent_bilingual_lexicon_based_on_external_count(
    model_es.vocab, model_pt.vocab)

logging.info("getting vector pairs")
X, Y = zip(*word_vectors.bilingual_lexicon_vectors(
    model_es, model_pt, bilingual_lexicon=lexicon))

X_array = np.vstack(X)
Y_array = np.vstack(Y)

logging.info(
    "Computing linear transformations and classifying with cross-validation..."
)
print("   N\t Acc")

friend_pairs = util.read_words(PARENT_DIR +
                               '/resources/sepulveda2011_original.txt')

clf = classifier.build_classifier()

for size in range(50, len(X), 50):
    indices = np.random.choice(list(range(X_array.shape[0])),
                               size,
                               replace=False)

    T = linear_trans.linear_transformation(list(X_array[indices, :]),
                                           list(Y_array[indices, :]))

    X, y = classifier.features_and_labels(friend_pairs, model_es, model_pt, T)

    measures = classifier.classify_with_cross_validation(X, y, clf=clf)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import random

from sklearn import svm

from falsefriends import classifier
from falsefriends import linear_trans
from falsefriends import util, word_vectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt')

model_es = word_vectors.load_model('resources/big/jairo/vectors_es.bin')
model_pt = word_vectors.load_model('resources/big/jairo/vectors_pt.bin')

logging.info("computing equal words...")
equal_words = model_es.vocab.keys() & model_pt.vocab.keys()

print("Equal words number in the Wikipedia's:", len(equal_words))

SAMPLE_SIZE = 20
print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE))

T = linear_trans.load_linear_transformation('resources/big/jairo/linear_trans.npz')

clf = svm.SVC()

X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T)
    def command_classify(args_):
        training_friend_pairs = util.read_words(
            args_.training_friends_file_name)
        testing_friend_pairs = util.read_words(args_.testing_friends_file_name)
        model_es, model_pt = util.read_models(args_)

        T = linear_trans.load_linear_transformation(
            args_.translation_matrix_file_name)

        clf = classifier.build_classifier(CLF_OPTIONS[args_.classifier])

        if args_.cross_validation:
            friend_pairs = training_friend_pairs + testing_friend_pairs

            X, y, = classifier.features_and_labels(
                friend_pairs,
                model_es,
                model_pt,
                T,
                backwards=args_.backwards,
                topx=args_.top,
                use_taxonomy=args_.use_taxonomy)
            measures = classifier.classify_with_cross_validation(X, y, clf=clf)
            print('')

            print("Cross-validation measures with 95% of confidence:")

            for measure_name, (mean, delta) in measures.items():
                print(
                    "{measure_name}: {mean:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]"
                    .format(measure_name=measure_name,
                            mean=mean,
                            delta=delta,
                            inf=mean - delta,
                            sup=mean + delta))

            print('')

            mean_measures = {
                measure_name: mean
                for measure_name, (mean, delta) in measures.items()
            }
            _print_metrics_matrix(mean_measures)
            _print_confusion_matrix(mean_measures)
        else:
            X_train, y_train = classifier.features_and_labels(
                training_friend_pairs,
                model_es,
                model_pt,
                T,
                backwards=args_.backwards,
                topx=args_.top,
                use_taxonomy=args_.use_taxonomy)
            X_test, y_test = classifier.features_and_labels(
                testing_friend_pairs,
                model_es,
                model_pt,
                T,
                backwards=args_.backwards,
                topx=args_.top,
                use_taxonomy=args_.use_taxonomy)
            measures = classifier.classify(X_train, X_test, y_train, y_test,
                                           clf)

            print('')

            _print_metrics_matrix(measures)
            _print_confusion_matrix(measures)