def command_classify(_args): training_friend_pairs = util.read_words(_args.training_friends_file_name) testing_friend_pairs = util.read_words(_args.testing_friends_file_name) model_es, model_pt = util.read_models(_args) T = linear_trans.load_linear_transformation(_args.translation_matrix_file_name) clf = CLF_OPTIONS[_args.classifier] if _args.cross_validation: X, y, _ = classifier.features_labels_and_scaler(training_friend_pairs + testing_friend_pairs, model_es, model_pt, T, backwards=_args.backwards) # FIXME: I think it should scale on each different training set. measures = classifier.classify_with_cross_validation(X, y, clf=clf) print('') print("Cross-validation measures with 95% of confidence:") for measure_name, (mean, delta) in measures.items(): print("{measure_name}: {mean:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]".format( measure_name=measure_name, mean=mean, delta=delta, inf=mean - delta, sup=mean + delta)) print('') mean_measures = {measure_name: mean for measure_name, (mean, delta) in measures.items()} __print_metrics_matrix(mean_measures) __print_confusion_matrix(mean_measures) else: X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T, backwards=_args.backwards) X_test, y_test, _ = classifier.features_labels_and_scaler(testing_friend_pairs, model_es, model_pt, T, scaler=scaler, backwards=_args.backwards) measures = classifier.classify(X_train, X_test, y_train, y_test) print('') __print_metrics_matrix(measures) __print_confusion_matrix(measures)
def command_out_of_vocabulary(_args): friend_pairs = util.read_words(_args.friends_file_name) model_es, model_pt = util.read_models(_args) words_es = (friend_pair.word_es for friend_pair in friend_pairs) words_pt = (friend_pair.word_pt for friend_pair in friend_pairs) print("OOV es:") for word_es in word_vectors.words_out_of_vocabulary(model_es, words_es): print(word_es) print('') print("OOV pt:") for word_pt in word_vectors.words_out_of_vocabulary(model_pt, words_pt): print(word_pt)
def command_out_of_vocabulary(args_): friend_pairs = util.read_words(args_.friends_file_name) model_es, model_pt = util.read_models(args_) words_es = (friend_pair.word_es for friend_pair in friend_pairs) words_pt = (friend_pair.word_pt for friend_pair in friend_pairs) print("OOV es:") for word_es in word_vectors.words_out_of_vocabulary( model_es, words_es): print(word_es) print('') print("OOV pt:") for word_pt in word_vectors.words_out_of_vocabulary( model_pt, words_pt): print(word_pt)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging import random from sklearn import svm from falsefriends import classifier from falsefriends import linear_trans from falsefriends import util, word_vectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt') model_es = word_vectors.load_model('resources/big/vectors_es_100.bin') model_pt = word_vectors.load_model('resources/big/vectors_pt_100.bin') logging.info("computing equal words...") equal_words = model_es.vocab.keys() & model_pt.vocab.keys() print("Equal words number in the Wikipedia's:", len(equal_words)) SAMPLE_SIZE = 20 print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE)) T = linear_trans.load_linear_transformation( 'resources/big/trans_es_100_pt_100.npz')
lexicon = bilingual_lexicon.most_frequent_bilingual_lexicon_based_on_external_count( model_es.vocab, model_pt.vocab) logging.info("getting vector pairs") X, Y = zip(*word_vectors.bilingual_lexicon_vectors( model_es, model_pt, bilingual_lexicon=lexicon)) X_array = np.vstack(X) Y_array = np.vstack(Y) logging.info( "Computing linear transformations and classifying with cross-validation..." ) print(" N\t Acc") friend_pairs = util.read_words(PARENT_DIR + '/resources/sepulveda2011_original.txt') clf = classifier.build_classifier() for size in range(50, len(X), 50): indices = np.random.choice(list(range(X_array.shape[0])), size, replace=False) T = linear_trans.linear_transformation(list(X_array[indices, :]), list(Y_array[indices, :])) X, y = classifier.features_and_labels(friend_pairs, model_es, model_pt, T) measures = classifier.classify_with_cross_validation(X, y, clf=clf)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging import random from sklearn import svm from falsefriends import classifier from falsefriends import linear_trans from falsefriends import util, word_vectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt') model_es = word_vectors.load_model('resources/big/jairo/vectors_es.bin') model_pt = word_vectors.load_model('resources/big/jairo/vectors_pt.bin') logging.info("computing equal words...") equal_words = model_es.vocab.keys() & model_pt.vocab.keys() print("Equal words number in the Wikipedia's:", len(equal_words)) SAMPLE_SIZE = 20 print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE)) T = linear_trans.load_linear_transformation('resources/big/jairo/linear_trans.npz') clf = svm.SVC() X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T)
def command_classify(args_): training_friend_pairs = util.read_words( args_.training_friends_file_name) testing_friend_pairs = util.read_words(args_.testing_friends_file_name) model_es, model_pt = util.read_models(args_) T = linear_trans.load_linear_transformation( args_.translation_matrix_file_name) clf = classifier.build_classifier(CLF_OPTIONS[args_.classifier]) if args_.cross_validation: friend_pairs = training_friend_pairs + testing_friend_pairs X, y, = classifier.features_and_labels( friend_pairs, model_es, model_pt, T, backwards=args_.backwards, topx=args_.top, use_taxonomy=args_.use_taxonomy) measures = classifier.classify_with_cross_validation(X, y, clf=clf) print('') print("Cross-validation measures with 95% of confidence:") for measure_name, (mean, delta) in measures.items(): print( "{measure_name}: {mean:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]" .format(measure_name=measure_name, mean=mean, delta=delta, inf=mean - delta, sup=mean + delta)) print('') mean_measures = { measure_name: mean for measure_name, (mean, delta) in measures.items() } _print_metrics_matrix(mean_measures) _print_confusion_matrix(mean_measures) else: X_train, y_train = classifier.features_and_labels( training_friend_pairs, model_es, model_pt, T, backwards=args_.backwards, topx=args_.top, use_taxonomy=args_.use_taxonomy) X_test, y_test = classifier.features_and_labels( testing_friend_pairs, model_es, model_pt, T, backwards=args_.backwards, topx=args_.top, use_taxonomy=args_.use_taxonomy) measures = classifier.classify(X_train, X_test, y_train, y_test, clf) print('') _print_metrics_matrix(measures) _print_confusion_matrix(measures)