def command_classify(_args): training_friend_pairs = util.read_words(_args.training_friends_file_name) testing_friend_pairs = util.read_words(_args.testing_friends_file_name) model_es, model_pt = util.read_models(_args) T = linear_trans.load_linear_transformation(_args.translation_matrix_file_name) clf = CLF_OPTIONS[_args.classifier] if _args.cross_validation: X, y, _ = classifier.features_labels_and_scaler(training_friend_pairs + testing_friend_pairs, model_es, model_pt, T, backwards=_args.backwards) # FIXME: I think it should scale on each different training set. measures = classifier.classify_with_cross_validation(X, y, clf=clf) print('') print("Cross-validation measures with 95% of confidence:") for measure_name, (mean, delta) in measures.items(): print("{measure_name}: {mean:0.4f} ± {delta:0.4f} --- [{inf:0.4f}, {sup:0.4f}]".format( measure_name=measure_name, mean=mean, delta=delta, inf=mean - delta, sup=mean + delta)) print('') mean_measures = {measure_name: mean for measure_name, (mean, delta) in measures.items()} __print_metrics_matrix(mean_measures) __print_confusion_matrix(mean_measures) else: X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T, backwards=_args.backwards) X_test, y_test, _ = classifier.features_labels_and_scaler(testing_friend_pairs, model_es, model_pt, T, scaler=scaler, backwards=_args.backwards) measures = classifier.classify(X_train, X_test, y_train, y_test) print('') __print_metrics_matrix(measures) __print_confusion_matrix(measures)
model_es = word_vectors.load_model('resources/big/jairo/vectors_es.bin') model_pt = word_vectors.load_model('resources/big/jairo/vectors_pt.bin') logging.info("computing equal words...") equal_words = model_es.vocab.keys() & model_pt.vocab.keys() print("Equal words number in the Wikipedia's:", len(equal_words)) SAMPLE_SIZE = 20 print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE)) T = linear_trans.load_linear_transformation('resources/big/jairo/linear_trans.npz') clf = svm.SVC() X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T) logging.info("training...") clf.fit(X_train, y_train) equal_friend_pairs = (classifier.FriendPair(word, word, None) for word in equal_words) logging.info("computing features...") X_equal = classifier.features_labels_and_scaler(equal_friend_pairs, model_es, model_pt, T, scaler)[0] logging.info("predicting equal words...") y_equal = clf.predict(X_equal) print("Cognates percentage in equal words in Wikipedia's:", sum(y_equal) / len(y_equal)) # TODO: same for similar words