def evaluate_LFW(model,
                 embedding_size,
                 use_flipped_images=False,
                 N_folds=5,
                 distance_metric=1,
                 verbose=1):
    pairs = read_pairs(os.path.expanduser(LFW_PAIRS_PATH))
    paths, actual_issame = get_paths(os.path.expanduser(LFW_DIR), pairs)
    ds = tf_dataset_from_paths(paths, flip=False)
    embeddings = np.zeros([len(paths), embedding_size])
    j = 0
    if verbose >= 2:
        print("Feed forward all pairs")
    for batch in ds:
        batch_embeddings = model(batch).numpy()
        embeddings[j:j + len(batch)] = batch_embeddings
        j += len(batch)
    if use_flipped_images:
        if verbose >= 2:
            print("Feed forward all pairs - flipped")
        flip_ds = tf_dataset_from_paths(paths, flip=True)
        flip_embeddings = np.zeros([len(paths), embedding_size])
        j = 0
        for batch in flip_ds:
            batch_embeddings = model(batch).numpy()
            flip_embeddings[j:j + len(batch)] = batch_embeddings
            j += len(batch)

        full_embeddings = np.zeros((len(paths), embedding_size * 2))
        full_embeddings[:, :embedding_size] = embeddings
        full_embeddings[:, embedding_size:] = flip_embeddings
    if verbose >= 2:
        print("Calculating metrics")

    if use_flipped_images:
        tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(
            (embeddings + flip_embeddings) / 2,
            actual_issame,
            nrof_folds=N_folds,
            distance_metric=distance_metric)
    else:
        tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate(
            embeddings,
            actual_issame,
            nrof_folds=N_folds,
            distance_metric=distance_metric)
    if verbose:
        print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy)))
        print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' %
              (val, val_std, far))
        print('threshold : %2.5f+-%2.5f' %
              (np.mean(best_thresholds), np.std(best_thresholds)))
        auc = metrics.auc(fpr, tpr)
        print('Area Under Curve (AUC): %1.3f' % auc)
        eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0.,
                     1.)
        print('Equal Error Rate (EER): %1.3f' % eer)
    return accuracy
def gensim_classifier():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    label_list = get_labels()
    tweet_list = get_labelled_tweets()

    # split all sentences to list of words
    sentences = []
    for tweet in tweet_list:
        temp_doc = tweet.split()
        sentences.append(temp_doc)

    # parameters for model
    num_features = 100
    min_word_count = 1
    num_workers = 4
    context = 2
    downsampling = 1e-3

    # Initialize and train the model
    w2v_model = Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling, seed=1)

    index_value, train_set, test_set = train_test_split(0.80, sentences)
    train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
    test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
    train_vector = Imputer().fit_transform(train_vector)
    test_vector = Imputer().fit_transform(test_vector)

    # train model and predict
    model = LinearSVC()
    classifier_fitted = OneVsRestClassifier(model).fit(
        train_vector, label_list[:index_value])
    result = classifier_fitted.predict(test_vector)

    # output result to csv
    create_directory('data')
    result.tofile("data/w2v_linsvc.csv", sep=',')

    # store the model to mmap-able files
    create_directory('model')
    joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

    # evaluation
    label_score = classifier_fitted.decision_function(test_vector)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(label_list, classes=class_list)

    evaluate(binarise_result, binarise_labels[index_value:], label_score,
             'w2v_linsvc')
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()
    sentences.append(temp_doc)

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  create_directory('model')
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def find_alphas(preprocess=None):
    index, categories, categories_words, N_words = analyze_training_data(
        './data/')
    results = {}
    for alpha in [i / 10 for i in range(1, 51)]:
        predictions, labels = naive_bayes_on_validation_set(
            './data/',
            index,
            categories,
            categories_words,
            N_words,
            alpha=alpha,
            preprocess=preprocess)
        results[alpha] = evaluate(predictions, labels)
    if preprocess is not None:
        f = open(
            'naive_bayes_with_different_alphas_' + str(preprocess) + '_.txt',
            'w+')
    else:
        f = open('naive_bayes_with_different_alphas.txt', 'w+')
    for alpha in results.keys():
        print("##############################\nRESULTS FOR ALPHA={}\n".format(
            alpha),
              results[alpha],
              file=f)
    f.close()
 def perform_on_specific_setting(self, method, k):
     predictions, labels = self.knn_on_validation_set(k=k, method=method)
     result = evaluate(predictions, labels)
     f = open('knn_with_' + str(self.preprocess) + '.txt', 'w+')
     print("##############################\nRESULTS FOR M={} K={}\n".format(
         method, k),
           result,
           file=f)
     f.close()
def search_best_n_est_and_max_depth(data_path):
    results = {}
    for num_estimators in [i for i in range(20, 200, 20)]:
        for max_depth in [i for i in range(100, 500, 50)]:
            predictions, labels = validation(data_path, num_estimators, max_depth)
            results[(num_estimators, max_depth)] = evaluate(predictions, labels)
    f = open('random_forest_with_different_parameters.txt', 'w+')
    for k in results.keys():
        print("##############################\nRESULTS FOR (N_EST, MAX_DEPTH)={}\n".format(k),
              results[k], file=f)
    f.close()
def search_best_c(data_path):
    results = {}
    for C in [i / 10 for i in range(1, 21)]:
        predictions, labels = validation(data_path, C)
        results[C] = evaluate(predictions, labels)
    f = open('SVM_with_different_Cs.txt', 'w+')
    for C in results.keys():
        print("##############################\nRESULTS FOR C={}\n".format(C),
              results[C],
              file=f)
    f.close()
def lin_svc():
    label_list = get_labels()
    tweet_list = get_labelled_tweets()
    # vectorise using tf-idf
    vectoriser = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
    )

    ## do transformation into vector
    fitted_vectoriser = vectoriser.fit(tweet_list)
    vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
    train_vector, test_vector, train_labels, test_labels = train_test_split(
        vectorised_tweet_list, label_list, test_size=0.8, random_state=42)

    # train model and predict
    model = LinearSVC()
    ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
    result = ovr_classifier.predict(test_vector)

    # output result to csv
    create_directory('data')
    save_to_csv("data/testset_labels.csv", test_labels)
    result.tofile("data/tfidf_linsvc.csv", sep=',')

    save_model(ovr_classifier, 'tfidf_linsvc')
    save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

    # evaluation
    label_score = ovr_classifier.decision_function(test_vector)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(test_labels, classes=class_list)

    evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  fitted_vectoriser = vectoriser.fit(tweet_list)
  vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  # train model and predict
  model = LinearSVC()
  ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
  result = ovr_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  save_to_csv("data/testset_labels.csv", test_labels)
  result.tofile("data/tfidf_linsvc.csv", sep=',')

  save_model(ovr_classifier, 'tfidf_linsvc')
  save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

  # evaluation
  label_score = ovr_classifier.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)

  evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
 def find_results_of_ks(self):
     results = {m: {} for m in self.methods}
     for m in self.methods:
         for k in [1, 3, 5]:
             t1 = time.time()
             predictions, labels = self.knn_on_validation_set(k=k, method=m)
             results[m][k] = evaluate(predictions, labels)
             t2 = time.time()
             print(t2 - t1, 'secs')
     f = open('knn_with_different_ks.txt', 'w+')
     for m in self.methods:
         for k in results[m].keys():
             print(
                 "##############################\nRESULTS FOR M={} K={}\n".
                 format(m, k),
                 results[m][k],
                 file=f)
     f.close()