Beispiel #1
0
 def k_nearst_neighbour_classify(self):
     knn = KNNClassifier()
     predictions = knn.predict_classification(self.X_train, self.y_train,
                                              self.X_test)
     accuracy = str(self.main.accuracy(self.y_test, predictions))
     accuracy = accuracy[0:4]
     self.knn_acc_label.setText(accuracy)
Beispiel #2
0
    def test_knn(self):
        from classifiers import KNNClassifier

        if type(self.k) == int:
            k = "%s" % self.k
        else:
            k = "-".join([str(i) for i in self.k])

        print("KNNClassifier")
        print("---" * 45)
        print("Train num = %s" % self.train_num)
        print("Test num = %s" % self.test_num)
        print("K = %s" % k)

        knn = KNNClassifier(self.train_data, self.train_labels, k=self.k, best_words=self.best_words)
        classify_labels = []

        print("KNNClassifiers is testing ...")
        for data in self.test_data:
            classify_labels.append(knn.classify(data))
        print("KNNClassifiers tests over.")

        filepath = "f_runout/KNN-%s-train-%d-test-%d-f-%d-k-%s-%s.xls" % \
                   (self.type,
                    self.train_num, self.test_num,
                    self.feature_num, k,
                    datetime.datetime.now().strftime(
                        "%Y-%m-%d-%H-%M-%S"))

        self.write(filepath, classify_labels)
Beispiel #3
0
def match_codes():
    """Matching full dataset in parallel with customizable embedding method"""

    # 0.398657046863 hard, 0.275397642306 soft

    working_directory = './data/'

    data_codes, data_descriptions = get_data_to_match('slim')

    official_codes, official_descriptions = get_official_data()

    level = 1
    model = HashingEmbedder(
        level=level, analyzer='char_wb', ngram_range=(4, 5), norm='l2'
    )  # word2vecEmbedder() # word2vecEmbedder() # HashingEmbedder() #  [HashingEmbedder(level=level, analyzer='char', ngram_range=(3,5), norm='l2')] #[HashingEmbedder(level=level, analyzer='char', ngram_range=(2,3))]
    model.embed_data(data_descriptions)

    print 'loaded and embedded data'

    # test_nNN(model, data_descriptions, data_codes)

    official_code_labels = None
    true_data_codes = None
    use_section = False
    if use_section:
        official_code_labels = get_section_codes(model.official_codes)
        true_data_codes = get_section_codes(data_codes)
    else:
        official_code_labels = coarsen_codes(model.official_codes)
        true_data_codes = coarsen_codes(data_codes)

    nNN = 4
    classifier = KNNClassifier(n_neighbors=nNN)
    classifier.fit(model.official_embeddings, official_code_labels)
    pred_codes = classifier.predict(
        model.data_embeddings, pbar=True
    )  # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)

    errors = pred_codes - true_data_codes
    print 'Correctly predicted', 1.0 * np.sum(
        errors == 0) / errors.shape[0], 'percent of top level codes'
    # plot_confusion_matrix(true_data_codes, pred_codes)

    model = word2vecEmbedder()
    model.embed_data(data_descriptions)
    classifier = KNNClassifier(n_neighbors=nNN)
    classifier.fit(model.official_embeddings, official_code_labels)
    pred_codes = classifier.predict(
        model.data_embeddings
    )  # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)
def classify(data, classifier, num_classes, train_labels, train_features,
             test_labels, test_features):
    """
    Function used by FoldRunner to execute classification based on the current classifier
    :param data: the configuration dictionary
    :param classifier: current classifier (from the classifiers list in the configuration file)
    :param num_classes: the number of distinct labels (binary or multiclass classification)
    :param train_labels: the labels of all train instances
    :param train_features: the features of all train instances
    :param test_labels: the labels of all test instances
    :param test_features: the features of all test instances
    :return: the confusion matrix of the classification
    """
    if classifier == "NN_keras":
        return nnk.classify(data, num_classes, train_labels, train_features,
                            test_labels, test_features)
    elif classifier == "NN_scikit-learn":
        return nns.classify(train_labels, train_features, test_labels,
                            test_features)
    elif classifier == "KNN":
        return knn.classify(data, train_labels, train_features, test_labels,
                            test_features)
    elif classifier == "NaiveBayes":
        return nb.classify(train_labels, train_features, test_labels,
                           test_features)
    elif classifier == "RandomForest":
        return rf.classify(train_labels, train_features, test_labels,
                           test_features)
    elif classifier == "LogisticRegression":
        return lr.classify(train_labels, train_features, test_labels,
                           test_features)
Beispiel #5
0
    def classify_custom_input(self, custom_input_vector):
        nb = NaiveBayesClassifier()
        nb.train(self.X_train, self.y_train)
        prediction = nb.predict([custom_input_vector])
        self.custom_text_nb_label.setText(str(prediction[0]))

        knn = KNNClassifier()
        prediction = knn.predict_classification(self.X_train, self.y_train,
                                                [custom_input_vector])
        self.custom_text_knn_label.setText(str(prediction[0]))

        rf = SklearnRandomForest()
        prediction = rf.random_forest(self.X_train, self.y_train,
                                      [custom_input_vector])
        self.custom_text_dt_label.setText(str(prediction[0]))

        dt = SklearnDecisionTree()
        prediction = dt.decision_tree(self.X_train, self.y_train,
                                      [custom_input_vector])
        self.custom_text_rf_label.setText(str(prediction[0]))
Beispiel #6
0
def test_nNN(model, data_descriptions, data_codes, nNNmin=2, nNNmax=10):
    for nNN in xrange(nNNmin, nNNmax + 1):
        classifier = KNNClassifier(n_neighbors=nNN)

        t1 = time()
        classifier.fit(model.official_embeddings,
                       get_section_codes(model.official_codes))
        # classifier.fit(model.official_embeddings, coarsen_codes(model.official_codes))
        pred_section_codes = classifier.predict_with_edit_dist(
            model.data_embeddings, data_descriptions,
            model.official_descriptions)
        # true_coarse_codes = coarsen_codes(data_codes) # .reshape((-1,1))
        # errors = pred_codes - true_coarse_codes
        true_section_codes = get_section_codes(data_codes)  # .reshape((-1,1))
        errors = pred_section_codes - true_section_codes

        print '------------------------------'
        print 'nNN:', nNN
        print 'Correctly predicted', 1.0 * np.sum(
            errors ==
            0) / errors.shape[0], 'percent of top level codes w/ edit dist kNN'
        print 'Took', time() - t1, 'seconds'

        t1 = time()
        # pred_codes = classifier.predict(model.data_embeddings)
        # errors = pred_codes - true_coarse_codes
        pred_section_codes = classifier.predict(model.data_embeddings)
        errors = pred_section_codes - true_section_codes
        print 'Correctly predicted', 1.0 * np.sum(
            errors ==
            0) / errors.shape[0], 'percent of top level code w/ euclidean kNN'
        print 'Took', time() - t1, 'seconds'
        print '------------------------------'
Beispiel #7
0
    def test_knn(self):
        from classifiers import KNNClassifier

        if type(self.k) == int:
            k = "%s" % self.k
        else:
            k = "-".join([str(i) for i in self.k])

        print("KNNClassifier")
        print("---" * 45)
        print("Train num = %s" % self.train_num)
        print("Test num = %s" % self.test_num)
        print("K = %s" % k)

        # print self.train_data
        print (self.train_labels)
        print (len(self.train_data))
        print (self.train_data[0])

        knn = KNNClassifier(self.train_data, self.train_labels, k=self.k, best_words=self.best_words)
        classify_labels = []

        print("KNNClassifiers is testing ...")

        for data in self.test_data:
            classify_labels.append(knn.classify(data))
        print("KNNClassifiers tests over.")

        filepath = "f_runout/KNN-%s-train-%d-test-%d-f-%d-k-%s-%s.xls" % \
                   (self.type,
                    self.train_num, self.test_num,
                    self.feature_num, k,
                    datetime.datetime.now().strftime(
                        "%Y-%m-%d-%H-%M-%S"))

        self.write(filepath, classify_labels)
Beispiel #8
0
 def KNNClassifier(self,
                   dataset=None,
                   class_column=None,
                   name=None,
                   pipeline=None,
                   K=5,
                   kernel="euclidean",
                   algo="auto",
                   weights="uniform",
                   kernel_params={}):
     return KNNClassifier(dataset=dataset,
                          class_column=class_column,
                          name=name,
                          pipeline=pipeline,
                          K=K,
                          kernel=kernel,
                          algo=algo,
                          weights=weights,
                          kernel_params=kernel_params,
                          client=self.client)
Beispiel #9
0
        localAreaDensity=-1,  # Using numActiveColumnsPerInhArea
        numActiveColumnsPerInhArea=64,
        # All input activity can contribute to feature output
        stimulusThreshold=0,
        synPermInactiveDec=synPermDec,
        synPermActiveInc=synPermInc,
        synPermConnected=synPermConn,
        maxBoost=1.0,
        seed=1956,  # The seed that Grok uses
        spVerbosity=1)

    # Instantiate the spatial pooler test bench.
    tb = VisionTestBench(sp)

    # Instantiate the classifier
    clf = KNNClassifier()

    # Train the spatial pooler on trainingVectors.
    numCycles = tb.train(trainingVectors, trainingTags, clf, maxTrainingCycles,
                         minAccuracy)

    # Save the permanences and connections after training.
    #tb.savePermanences('perms.jpg')
    #tb.showPermanences()
    #tb.showConnections()

    # Get testing images and convert them to vectors.
    testingImages, testingTags = data.getImagesAndTags(testingDataset)
    testingVectors = encoder.imagesToVectors(testingImages)

    # Reverse the order of the vectors and tags for testing
Beispiel #10
0

if len(sys.argv) < 2:
    terminate()
else:
    mode = sys.argv[1]
    if mode not in func_mode_list:
        terminate()


def show_plot_sample():
    fig = plt.figure(figsize=(8, 8))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
    for i in tqdm(range(25)):
        id = random.randint(0, len(testX) - 1)
        images = np.reshape(testX[id], [28, 28])
        ax = fig.add_subplot(5, 5, i + 1, xticks=[], yticks=[])
        ax.imshow(images, cmap=plt.cm.binary, interpolation='nearest')
        ax.text(0, 2, "label:" + str(testY[id]))
        ax.text(0, 4, "predict:" + str(knn.predict(testX[id])))
    plt.show()


if __name__ == '__main__':
    trainX, trainY, testX, testY = load_mnist()
    knn = KNNClassifier(train_data=trainX, train_labels=trainY, ord=2)
    if mode == 'run_sample':
        show_plot_sample()
    else:
        knn.test_acc(test_data=testX, test_label=testY, K=1)