Beispiel #1
0
class NearestMeanClassifier(BaseClassifier):
    def __init__(self, feature_length, num_classes):
        super().__init__(feature_length, num_classes)
        self.num_classes = num_classes

        # model build
        # shrink_threshold = True for Nearest Shrunken Centroid Classifier
        self.model = NearestCentroid(metric='manhattan')

    def train(self, features, labels):
        """
        Using a set of features and labels, trains the classifier and returns the training accuracy.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to train to predict
        :return: Prediction accuracy, as a float between 0 and 1
        """
        labels = self.labels_to_categorical(labels)
        self.model.fit(features, labels)
        accuracy = self.model.score(features, labels)
        return accuracy


    def get_prediction(self,features):
        '''
        this function get the prediction from the
        :param features: sample to predict
        :return: prediction from the model
        '''
        return self.model.predict(features)

    def predict(self, features, labels):
        """
        Using a set of features and labels, predicts the labels from the features,
        and returns the accuracy of predicted vs actual labels.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to test prediction accuracy on
        :return: Prediction accuracy, as a float between 0 and 1
        """
        labels = self.labels_to_categorical(labels)
        accuracy = self.model.score(features, labels)
        return accuracy

    def labels_to_categorical(self, labels):
        '''
        convert the labels from string to number
        :param labels: labels list of string
        :return: labels converted in number
        '''
        _, IDs = unique(labels, return_inverse=True)
        return IDs
Beispiel #2
0
def test_kernel_sef():
    """
    Performs some basic testing using the KernelSEF
    :return:
    """
    np.random.seed(1)
    train_data = np.random.randn(100, 50)
    train_labels = np.random.randint(0, 2, 100)

    proj = KernelSEF(train_data,
                     50,
                     output_dimensionality=12,
                     kernel_type='rbf')
    proj._initialize(train_data)
    proj_data = proj.transform(train_data, batch_size=8)
    assert proj_data.shape[0] == 100
    assert proj_data.shape[1] == 12

    ncc = NearestCentroid()
    ncc.fit(proj_data, train_labels)
    acc_before = ncc.score(proj_data, train_labels)

    loss = proj.fit(data=train_data,
                    target_labels=train_labels,
                    epochs=200,
                    target='supervised',
                    batch_size=8,
                    regularizer_weight=0,
                    learning_rate=0.0001,
                    verbose=False)

    # Ensure that loss is reducing
    assert loss[0] > loss[-1]

    proj_data = proj.transform(train_data, batch_size=8)
    assert proj_data.shape[0] == 100
    assert proj_data.shape[1] == 12

    ncc = NearestCentroid()
    ncc.fit(proj_data, train_labels)
    acc_after = ncc.score(proj_data, train_labels)

    assert acc_after > acc_before
def test_pickle():
    import pickle

    # classification
    obj = NearestCentroid()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(iris.data, iris.target)
    assert_array_equal(score, score2,
                       "Failed to generate same score"
                       " after pickling (classification).")
def test_pickle():
    import pickle

    # classification
    obj = NearestCentroid()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(iris.data, iris.target)
    assert_array_equal(
        score, score2, "Failed to generate same score"
        " after pickling (classification).")
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()):
    #Create a nearest centroid
    clf = NearestCentroid()
    # Train with the data
    clf.fit(Xtrain, Xtrain_lbls)

    # Create prediction for test data
    y_pred_test = clf.predict(Xtest)

    # How well does it fit
    score = clf.score(Xtest, Xtest_lbls)

    print('%-9s\t%.2fs\t%-9s\t%-9s'
          % (name, (time() - t0), score, data))

    return y_pred_test
Beispiel #6
0
class NearestCentroidClassfier(Classifier):
    def __init__(self,
                 train_set=None,
                 val_set=None,
                 data_file=None,
                 header=0,
                 test_size=0.2,
                 feature_col_range=[2, 9],
                 label_col=-1,
                 features_degree=3):

        Classifier.__init__(self, train_set, val_set, data_file, header,
                            test_size, feature_col_range, label_col,
                            features_degree, True)

    def fit(self, metric='manhattan'):

        print('Using Nearest Centroid Classfier...')

        self.model = Model(metric=metric, shrink_threshold=-17)

        self.model.fit(self.X_train, self.y_train)

        print('\nTrain Set Accuracy: ',
              self.model.score(self.X_train, self.y_train) * 100)

        # Predicting the Test set results
        if len(self.X_test) > 0:
            print('\nEvaluating on test set...')
            y_pred = self.predict(self.X_test)
            self.score = self.evaluate(X=self.X_test, y=self.y_test)

            # Making the Confusion Matrix
            self.cm = confusion_matrix(self.y_test,
                                       y_pred,
                                       labels=[i for i in range(num_labels)])

    def probality(self,
                  X=None,
                  data_file=None,
                  header=0,
                  feature_col_range=[1, 9]):
        print('probality() is not supported')
        return None
Beispiel #7
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
class scikit_NearestCentroid(MLAlgo):
    def __init__(self):
        self.clf = NearestCentroid()
        self.className = self.__class__.__name__

    def train(self, train_data):
        train_X = train_data[:, :-1]
        train_Y = train_data[:, -1]
        self.clf.fit(train_X, train_Y)
        print("NearestCentroid model built.")
        return self.className + " Training finished...\n"

    def test(self, test_data):
        test_X = test_data[:, :-1]
        test_Y = test_data[:, -1]
        print("Accuracy: ", self.clf.score(test_X, test_Y))
        return self.className + " Testing finished...\n"

    def predict(self, predict_data):
        print("Predictions: ", self.clf.predict(predict_data))
        return self.className + " Prediction finished...\n"

    def cross_validate(self, train_data):
        X_ = train_data[:, :-1]
        Y_ = train_data[:, -1]
        predicted = cross_val_predict(self.clf, X_, Y_, cv=10)
        print("Cross-validation accuracy: ",
              metrics.accuracy_score(Y_, predicted))

        if metrics.accuracy_score(Y_,
                                  predicted) > MLAlgo.cross_validate_accuracy:
            MLAlgo.cross_validate_accuracy = metrics.accuracy_score(
                Y_, predicted)
            MLAlgo.classifier = self.clf
            MLAlgo.trained_instance = self

        return self.className + " Cross validation finished...\n"
Beispiel #9
0
print("Nested Scores:\n{}".format(knnc_nested_scores))
print("Max score: {} (index {})\n".format(knnc_nested_scores.max(),
                                          np.argmax(knnc_nested_scores)))

#----------------------------------------------------------------------------------------------
#Confusion Matrix KNN NC

shrink_threshold = 0
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_knnc, y, random_state=3)

knnc_classifier = NearestCentroid(shrink_threshold=shrink_threshold).fit(
    X_train, y_train)

print("KNN Nearest Centroid Confusion Matrix using the best parameters")
print("Train score:{}".format(knnc_classifier.score(X_train, y_train)))
print("Test score:{}\n".format(knnc_classifier.score(X_test, y_test)))

class_names = [
    'grab', 'hit', 'massage', 'pat', 'pinch', 'poke', 'press', 'rub',
    'scratch', 'slap', 'squeeze', 'stroke', 'tap', 'tickle'
]
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(knnc_classifier,
                                 X_test,
                                 y_test,
Beispiel #10
0
def nearest_centroid(x_train, y_train, x_test, y_test):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid()
    clf.fit(x_train, y_train)
    value = clf.score(x_test, y_test)
    return "{0:.2f}".format(value)
# 21 jellemző
print(x.shape)
print(y.shape)

# Train és test-re bontás
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=8, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.15,
                                                    shuffle=True)

#Baseline - Nearest centroid
tic = timeit.default_timer()
nc = NearestCentroid()
nc.fit(x_train, y_train)
print('Train-acc:', nc.score(x_train, y_train))
print('Test-acc:', nc.score(x_test, y_test))
toc = timeit.default_timer()
elapsed_time = toc - tic
NC_time = elapsed_time
print('Elapsed time: ', elapsed_time, 'seconds')

#Gaussian Naive Bayes - Gaus eloszlást feltételezünk
tic = timeit.default_timer()
model = GaussianNB()
model.fit(x_train, y_train)
toc = timeit.default_timer()
print('Train-acc:', model.score(x_train, y_train))
print('Test-acc:', model.score(x_test, y_test))
toc = timeit.default_timer()
elapsed_time = toc - tic
def test_ncm(alg_list, datasets, seed=28):
    """
    Evaluates the algorithms specified in the datasets provided.

    Parameters
    ----------

    alg_list : list
        The list of algorithms. Each item must be a quadruple (alg, name, key, ks, cons), where 'alg' is the algorithm, 'name'
        is the string name, 'key' is a key-name for the alg, 'ks' is the list of neighbors to consider in k-NN, and cons
        is the initialization code of the algorithm.

    datasets : list
        The list of datasets to use. Each item must be a pair (str, frac), where 'str' is the name of the dataset
        and 'frac' is the fraction of the dataset to take (for big datasets).

    """
    print("* NEAREST CENTROIDS TEST STARTED")
    mms = MinMaxScaler()
    rownames = ["FOLD " + str(i + 1) for i in range(10)]

    results = {}

    for dset, f in datasets:
        print("** DATASET ", dset)

        folds, [n, d, c] = ds.reduced_dobscv10(dset, f)

        print("** SIZE ", n, " x ", d, " [", c, " classes]")

        results[dset] = {}

        norm_folds = []

        for i, (xtr, ytr, xtst, ytst) in enumerate(folds):
            print("*** NORMALIZING FOLD ", i + 1)
            # Normalizing
            xtr = mms.fit_transform(xtr)
            xtst = mms.transform(xtst)
            norm_folds.append((xtr, ytr, xtst, ytst))

        for j, (dml, dml_name, dml_key, ks, cons) in enumerate(alg_list):
            print("*** EVALUATING DML ", dml_name)

            results[dset] = defaultdict(lambda: np.zeros([12, 3]))

            for i, (xtr, ytr, xtst, ytst) in enumerate(norm_folds):
                print("**** FOLD ", i + 1)
                np.random.seed(seed)

                try:
                    print("***** TRAINING")
                    start = time.time()  # Start timer
                    dml.fit(xtr, ytr)  # Fitting distance
                    end = time.time()  # Stop timer
                    elapsed = end - start  # Timer measurement

                    xtr2 = dml.transform()
                    xtst2 = dml.transform(xtst)

                    for namek, keyk, k in zip(dml_name, dml_key, ks):
                        print("****** TEST NCM [", k, " CTRD]")

                        if k == 1:
                            ncm = NearestCentroid()
                        else:
                            ncm = NCMC_Classifier(k)

                        ncm.fit(xtr2, ytr)

                        results[dset][keyk][i,
                                            0] = ncm.score(xtr2,
                                                           ytr)  # Train score
                        results[dset][keyk][i,
                                            1] = ncm.score(xtst2,
                                                           ytst)  # Test score
                        results[dset][keyk][i, 2] = elapsed  # Time score
                except:
                    print("--- ERROR IN DML ", dml_name)
                    for keyk in dml_key:
                        results[dset][keyk][i, 0] = np.nan  # Train score
                        results[dset][keyk][i, 1] = np.nan  # Test score
                        results[dset][keyk][i, 2] = np.nan  # Time score

                    traceback.print_exc()

            for keyk, namek in zip(dml_key, dml_name):
                results[dset][keyk][10, :] = np.mean(
                    results[dset][keyk][:10, :], axis=0)
                results[dset][keyk][11, :] = np.std(
                    results[dset][keyk][:10, :], axis=0)

                # Saving results
                r = pd.DataFrame(results[dset][keyk],
                                 columns=['TRAIN', 'TEST', 'TIME'],
                                 index=rownames + ["MEAN", "STD"])

                r.to_csv("../results/cv-ncm-" + keyk + "-" + dset + ".csv")
                r.to_html("../results/cv-ncm-" + keyk + "-" + dset + ".html",
                          classes=[table_css(), "kfoldtable meanstd"])

                print("RESULTS: ", dset, ", dml = ", namek)
                print(r)
y_predicted_train = model.predict(x_train)
print("Accuracy Score - train dataset:",
      metrics.accuracy_score(y_train, y_predicted_train))

#print(metrics.accuracy_score(y_train, y_predicted_train))
print(metrics.confusion_matrix(y_test, y_pred), "\n")

print(classification_report(y_pred, y_test))

# Nearest Centroid Classifier

nc = NearestCentroid()
nc.fit(x_train, y_train)

score = nc.score(x_train, y_train)
print("Score: ", score)

cv_scores = cross_val_score(nc, x_train, y_train, cv=10)
print("CV average score: %.2f" % cv_scores.mean())

y_pred = nc.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

cr = classification_report(y_test, y_pred)
print(cr)

##################################################
Beispiel #14
0
def evaluate_ncc(train_data, train_labels, test_data, test_labels):
    ncc = NearestCentroid()
    ncc.fit(train_data, train_labels)
    ncc_test = ncc.score(test_data, test_labels)
    return ncc_test
Beispiel #15
0
# checking which k value is the best to use, only went to 100 cause I have a bad machine, the curve should go back down after a while because k checking too many neighbours isn't efficient


def check_results_different_k(from_k, to_k, X_train, X_val):
    scores = []
    k_values = []
    for k in range(from_k, to_k):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_val, y_val))
        k_values.append(k)
    plt.plot(k_values, scores)
    plt.show()


start_time = time.time()
check_results_different_k(2, 20, X_train_image_flatten, X_val_image_flatten)
print("--- %s seconds ---" % (time.time() - start_time))


k_best_result = 100
knn = KNeighborsClassifier(n_neighbors=k_best_result)
knn.fit(X_train_image_flatten, y_train)
print(knn.score(X_test_image_flatten, y_test))

# nearest centroind
nc = NearestCentroid()
nc.fit(X_train_image_flatten, y_train)
nc.score(X_test_image_flatten, y_test)
# Splitting data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=44, shuffle=True
)


# ----------------------------------------------------
# Applying LogisticRegression Model


NearestNeighbors = NearestCentroid()
NearestNeighbors.fit(X_train, y_train)

# Calculating Details
print("NearestNeighbors Train Score is : ", NearestNeighbors.score(X_train, y_train))
print("NearestNeighbors Test Score is : ", NearestNeighbors.score(X_test, y_test))
print("NearestNeighbors Classes are : ", NearestNeighbors.classes_)
print("----------------------------------------------------")

# Calculating Prediction
y_pred = NearestNeighbors.predict(X_test)
print("Predicted Value for NearestNeighbors is : ", y_pred[:10])

# ----------------------------------------------------
# Calculating Confusion Matrix
CM = confusion_matrix(y_test, y_pred)
print("Confusion Matrix is : \n", CM)

# drawing confusion matrix
sns.heatmap(CM, center=True)
Beispiel #17
0
        performances = []

        # go through adding features $increment at a time
        for attempt in range(0, train.shape[1]+1, increment):
            cnt += 1

            i += increment
            used_features.append(i)
            loo = LeaveOneOut()
            X = train[:, 0:i]
            scores = []
            for train_index, test_index in loo.split(X):
                sample_train, sample_test = X[train_index], X[test_index]
                outcome_train, outcome_test = y[train_index], y[test_index]
                classifier.fit(sample_train, outcome_train)
                loo_score = classifier.score(sample_test, outcome_test)
                scores.append(loo_score)

            performance = mean(scores)
            # break when the performance has not improved in the last 5 iterations
            if cnt >= 6:
                last_5 = performances[-5:]
                # break if the performance has not improved in the last 5 iterations
                if min(last_5) >= performance:
                    break
            performances.append(performance)

        # select the number of features with the highest performance
        np.array(performances)
        #index of highest performing fit
        max_performance = np.where(performances == np.amax(performances))[0][0]
Beispiel #18
0
print(knn.score(X_test_image, y_test))

# checking which k value is the best to use, only went to 100 cause I have a bad machine, the curve should go back down after a while because k checking too many neighbours isn't efficient


def check_results_different_k(from_k, to_k, X_train, X_val):
    scores = []
    k_values = []
    for k in range(from_k, to_k):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_val, y_val))
        k_values.append(k)
    plt.plot(k_values, scores)
    plt.show()


start_time = time.time()
check_results_different_k(2, 20, X_train_image, X_val_image)
print("--- %s seconds ---" % (time.time() - start_time))

k_best_result = 15
knn = KNeighborsClassifier(n_neighbors=k_best_result)
knn.fit(X_train_image, y_train)
print(knn.score(X_test_image, y_test))

# nearest centroind
nc = NearestCentroid()
nc.fit(X_train_image, y_train)
nc.score(X_test_image, y_test)
Beispiel #19
0
def NearestCentroidClassifier(x, y):
    clf = NearestCentroid()
    clf.fit(x, y)
    ac = clf.score(x, y)
    return ac
Beispiel #20
0

 
(X_train, y_train), (X_test, y_test) = mnist.load_data()

x_train=X_train.reshape(60000,28*28)
x_test=X_test.reshape(10000,28*28)

scaler= preprocessing.MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test )





model= NearestCentroid()
model.fit(x_train,y_train)


print ("Accuracy for the test set: ",  model.score(x_test,y_test) )

print ( "Accuracy for the train set: ", model.score(x_train,y_train))






Beispiel #21
0
# Standardizing the features
scaler.fit(x_test)
x_test = scaler.transform(x_test)
x_train = scaler.transform(x_train)

# Make an instance of the Model
pca = PCA(.95)

# apply PCA inorder to get fewer dimensions to work with
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

# making our predictions
predictions = []

kNearestNeighbor(x_train_pca, y_train, x_test_pca, predictions, 1)

# transform the list into an array
predictions = np.asarray(predictions)

# evaluating accuracy
accuracy = accuracy_score(y_test, predictions)
print('\nThe accuracy of our classifier is %d%%' % accuracy * 100)

# train using K-NN
ncc = NearestCentroid()
ncc.fit(x_train, y_train)
# get the model accuracy
modelscore = ncc.score(x_test, y_test)
    labels_process_r = labels_process[indexes]

    # Extracting sets
    test_index = int(test_ratio * data_full.shape[0])
    test_data, train_data = np.split(data_full_r, [test_index])
    test_labels_f, train_labels_f = np.split(labels_full_r, [test_index])
    test_labels_m, train_labels_m = np.split(labels_merged_r, [test_index])
    test_labels_p, train_labels_p = np.split(labels_process_r, [test_index])

    # Normalizing sets
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)

    # Traning & Testing Model with Full labels
    model.fit(train_data, train_labels_f)
    acc_full.append(model.score(test_data, test_labels_f))

    # Traning & Testing Model with Merged labels
    model.fit(train_data, train_labels_m)
    acc_merged.append(model.score(test_data, test_labels_m))

    # Training & Testing Model with Process labels
    model.fit(train_data, train_labels_p)
    acc_process.append(model.score(test_data, test_labels_p))

# Traing with set A and Testing with test B
model.fit(data_A, labels_half)
acc_cross_ab = model.score(data_B, labels_half)

# Traing with set A and Testing with test B
model.fit(data_B, labels_half)
Beispiel #23
0
def evaluate(task,
             represent,
             prepare=None,
             batchsize=None,
             invariant=False,
             verbose=False,
             params=[10**i for i in range(-2, 3)],
             intercept=False,
             n_folds=2,
             n_jobs=-1,
             random_state=0,
             mean_clf=False):
    '''evaluates representation method on given task
  Args:
    task: string name of task
    represent: function that transforms list of documents to a matrix with len(documents) rows
    prepare: returns aggregate information used by represent (should be limited to n-gram vocab, NOT feature counts, etc.)
    batchsize: number of documents the represent should process at a time
    invariant: representation method does not depend on the batch (unlike e.g. SIF weighted features); if False must have batchsize is None
    verbose: print progress information
    params: cross-validation parameters
    intercept: whether to fit intercept in linear model
    n_folds: number of folds to use when cross-validating
    n_jobs: number of threads to run when cross-validating
    random_state: cross-validation seed
    mean_clf: use mean classifier instead of logit
  Returns:
    if accuracy task: (train acc, test acc); if regression: (Pearson r, Spearman rho); if retrieval: (acc, F1)
  '''

    assert batchsize is None or invariant, "cannot construct in batches if not invariant"

    if task in TASKMAP['train-test split']:
        (dtrain, ltrain), (dtest, ltest) = TASKMAP['train-test split'][task]()
        info = () if prepare is None else prepare(dtrain + dtest)
        root = '\rBuilding ' + task.upper() + ' Train' if verbose else ''
        Xtrain = batched_build(dtrain, represent, info, root, batchsize)
        Ytrain = np.array(ltrain)
        root = '\rBuilding ' + task.upper() + ' Test' if verbose else ''
        Xtest = batched_build(dtest, represent, info, root, batchsize)
        Ytest = np.array(ltest)
        if mean_clf:
            clf = NearestCentroid()
        else:
            clf = LogitCV(Cs=params,
                          fit_intercept=intercept,
                          cv=n_folds,
                          dual=np.less(*Xtrain.shape),
                          solver='liblinear',
                          n_jobs=n_jobs,
                          random_state=random_state)
            if verbose:
                write('\rCross-Validating and Fitting ' + task.upper() +
                      10 * ' ')
        clf.fit(Xtrain, Ytrain)
        train = 100.0 * clf.score(Xtrain, Ytrain)
        test = 100.0 * clf.score(Xtest, Ytest)

    elif task in TASKMAP['cross-validation']:
        documents, labels = TASKMAP['cross-validation'][task]()
        info = () if prepare is None else prepare(documents)
        train = 0.0
        test = 0.0
        Y = np.array(labels)
        if invariant:
            root = '\rBuilding ' + task.upper() if verbose else ''
            X = batched_build(documents, represent, info, root, batchsize)
            for i, (tr, te) in enumerate(
                    StratifiedKFold(n_splits=10,
                                    random_state=random_state).split(X, Y)):
                if mean_clf:
                    clf = NearestCentroid()
                else:
                    if verbose:
                        write('\rCross-Validating and Fitting ' +
                              task.upper() + ' Fold ' + str(i + 1) + 10 * ' ')
                    clf = LogitCV(Cs=params,
                                  fit_intercept=intercept,
                                  cv=n_folds,
                                  dual=np.less(*X.shape),
                                  solver='liblinear',
                                  n_jobs=n_jobs,
                                  random_state=random_state)
                clf.fit(X[tr], Y[tr])
                train += clf.score(X[tr], Y[tr])
                test += clf.score(X[te], Y[te])
        else:
            for i, (tr, te) in enumerate(
                    StratifiedKFold(n_splits=10,
                                    random_state=random_state).split(
                                        documents, Y)):
                root = '\rBuilding ' + task.upper() + ' Fold ' + str(
                    i + 1) + ' Train' if verbose else ''
                Xtrain = batched_build([documents[i] for i in tr], represent,
                                       info, root, batchsize)
                root = '\rBuilding ' + task.upper() + ' Fold ' + str(
                    i + 1) + ' Test' if verbose else ''
                Xtest = batched_build([documents[i] for i in te], represent,
                                      info, root, batchsize)
                if mean_clf:
                    clf = NearestCentroid()
                else:
                    if verbose:
                        write('\rCross-Validating and Fitting ' +
                              task.upper() + ' Fold ' + str(i + 1) + 10 * ' ')
                    clf = LogitCV(Cs=params,
                                  fit_intercept=intercept,
                                  cv=n_folds,
                                  dual=np.less(*Xtrain.shape),
                                  solver='liblinear',
                                  n_jobs=n_jobs,
                                  random_state=random_state)
                clf.fit(Xtrain, Y[tr])
                train += clf.score(Xtrain, Y[tr])
                test += clf.score(Xtest, Y[te])
        train *= 10.0
        test *= 10.0

    elif task in TASKMAP['pairwise task']:
        (d1train, d2train, ltrain), (d1test, d2test,
                                     ltest) = TASKMAP['pairwise task'][task]()
        info = () if prepare is None else prepare(d1train + d2train + d1test +
                                                  d2test)
        root = '\rBuilding ' + task.upper() + ' Train' if verbose else ''
        Xtrain = batched_build(d1train + d2train, represent, info, root,
                               batchsize)
        m = int(Xtrain.shape[0] / 2)
        if task == 'sts':
            Ptrain = np.zeros(m)
            nz = norm(Xtrain[:m], axis=1) * norm(Xtrain[m:], axis=1) > 0.0
            Ptrain[nz] = np.sum(normalize(Xtrain[:m][nz]) *
                                normalize(Xtrain[m:][nz]),
                                axis=1)
        else:
            Xtrain = np.hstack(
                [abs(Xtrain[:m] - Xtrain[m:]), Xtrain[:m] * Xtrain[m:]])
        root = '\rBuilding ' + task.upper() + ' Test' if verbose else ''
        Xtest = batched_build(d1test + d2test, represent, info, root,
                              batchsize)
        m = int(Xtest.shape[0] / 2)
        if task == 'sts':
            Ptest = np.zeros(m)
            nz = norm(Xtest[:m], axis=1) * norm(Xtest[m:], axis=1) > 0.0
            Ptest[nz] = np.sum(normalize(Xtest[:m][nz]) *
                               normalize(Xtest[m:][nz]),
                               axis=1)
        else:
            Xtest = np.hstack(
                [abs(Xtest[:m] - Xtest[m:]), Xtest[:m] * Xtest[m:]])
        if verbose:
            write('\rCross-Validating and Fitting ' + task.upper() + 10 * ' ')
        if task in {'sick_r', 'sts'}:
            if task == 'sts':
                Ytest = np.array([float(y) for y in ltrain + ltest])
                P = np.concatenate([Ptrain, Ptest])
            else:
                Ytrain = np.array([float(y) for y in ltrain])
                Ytest = np.array([float(y) for y in ltest])
                reg = RidgeCV(alphas=params, fit_intercept=intercept)
                reg.fit(Xtrain, Ytrain)
                P = reg.predict(Xtest)
            r = 100.0 * pearsonr(Ytest, P)[0]
            rho = 100.0 * spearmanr(Ytest, P)[0]
            if verbose:
                write('\r' + task.upper() + ': r=' + str(r) + ', rho=' +
                      str(rho) + 10 * ' ' + '\n')
            return r, rho
        else:
            clf = LogitCV(Cs=params,
                          fit_intercept=intercept,
                          cv=n_folds,
                          dual=np.less(*Xtrain.shape),
                          solver='liblinear',
                          n_jobs=n_jobs,
                          random_state=random_state)
            if task == 'mrpc':
                Ytrain = np.array([int(y) for y in ltrain])
                Ytest = np.array([int(y) for y in ltest])
                clf.fit(Xtrain, Ytrain)
                acc = 100.0 * clf.score(Xtest, Ytest)
                f1 = 100.0 * f1_score(Ytest, clf.predict(Xtest))
                if verbose:
                    write('\r' + task.upper() + ': Acc=' + str(acc) + ', F1=' +
                          str(f1) + 10 * ' ' + '\n')
                return acc, f1
            else:
                Ytrain = np.array(ltrain)
                Ytest = np.array(ltest)
                clf.fit(Xtrain, Ytrain)
                train = 100.0 * clf.score(Xtrain, Ytrain)
                test = 100.0 * clf.score(Xtest, Ytest)

    else:
        raise (NotImplementedError)

    if verbose:
        write('\r' + task.upper() + ': Train Acc=' + str(train) +
              ', Test Acc=' + str(test) + 10 * ' ' + '\n')
    return train, test
Beispiel #24
0
        cnt += 1
        i += increment  # how many features to add each time
        used_features.append(i)
        X = train[:, 0:i]
        loo = LeaveOneOut(
        )  # function to compute the indices which split the data so that each sample is used for testing once
        scores = []

        for train_index, test_index in loo.split(
                X
        ):  # this method gives the indices to use each sample as the test once
            X_train, X_test = X[train_index], X[test_index]
            #print(X_train)
            y_train, y_test = y[train_index], y[test_index]
            classifier.fit(X_train, y_train)
            score = classifier.score(X_test, y_test)
            scores.append(score)

        performance = mean(scores)

        if cnt >= 6:
            last_5 = performances[-5:]
            # break if the performance has not improved in the last 5 iterations
            if min(last_5) >= performance:
                break
        performances.append(performance)

    # find model with best performance and extract the features used
    f_cnt = performances.index(max(performances))
    best_features = used_features[f_cnt]
Beispiel #25
0
w1 = n_samples / (n_classes * n_samples1)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

weights = y_train.map(lambda y: w0 if y == 0 else w1)

from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report
# Creating the Nearest Centroid Clissifier
model = NearestCentroid()

# Training the classifier
model.fit(X_train, y_train.values.ravel())

model.score(X_train, y_train, sample_weight=weights)

# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(X_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(X_test, y_test) * 100} %")

# Printing classification report of classifier on the test set set data
print(
    f"Model Classification Report : \n{classification_report(y_test, model.predict(X_test))}"
)
'''
Extra Tree Classifier
'''

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, KFold