def nearest_neighbors(X, Y):
    ''' This method implements K-nearest neighbor and nearest centroid classifiers
    from the nearest neighbors ML family. It utilizes 5-fold stratified cross validation
    for compilation and reports models' performances. '''

    # Importing library
    from sklearn import neighbors

    # Compiling the model with recording statistical scores for KNN evaluation
    accuracy_scores, precision_scores, recall_scores, f1_scores = model_compilation(
        X, Y, neighbors.KNeighborsClassifier(15, weights='uniform'))

    # Statistical measurement of the model
    print(" ======= KNN (neighbor = 5 and weight = uniform) ======= ")
    print("Accuracy: ", np.mean(accuracy_scores))
    print("Precision: ", np.mean(precision_scores))
    print("Recall: ", np.mean(recall_scores))
    print("F1: ", np.mean(f1_scores))

    # Compiling the model with recording statistical scores for Nearest Neighbor's evaluation
    accuracy_scores, precision_scores, recall_scores, f1_scores = model_compilation(
        X, Y, neighbors.NearestCentroid())

    # Statistical measurement of the model
    print(" ======= Nearest Centroid ======= ")
    print("Accuracy: ", np.mean(accuracy_scores))
    print("Precision: ", np.mean(precision_scores))
    print("Recall: ", np.mean(recall_scores))
    print("F1: ", np.mean(f1_scores))
Example #2
0
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            # x14 == x10
            # x8 == x3
            # x9 == x6^2 - C
            ('drop', transformers.ColumnDropper(columns=(7, 8, 11, 12, 13, 14))
             ),
            ('scale',
             preprocessing.StandardScaler(with_mean=True, with_std=True)),
            ('expand',
             preprocessing.PolynomialFeatures(degree=2,
                                              interaction_only=True,
                                              include_bias=False)),
            ('select',
             feature_selection.SelectKBest(
                 k=26, score_func=feature_selection.mutual_info_classif)),
            ('estim',
             neighbors.NearestCentroid(metric='euclidean',
                                       shrink_threshold=None)),
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
Example #3
0
def get_classifier(classifier_str):
    '''
    This functions maps the classifier string classifier_str to the
    corresponding classifier object with the default paramers set.
    '''

    # SVC
    if (classifier_str == 'linearsvc'):
        cl = svm.LinearSVC(**svm_default_param)
    elif (classifier_str == 'svc_linear'):
        libsvm_default_param['kernel'] = 'linear'
        cl = svm.SVC(**libsvm_default_param)
    elif (classifier_str == 'svc_rbf'):
        libsvm_default_param['kernel'] = 'rbf'
        cl = svm.SVC(**libsvm_default_param)
    # polynomial, sigmoid kernel
    # nuSVC
    # Nearest Neighbors (euclidian distance used by default)
    elif (classifier_str == 'kn_uniform'):
        kn_default_param['weights'] = 'uniform'
        cl = neighbors.KNeighborsClassifier(**kn_default_param)
    elif (classifier_str == 'kn_distance'):
        kn_default_param['weights'] = 'distance'
        cl = neighbors.KNeighborsClassifier(**kn_default_param)
    elif (classifier_str == 'rn_uniform'):
        rn_default_param['weights'] = 'uniform'
        cl = neighbors.RadiusNeighborsClassifier(**rn_default_param)
    elif (classifier_str == 'rn_distance'):
        rn_default_param['weights'] = 'distance'
        cl = neighbors.RadiusNeighborsClassifier(**rn_default_param)
    elif (classifier_str == 'nc'):
        cl = neighbors.NearestCentroid()
    # LDA and QDA, priors are by default set to 1/len(class) for each class
    elif (classifier_str == 'lda'):
        cl = lda.LDA()
    elif (classifier_str == 'qda'):
        cl = qda.QDA()
    # Gaussion naive bayes
    # from the code it is unclear how priors are set
    elif (classifier_str == 'gnb'):
        cl = naive_bayes.GaussianNB()
    elif (classifier_str == 'mnb'):
        cl = naive_bayes.MultinomialNB()
    elif (classifier_str == 'bnb'):
        cl = naive_bayes.BernoulliNB()
    # Decision tree
    elif (classifier_str == 'dtree'):
        cl = tree.DecisionTreeClassifier()
    elif (classifier_str == 'rforest'):
        cl = ensemble.RandomForestClassifier()
    else:
        # raise error if classifier not found
        raise ValueError('Classifier not implemented: %s' % (classifier_str))

    return (cl)
Example #4
0
 def select_model(classifier_method):
     """
     Initializes desired classifier
     :param classifier_method: desired classifier, expects 'KNN' or 'Rocchio'
     :return: classifier sklearn object
     """
     if classifier_method == 'KNN':
         return neighbors.KNeighborsClassifier(n_neighbors=NEIGHBORS)
     elif classifier_method == 'Rocchio':
         return neighbors.NearestCentroid()
     else:
         print("Error. Expects 'KNN' or 'Rocchio' only.")
Example #5
0
 def select_model(classifier_method, number_of_neighbors):
     """
     Initializes desired classifier
     :param classifier_method: desired classifier, expects 'KNN' or 'Rocchio'
     :param number_of_neighbors: the number of neighbors for knn model, default 10
     :return: classifier sklearn object
     """
     if classifier_method == 'KNN':
         return neighbors.KNeighborsClassifier(n_neighbors=number_of_neighbors, metric='manhattan')
     elif classifier_method == 'Rocchio':
         return neighbors.NearestCentroid()
     else:
         print("Error. Expects 'KNN' or 'Rocchio' only.")
Example #6
0
def train_test(x_tr, y_tr, x_te, y_te, name):
    algorithms = {
        'ada_boost': ensemble.AdaBoostClassifier(),
        'bagging': ensemble.BaggingClassifier(),
        'extra_trees': ensemble.ExtraTreesClassifier(),
        'random_forest': ensemble.RandomForestClassifier(),
        'logistic_regression': linear_model.LogisticRegression(),
        'passive_aggressive': linear_model.PassiveAggressiveClassifier(),
        'ridge': linear_model.RidgeClassifier(),
        'sgd': linear_model.SGDClassifier(),
        'bernoulli': naive_bayes.BernoulliNB(),
        'gaussian': naive_bayes.GaussianNB(),
        'k_neighbors': neighbors.KNeighborsClassifier(),
        'nearest_centroid': neighbors.NearestCentroid(),
        'mlp': neural_network.MLPClassifier(),
        'linear_svc': svm.LinearSVC(),
        'decision_tree': tree.DecisionTreeClassifier(),
        'extra_tree': tree.ExtraTreeClassifier(),
        'gradient_boosting': ensemble.GradientBoostingClassifier(),
        'hist_gradient_boosting': HistGradientBoostingClassifier()
    }
    res = {}
    try:
        clf = GridSearchCV(algorithms.get(name),
                           getattr(CVParameters, name),
                           cv=2,
                           n_jobs=-1)
        start = time.clock()
        clf.fit(x_tr, y_tr)
        tr_time = time.clock() - start
        print(tr_time)
        print(clf.best_params_)
        print(clf.best_score_)
        tr_score = clf.score(x_tr, y_tr)
        score = clf.score(x_te, y_te)
        tr_fscore = f1_score(y_tr, clf.predict(x_tr), average='weighted')
        fscore = f1_score(y_te, clf.predict(x_te), average='weighted')
        print(tr_score, score, tr_fscore, fscore)
        res = {
            name: {
                'test': score,
                'train': tr_score,
                'f1_test': fscore,
                'f1_train': tr_fscore,
                'tr_time': tr_time
            }
        }
        res[name].update(clf.best_params_)
    except Exception as e:
        print(e)
    return res
Example #7
0
def feature(model, args, writer, epoch):
    model.eval()
    transform = transforms.Compose([
        # transforms.ToPILImage(),
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize(mean=np.array([0.485, 0.456, 0.406]),
                             std=np.array([0.229, 0.224, 0.225])),
    ])

    NearestCentroid, NearestCentroid_label, features, labels = [], [], [], []
    preserved_features, preserved_labels = [], []
    fea, l = torch.zeros(0), torch.zeros(0)

    train_set = torchvision.datasets.ImageFolder(root=args.preserved_sample,
                                                 transform=transform)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=3,
                                               shuffle=False)
    for i, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        output = model.module.forward(data)
        preserved_features.extend(output.data)
        preserved_labels.extend(target.data.cpu().numpy())
        NearestCentroid.append(output[0].data.cpu().numpy())
        NearestCentroid_label.append(target[0].data.cpu().numpy())
        fea = torch.cat((fea, output.data.cpu()))
        l = torch.cat((l, target.data.cpu().float()))

    train_set = torchvision.datasets.ImageFolder(root=args.train_set,
                                                 transform=transform)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.test_batch_size,
                                               shuffle=False)
    for i, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        output = model.forward(data)
        features.extend(output.data)
        labels.extend(target.data.cpu().numpy())
        fea = torch.cat((fea, output.data.cpu()))
        l = torch.cat((l, target.data.cpu().float()))

    clf = neighbors.NearestCentroid()
    clf.fit(NearestCentroid, NearestCentroid_label)

    writer.add_embedding(mat=fea, metadata=l, global_step=epoch)
    return features, labels, clf, preserved_features, preserved_labels
Example #8
0
def create_ts_and_targets(data_cols, cents):
    print("**extracting time series from raw accidents")

    #initialize nearest neighbor classifier with precomputed centroids
    dummy_classes = [i for i in range(K)]
    nn_clf = nbr.NearestCentroid()
    nn_clf.fit(cents, dummy_classes)

    #this is the idx of the first accident in the window
    time_stamps = data_cols[0]
    acc_start_idx = extHelp.index_for_t(WIND_EXT_START, time_stamps)
    wind_idx = 0

    #each loop extracts one row for final predictor time series
    feat_ts = []
    target_ts = []

    log_count = 0
    while (time_stamps[acc_start_idx] < WIND_EXT_END):
        curr_t = WIND_EXT_START + (wind_idx * STEP_SZ)

        log_count += 1
        if (log_count % 25 == 0):
            print("**current t\n\t" + str(curr_t) + " of\n\t" +
                  str(WIND_EXT_END))

        #windows of accidents, both historical and on the forecasting horizon
        prev_day, prev_week, prev_month = extract_hist_winds(
            curr_t, data_cols, acc_start_idx)
        fwd_horizon = extract_target_wind(curr_t, data_cols, acc_start_idx)

        prev_day_probs = rel_freqs(nn_clf, prev_day)
        prev_week_probs = rel_freqs(nn_clf, prev_week)
        prev_month_probs = rel_freqs(nn_clf, prev_month)
        target_probs = rel_freqs(nn_clf, fwd_horizon)
        time_embed = extHelp.embed_time(curr_t, WIND_EXT_START)

        #concat all probabilities and time embedding into single step
        concat_feats = time_embed + prev_day_probs + prev_week_probs + prev_month_probs

        feat_ts.append(concat_feats)
        target_ts.append(target_probs)

        #bring index to first accident >= current_t + STEP_SZ
        acc_start_idx += extHelp.index_for_t(curr_t + STEP_SZ,
                                             time_stamps[acc_start_idx:])
        wind_idx += 1
    return feat_ts, target_ts
 def __init__(self, data, algorithm, k=10):
     """
     Runs the specified algorithm on processed data and calculates accuracy.
     :param data: Data set.
     :param algorithm: String represent algorithm to use: 'KNN' or 'Rocchio'
     :param k: Optional - initializes 'KNN' algorithm number of neighbors (default = 10).
     """
     self._name = algorithm
     self._data = data
     if algorithm == "KNN":
         self.algorithm = neighbors.KNeighborsClassifier(n_neighbors=k, p=1)
     elif algorithm == "Rocchio":
         self.algorithm = neighbors.NearestCentroid()
     else:
         print("Please enter one of : KNN or Rocchio")
     self._accuracy = 0
Example #10
0
def nm_alg(teachingSet, testSet, features, distanceMetrics, normalization,
           metric):

    trainDataFeatures, trainDataLabelFeatures = prepareDataSet(
        teachingSet, features, normalization)
    testDataFeatures, testDataLabelFeatures = prepareDataSet(
        testSet, features, normalization)

    classifier = neighbors.NearestCentroid(metric=distanceMetrics,
                                           shrink_threshold=None)

    classifier.fit(trainDataFeatures, trainDataLabelFeatures)
    predictions = classifier.predict(testDataFeatures)

    score = eval(metric + "_score")(testDataLabelFeatures, predictions)
    accuracy_confusion_matrix = confusion_matrix(testDataLabelFeatures,
                                                 predictions)

    return score, accuracy_confusion_matrix
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=True
            )),
            ('select', feature_selection.SelectKBest(
                k=101,
                score_func=feature_selection.f_classif
            )),
            ('estim', neighbors.NearestCentroid(
                metric='euclidean',
                shrink_threshold=None
            )),
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
Example #12
0
 def fit(self, X, y):
     self.centroids_ = neighbors.NearestCentroid(metric="cosine")\
               .fit(X, y).centroids_
     return self
Example #13
0
test_file = 'BRENNT_' + client + '_Test.csv'
aux_path = client + '/'
cat_list = [2, 5, 6, 23, 24, 25, 26, 27]
stats_file = client + '.stats'
name_list = client + '.names'

DPlib.getLabels(data_path, data_file, cat_list, aux_path, stats_file)

DATA, LABEL = DPlib.getAllModData(data_path, data_file, aux_path, name_list,
                                  stats_file)
tDATA, tLABEL = DPlib.getAllModData(data_path, test_file, aux_path, name_list,
                                    stats_file)

clfkNNu = neighbors.KNeighborsClassifier(3, 'uniform', p=5)
clfkNNd = neighbors.KNeighborsClassifier(3, 'distance', p=5)
clfkNNc = neighbors.NearestCentroid()

clfkNNu.fit(DATA, LABEL)
clfkNNd.fit(DATA, LABEL)
clfkNNc.fit(DATA, LABEL)

pLABELkNNu = clfkNNu.predict(tDATA)
pLABELkNNd = clfkNNd.predict(tDATA)
pLABELkNNc = clfkNNc.predict(tDATA)

V = [pLABELkNNu, pLABELkNNd, pLABELkNNc]

pLABELmajority = []
for ii in range(len(V[0])):
    summ = 0
    for jj in range(3):
Example #14
0
    vectorizer.fit(x_train_raw)
    my_representations.append({"name":name, "x_train":vectorizer.transform(x_train_raw), "x_test":vectorizer.transform(x_test_raw)})
    if name == 'tf':
        print len(vectorizer.vocabulary_)


###########################
# learning



from sklearn import naive_bayes, linear_model, svm, ensemble, neighbors, metrics
from sklearn.ensemble import RandomForestClassifier

# configure
learners = [{"name":"LR", "model":linear_model.LogisticRegression(C=1,class_weight='balanced')},
            {"name":"SVM", "model":svm.LinearSVC(C=1,class_weight='balanced')},
            {"name":"5-NN", "model":neighbors.KNeighborsClassifier(n_neighbors=5)},
            {"name":"Rochio", "model":neighbors.NearestCentroid()},
            {"name":"N.B.", "model":naive_bayes.MultinomialNB(alpha=1)},
            {"name":"R.F.", "model":RandomForestClassifier(n_estimators = 100)}]
# fit and test
for representation in my_representations:
    print "\tRepresentation:", representation["name"]
    for learner in learners:
        learner['model'].fit(representation["x_train"], y_train)
        preds = learner['model'].predict(representation["x_test"])
        print "%s:\tAccuracy: %0.3f\tF1 macro: %0.3f"%(learner['name'],
                    metrics.accuracy_score(y_test, preds), metrics.f1_score(y_test, preds, average='macro'))
    print "----------------"
## refer to 1.1.11. Stochastic Gradient Descent - SGD

## 1.6. Nearest Neighbors
models.append( {"name": "1.6.3. KNeighborsRegressor uniform", \
    "model": neighbors.KNeighborsRegressor(weights = "uniform")} )
models.append( {"name": "1.6.3. KNeighborsRegressor distance", \
    "model": neighbors.KNeighborsRegressor(weights = "distance")} )

#ValueError: Input contains NaN
#models.append( {"name": "1.6.3. RadiusNeighborsRegressor uniform", \
#				"model": neighbors.RadiusNeighborsRegressor(weights = "uniform")} )
#ZeroDivisionError: Weights sum to zero, can't be normalized
#models.append( {"name": "1.6.3. RadiusNeighborsRegressor distance", \
#				"model": neighbors.RadiusNeighborsRegressor(weights = "distance")} )
models.append( {"name": "1.6.3. NearestCentroid", \
    "model": neighbors.NearestCentroid()} )

## 1.7. Gaussian Processes
## too slow?
#models.append( {"name": "1.7. Gaussian Processes", \
#				"model": gaussian_process.GaussianProcess()} )

## 1.8. Cross decomposition
models.append( {"name": "1.8. Cross decomposition PLSRegression", \
    "model": cross_decomposition.PLSRegression()} )
models.append( {"name": "1.8. Cross decomposition PLSCanonical", \
    "model": cross_decomposition.PLSCanonical()} )
# slow
#models.append( {"name": "1.8. Cross decomposition CCA", \
#				"model": cross_decomposition.CCA()} )
Example #16
0
def select_three_sample(model, args, epoch, writer):
    model.eval()
    num_each_class = [500, 500, 500, 500, 500, 500, 500, 50, 50, 50]
    # num_each_class = [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]
    transform = transforms.Compose([
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize(mean=np.array([0.485, 0.456, 0.406]),
                             std=np.array([0.229, 0.224, 0.225])),
    ])
    train_set = torchvision.datasets.ImageFolder(root=args.train_set,
                                                 transform=transform)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.test_batch_size,
                                               shuffle=False)

    NearestCentroid, KNeighbors, features, label, labels = [], [], [], [], []
    for i, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        output = model(data)
        features.extend(output.data)
        KNeighbors.extend(output.data.cpu().numpy())
        labels.extend(target.data.cpu().numpy())

    count = 0
    l2_dist = PairwiseDistance(2)
    destination = os.path.join(args.check_path, 'epoch' + str(epoch))
    if not os.path.exists(destination):
        os.mkdir(destination)
    for i in range(len(num_each_class)):
        num_sample = features[count:count + num_each_class[i]]
        m = torch.tensor(np.zeros(args.embedding_size)).float().cuda()
        for x in num_sample:
            m += x
        m /= num_each_class[i]

        sample1 = min(num_sample, key=lambda x: l2_dist.forward_val(x, m))
        sample2 = max(num_sample,
                      key=lambda x: l2_dist.forward_val(x, sample1))
        sample3 = max(num_sample,
                      key=lambda x: l2_dist.forward_val(x, sample2))
        NearestCentroid.append(sample1.cpu().numpy())
        label.append(i)

        sample1_loc, sample2_loc, sample3_loc = -1, -1, -1
        for j in range(num_sample.__len__()):
            if (num_sample[j] == sample1).all():
                sample1_loc = j
            if (num_sample[j] == sample2).all():
                sample2_loc = j
            if (num_sample[j] == sample3).all():
                sample3_loc = j

        frame = pd.read_csv(args.train_set_csv)
        destination_class = os.path.join(
            destination, str(frame['name'][count + sample1_loc]))
        if not os.path.exists(destination_class):
            os.mkdir(destination_class)
        sample1_source = os.path.join(
            args.train_set, str(frame['name'][count + sample1_loc]),
            str(frame['id'][count + sample1_loc]) + '.png')
        sample2_source = os.path.join(
            args.train_set, str(frame['name'][count + sample2_loc]),
            str(frame['id'][count + sample2_loc]) + '.png')
        sample3_source = os.path.join(
            args.train_set, str(frame['name'][count + sample3_loc]),
            str(frame['id'][count + sample3_loc]) + '.png')
        shutil.copy(sample1_source, destination_class + '/sample1.png')
        shutil.copy(sample2_source, destination_class + '/sample2.png')
        shutil.copy(sample3_source, destination_class + '/sample3.png')
        count += num_each_class[i]

    clf = neighbors.NearestCentroid()
    clf.fit(NearestCentroid, label)

    return features, labels, clf, destination
Example #17
0
def errorCorrectionTrain(input_images,
                         output,
                         parameters=None,
                         debug=False,
                         partition=None,
                         part=None,
                         multilabel=1):
    try:
        use_coord = parameters.get('use_coord', True)
        use_joint = parameters.get('use_joint', True)
        patch_size = parameters.get('patch_size', 1)

        border = patch_size * 2

        if patch_size == 0:
            border = 2

        normalize_input = parameters.get('normalize_input', True)

        method = parameters.get('method', 'lSVC')
        method2 = parameters.get('method2', method)
        method_n = parameters.get('method_n', 15)
        method2_n = parameters.get('method2_n', method_n)
        method_random = parameters.get('method_random', None)
        method_max_features = parameters.get('method_max_features', 'auto')
        method_n_jobs = parameters.get('method_n_jobs', 1)
        primary_features = parameters.get('primary_features', 1)

        training_images = []
        training_diff = []
        training_images_direct = []
        training_direct = []

        if debug:
            print("errorCorrectionTrain use_coord={} use_joint={} patch_size={} normalize_input={} method={} output={} partition={} part={}".\
                    format(repr(use_coord),repr(use_joint),repr(patch_size),repr(normalize_input),method,output,partition,part))

        coords = None
        total_mask_size = 0
        total_diff_mask_size = 0

        for (i, inp) in enumerate(input_images):
            mask = None
            diff = None
            mask_diff = None

            if inp[-2] is not None:
                mask = extract_part(
                    minc2_file(inp[-2]).data, partition, part, border)

            ground_data = minc2_file(inp[-1]).data
            auto_data = minc2_file(inp[-3]).data

            ground_shape = ground_data.shape
            ground = extract_part(ground_data, partition, part, border)
            auto = extract_part(auto_data, partition, part, border)

            shape = ground_shape
            if coords is None and use_coord:
                c = np.mgrid[0:shape[0], 0:shape[1], 0:shape[2]]
                coords = [
                    extract_part((c[j] - shape[j] / 2.0) / (shape[j] / 2.0),
                                 partition, part, border) for j in range(3)
                ]

            features = [
                extract_part(minc2_file(k).data, partition, part, border)
                for k in inp[0:-3]
            ]

            mask_size = shape[0] * shape[1] * shape[2]

            if debug:
                print("Training data size:{}".format(len(features)))
                if mask is not None:
                    mask_size = np.sum(mask)
                    print("Mask size:{}".format(mask_size))
                else:
                    print("Mask absent")
            total_mask_size += mask_size

            if multilabel > 1:
                diff = (ground != auto)
                total_diff_mask_size += np.sum(mask)

                if mask is not None:
                    mask_diff = diff & (mask > 0)
                    print("Sample {} mask_diff={} diff={}".format(
                        i, np.sum(mask_diff), np.sum(diff)))
                    #print(mask_diff)
                    training_diff.append(diff[mask > 0])
                    training_direct.append(ground[mask_diff])
                else:
                    mask_diff = diff
                    training_diff.append(diff)
                    training_direct.append(ground[diff])

                training_images.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

                training_images_direct.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask_diff,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

            else:
                mask_diff = mask
                if mask is not None:
                    training_diff.append(ground[mask > 0])
                else:
                    training_diff.append(ground)

                training_images.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

            if debug:
                print("feature size:{}".format(len(training_images[-1])))

            if i == 0 and parameters.get('dump', False):
                print("Dumping feature images...")
                for (j, k) in enumerate(training_images[-1]):
                    test = np.zeros_like(images[0])
                    test[mask > 0] = k
                    out = minc2_file()
                    out.imitate(inp[0], path="dump_{}.mnc".format(j))
                    out.data = test

        # calculate normalization coeffecients

        if debug: print("Done")

        clf = None
        clf2 = None

        if total_mask_size > 0:
            training_X = convert_image_list(training_images)
            training_Y = np.ravel(
                np.concatenate(tuple(j for j in training_diff)))

            if debug: print("Fitting 1st...")

            if method == "xgb":
                clf = None
            elif method == "SVM":
                clf = svm.SVC()
            elif method == "nuSVM":
                clf = svm.NuSVC()
            elif method == 'NC':
                clf = neighbors.NearestCentroid()
            elif method == 'NN':
                clf = neighbors.KNeighborsClassifier(method_n)
            elif method == 'RanForest':
                clf = ensemble.RandomForestClassifier(
                    n_estimators=method_n,
                    n_jobs=method_n_jobs,
                    max_features=method_max_features,
                    random_state=method_random)
            elif method == 'AdaBoost':
                clf = ensemble.AdaBoostClassifier(n_estimators=method_n,
                                                  random_state=method_random)
            elif method == 'AdaBoostPP':
                clf = Pipeline(steps=[('normalizer', Normalizer()),
                                      ('AdaBoost',
                                       ensemble.AdaBoostClassifier(
                                           n_estimators=method_n,
                                           random_state=method_random))])
            elif method == 'tree':
                clf = tree.DecisionTreeClassifier(random_state=method_random)
            elif method == 'ExtraTrees':
                clf = ensemble.ExtraTreesClassifier(
                    n_estimators=method_n,
                    max_features=method_max_features,
                    n_jobs=method_n_jobs,
                    random_state=method_random)
            elif method == 'Bagging':
                clf = ensemble.BaggingClassifier(
                    n_estimators=method_n,
                    max_features=method_max_features,
                    n_jobs=method_n_jobs,
                    random_state=method_random)
            elif method == 'dumb':
                clf = dummy.DummyClassifier(strategy="constant", constant=0)
            else:
                clf = svm.LinearSVC()

            #scores = cross_validation.cross_val_score(clf, training_X, training_Y)
            #print scores
            if method == "xgb":
                xg_train = xgb.DMatrix(training_X, label=training_Y)
                param = {}
                num_round = 100
                # use softmax multi-class classification
                param['objective'] = 'multi:softmax'
                # scale weight of positive examples
                param['eta'] = 0.1
                param['max_depth'] = 8
                param['silent'] = 1
                param['nthread'] = 4
                param['num_class'] = 2
                clf = xgb.train(param, xg_train, num_round)
            elif method != 'dumb':
                clf.fit(training_X, training_Y)

            if multilabel > 1 and method != 'dumb':
                if debug: print("Fitting direct...")

                training_X = convert_image_list(training_images_direct)
                training_Y = np.ravel(
                    np.concatenate(tuple(j for j in training_direct)))

                if method2 == "xgb":
                    clf2 = None
                if method2 == "SVM":
                    clf2 = svm.SVC()
                elif method2 == "nuSVM":
                    clf2 = svm.NuSVC()
                elif method2 == 'NC':
                    clf2 = neighbors.NearestCentroid()
                elif method2 == 'NN':
                    clf2 = neighbors.KNeighborsClassifier(method_n)
                elif method2 == 'RanForest':
                    clf2 = ensemble.RandomForestClassifier(
                        n_estimators=method_n,
                        n_jobs=method_n_jobs,
                        max_features=method_max_features,
                        random_state=method_random)
                elif method2 == 'AdaBoost':
                    clf2 = ensemble.AdaBoostClassifier(
                        n_estimators=method_n, random_state=method_random)
                elif method2 == 'AdaBoostPP':
                    clf2 = Pipeline(steps=[('normalizer', Normalizer()),
                                           ('AdaBoost',
                                            ensemble.AdaBoostClassifier(
                                                n_estimators=method_n,
                                                random_state=method_random))])
                elif method2 == 'tree':
                    clf2 = tree.DecisionTreeClassifier(
                        random_state=method_random)
                elif method2 == 'ExtraTrees':
                    clf2 = ensemble.ExtraTreesClassifier(
                        n_estimators=method_n,
                        max_features=method_max_features,
                        n_jobs=method_n_jobs,
                        random_state=method_random)
                elif method2 == 'Bagging':
                    clf2 = ensemble.BaggingClassifier(
                        n_estimators=method_n,
                        max_features=method_max_features,
                        n_jobs=method_n_jobs,
                        random_state=method_random)
                elif method2 == 'dumb':
                    clf2 = dummy.DummyClassifier(strategy="constant",
                                                 constant=0)
                else:
                    clf2 = svm.LinearSVC()

                if method2 == "xgb":
                    xg_train = xgb.DMatrix(training_X, label=training_Y)

                    param = {}
                    num_round = 100
                    # use softmax multi-class classification
                    param['objective'] = 'multi:softmax'
                    # scale weight of positive examples
                    param['eta'] = 0.1
                    param['max_depth'] = 8
                    param['silent'] = 1
                    param['nthread'] = 4
                    param['num_class'] = multilabel

                    clf2 = xgb.train(param, xg_train, num_round)

                elif method != 'dumb':
                    clf2.fit(training_X, training_Y)

            #print(clf.score(training_X,training_Y))

            if debug:
                print(clf)
                print(clf2)
        else:
            print("Warning : zero total mask size!, using null classifier")
            clf = dummy.DummyClassifier(strategy="constant", constant=0)

        if method == 'xgb' and method2 == 'xgb':
            #save
            clf.save_model(output)
            clf2.save_model(output + '_2')
        else:
            with open(output, 'wb') as f:
                pickle.dump([clf, clf2], f, -1)

    except mincError as e:
        print("Exception in linear_registration:{}".format(str(e)))
        traceback.print_exc(file=sys.stdout)
        raise
    except:
        print("Exception in linear_registration:{}".format(sys.exc_info()[0]))
        traceback.print_exc(file=sys.stdout)
        raise
Example #18
0
from sklearn import tree, svm, neighbors, linear_model

clftree = tree.DecisionTreeClassifier()  # Tree
clfsvm = svm.SVC()  # Support Vector Machine (SVM)
clfnc = neighbors.NearestCentroid()  # Nearest Centroid (NC)
clfsgd = linear_model.SGDClassifier(
    loss='hinge', penalty='l2',
    max_iter=1000)  # Stochastic Gradient Descent (SGD)

# [height, weight, shoe_size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
]

clftree = clftree.fit(X, Y)  # Tree
clfsvm = clfsvm.fit(X, Y)  # SVM
clfnc = clfnc.fit(X, Y)  # NC
clfsgd = clfsgd.fit(X, Y)  # SGD

prediction_tree = clftree.predict([[190, 70, 43]])  # Tree
prediction_svm = clfsvm.predict([[190, 70, 43]])  # SVM
prediction_nc = clfnc.predict([[190, 70, 43]])  # NC
prediction_sgd = clfsgd.predict([[190, 70, 43]])  # SGD

print('Tree', prediction_tree)  # Tree
print('SVM', prediction_svm)  # SVM
Example #19
0
 def __init__(self, **kwargs):
     self.classifier = neighbors.NearestCentroid(**kwargs)
Example #20
0
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets, neighbors

neighbors_cnt = 15

if __name__ == "__main__":
    print("Loading data...")
    data = datasets.load_iris()
    X, y = data.data[:, :2], data.target
    step = 0.01
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    for shrinkage in [None, 0.2]:
        model = neighbors.NearestCentroid(shrink_threshold=shrinkage)
        model.fit(X, y)
        y_prediction = model.predict(X)
        print(shrinkage, np.mean(y_prediction == y))
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, step),
                             np.arange(y_min, y_max, step))
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                    edgecolor='k', s=20)
        plt.title("3-Class classification (shrink_threshold=%r)"
                  % shrinkage)
Example #21
0
    }]
}

#训练和测试
for clf, name, parameters in (
    (RidgeClassifier(), "Ridge Classifier of linear_model",
     parameters_condition['RidgeClassifier']),
    (Perceptron(), "Perceptron of linear_model",
     parameters_condition['Perceptron']),
    (PassiveAggressiveClassifier(), "Passive-Aggressive of linear_model",
     parameters_condition['PassiveAggressiveClassifier']),
    (SGDClassifier(), "SGD model of linear_model",
     parameters_condition['SGDClassifier']),
    (neighbors.KNeighborsClassifier(), "kNN",
     parameters_condition['KNeighborsClassifier']),
    (neighbors.NearestCentroid(), "NearestCentroid",
     parameters_condition['NearestCentroid']), (svm.SVC(), "SVC",
                                                parameters_condition['SVC']),
    (svm.LinearSVC(), "LinearSVC", parameters_condition['LinearSVC']),
    (svm.NuSVC(), "NuSVC",
     parameters_condition['NuSVC']), (MultinomialNB(), "MultinomialNB",
                                      parameters_condition['MultinomialNB']),
    (BernoulliNB(), "BernoulliNB", parameters_condition['BernoulliNB']),
    (RandomForestClassifier(), "Random forest",
     parameters_condition['RandomForestClassifier'])):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf=clf, parameters=parameters))

# make some plots
import matplotlib.pyplot as plt
Example #22
0
 def get_skl_estimator(self, **default_parameters):
     return neighbors.NearestCentroid(**default_parameters)
Example #23
0
    def run(self):
        start_time = datetime.datetime.now()
        best_acc, best_epoch, fts_means = 0., 0, None
        for epoch in range(1, self.args.epoch + 1):
            if self.scheduler is not None:
                self.scheduler.step()

            if self.increment_phase > 0:
                new_loss, ebd_loss = self.train_increment(
                    epoch=epoch,
                    model=self.model,
                    criterion=self.criterion,
                    embedding_loss=self.embedding_loss,
                    optimizer=self.optimizer,
                    new_loader=self.sampler_train_loader,
                    train_loader=self.train_loader_old)

                if epoch % 2 == 0:
                    validate_start = datetime.datetime.now()

                    with torch.no_grad():
                        new_embeddings, new_targets = self.extractEmbeddings(
                            self.model, self.train_loader)
                        old_embeddings, old_targets = self.extractEmbeddings(
                            self.model, self.train_loader_old)

                    ########################################
                    embeddings = torch.cat((new_embeddings, old_embeddings))
                    targets = np.append(new_targets, old_targets)
                    fts_means, labels = self.extract_feature_mean(
                        embeddings, targets)
                    clf_knn = neighbors.KNeighborsClassifier(
                        n_neighbors=self.args.vote).fit(
                            embeddings.cpu().data.numpy(), targets)
                    clf_ncm = neighbors.NearestCentroid().fit(
                        fts_means.cpu().data.numpy(), labels)
                    # clf_ncm = neighbors.NearestCentroid().fit(self.means.cpu().data.numpy(), labels)

                    #############################################
                    # New Train accuracy
                    new_train_accy, new_train_fts, new_train_lbls = self.validate(
                        args=self.args,
                        model=self.model,
                        clf_knn=clf_knn,
                        loader=self.train_loader,
                        clf_ncm=clf_ncm)

                    # Old train acc
                    old_train_accy, old_train_fts, old_train_lbls = self.validate(
                        args=self.args,
                        model=self.model,
                        clf_knn=clf_knn,
                        loader=self.train_loader_old,
                        clf_ncm=clf_ncm)

                    # Test accuracy
                    valid_accy, pred_fts, pred_lbls = self.validate(
                        args=self.args,
                        model=self.model,
                        loader=self.test_loader,
                        clf_knn=clf_knn,
                        clf_ncm=clf_ncm)

                    self.log(epoch, new_loss, new_train_accy, valid_accy,
                             validate_start, fts_means, pred_lbls, best_acc,
                             ebd_loss, old_train_accy)

                    if (valid_accy[1] > best_acc) or epoch == self.args.epoch:
                        best_acc = max(best_acc, valid_accy[1])
                        best_epoch = epoch
                        self.save_model(epoch,
                                        fts_means,
                                        preserved_embedding=None)

            elif self.increment_phase == 0:
                train_loss = self.train_epoch(
                    epoch=epoch,
                    model=self.model,
                    criterion=self.criterion,
                    optimizer=self.optimizer,
                    new_loader=self.sampler_train_loader,
                    pairwise=self.args.pairwise)

                if epoch % 4 == 0:
                    validate_start = datetime.datetime.now()

                    # Validate
                    with torch.no_grad():
                        embeddings, targets = self.extractEmbeddings(
                            model=self.model, train_loader=self.train_loader)

                    fts_means, labels = self.extract_feature_mean(
                        embeddings, targets)  # [n, feature_dimension], [n]

                    clf_knn = neighbors.KNeighborsClassifier(
                        n_neighbors=self.args.vote).fit(
                            embeddings.cpu().data.numpy(), targets)
                    clf_ncm = neighbors.NearestCentroid().fit(
                        fts_means.cpu().data.numpy(), labels)

                    # Train accuracy
                    train_accy, train_fts, train_lbls = self.validate(
                        args=self.args,
                        model=self.model,
                        loader=self.train_loader,
                        clf_knn=clf_knn,
                        clf_ncm=clf_ncm)

                    # Test accuracy
                    valid_accy, pred_fts, pred_lbls = self.validate(
                        args=self.args,
                        model=self.model,
                        loader=self.test_loader,
                        clf_knn=clf_knn,
                        clf_ncm=clf_ncm)

                    self.log(epoch,
                             train_loss,
                             train_accy,
                             valid_accy,
                             validate_start,
                             fts_means,
                             pred_lbls,
                             best_acc=best_acc)

                    if (train_accy[1] >= 0.96 and valid_accy[1] > best_acc
                        ) or epoch == self.args.epoch:
                        best_acc = max(best_acc, valid_accy[1])
                        best_epoch = epoch
                        preserved_embedding = self.preserve_image(
                            epoch, embeddings, targets, fts_means,
                            self.classes)
                        self.save_model(epoch, fts_means, preserved_embedding)

            elif self.increment_phase == -1:
                losses = self.train_cross_entropy(
                    train_loader=self.train_loader,
                    model=self.model,
                    criterion=self.criterion,
                    optimizer=self.optimizer,
                    epoch=epoch)
                # TODO

            end_time = datetime.datetime.now()
            self.f.write(
                'Best accy: {:.4f}, Best_epoch: {}, Time comsumed: {}mins'.
                format(best_acc, best_epoch,
                       int(((end_time - start_time).seconds) / 60)))
            print('Best accy: {:.4f}, Best_epoch: {}, Time comsumed: {}mins'.
                  format(best_acc, best_epoch,
                         int(((end_time - start_time).seconds) / 60)))
Example #24
0
# Level 2 Score: 

clf = linear_model.ElasticNetCV(cv=5, verbose=0)
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5,seed=rnd, category="regressor", filename = "ElasticNet", setused=setused)


# Level 2 Score: 

clf = linear_model.BayesianRidge()
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "BayesianRidge", setused=setused)


# Level 2 Score: 

clf = neighbors.NearestCentroid()
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "NearCentroid", setused=setused)


# Level 2 Score: 

clf = naive_bayes.GaussianNB()
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "GaussianNB", setused=setused)


# Level 2 Score, k=   2: 
# Level 2 Score, k=   4: 
# Level 2 Score, k=   8: 
# Level 2 Score, k=  16: 
# Level 2 Score, k=  32: 
# Level 2 Score, k=  64: 
Example #25
0
# for c in c_logreg_values:
#     logreg_digit = linear_model.LogisticRegression(C=c, solver='liblinear')
#     logreg_digit.fit(digit_data_train, digit_targets_train)
#     digit_ypred_test = logreg_digit.predict(digit_data_test)
#     logreg_liblinear_acc.append(metrics.accuracy_score(digit_targets_test, digit_ypred_test))
#
# x_axis = c_logreg_values
# plt.plot(x_axis, logreg_acc_lbfgs_multi, x_axis, logreg_liblinear_acc)
# plt.title('Logistic Regression Accuracy Scores for different parameters and solvers')
# plt.legend(('lbgfs with multinomial multiclass', 'liblinear'))
# plt.xlabel('C')
# plt.ylabel('Accuracy Score')
# plt.axis('tight')
# plt.show()

nc_digit = neighbors.NearestCentroid()
nc_digit.fit(digit_data_train, digit_targets_train)

digit_targets_pred_nc = nc_digit.predict(digit_data_test)
print("Nearest Centroid Accuracy (Digit Dataset):",
      metrics.accuracy_score(digit_targets_test, digit_targets_pred_nc))

qda_digit = discriminant_analysis.QuadraticDiscriminantAnalysis()
qda_digit.fit(digit_data_train, digit_targets_train)
digit_targets_pred_qda = qda_digit.predict(digit_data_test)
print("QDA Accuracy (Digit Dataset):",
      metrics.accuracy_score(digit_targets_test, digit_targets_pred_qda))

# iris_data_split = np.reshape(iris_data, [10, len(iris_data)/10])
# iris_targets_split = np.reshape(iris_targets [10, len(iris_data)/10])