Python NearestCentroid.predict Exemples, sklearn.neighbors.nearest_centroid.NearestCentroid.predict Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ModelTraining.py Projet : JazminWhy/DMTeam20_Uni_Mannheim

def train_nearest_centroid(params,
                           x_train,
                           y_train,
                           n_folds,
                           random_state,
                           stratified=True,
                           shuffle=True):

    # Model and hyperparameter selection
    if stratified:
        kf = StratifiedKFold(n_splits=n_folds,
                             random_state=random_state,
                             shuffle=shuffle)
    else:
        kf = KFold(n_splits=n_folds,
                   random_state=random_state,
                   shuffle=shuffle)

    nearest_centroid_model = NearestCentroid(**params)
    i = 0

    # Model Training
    for (train_index, test_index) in kf.split(x_train, y_train):
        # cross-validation randomly splits train data into train and validation data
        print('\n Fold %d' % (i + 1))

        x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[
            test_index]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[
            test_index]

        # declare your model
        nearest_centroid_model.fit(x_train_cv, y_train_cv)

        # predict train and validation set accuracy and get eval metrics
        scores_cv = nearest_centroid_model.predict(x_train_cv)
        scores_val = nearest_centroid_model.predict(x_val_cv)

        # training evaluation
        train_pc = accuracy_score(y_train_cv, scores_cv)
        train_pp = precision_score(y_train_cv, scores_cv)
        train_re = recall_score(y_train_cv, scores_cv)
        print('\n train-Accuracy: %.6f' % train_pc)
        print(' train-Precision: %.6f' % train_pp)
        print(' train-Recall: %.6f' % train_re)

        eval_pc = accuracy_score(y_val_cv, scores_val)
        eval_pp = precision_score(y_val_cv, scores_val)
        eval_re = recall_score(y_val_cv, scores_val)
        print('\n eval-Accuracy: %.6f' % eval_pc)
        print(' eval-Precision: %.6f' % eval_pp)
        print(' eval-Recall: %.6f' % eval_re)

        i = i + 1

    # return model for evaluation and prediction
    return nearest_centroid_model

Exemple #2

0

Afficher le fichier

Fichier : sklearnNearest.py Projet : biudar/pt

def build_model():
    clf = NearestCentroid()
    clf.fit(trainX, trainY)
    joblib.dump(clf, 'models/nearest.model')
    predictY = clf.predict(testX)
    acc = get_acc(predictY, testY)
    print '* acc on test:', acc
    predictY = clf.predict(validX)
    acc = get_acc(predictY, validY)
    print '* acc on valid:', acc
    return 0

Exemple #3

0

Afficher le fichier

Fichier : Data_Break_Centroid.py Projet : Ankurugra/Data-Mining

def pickData(filename, class_numbers, training_instances, test_instances):

    data1 = np.genfromtxt(filename, delimiter=",")  ####  Reading File

    array = np.array(data1)
    data = array
    class_count = 0
    test_instance = test_instances
    training_instance = training_instances
    count = 1
    file_name = filename

    if (file_name == "HandWrittenLetters.txt"):
        class_count = 39
    elif (file_name == "ATNTFaceImages400.txt"):
        class_count = 10

    for i in range(len(class_numbers)):
        column_from = (class_numbers[i] - 1) * class_count
        column_to = column_from + class_count
        training_column_end = column_to - test_instance

        train_label = data[0, column_from:training_column_end]
        train_data = data[1:, column_from:training_column_end]

        test_label = data[0, training_column_end:column_to]
        test_data = data[1:, training_column_end:column_to]

        if (count == 1):
            train_label_final = train_label
            test_label_final = test_label
            train_data_final = train_data
            test_data_final = test_data
            count = 0
        else:
            train_label_final = np.hstack((train_label_final, train_label))
            test_label_final = np.hstack((test_label_final, test_label))
            train_data_final = np.hstack((train_data_final, train_data))
            test_data_final = np.hstack((test_data_final, test_data))

    train_data_final_t = train_data_final.transpose()
    test_data_final_t = test_data_final.transpose()

    outfile(train_data_final, test_data_final, train_label_final,
            test_label_final)

    clf = NearestCentroid()
    clf.fit(train_data_final_t, train_label_final)
    predictions = clf.predict(test_data_final_t)
    print("Test set predictions:\n{}".format(clf.predict(test_data_final_t)))
    print("Test set accuracy: {:.2f}".format(
        clf.score(test_data_final_t, test_label_final)))

Exemple #4

0

Afficher le fichier

Fichier : supervised.py Projet : yueqiw/Neuron_Morpho_Classification_ML

def nearest_centroid(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans nearest_centroid")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = NearestCentroid()
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Nearest_Centroid_metrics.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid Classifier"
    save = Output + "Nearest_Centroid_Classifier_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans nearest_centroid")

Exemple #5

0

Afficher le fichier

Fichier : db_word_classification.py Projet : jarrarhaider94/IR-Project

def Rocchio_Algorith():
    clf = NearestCentroid()
    clf.fit(X_train, Y_train)

    pred_result = clf.predict(X_test)

    print(pred_result)
    print()
    print(Y_test)

    print('classification report: ')
    #     print classification_report(y_test, yhat)
    print(classification_report(Y_test, pred_result))

    print('f1 score')
    print(f1_score(Y_test, pred_result, average='macro'))

    print('accuracy score')
    print(accuracy_score(Y_test, pred_result))

    precision = precision_score(Y_test, pred_result, average=None)
    print("Precision : ")
    print(precision)
    recall = recall_score(Y_test, pred_result, average=None)
    print("Recall : ")
    print(recall)

Exemple #6

0

Afficher le fichier

    def handwritingClassTest(self):
        hwLabels = []
        # 加载训练数据集
        trainingFileList = listdir(Config.DATAS + 'KNN/digits/trainingDigits')
        m = len(trainingFileList)
        trainingMat = np.zeros((m, 1024))
        for i in range(m):
            fileNameStr = trainingFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = int(fileStr.split('_')[0])
            hwLabels.append(classNumStr)
            trainingMat[i, :] = self.img2vector(
                Config.DATAS + 'KNN/digits/trainingDigits/%s' % fileNameStr)
        # 开始训练
        clf = NearestCentroid()
        clf.fit(trainingMat, hwLabels)

        testFileList = listdir(Config.DATAS + 'KNN/digits/testDigits')
        errorCount = 0.0
        mTest = len(testFileList)
        for i in range(mTest):
            fileNameStr = testFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = int(fileStr.split('_')[0])
            vectorUnderTest = self.img2vector(Config.DATAS +
                                              'KNN/digits/testDigits/%s' %
                                              fileNameStr)
            classifierResult = clf.predict(vectorUnderTest)
            print "the classifier came back with: %d, the real answer is: %d" % (
                classifierResult, classNumStr)
            if (classifierResult != classNumStr): errorCount += 1.0
        print "\nthe total number of errors is: %d" % errorCount
        print "\nthe total error rate is: %f" % (errorCount / float(mTest))

Exemple #7

0

Afficher le fichier

def flw_dataset_classify():
    f = Feature()
    paths, classes = loadFaceData('face.csv', nrows=82)
    X = []
    y = []
    for index, path in enumerate(paths):
        ar = f.getFeature(path)
        print(index, path)
        if ar.all() == 0:
            continue
        X.append(ar)
        y.append(classes[index])
    X = np.array(X)
    y = np.array(y)
    print(X.shape)
    print(X)
    print(y)
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
        X, y, test_size=0.3, stratify=y)
    nearestCentroid = NearestCentroid()
    nearestCentroid.fit(X_train_data, y_train_data)

    predict_y = nearestCentroid.predict(X_test_data)
    acc = accuracy_score(y_test_data, predict_y)

    print(acc)

Exemple #8

0

Afficher le fichier

Fichier : Classifier.py Projet : igupta967937/Predicted_Health_Insurance

def classifyUsingKNNCentroid(trainX, trainY, testX, testY):

    print(
        '################# classifyUsingKNNCentroid() started ##################'
    )
    start_time = time.time()

    clf = NearestCentroid()
    print('KNN Initialized')

    clf.fit(trainX, trainY)
    print('KNN Trained')

    predictedY = clf.predict(testX)
    print('KNN prediction completed')

    accuracy = accuracy_score(testY, predictedY)

    confusionMatrix = confusion_matrix(testY, predictedY)

    f1Score = f1_score(testY, predictedY, average='weighted')

    print('accuracy:', accuracy)
    print('confusionMatrix: ', confusionMatrix)
    print('f1Score: ', f1Score)

    print(
        '################# classifyUsingKNNCentroid() finished ##################'
    )
    print("--- %s seconds ---" % (time.time() - start_time))

Exemple #9

0

Afficher le fichier

def exeML(mlmethod, xtr, ytr, xte, yte, islog=True, isfeatureselection=True):
    if islog:
        xtr = np.log(np.abs(xtr)).tolist()
        ytr = np.log(np.abs(ytr)).tolist()
        xte = np.log(np.abs(xte)).tolist()
        yte = np.log(np.abs(yte)).tolist()

    if isfeatureselection:
        estimator = SVR(kernel="linear")
        selector = RFE(estimator, 100, step=1)
        selector = selector.fit(xtr, ytr)
        xtr = np.array(xtr)[:, selector.support_].tolist()
        xte = np.array(xte)[:, selector.support_].tolist()

    np.random.seed(1000)
    if mlmethod == "SVM":
        clf = svm.SVR(kernel='poly')
    elif mlmethod == "NeaNei":
        clf = NearestCentroid()
    elif mlmethod == "dtree":
        clf = tree.DecisionTreeClassifier()
    elif mlmethod == "lda":
        clf = lda(solver="svd")

    predval = []
    clf.fit(xtr, ytr)

    for i in range(len(xte)):
        predval.append(np.float(clf.predict(xte[i])))

    return predval

Exemple #10

0

Afficher le fichier

Fichier : staticGO.py Projet : zsp1197/NILM_fightingMAN

 def r3_get_r2(r3list, label, app_statedict: dict, psdict: dict):
     print('Using history data to extract r2 belonging to r3')
     df = pd.DataFrame(columns=list(psdict.keys()))
     r3combi = {}
     # generating dataframe
     for key, ps in psdict.items():
         app_state = app_statedict[key]
         state_num = len(app_state)
         clf = NearestCentroid()
         clf.fit(np.append([0], np.array([i.center_value for i in app_state])).reshape(-1, 1),
                 np.array(range(state_num + 1)))
         df[key] = clf.predict(ps.values.reshape(-1, 1))
     for idn, r3 in enumerate(r3list):
         idx = label == idn
         tempt = df.iloc[idx]
         combination = set([tuple(i) for i in list(tempt.values)])
         r3combi[idn] = combination
         r2list = []
         state_count = []
         for r2row in combination:
             app_state_tuple = []
             for kk, key in enumerate(df.columns):
                 if r2row[kk] > 0:
                     app_state_tuple.append(app_statedict[key][r2row[kk] - 1])
             if app_state_tuple != []:
                 r2list.append(State_r2(tuple(app_state_tuple)))
                 state_count.append(np.count_nonzero((tempt == np.array(r2row)).all(1)))
             else:
                 r2list.append(State_r2(None))
                 state_count.append(np.count_nonzero((tempt == np.array(r2row)).all(1)))
         r3.set_state_r2_list(r2list, False)
         r3.statecount = state_count
         if (len(r2list) != len(state_count)):
             print()
     print('r2 extracting finished')

Exemple #11

0

Afficher le fichier

Fichier : supervised_split_test.py Projet : xaviervasques/Neuron_Morpho_Classification_ML

def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")

Exemple #12

0

Afficher le fichier

def vote_scheme(df_peaks):
    # Apply a voting scheme using the nearest neighbors (NN) to see if peaks in the
    # other axes are near the one found for X. If a peak gets 3 votes than it gives
    # higher confidence that motion was
    n_peaks = [len(df_peaks[col].dropna()) for col in df_peaks.columns]

    # Will use the direction with the greatest number of peaks as the base
    # The other two direction will vote to see which peaks they have matching
    # If they all have the same number of peaks, default to X-axis
    if len(set(n_peaks)) == 1:
        base_dir = df_peaks.iloc[:, 0]
        voting_dirs = df_peaks.iloc[:, 1:]
    else:

        highest_n_peak = [peak == max(n_peaks) for peak in n_peaks]
        lower_n_peak = [peak != max(n_peaks) for peak in n_peaks]
        base_dir = df_peaks.X_filt_hp
        voting_dirs = df_peaks.loc[:, lower_n_peak]

    X = np.array(base_dir.values).reshape(-1, 1)
    y = np.array(base_dir.index.values)
    clf = NearestCentroid()
    clf.fit(X, y)
    NearestCentroid(metric='euclidean', shrink_threshold=None)
    total_votes = np.ones(len(base_dir))
    for col in voting_dirs:
        votes = clf.predict(
            np.array(voting_dirs[col].dropna().values).reshape(-1, 1))
        total_votes[votes] += 1
    peaks = (total_votes == len(df_peaks.columns))
    #print(peaks, votes)
    return peaks

Exemple #13

0

Afficher le fichier

def kNCN(x, Y, newData):
    global model_kncn, modelCreated_kncn, predictBuf_kncn, pbDetected_kncn
    if not modelCreated_kncn:
        print('Training Initiated. . .')
        feature = np.array(x, dtype=np.float32)
        label = np.array(Y, dtype=np.int)
        model_kncn = NearestCentroid(metric='euclidean', shrink_threshold=None)
        model_kncn.fit(feature, label)
        modelCreated_kncn = True
        print('Training Complete')
    else:
        predicted = model_kncn.predict(newData)
        predictBuf_kncn = np.array(predicted, dtype=np.int)
        for i in range(len(predictBuf_kncn)):
            if predictBuf_kncn[i] == 0:
                for j in newData:
                    print('>>>Cost:', j, '\n>>>Prediction: Prolonged')
                    pbDetected_kncn += 1
                    print('Prolonged detection number', pbDetected_kncn)
            elif predictBuf_kncn[i] == 1:
                for j in newData:
                    print('>>>Cost:', j, '\n>>>Prediction: Right')
            elif predictBuf_kncn[i] == 2:
                for j in newData:
                    print('>>>Cost:', j, '\n>>>Prediction: Left')
            else:
                print('>>>hmmm...')

Exemple #14

0

Afficher le fichier

def predictor(final):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors.nearest_centroid import NearestCentroid
    from sklearn import svm
    from sklearn.cross_validation import cross_val_score

    KNN = KNeighborsClassifier()
    cen = NearestCentroid()
    SVM = svm.SVC()
    #if(temper1==0):
    TrainX = []
    TrainX = final[1]
    TrainY = []
    TrainY = final[2]
    TestX = []
    TestX = final[3]
    #predictor(TrainX.TrainY,TestX)
    KNN.fit(TrainX, TrainY)
    cen.fit(TrainX, TrainY)
    SVM.fit(TrainX, TrainY)
    abc = []
    # print("The predicted values using KNN are", KNN.predict(TestX))
    abc.append(KNN.predict(TestX))
    #print("The predicted values using Centroid are",cen.predict(TestX))
    abc.append(cen.predict(TestX))
    #print("The predicted values using SVM are",SVM.predict(TestX))
    abc.append(SVM.predict(TestX))
    return abc

Exemple #15

0

Afficher le fichier

def scut_fbp_test():
    f = Feature()
    # af1and5 0.890287769784
    paths, classes = loadFaceData(
        './dataset/af1and5.csv',
        nrows=100)  # './dataset/all(round_score).csv' for full class
    X = []
    y = []
    for index, path in enumerate(paths):
        ar = f.getFeature(path)
        print(index, path)
        if ar.all() == 0:
            continue
        X.append(ar)
        y.append(round(classes[index]))
    X = np.array(X)
    y = np.array(y)
    print(X.shape)
    print(X)
    print(y)
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(
        X, y, test_size=0.3, stratify=y)
    nearestCentroid = NearestCentroid()
    nearestCentroid.fit(X_train_data, y_train_data)

    predict_y = nearestCentroid.predict(X_test_data)
    acc = accuracy_score(y_test_data, predict_y)

    print(acc)

Exemple #16

0

Afficher le fichier

def sk_nearest_neighbour(X_train, y_train, X_test, y_test):
    """ Wrapper over sklearn's nearest neighbor. """
    clf = NearestCentroid()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    c = np.sum(y_pred == y_test)
    accuracy = c * 100.0 / len(y_test)
    return accuracy, y_pred

Exemple #17

0

Afficher le fichier

Fichier : Wells_Fargo.py Projet : Sbonelo-Mntungwa/Data_Science

def run_nearest_neighbour(feature, label):
    print ('nearest neighbour begin...\n')
    x_train, x_test, y_train, y_test = tts(feature, label, test_size=0.2)
    clf = NearestCentroid()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_test)
    run_result(y_test, preds)
    print ('...nearest neighbour complete\n')

Exemple #18

0

Afficher le fichier

Fichier : common.py Projet : semercim/LMTD

def knn_classify0(training_set, training_labels, test_set, test_labels,
                  num_neighbors):
    clf = NearestCentroid(metric='euclidean')
    clf.fit(training_set, training_labels)
    input_test_predictions = clf.predict(test_set)
    test_result = np.sum(
        input_test_predictions == test_labels) * 100.0 / float(
            len(test_labels))  # type: ndarray
    return 0.0, test_result

Exemple #19

0

Afficher le fichier

def ml_algo(inp):
    df = pd.read_csv("data/final_preprocess.csv")
    X = np.array(df.drop(['Result'], axis=1))
    y = np.array(df['Result'])
    X, y = shuffle(X, y, random_state=1)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)

    model_centroid = NearestCentroid().fit(X_train, y_train)
    model_knn = KNeighborsClassifier(25).fit(X_train, y_train)
    model_svm = SVC().fit(X_train, y_train)
    model_lr = LinearRegression().fit(X_train, y_train)
    model_nb = BernoulliNB().fit(X_train, y_train)
    # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None;
    # min_samples_split-> min no. of samples reqd. to split an internal node;
    # min_samples_leaf -> The minimum number of samples required to be at a leaf node.
    # min_impurity_split -> It defines the threshold for early stopping tree growth.
    model_dtree = DecisionTreeClassifier(criterion="entropy",
                                         random_state=100,
                                         max_depth=3,
                                         min_samples_leaf=5).fit(
                                             X_train, y_train)

    # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________')
    accu_centroid = model_centroid.score(X_test, y_test)
    # print ("NearestCentroid -> ", accu_centroid)
    accu_knn = model_knn.score(X_test, y_test)
    # print ("Knn             -> ",accu_knn)
    accu_svm = model_svm.score(X_test, y_test)
    # print ("SVM             -> ", accu_svm,)
    accu_lr = model_lr.score(X_test, y_test)
    # print ("Linear Regr     -> ", accu_lr)
    accu_nb = model_nb.score(X_test, y_test)
    # print ("Naive Bayes     -> ", accu_nb)
    accu_dtree = model_dtree.score(X_test, y_test)
    # print ("Decission Tree  -> ", accu_dtree, "\n")

    result_centroid = model_centroid.predict(inp)
    result_knn = model_knn.predict(inp)
    result_svm = model_svm.predict(inp)
    result_lr = model_lr.predict(inp)
    result_nb = model_nb.predict(inp)
    result_dtree = model_dtree.predict(inp)

    # disease-name, description, [list of step to be taken], [list of to whom we can contact]

    # print ("[2] PREDICTION ",'\n___________________')
    # print ("NearestCentroid -> ", result_centroid)
    # print ("knn             -> ", result_centroid)
    # print ("svm             -> ", result_svm)
    # print ("LinearReg       -> ", result_lr)
    # print ("Naive Bayes     -> ", result_nb)
    # print ("Decission Tree  -> ", result_dtree)

    # return map_disease[str(result_knn[0])]
    return result_knn[0]

Exemple #20

0

Afficher le fichier

Fichier : football_predict.py Projet : saromanov/pyfootballpred

	def _clustering(self, targetgame, games):
		'''
			Find similar games with clustering
			TODO
		'''
		preparegames = list(map(lambda x: [i[1] for i in x.data], games))
		preparegame = list(map(lambda x: x[1], targetgame.data))
		lables = list(range(len(games)))
		clf = NearestCentroid()
		clf.fit(preparegames, lables)
		print(clf.predict(preparegame))

Exemple #21

0

Afficher le fichier

Fichier : football_predict.py Projet : saromanov/pyfootballpred

    def _clustering(self, targetgame, games):
        '''
			Find similar games with clustering
			TODO
		'''
        preparegames = list(map(lambda x: [i[1] for i in x.data], games))
        preparegame = list(map(lambda x: x[1], targetgame.data))
        lables = list(range(len(games)))
        clf = NearestCentroid()
        clf.fit(preparegames, lables)
        print(clf.predict(preparegame))

Exemple #22

0

Afficher le fichier

Fichier : knClassifier.py Projet : efikalti/computational-intelligence

 def nearestCentroid(self, x_train, y_train, x_test, y_test):
     # Test with Nearest Centroid
     clf = None
     clf = NearestCentroid(metric='euclidean')
     # Train created model
     clf.fit(x_train, y_train)
     # Predict on test data
     prediction = None
     prediction = clf.predict(x_test)
     # Log results
     self.logResults(y_test, prediction, kn=False)

Exemple #23

0

Afficher le fichier

Fichier : myread.py Projet : good5dog5/AI-Challenge

def KNN_K1(Xtrain, Ytrian, Xtest, err, Name):
    from sklearn.neighbors.nearest_centroid import NearestCentroid
    clf = NearestCentroid()
    clf.fit(Xtrain, Ytrian)
    a = clf.predict(Xtest)
    result = pd.read_csv("./upload" + ".csv", sep=',', delimiter=None)

    result['proba'] = a * (1.0 - err[0]) + (1 - a) * err[1]
    result.to_csv(Name[0] + str(Name[1]) + ".csv",
                  sep=',',
                  encoding='utf-8',
                  index=False)

Exemple #24

0

Afficher le fichier

def knn_scargc(X, y, Ut):
    '''clf = KNeighborsClassifier(n_neighbors=1).fit(X, y)
    predicted_label = clf.predict(Ut.reshape(1, -1))'''
    best_distance, ind = NearestNeighbors(n_neighbors=1).fit(X).kneighbors(
        Ut.reshape(1, -1))
    nearest = X[ind]
    clf = NearestCentroid(metric='euclidean').fit(X, y)
    predicted_label = clf.predict(Ut.reshape(1, -1))
    #print(clf.centroids_) #exemplo [[ 0.25940611 -0.02868181] [ 5.450457    5.40674248]]
    #print("nearest scikit",nearest[0][0])
    #print("predicted scikit",predicted_label)
    return predicted_label, best_distance, nearest[0]

Exemple #25

0

Afficher le fichier

class centroid():
    def __init__(self, bm):
        self.input = File[bm]
        self.clf = NearestCentroid()
        self.x = []  #store the properties of the object
        self.y = []  #store the kind of class

        self.readData(bm)

        print 'centroid Fitting...'
        self.clf.fit(self.x, self.y)
        print 'Done!'

    def readData(self, bm):
        '''
        readData:
            get the data in the source data
        '''
        read = open(self.input, 'r')
        for line in read.readlines():
            one = line.strip('\n').split(' ')
            tmp = list()
            for i in xrange(N[bm]):
                tmp.append(float(one[i]))
            self.x.append(tmp)
            self.y.append(int(one[N[bm]]))
        read.close()
        self.x = np.array(self.x)
        self.y = np.array(self.y)

    def doIt(self):
        accuracy = 0
        for i in xrange(len(self.y)):
            if self.y[i] == self.clf.predict(self.x[i]):
                accuracy += 1
        print accuracy * 1.0 / len(self.y)
        return accuracy * 1.0 / len(self.y)

    def predict(self, x):
        return self.clf.predict(x)

Exemple #26

0

Afficher le fichier

def predictor(TrainX_F, TrainY_F, TestX):
    cen = NearestCentroid()
    SVM = svm.SVC()
    regr = LinearRegression()
    cen.fit(TrainX_F, TrainY_F)
    SVM.fit(TrainX_F, TrainY_F)
    regr.fit(TrainX_F, TrainY_F)
    print("Centroid Predicted Labels: ", end='')
    print(cen.predict(TestX))
    print("SVM Predicted Labels: ", end='')
    print(SVM.predict(TestX))
    print("LR Predicted Labels: ", end='')
    print(regr.predict(TestX))

Exemple #27

0

Afficher le fichier

Fichier : nebulosa.py Projet : ulyssesrr/machinelearning-assignment4

def itemB():
    train_dataset = load_nebulosa_train()
    # remover missing values
    # print(train_dataset)
    train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)]
    train_dataset = train_dataset[:, 2:]

    train_target = train_dataset[:, -1]
    train_dataset = train_dataset[:, :-2]

    # train_dataset = normalize(train_dataset, axis=0)

    test_dataset = load_nebulosa_test()
    # remover mising values
    test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)]
    test_dataset = test_dataset[:, 2:]

    test_target = test_dataset[:, -1]
    test_dataset = test_dataset[:, :-2]
    # print(test_dataset)
    # test_dataset = normalize(test_dataset, axis=1)
    # print(test_dataset)

    kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target)
    train_dataset = kbest.transform(train_dataset)
    test_dataset = kbest.transform(test_dataset)

    # print(train_dataset)

    n_train_samples = train_dataset.shape[0]
    n_train_features = train_dataset.shape[1]
    # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features))

    n_test_samples = test_dataset.shape[0]
    n_test_features = test_dataset.shape[1]
    # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features))

    nn = KNeighborsClassifier(n_neighbors=1)
    nn.fit(train_dataset, train_target)
    nn_target_pred_test = nn.predict(test_dataset)

    nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test)
    print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test))

    nc = NearestCentroid(metric="euclidean")
    nc.fit(train_dataset, train_target)
    nc_target_pred_test = nc.predict(test_dataset)

    nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test)
    print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))

Exemple #28

0

Afficher le fichier

Fichier : nearest_centroid.py Projet : yutarochan/lale

class NearestCentroidImpl():
    def __init__(self, metric='euclidean', shrink_threshold=None):
        self._hyperparams = {
            'metric': metric,
            'shrink_threshold': shrink_threshold
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

Exemple #29

0

Afficher le fichier

Fichier : Models.py Projet : eivhav/Machine-Learning

class Knn():
    def __init__(self, method, n_neighbors, weights, radius):
        if method == 'knn_class':
            self.clf = neighbors.KNeighborsClassifier(n_neighbors,
                                                      weights=weights)
        elif method == 'knn_rad':
            self.clf = RadiusNeighborsClassifier(radius=radius)
        elif method == 'knn_cent':
            self.clf = NearestCentroid()

    def train_model(self, train):
        self.clf.fit(train[0], train[1])

    def predict(self, data):
        return self.clf.predict(data)

    def test_model(self, test):
        return self.clf.score(test[0], test[1])

Exemple #30

0

Afficher le fichier

def vote_scheme(df_peaks):
    # Apply a voting scheme using the nearest neighbors (NN) to see if peaks in the
    # other axes are near the one found for X. If a peak gets 3 votes than it gives
    # higher confidence that motion was
    n_peaks = [len(df_peaks[col].dropna()) for col in df_peaks.columns]

    # Will use the direction with the greatest number of peaks as the base
    # The other two direction will vote to see which peaks they have matching
    # If they all have the same number of peaks, default to X-axis
    if len(set(n_peaks)) == 1:
        base_dir = df_peaks.iloc[:, 0]
        voting_dirs = df_peaks.iloc[:, 1:]
    else:

        highest_n_peak = [peak == max(n_peaks) for peak in n_peaks]
        lower_n_peak = [peak != max(n_peaks) for peak in n_peaks]
        base_dir = df_peaks.iloc[:, highest_n_peak]
        voting_dirs = df_peaks.iloc[:, lower_n_peak].dropna()
    if len(base_dir) == 0 or len(voting_dirs) == 0:
        df_peaks_voted = pd.DataFrame()
        return df_peaks_voted
    # Do a KNN to vote if both directions have the a peak near same
    # Allow for a 5% max fluxuation
    X = np.array(base_dir.values).reshape(-1, 1)
    y = np.array(base_dir.index.values)
    clf = NearestCentroid()
    clf.fit(X, y)
    NearestCentroid(metric='euclidean', shrink_threshold=None)
    total_votes = np.ones(len(base_dir))
    for col in voting_dirs:
        votes = clf.predict(np.array(voting_dirs[col].values).reshape(-1, 1))
        total_votes[votes] += 1

    peaks_votes = (total_votes == len(df_peaks.columns))
    # Check how much each row differs - max 5%
    df_base = base_dir[peaks_votes]
    df_base.reset_index(drop=True, inplace=True)
    df_peaks_temp = pd.concat([df_base, voting_dirs], axis=1)

    df_peaks_temp['10pt_diff'] = df_peaks_temp['X_filt_bp'].sub(
        df_peaks_temp['Y_filt_bp']).abs() < 10
    df_peaks_voted = df_peaks_temp[df_peaks_temp['10pt_diff']]
    df_peaks_voted.drop('10pt_diff', axis=1, inplace=True)
    return df_peaks_voted

Exemple #31

0

Afficher le fichier

Fichier : nebulosa.py Projet : ulyssesrr/machinelearning-assignment4

def itemA():
    train_dataset = load_nebulosa_train()

    train_target = train_dataset[:, -1]
    train_dataset = train_dataset[:, :-1]

    nam_target = np.where(np.isnan(train_target))
    train_target = np.delete(train_target, nam_target)
    train_dataset = np.delete(train_dataset, nam_target, 0)
    train_dataset = np.nan_to_num(train_dataset)

    test_dataset = load_nebulosa_test()

    test_target = test_dataset[:, -1]
    test_dataset = test_dataset[:, :-1]

    nam_target = np.where(np.isnan(test_target))
    test_target = np.delete(test_target, nam_target)
    test_dataset = np.delete(test_dataset, nam_target, 0)
    test_dataset = np.nan_to_num(test_dataset)

    n_train_samples = train_dataset.shape[0]
    n_train_features = train_dataset.shape[1]
    print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features))

    n_test_samples = test_dataset.shape[0]
    n_test_features = test_dataset.shape[1]
    print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features))

    nn = KNeighborsClassifier(n_neighbors=1)
    nn.fit(train_dataset, train_target)
    nn_target_pred_test = nn.predict(test_dataset)

    nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test)
    print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test))

    # train_target[18] = 1
    nc = NearestCentroid(metric="euclidean")
    nc.fit(train_dataset, train_target)
    nc_target_pred_test = nc.predict(test_dataset)
    # print(nc_target_pred_test)

    nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test)
    print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))

Exemple #32

0

Afficher le fichier

def predict_with_nearestcentroid(train_features,
                                 test_features,
                                 train_labels,
                                 test_labels,
                                 metric='cosine'):
    """using nearest center classifer to evaluate the results.
    :train_features, test_features: the feature vectors of train set and test set
    :train_labels, test_labels: the labels of train set and test set.
    :metric: the metric to calculate distence.
    :return: CRR and center vectors of each class
    :rtype: tuple
    """
    clf = NearestCentroid(metric=metric)

    clf.fit(train_features, train_labels)

    predicted = clf.predict(test_features)

    return cal_crr(test_labels, predicted), clf.centroids_

Exemple #33

0

Afficher le fichier

def main(CV=False, PLOT=True):
    """Entry Point.

    Parameters
    ----------
    CV: bool
        Cross-validation flag
    PLOT: bool
        Plotting flag
    """
    _data = fetch_data()

    if CV:
        method, params = cross_validate(_data)
    else:
        method = 'l2'
        params = {'metric': chisquare}

    data = normalise(_data, method)

    X_train, y_train = data['train']
    X_test, y_test = data['test']

    classifier = NearestCentroid(**params)
    classifier.fit(X_train, y_train)

    print('ACCURACY: ', classifier.score(X_test, y_test))

    if PLOT:

        y_hat = classifier.predict(X_test)

        cnf_matrix = confusion_matrix(y_test, y_hat)

        plot_confusion_matrix(cnf_matrix,
                              classes=list(set(y_test)),
                              title='Nearest Centroid\nConfusion Matrix',
                              cmap=plt.cm.Blues)

        plt.savefig('data/out/nc_cnf_matrix.pdf',
                    format='pdf',
                    dpi=300,
                    transparent=True)

Exemple #34

0

Afficher le fichier

Fichier : symptom_text_classifier.py Projet : wangyuaqi/XiaoLuAI

def text_classify(X_train, X_test, y_train, y_test):
    """
    machine learning classifier
    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    # print('=' * 100)
    # print('start launching MLP Classifier......')
    # mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20, 20, 30, 50), random_state=1)
    # mlp.fit(X_train, y_train)
    # print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(X_test, y_test)))

    print('=' * 100)
    print('start launching SVM Classifier......')
    svc = svm.SVC(decision_function_shape='ovo')
    svc.fit(X_train, y_train)
    print('finish launching SVM Classifier, the test accuracy is {:.5%}'.format(
        accuracy_score(svc.predict(X_test), y_test)))

    print('=' * 100)
    print('start launching Decision Tree Classifier......')
    dtree = tree.DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    print('finish launching Decision Tree Classifier, the test accuracy is {:.5%}'.format(
        accuracy_score(dtree.predict(X_test), y_test)))

    print('=' * 100)
    print('start launching KNN Classifier......')
    knn = NearestCentroid()
    knn.fit(X_train, y_train)
    print('finish launching KNN Classifier, the test accuracy is {:.5%}'.format(
        accuracy_score(knn.predict(X_test), y_test)))

    print('=' * 100)
    print('start launching Random Forest Classifier......')
    rf = RandomForestClassifier(n_estimators=20)
    rf.fit(X_train, y_train)
    print('finish launching Random Forest Classifier, the test accuracy is {:.5%}'.format(
        accuracy_score(rf.predict(X_test), y_test)))

Exemple #35

0

Afficher le fichier

Fichier : digit.py Projet : jhprks/digitrecognizer

# Takes a list, creates a csv file
def submitFile(x, pre):
    f = open(pre + '_submission.csv', 'w')
    for val in x:
        f.write(str(val) + ',\r')
    f.close()

# ==============================================================================
# Nearest centroid classifier
# ==============================================================================
from sklearn.neighbors.nearest_centroid import NearestCentroid

ncc = NearestCentroid()
ncc.fit(features_to_train, targets_to_train)

predicted_targets = ncc.predict(features_to_test)

# Just print out the precision and f1 scores
print 'precision: %0.5f' % metrics.precision_score(rf_benchmark_targets, predicted_targets)
print 'f1 score: %0.5f' % metrics.f1_score(rf_benchmark_targets, predicted_targets)

# The following scores are used for classification models
print 'accuracy: %0.5f' % metrics.zero_one_score(rf_benchmark_targets, predicted_targets)
print 'loss: %d' % metrics.zero_one(rf_benchmark_targets, predicted_targets)

# ==============================================================================
# Multinomial naive bayes
# ==============================================================================
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

Exemple #36

0

Afficher le fichier

Fichier : nearest_centroid_classifier.py Projet : id774/sandbox

from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid()
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))

Exemple #37

0

Afficher le fichier

Fichier : subtypes.py Projet : cdansereau/Proteus

class clusteringST:
    """
    Identification of sub-types for prediction
    """

    def __init__(self, verbose=True):
        self.verbose = verbose

    def fit(self, net_data_low, nSubtypes=3, reshape_w=True):
        # net_data_low = net_data_low_main.copy()
        self.flag_2level = False
        self.nnet_cluster = net_data_low.shape[1]
        self.nSubtypes = nSubtypes

        # ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template)
        # self.ind_low_scale = ind_low_scale

        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        # net_data_low = transform_low_scale(ts_data,self.ind_low_scale)
        # self.net_data_low = net_data_low

        self.normalized_net_template = []
        for i in range(net_data_low.shape[1]):
            # average template
            if nSubtypes < 1:
                self.normalized_net_template.append(np.zeros_like(net_data_low[0, i, :]).astype(float))
            else:
                self.normalized_net_template.append(np.mean(net_data_low[:, i, :], axis=0))
                # self.normalized_net_template.append(np.zeros_like(net_data_low[0,i,:])).astype(float))

            # indentity matrix of the corelation between subjects
            # tmp_subj_identity = np.corrcoef(net_data_low[:,i,:])
            # ind_st = cls.hclustering(tmp_subj_identity,nSubtypes)
            # subjects X network_nodes
            ind_st = cls.hclustering(net_data_low[:, i, :] - self.normalized_net_template[-1], nSubtypes)
            # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes)

            for j in range(nSubtypes):
                if j == 0:
                    st_templates_tmp = np.median(net_data_low[:, i, :][ind_st == j + 1, :], axis=0)[np.newaxis, ...]
                else:
                    st_templates_tmp = np.vstack(
                        (
                            st_templates_tmp,
                            np.median(net_data_low[:, i, :][ind_st == j + 1, :], axis=0)[np.newaxis, ...],
                        )
                    )

            # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
            if i == 0:
                self.st_templates = st_templates_tmp[np.newaxis, ...]
            else:
                self.st_templates = np.vstack((self.st_templates, st_templates_tmp[np.newaxis, ...]))
            del st_templates_tmp

        # calculate the weights for each subjects
        self.W = self.compute_weights(net_data_low, self.st_templates)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def norm_subjects(self, data, ref=[]):
        if len(data.shape) == 2:

            ref_avg_rmaps = ref.mean()
            avrg_rmaps = data.mean(1)
            scaling_factor = ref_avg_rmaps / avrg_rmaps
            return data * scaling_factor.reshape(-1, 1)
        else:
            ref_avg_rmaps = np.array(self.normalized_net_template).mean(1)
            avrg_rmaps = data.mean(2)
            scaling_factor = ref_avg_rmaps / avrg_rmaps
            print ref_avg_rmaps.shape, avrg_rmaps.shape, scaling_factor.shape
            return np.swapaxes(np.swapaxes(data, 0, 2) * np.swapaxes(scaling_factor, 0, 1), 0, 2)

    def robust_st(self, net_data_low, nSubtypes, n_iter=50):
        bs_cluster = []
        n = net_data_low.shape[0]
        stab_ = np.zeros((n, n)).astype(float)
        rs = ShuffleSplit(net_data_low.shape[0], n_iter=n_iter, test_size=0.05, random_state=1)
        for train, test in rs:
            # indentity matrix of the corelation between subjects
            ind_st = cls.hclustering(net_data_low[train, :], nSubtypes)
            mat_ = (cls.ind2matrix(ind_st) > 0).astype(float)
            for ii in range(len(train)):
                stab_[train, train[ii]] += mat_[:, ii]

        stab_ = stab_ / n_iter
        ms = KMeans(nSubtypes)
        ind = ms.fit_predict(stab_)
        # row_clusters = linkage(stab_, method='ward')
        # ind = fcluster(row_clusters, nSubtypes, criterion='maxclust')
        return ind + 1, stab_

    def fit_robust(self, net_data_low, nSubtypes=3, reshape_w=True, stab_thereshold=0.5):
        self.flag_2level = False
        self.nnet_cluster = net_data_low.shape[1]
        self.nSubtypes = nSubtypes

        self.normalized_net_template = []
        for i in range(net_data_low.shape[1]):
            # average template
            self.normalized_net_template.append(np.mean(net_data_low[:, i, :], axis=0))
            # self.normalized_net_template.append(np.zeros_like(net_data_low[0,i,:]))

            # indentity matrix of the corelation between subjects
            # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes)
            ind_st, stab_ = self.robust_st(net_data_low[:, i, :] - self.normalized_net_template[-1], nSubtypes)

            for j in range(nSubtypes):
                mask_stable = (stab_[ind_st == j + 1, :].mean(0) > stab_thereshold)[ind_st == j + 1]
                if self.verbose:
                    print "Robust: new N ", mask_stable.sum(), " old N ", mask_stable.shape
                data_ = net_data_low[ind_st == j + 1, i, :][mask_stable, :]
                if j == 0:
                    st_templates_tmp = np.median(data_, axis=0)[np.newaxis, ...]
                else:
                    st_templates_tmp = np.vstack((st_templates_tmp, np.median(data_, axis=0)[np.newaxis, ...]))

            # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
            if i == 0:
                self.st_templates = st_templates_tmp[np.newaxis, ...]
            else:
                self.st_templates = np.vstack((self.st_templates, st_templates_tmp[np.newaxis, ...]))
            del st_templates_tmp

        # calculate the weights for each subjects
        self.W = self.compute_weights(net_data_low, self.st_templates)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def fit_robust_network(self, net_data_low, nSubtypes=3, reshape_w=True, stab_thereshold=0.5):
        self.flag_2level = False
        self.nnet_cluster = 1
        self.nSubtypes = nSubtypes
        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork

        self.normalized_net_template = []
        # average template
        self.normalized_net_template.append(np.mean(net_data_low[:, :], axis=0))
        # self.normalized_net_template.append(np.zeros_like(net_data_low[0,:]))
        # indentity matrix of the corelation between subjects
        ind_st, stab_ = self.robust_st(net_data_low - self.normalized_net_template[-1], nSubtypes)

        for j in range(nSubtypes):
            mask_stable = (stab_[ind_st == j + 1, :].mean(0) > stab_thereshold)[ind_st == j + 1]
            if self.verbose:
                print "Robust: new N ", mask_stable.sum(), " old N ", mask_stable.shape
            data_ = net_data_low[ind_st == j + 1, :][mask_stable, :]
            if j == 0:
                st_templates_tmp = np.median(data_, axis=0)[np.newaxis, ...]
            else:
                st_templates_tmp = np.vstack((st_templates_tmp, np.median(data_, axis=0)[np.newaxis, ...]))

        # st_templates --> Dimensions: nNetwork_low,nSubtypes, nNetwork
        self.st_templates = st_templates_tmp[np.newaxis, ...]
        del st_templates_tmp
        # calculate the weights for each subjects
        self.W = self.compute_weights(net_data_low, self.st_templates)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def fit_network(self, net_data_low, nSubtypes=3, reshape_w=True):
        self.flag_2level = False
        self.nnet_cluster = 1
        self.nSubtypes = nSubtypes
        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork

        self.normalized_net_template = []
        # average template
        self.normalized_net_template.append(np.mean(net_data_low, axis=0))
        # self.normalized_net_template.append(np.zeros_like(net_data_low[0,:]))
        # indentity matrix of the corelation between subjects
        ind_st = cls.hclustering(net_data_low - self.normalized_net_template[-1], nSubtypes)

        for j in range(nSubtypes):
            data_tmp = np.median(net_data_low[ind_st == j + 1, :] - self.normalized_net_template[-1], axis=0)[
                np.newaxis, ...
            ]
            if j == 0:
                st_templates_tmp = data_tmp
            else:
                st_templates_tmp = np.vstack((st_templates_tmp, data_tmp))

        # st_templates --> Dimensions: nNetwork_low,nSubtypes, nNetwork
        self.st_templates = st_templates_tmp[np.newaxis, ...]
        del st_templates_tmp
        # calculate the weights for each subjects
        self.W = self.compute_weights(net_data_low, self.st_templates)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def fit_2level(self, net_data_low_l1, net_data_low_l2, nSubtypes_l1=5, nSubtypes_l2=2, reshape_w=True):
        self.flag_2level = True
        self.nnet_cluster = net_data_low_l1.shape[1]
        self.nSubtypes = nSubtypes_l1 * nSubtypes_l2
        self.nSubtypes_l1 = nSubtypes_l1
        self.nSubtypes_l2 = nSubtypes_l2

        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        self.net_data_low = net_data_low_l1
        self.net_data_low_l2 = net_data_low_l2

        ####
        # LEVEL 1
        ####
        # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
        st_templates = []
        for i in range(net_data_low_l1.shape[1]):
            # indentity matrix of the corelation between subjects
            ind_st = cls.hclustering(net_data_low_l1[:, i, :], nSubtypes_l1)

            for j in range(nSubtypes_l1):
                if j == 0:
                    st_templates_tmp = net_data_low_l1[:, i, :][ind_st == j + 1, :].mean(axis=0)[np.newaxis, ...]
                else:
                    st_templates_tmp = np.vstack(
                        (st_templates_tmp, net_data_low_l1[:, i, :][ind_st == j + 1, :].mean(axis=0)[np.newaxis, ...])
                    )

            if i == 0:
                st_templates = st_templates_tmp[np.newaxis, ...]
            else:
                st_templates = np.vstack((st_templates, st_templates_tmp[np.newaxis, ...]))

        self.st_templates_l1 = st_templates

        # calculate the weights for each subjects
        # W --> Dimensions: nSubjects,nNetwork_low, nSubtypes
        net_data_low_l2_tmp = np.vstack((net_data_low_l1, net_data_low_l2))
        self.W_l1 = self.compute_weights(net_data_low_l2_tmp, self.st_templates_l1)

        ####
        # LEVEL 2
        ####
        # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
        st_templates = []
        # st_templates = self.st_templates_l1.copy()
        # st_templates = st_templates[:,:,np.newaxis,:]
        for i in range(net_data_low_l2.shape[1]):

            # Iterate on all the Level1 subtypes (normal variability subtypes)
            for k in range(self.st_templates_l1.shape[1]):
                # Find the L1 subtype
                max_w = np.max(self.W_l1[:, i, :], axis=1)
                mask_selected_subj = self.W_l1[:, i, k] == max_w
                template2substract = self.st_templates_l1[i, k, :]
                if np.sum(mask_selected_subj) <= 3:
                    print ("Less then 2 subjects for network: " + str(i) + " level1 ST: " + str(k))
                    for j in range(nSubtypes_l2):
                        if (k == 0) & (j == 0):
                            st_templates_tmp = self.st_templates_l1[i, k, :][np.newaxis, ...]
                        else:
                            st_templates_tmp = np.vstack(
                                (st_templates_tmp, self.st_templates_l1[i, k, :][np.newaxis, ...])
                            )

                else:
                    # indentity matrix of the corelation between subjects
                    ind_st = cls.hclustering(
                        net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...] - template2substract, nSubtypes_l2
                    )
                    # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes)
                    if len(np.unique(ind_st)) < nSubtypes_l2:
                        print (
                            "Clustering generated less class then asked nsubjects: "
                            + str(len(ind_st))
                            + " network: "
                            + str(i)
                            + " level1 ST: "
                            + str(k)
                        )
                    # if (i==6) & (k==3):
                    # print ind_st
                    for j in range(nSubtypes_l2):
                        if (k == 0) & (j == 0):
                            st_templates_tmp = (
                                net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...][ind_st == j + 1, :]
                                - template2substract
                            ).mean(axis=0)[np.newaxis, ...]
                        else:
                            st_templates_tmp = np.vstack(
                                (
                                    st_templates_tmp,
                                    (
                                        net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...][ind_st == j + 1, :]
                                        - template2substract
                                    ).mean(axis=0)[np.newaxis, ...],
                                )
                            )

            if i == 0:
                st_templates = st_templates_tmp[np.newaxis, ...]
            else:
                print st_templates.shape, st_templates_tmp.shape
                st_templates = np.vstack((st_templates, st_templates_tmp[np.newaxis, ...]))

        self.st_templates_l2 = st_templates

        # calculate the weights for each subjects
        self.W_l2 = self.compute_weights(net_data_low_l2, self.st_templates_l2)
        if reshape_w:
            return self.reshapeW(self.W_l2)
        else:
            return self.W_l2

    def compute_weights_old(self, net_data_low, st_templates):
        # calculate the weights for each subjects
        W = np.zeros((net_data_low.shape[0], st_templates.shape[0], st_templates.shape[1]))
        for i in range(net_data_low.shape[0]):
            for j in range(st_templates.shape[0]):
                for k in range(st_templates.shape[1]):
                    # Demean
                    average_template = np.median(self.net_data_low[:, j, :], axis=0)
                    # average_template = self.st_templates[j,:,:].mean(axis=0)
                    dm_map = net_data_low[i, j, :] - average_template
                    dm_map = preprocessing.scale(dm_map)
                    st_dm_map = st_templates[j, k, :] - average_template
                    W[i, j, k] = np.corrcoef(st_dm_map, dm_map)[-1, 0:-1]

        return W

    def compute_weights(self, net_data_low, st_templates=[], mask_part=[]):

        if st_templates == []:
            st_templates = self.st_templates
        # calculate the weights for each subjects
        # W = np.zeros((net_data_low.shape[0],st_templates.shape[0],st_templates.shape[1]))
        for j in range(st_templates.shape[0]):
            average_template = self.normalized_net_template[j]
            if len(net_data_low.shape) == 2:
                rmaps = net_data_low - average_template
            else:
                rmaps = net_data_low[:, j, :] - average_template
            st_rmap = st_templates[j, :, :] - average_template
            tmp_rmap = self.compute_w(rmaps, st_rmap, mask_part)
            if j == 0:
                W = np.zeros((net_data_low.shape[0], st_templates.shape[0], tmp_rmap.shape[1]))
            W[:, j, :] = tmp_rmap

        return np.nan_to_num(W)

    def compute_w_global(self, X, ref):
        range_ = 1
        if len(X.shape) == 3:
            # multiple networks
            for net in range(X.shape[1]):
                if len(ref.shape) > 2:
                    range_ = ref.shape[1]
                w_global = np.corrcoef(ref[net, ...], X[:, net, :])[range_:, 0:range_]
        else:
            # One network
            if len(ref.shape) > 1:
                range_ = ref.shape[0]
            w_global = np.corrcoef(ref, X)[range_:, 0:range_]

        return w_global

    def compute_w(self, X, ref, mask_part=[]):
        if mask_part != []:
            # sub_w based on partition
            w_ = []
            list_id = np.unique(mask_part)
            for idx in np.delete(list_id, np.where(list_id == 0)):
                mask_ = mask_part == idx
                w_.append(self.compute_w_global(X[..., mask_], ref[..., mask_]))
            w_ = np.hstack(w_)
        else:
            # global mode, no sub-partition
            w_ = self.compute_w_global(X, ref)
        return w_

    def compute_weights_l2(self, net_data_low):

        corrected_ndl = net_data_low.copy()
        W_l1 = self.compute_weights(net_data_low, self.st_templates_l1)

        # calculate the weights for each subjects
        for i in range(net_data_low.shape[1]):
            for k in range(self.st_templates_l1.shape[1]):
                # Find the L1 subtype
                max_w = np.max(W_l1[:, i, :], axis=1)
                mask_selected_subj = W_l1[:, i, k] == max_w
                corrected_ndl[mask_selected_subj, i, :] = (
                    corrected_ndl[mask_selected_subj, i, :] - self.st_templates_l1[i, k, :]
                )

        return self.compute_weights(corrected_ndl, self.st_templates_l2)

    def transform(self, net_data_low, mask_part=[], reshape_w=True):
        """
            Calculate the weights for each sub-types previously computed
        """
        # compute the low scale version of the data
        # net_data_low = transform_low_scale(ts_data,self.ind_low_scale)

        if self.flag_2level:
            # calculate the weights for each subjects
            # W = self.compute_weights(net_data_low,self.st_templates_l2)
            W = self.compute_weights_l2(net_data_low)
        else:
            # calculate the weights for each subjects
            W = self.compute_weights(net_data_low, self.st_templates, mask_part)

        if reshape_w:
            return self.reshapeW(W)
        else:
            return W

    def reshapeW(self, W):
        # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights]
        xw = W.reshape((W.shape[0], W.shape[1] * W.shape[2]))
        return xw

    def fit_dev(self, net_data, nnet_cluster="auto", nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes

        if nnet_cluster == "auto":
            # self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data, nnet_cluster, algo="meanshift")
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data, nnet_cluster, algo="kmeans")

        # self.valid_cluster = self.clust_list
        # self.valid_net_idx = range(len(self.valid_cluster))
        for i in range(net_data.shape[0]):
            if i == 0:
                self.assign_net = self.assigneDist(net_data[i, :, :], self.valid_cluster, self.valid_net_idx)
            else:
                self.assign_net = np.vstack(
                    ((self.assign_net, self.assigneDist(net_data[i, :, :], self.valid_cluster, self.valid_net_idx)))
                )
        print "Size of the new data map: ", self.assign_net.shape
        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net, self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net, self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        # print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus

Exemple #38

0

Afficher le fichier

Fichier : ml_classification_knc.py Projet : nishantnath/MusicPredictiveAnalysis_EE660_USCFall2015

# range(2,74) means its goes from col 2 to col 73
df_input_data = df_input[list(range(2,74))].as_matrix() # test with few good features as determined through PCA?
df_input_target = df_input[list(range(0,1))].as_matrix()

colors = numpy.random.rand(len(df_input_target))

# splitting the data into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist())

# k-NN
from sklearn.neighbors.nearest_centroid import NearestCentroid
knc = NearestCentroid()
knc.fit(X_train[:],numpy.ravel(y_train[:]))
predicted = knc.predict(X_test)

print y_test[60:90] , len(y_test[60:90])
print predicted[60:90] , len(predicted[60:90])

print knc.classes_

# Prediction Performance Measurement
matches = (predicted == [item for sublist in y_test for item in sublist])
print matches.sum()
print len(matches)

print matches[10:50], len(matches[10:50])

print "Accuracy : ", (matches.sum() / float(len(matches)))

Exemple #39

0

Afficher le fichier

Fichier : subtypes.py Projet : yassinebha/Proteus

class clusteringST:
    '''
    Identification of sub-types for prediction
    ''' 

    def getClusters(self,net_data):
        self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0]))
        self.avg_n_clusters = 0
        self.clust_list = []
        for i in range(net_data.shape[2]):
            ms = MeanShift()
            ms.fit(net_data[:,:,i])
            self.clust_list.append(ms)
            labels = ms.labels_
            cluster_centers = ms.cluster_centers_
            n_clusters_ = len(np.unique(labels))
            #print(labels,cluster_centers.shape,n_clusters_)
            #bin_mat = np.zeros(avg_bin_mat.shape)
             
            bin_mat = cls.ind2matrix(labels+1)>0
            self.avg_bin_mat += bin_mat
            self.avg_n_clusters += n_clusters_
    
        self.avg_bin_mat /= net_data.shape[2]
        self.avg_n_clusters /= net_data.shape[2]
        return self.avg_n_clusters
    
    def getMeanClustering(self):
        return self.avg_bin_mat

    def get_match_network(self,net_data,ncluster,algo='kmeans'):
        '''
        net_data: 3d volume (subjects x vecnetwork x vecnetwork)
        ncluster: number of groups to partition the subjects
        algo: (default: kmeans) kmeans, meanshift.
        '''
        valid_net_idx = []
        valid_cluster = []
        self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0]))
        self.avg_n_clusters = 0
        
        for i in range(net_data.shape[2]):
            # Compute clustering with for each network
            if algo == 'kmeans':
                clust = KMeans(init='k-means++', n_clusters=ncluster, n_init=10)
            else:
                clust = MeanShift()
            
            #t0 = time.time()
            clust.fit(net_data[:,:,i])
            #t_batch = time.time() - t0
            # Compute the stability matrix among networks
            bin_mat = cls.ind2matrix(clust.labels_+1)>0
            self.avg_bin_mat += bin_mat
            self.avg_n_clusters += len(np.unique(clust.labels_))
            
            valid_cluster.append(clust)
            valid_net_idx.append(i)
        self.avg_bin_mat /= net_data.shape[2]
        self.avg_n_clusters /= net_data.shape[2]
        
        return valid_cluster, valid_net_idx

    def assigneSubtype(self,nets,valid_cluster, valid_net_idx):
        classes = []
        dist_centroid = np.array([])
        for i in range(len(valid_net_idx)):
            classes.append(valid_cluster[i].predict(nets[:,valid_net_idx[i]])[0])
            #points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            #dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            #classes.append(np.argmin(dist_))
            points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            dist_centroid = np.hstack((dist_centroid,dist_))
        
        return classes, dist_centroid

    def assigneDist(self,nets,valid_cluster, valid_net_idx):
        classes = np.array([])
        for i in range(len(valid_net_idx)):
            #print  np.hstack((classes,(valid_cluster[i].transform(nets[:,valid_net_idx[i]])[0])))
            points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            #dist_ = squareform(pdist(points, metric='correlation'))[0,1:]
            classes = np.hstack((classes,dist_))
            #classes.append(np.argmin(dist_))
        return classes

    def fit_old(self,net_data,nnet_cluster='auto',nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes
        
        if nnet_cluster == 'auto':
            #self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift')
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans')

        #self.valid_cluster = self.clust_list
        #self.valid_net_idx = range(len(self.valid_cluster))
        self.assign_net = np.array([])
        self.dist_net   = np.array([])
        for i in range(net_data.shape[0]):
            if i == 0 :
                classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
                self.dist_net = dist_
                self.assign_net = classes_
            else:
                classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
                self.dist_net = np.vstack((self.dist_net,dist_))
                self.assign_net = np.vstack((self.assign_net,classes_))

        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net,self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net,self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus

    def transform_low_scale_old(self,net_data):
        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        nnet_cluster = np.max(self.ind_low_scale)
        net_data_low = []
        net_data_low = np.zeros((net_data.shape[0],nnet_cluster,net_data.shape[2]))

        for i in range(nnet_cluster):
            # average the apropriate parcels and scale them
            #net_data_low[:,i,:] = preprocessing.scale(net_data[:,self.ind_low_scale==i+1,:].mean(axis=1), axis=1)
            net_data_low[:,i,:] = net_data[:,self.ind_low_scale==i+1,:].mean(axis=1)
        return net_data_low

    def fit(self,net_data_low,nSubtypes=3,reshape_w=True):
        self.nnet_cluster = net_data_low.shape[1]
        self.nSubtypes = nSubtypes

        #ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template)
        #self.ind_low_scale = ind_low_scale

        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        #net_data_low = transform_low_scale(ts_data,self.ind_low_scale)
        self.net_data_low = net_data_low

        # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
        st_templates = []
        for i in range(len(net_data_low[1])):
            # indentity matrix of the corelation between subjects
            #tmp_subj_identity = np.corrcoef(net_data_low[:,i,:])
            #ind_st = cls.hclustering(tmp_subj_identity,nSubtypes)
            # subjects X network_nodes
            #ind_st = cls.hclustering(net_data_low[:,i,:]-np.mean(net_data_low[:,i,:],axis=0),nSubtypes)
            ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes)

            for j in range(nSubtypes):
                if j == 0:
                    st_templates_tmp = net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...]
                else:
                    st_templates_tmp = np.vstack((st_templates_tmp,net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...]))

            if i == 0:
                st_templates = st_templates_tmp[np.newaxis,...]
            else:
                st_templates = np.vstack((st_templates,st_templates_tmp[np.newaxis,...]))

        self.st_templates = st_templates

        # calculate the weights for each subjects
        self.W =  self.compute_weights(net_data_low)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def compute_weights(self,net_data_low):
        # calculate the weights for each subjects
        W = np.zeros((net_data_low.shape[0],self.st_templates.shape[0],self.st_templates.shape[1]))
        for i in range(net_data_low.shape[0]):
            for j in range(self.st_templates.shape[0]):
                for k in range(self.st_templates.shape[1]):
                    # Demean
                    average_template = np.median(self.net_data_low[:,j,:],axis=0)
                    #average_template = self.st_templates[j,:,:].mean(axis=0)
                    dm_map = net_data_low[i,j,:] - average_template
                    dm_map = preprocessing.scale(dm_map)
                    st_dm_map = self.st_templates[j,k,:] - average_template
                    W[i,j,k] = np.corrcoef(st_dm_map,dm_map)[-1,0:-1]

        return W

    def transform(self,net_data_low,reshape_w=True):
        '''
            Calculate the weights for each sub-types previously computed
        '''
        # compute the low scale version of the data
        #net_data_low = transform_low_scale(ts_data,self.ind_low_scale)

        # calculate the weights for each subjects
        W = self.compute_weights(net_data_low)

        if reshape_w:
            return self.reshapeW(W)
        else:
            return W

    def reshapeW(self,W):
        # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights]
        xw = W.reshape((W.shape[0], W.shape[1]*W.shape[2]))
        return xw

    def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes

        if nnet_cluster == 'auto':
            #self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift')
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans')

        #self.valid_cluster = self.clust_list
        #self.valid_net_idx = range(len(self.valid_cluster))
        for i in range(net_data.shape[0]):
            if i == 0 :
                self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
            else:
                self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx))))
        print 'Size of the new data map: ',self.assign_net.shape
        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net,self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net,self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus

Exemple #40

0

Afficher le fichier

Fichier : d_NearestNeighbors.py Projet : hphp/demo

#!/usr/bin/python
from sklearn.neighbors.nearest_centroid import NearestCentroid

import numpy

X = numpy.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
y = numpy.array([1,1,1,2,2,2])

clf = NearestCentroid()
clf.fit(X,y)

NearestCentroid(metric='euclidean', shrink_threshold=None)
print clf.predict([0,1])

Exemple #41

0

Afficher le fichier

Fichier : classify.py Projet : yxliang/what-makes-a-good-joke

# SGD classifier - gives about 73% accuracy
cl4 = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                     fit_intercept=True, n_iter=5, shuffle=True, verbose=0, 
                     epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', 
                     eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                      average=False)
cl4.fit(X_train, target)
pr4 = cl4.predict(X_test)
allpred += pr4
print"SGD: " + "%.2f" % (evaluate(pr4, test_jokes)) + "%"


# KNN Classifier - gives about 59% accuracy
cl5 = NearestCentroid()
cl5.fit(X_train, target)
pr5 = cl5.predict(X_test)
print"KNN: " + "%.2f" % (evaluate(pr5, test_jokes)) + "%"


# Decision tree classifier - gives about 75% accuracy
cl6 = tree.DecisionTreeClassifier()
cl6.fit(X_train, target)
pr6 = cl6.predict(X_test)
allpred += pr6
print"Decision tree: " + "%.2f" % (evaluate(pr6, test_jokes)) + "%"


maxpred = max(allpred)
pr7 = [1 if x > maxpred / 2 else 0 for x in allpred]
print "Bagging: " + "%.2f" % (evaluate(pr7, test_jokes)) + "%"

Exemple #42

0

Afficher le fichier

Fichier : knn.py Projet : louisccc/trans_algorithm

from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid()
clf.fit(X, y)
print clf.predict([[-0.8, -1]])

Exemple #43

0

Afficher le fichier

Fichier : classify-nc.py Projet : SentimentalAnalysis/SentimentalAnalysis

	all_instances.append(row1)
	if(row1[0] > maxlength):
	    maxlength = row1[0];

for row2 in negative:
	row2 = row2[:-1]
	row2 = row2.split(',')
	row2 = [int(i) for i in row2]
	all_instances.append(row2)
	if(row2[0] > maxlength):
	    maxlength = row2[0];

for instance in all_instances:
    instance[0] = instance[0]/maxlength; 
	
random.shuffle(all_instances)
# print all_instances[0:700]
print "all_instances size: ", len(all_instances)
train_set = np.array(all_instances[0:700])
test_set = np.array(all_instances[701:])

print train_set[:,:-1]

X = np.array(train_set[:,:-1])
Y = np.array(train_set[:,-1])

clf = NearestCentroid()
clf.fit(X, Y)
predication = clf.predict(test_set[:,:-1])

evaluation(predication, test_set[:,-1])

Exemple #44

0

Afficher le fichier

Fichier : AudiovsPower.py Projet : nsantacruz/Senior-Project

def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest):

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest = xtest[~np.isnan(xtest).any(axis=1),:]
    xtest = xtest[~np.isinf(xtest).any(axis=1),:]

    xtrain = np.append(xtrain_1,xtrain_2,0)
    ytrain = np.append(ytrain_1,ytrain_2)
    ytrain = np.ravel(ytrain)
    xtrunclength = sio.loadmat('../Files/xtrunclength.mat')
    xtrunclength = xtrunclength['xtrunclength'][0]



    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []

    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    ytest = bagging2.predict(xtest)
    predictionMat[:,count] = ytest
    count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        modeCol = predWindowVecModeFinder(tempCol,xtrunclength)
        modeStr = predVec2Str(modeCol)
        predictionStringMat.append(modeStr)
        finalPredMat += map(int,modeCol)

    return predictionStringMat,finalPredMat

Exemple #45

0

Afficher le fichier

Fichier : train_models.py Projet : adamcsvarga/sound-classification

def ncentr(train, labels, test):
    clf = NearestCentroid()
    clf.fit(train, labels)
    return clf.predict(test)

Exemple #46

0

Afficher le fichier

Fichier : gestures.py Projet : DThirman/ThirdArm

class Gestures:
	def removeMag(self, line):
		return line[6:]
	
	def __init__(self):
		x = []
		y = []
		small = False
		#clf = svm.LinearSVC()
		self.clf = NearestCentroid()
		folder = "gyro_side\\"
		files = ['still.csv', 'yes.csv', 'no.csv']
		for i in range(3):
			f =open(folder+files[i], 'r')

			for line in f.readlines():
				#print line
				
				line = [int(a) for a in line.split(',')]
				lines = [self.removeMag(line[9*j:9*j+9]) for j in range(9)]
				# smallLine=[]
				# for j in range(5):
					
					# smallLine = smallLine + line[6*j:6*j+3]
				# if small:
					# line=smallLine
				# if len(x)==0:
					# x= np.array(np.array([line]))
				# else:
					# x=np.append(x,np.array([line]), axis=0)
					# #print np.shape(x)
				x += [reduce(lambda x,y: x+y, lines[:5], [])]
				y += [i]
				x += [reduce(lambda x,y: x+y, lines[4:], [])]
				y += [i]
				
				try:
					z=1
				except Exception as e:
					#print e
					print i, line
					#z=1/0
				
			f.close()	
		x= np.array([np.array(z) for z in x])
		y = np.array(y)
		print y
		print np.shape(y)
		print np.shape(x)
		print type(x[0]), np.array(x[0])
		self.clf.fit(x,y)
		self.data = []

		#self.ser = serial.Serial('COM3', 9600)
		print "Classifier trained"
	

	def setInitialData(self, init):
		self.data = []
		for i in init:
			self.data = np.append(self.data,self.removeMag(i))
		
	def updateData(self, line):
		self.data = np.append(self.data[len(self.removeMag([[]]*9)):],np.array([self.removeMag(line)]))
		
		
		
	def predictGesture(self, line):
		self.updateData(line)
		return self.clf.predict(np.array(self.data))

Exemple #47

0

Afficher le fichier

Fichier : MNIST_NearestNeighborsCentroid.py Projet : hphp/DeepLearning

from sklearn import metrics
import numpy
import transform_data_to_format as tdtf

#train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",42000)
#train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",2100)
#valid_x , valid_y = tdtf.read_data_to_ndarray("../data/valid.csv",21000)
#test_x = tdtf.read_test_data_to_ndarray("../data/test.csv",28000);

clf = NearestCentroid()
clf.fit(train_x,train_y)

#NearestCentroid(metric='euclidean', shrink_threshold=None)
#pred_y = clf.predict(test_x)
#pred_train_y = clf.predict(train_x[0:21000])
pred_valid_y = clf.predict(valid_x)

#print pred_y

#tdtf.write_to_csv(pred_y,"../data/MNIST_NearestNeighborsCentroid.out")

#print("Classification report for classifier %s:\n%s\n"
#      % (clf , metrics.classification_report(train_y , pred_train_y )))
'''
print("Classification report for classifier %s:\n%s\n"
      % (clf , metrics.classification_report(train_y[0:21000] , pred_train_y )))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(train_y[0:21000] , pred_train_y ))
'''
print("Classification report for classifier %s:\n%s\n"
      % (clf , metrics.classification_report(valid_y , pred_valid_y )))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(valid_y , pred_valid_y ))

Exemple #48

0

Afficher le fichier

Fichier : WordRecognizer.py Projet : apolol92/Falling_Rectangles

class TwoWordRecognizer:

    def scaler(self,arr):
        return arr/np.max(np.abs(arr))*100

    def get_startingpoint(self,arr):
        arr = np.abs(arr)
        st_i = 0
        e_i = STEPS
        old_value = np.sum(arr[st_i:e_i,0])
        counter = 0
        while e_i < arr.shape[0]:
            arr_sum = np.sum(arr[st_i:e_i,0])
            if(arr_sum>old_value*FACTOR):
                return st_i
            else:
                if(old_value<arr_sum):
                    old_value = arr_sum
                st_i+=STEPS
                e_i+=STEPS
        return 10000

    def get_endingpoint(self,arr):
        arr = np.abs(arr)
        e_i = arr.shape[0]-1
        st_i = e_i - STEPS
        old_value = np.sum(arr[st_i:e_i,0])
        while st_i > 0:
            arr_sum = np.sum(arr[st_i:e_i,0])
            if(arr_sum>old_value*FACTOR):
                return e_i
            else:
                if(old_value<arr_sum):
                        old_value = arr_sum
                st_i -= STEPS
                e_i -= STEPS
        return 10000

    def euclidean_distance(self,arr1,arr2):
        a1 = arr1.copy()
        a2 = arr2.copy()
        if(a1.shape[0]<a2.shape[0]):
            zero_rows = a2[a1.shape[0]:a2.shape[0],[0,1]].copy()
            zero_rows[:,:] = 0
            a1 = np.concatenate((a1,zero_rows))
        elif(a1.shape[0]>a2.shape[0]):
            zero_rows = a1[a2.shape[0]:a1.shape[0],[0,1]].copy()
            zero_rows[:,:] = 0
            a2 = np.concatenate((a2,zero_rows))
        dist = np.sqrt((a2[:,0]-a1[:,0])**2)
        return np.sum(dist)

    def loadReferenceWords(self, word1_path, word2_path):
        fs, self.word1 = wavfile.read(word1_path)
        fs, self.word2 = wavfile.read(word2_path)
        self.word1 =  self.scaler(self.word1)
        self.word2 = self.scaler(self.word2)
        self.word1 = self.word1[self.get_startingpoint(self.word1):self.get_endingpoint(self.word1),:]
        self.word2 = self.word2[self.get_startingpoint(self.word2):self.get_endingpoint(self.word2),:]

    def loadData(self, ressourcepath1, ressourcepath2):
        print(ressourcepath1)
        dirList = os.listdir(ressourcepath1)
        fullpath1 = []
        for fname in dirList:
            fullpath1.append(ressourcepath1+""+fname)

        dirList = os.listdir(ressourcepath2)
        fullpath2 = []
        for fname in dirList:
            fullpath2.append(ressourcepath2+""+fname)
        counter = 0
        for path in fullpath1:
            if counter == 0:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])
                y = np.array([1])
                counter = 1
            else:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])))
                y = np.hstack((y,np.array([1])))

        for path in fullpath2:
                fs, w2 = wavfile.read(path)
                w2 = self.scaler(w2)
                w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)])))
                y = np.hstack((y,np.array([2])))
        from sklearn.neighbors.nearest_centroid import NearestCentroid
        self.clf = NearestCentroid()
        self.clf.fit(X,y)
        #import matplotlib.pyplot as plt
        #plt.scatter(X[:,0],X[:,1])
        #plt.show()

    def predict(self,input_path):
        fs, raw_arr = wavfile.read(input_path)
        raw_arr = self.scaler(raw_arr)
        word= raw_arr[self.get_startingpoint(raw_arr):self.get_endingpoint(raw_arr),:]
        x0 = np.array([self.euclidean_distance(self.word1,word),self.euclidean_distance(self.word2,word)])
        return self.clf.predict(x0)

Exemple #49

0

Afficher le fichier

Fichier : accelLearrn.py Projet : DThirman/ThirdArm

z
data = np.array([])
print "Starting to read"
for i in range(5):
	line = ser.readline()[:-2]
	line = removeMag([int(a) for a in line.split(',')])
	if small:
		line = smallLine
	data = np.append(data, np.array(line))
	
prevs = [0,0]
while True:
	#fullLine = reduce(lambda a,b: a+b, data, [])
	#print data
	#print data
	p = clf.predict(np.array(data))
	if p !=0:
		prevs[p-1] += 1
	if p == 0:
		if (prevs[0] > 2 or prevs[1] > 5):
			if prevs[0] > prevs[1]:
				print "Yes"
			else:
				print "No"
		prevs = [0,0]
	line = ser.readline()[:-2]
	line = removeMag([int(a) for a in line.split(',')])
	smallLine = line[0:3]
	if small:
		line = smallLine
	#data = np.append(data[(3 if small else 6):],np.array([line]))

Exemple #50

0

Afficher le fichier

Fichier : crossValidation.py Projet : gabrielmll/fluSpreadAnalysis

#print y_test
#scores = cross_validation.cross_val_score(clf, data[:, 3:15], data[:, 2], cv=5)
#print scores

# Nearest Neighbor
nbrs = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)
nbrs_y_pred = nbrs.predict(X_test)
nbrs_pr = precision_score(y_test,nbrs_y_pred)
nbrs_rc = recall_score(y_test,nbrs_y_pred)
nbrs_CM = confusion_matrix(y_test,nbrs_y_pred)
print "------------------"
print "\tNearest Neighbor"
print "------------------"
print "Real: "
print y_test
print "Predict"
print nbrs_y_pred
print "Score:"
print nbrs_pr

# NearestCentroid
clf = NearestCentroid().fit(X_train, y_train)
print "------------------"
print "\tNearest Centroid"
print "------------------"
print "Real: "
print y_test
print "Predict"
print clf.predict(X_test)
print "Score: "
print clf.score(X_test, y_test)

Exemple #51

0

Afficher le fichier

Fichier : scikit_test.py Projet : jbenua/Biomeh-BCI

 def nn_centroid(self, X, y, test):
     clf = NearestCentroid()
     clf.fit(X, y)
     t = clf.predict(test)
     print("nn_centroid:", t)
     return t

Exemple #52

0

Afficher le fichier

Fichier : kpca_mnist.py Projet : sdimi/handwritten-digits-recognition


#Nearest Centroid classification
start = int(round(time.time() * 1000))


classifier = NearestCentroid()
classifier.fit(X_lda, y_train)
NearestCentroid(metric='euclidean', shrink_threshold=None)
print (classifier)



print("---------(5) Cross validation accuracy--------")
print(cross_validation.cross_val_score(classifier, X_lda,y_train, cv=5))


end = int(round(time.time() * 1000))
print("--Centroid fitting finished in ", (end-start), "ms--------------")


print("---------Test-set dimensions after PCA--------")
print(X_test.shape)

expected = y_test
predicted = classifier.predict(X_test)

print("--------------------Results-------------------")
print("Classification report for Centroid classifier %s:\n%s\n"
     % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Exemple #53

0

Afficher le fichier

Fichier : data.py Projet : ResByte/EMG-classifier

    clf = NearestCentroid()
    X = []
    Y = []
    for i in range(0, 4):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(0)
    for i in range(5, 9):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(1)
    for i in range(10, 14):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(2)
        # print X
        # print Y
    clf.fit(X, Y)
    res = clf.predict([[mean_avg[14], ratio_avg[14]]])
    if res == 0:
        print "rock"
    if res == 1:
        print "scissor"
    if res == 2:
        print "paper"

    """
	# Plot individual channels data
	plt.figure(1)
	plt.subplot(431)
	plt.plot(x)
	plt.ylabel('x')
	#plt.figure(2)
	plt.subplot(412)

Exemple #54

0

Afficher le fichier

Fichier : training_t4.py Projet : nishantnath/MusicPredictiveAnalysis_EE660_USCFall2015

svm_model.fit(X_cropped, y_cropped)
y_train_predicted = svm_model.predict(X_train)
print "SVM Error rate on training data (t1): ", ml_aux.get_error_rate(y_train, y_train_predicted)
# ml_aux.plot_confusion_matrix(y_train, y_train_predicted, "CM SVM Training (t1)")
# plt.show()

y_validation_predicted = svm_model.predict(X_validation)
print "SVM Error rate on validation (t1): ", ml_aux.get_error_rate(y_validation, y_validation_predicted)


# Start k nearest Centroid Classification
print "Performing kNC Classification:"
from sklearn.neighbors.nearest_centroid import NearestCentroid

knnc_model = NearestCentroid()
knnc_model.fit(X_cropped, y_cropped)
y_validation_predicted = knnc_model.predict(X_validation)
print "Error Rate on kNNC (t1) Validation:  ", ml_aux.get_error_rate(y_validation, y_validation_predicted)

# Start Bagging Classification
print "Performing Bagging Classification:"
# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Bagging
bagging1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2), max_samples=1.0, max_features=0.1)
bagging1.fit(X_cropped, y_cropped)
y_validation_predicted = bagging1.predict(X_validation)
print "Error Rate kNN with Baggging Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted)

Exemple #55

0

Afficher le fichier

Fichier : ml_classification_knc_training.py Projet : nishantnath/MusicPredictiveAnalysis_EE660_USCFall2015

    knc3.fit(df_input3_data,numpy.ravel(df_input3_target))
    pickle.dump(knc3, open('model_knc_t3.pkl', 'wb'))

    knc4 = NearestCentroid()
    knc4.fit(df_input4_data,numpy.ravel(df_input4_target))
    pickle.dump(knc4, open('model_knc_t4.pkl', 'wb'))

    knc5 = NearestCentroid()
    knc5.fit(df_input5_data,numpy.ravel(df_input5_target))
    pickle.dump(knc5, open('model_knc_t5.pkl', 'wb'))

    # knc = KMeans(n_clusters=5, random_state=RandomState(9)
    # knc.fit(df_input_data,numpy.ravel(df_input_target))
    # pickle.dump(knc, open('model_knc_train.pkl', 'wb'))

    predicted1 = knc1.predict(df_input1_data)
    predicted2 = knc2.predict(df_input2_data)
    predicted3 = knc3.predict(df_input3_data)
    predicted4 = knc4.predict(df_input4_data)
    predicted5 = knc5.predict(df_input5_data)
    # predicted = knc.predict(df_input_data)

    matches1 = (predicted1 == [item for sublist in df_input1_target for item in sublist])
    matches2 = (predicted2 == [item for sublist in df_input2_target for item in sublist])
    matches3 = (predicted3 == [item for sublist in df_input3_target for item in sublist])
    matches4 = (predicted4 == [item for sublist in df_input4_target for item in sublist])
    matches5 = (predicted5 == [item for sublist in df_input5_target for item in sublist])
    # matches = (predicted == [item for sublist in df_input_target for item in sublist])

    print 'using excess rock & uncats removed'

Exemple #56

0

Afficher le fichier

Fichier : reuters6-classify.py Projet : lum4chi/IR

print 'Reading features... Done!'

# STEP 2 - computing scores
print 'Training...'
tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried.
tfidf.save('reuters/data/tfidf.model')

# STEP 3 - computing centroids
tfidf = models.TfidfModel.load('reuters/data/tfidf.model')
features = corpora.Dictionary.load_from_text('reuters/data/word.dict')
by_bow = Corpus2Dictionary(features)
train_corpus = ReutersCorpus('training')
tfidf_train = tfidf[by_bow[by_word[train_corpus]]]
X = matutils.corpus2csc(tfidf_train)  # to gensim into scipy sparse matrix
X = X.transpose() # from csc (document as column) to csr (document as row)
y = train_corpus.category_mask # label for doc
rocchio = NearestCentroid()
rocchio.fit(X, y)
print 'Training... Done!'

# STEP 4 - evaluate prediction
test_corpus = ReutersCorpus('test')
tfidf_test = tfidf[by_bow[by_word[test_corpus]]]
# num_terms required: otherwise Z shrink to the max feature found
X = matutils.corpus2csc(tfidf_test, num_terms=len(features))
X = X.transpose()
y_true = test_corpus.category_mask
y_pred = rocchio.predict(X)
# print precision_score(y_true, y_pred)
print rocchio.score(X, y_true)

Exemple #57

0

Afficher le fichier

Fichier : FinalClassifier.py Projet : nsantacruz/Senior-Project

def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat

Exemple #58

0

Afficher le fichier

Fichier : other.py Projet : vtrecsports/kinectdatalearning

        #train

        np.random.seed(i)
        random.seed(i)

        random.shuffle(FRAMES)
        data = np.array([[frame.distances[joint] for joint in frame.distances.keys()]
                for frame in FRAMES])
        target = np.array([frame.label for frame in FRAMES])
        indices = np.random.permutation(len(data))
        data_train = data[indices[:-len(data)/2]]
        target_train = target[indices[:-len(data)/2]]
        data_test = data[indices[-len(data)/2:]]
        target_test = target[indices[-len(data)/2:]]

        knn = NearestCentroid()
        knn.fit(data_train, target_train)
        accuracy = sum(1
                       for (actual, correct) in zip(knn.predict(data_test), target_test)
                       if actual == correct) / float(len(target_test))
        if scale:
            if accuracy > last_accuracy: times_better += 1
            elif accuracy < last_accuracy: times_wrong += 1
            else: times_same += 1
        last_accuracy = accuracy

print times_better / float(times_better + times_wrong + times_same)
print (times_better, times_wrong, times_same)