Beispiel #1
0
def ensemble(train_features, train_labels, test_features, test_labels):
    print("\n\nEnsemble")
    print(
        "===================================================================")
    ks = [1, 3, 5]
    for k in ks:
        print("k = ", k)
        m_knn = knc(n_neighbors=k)

        parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
        svr = svm.SVC(probability=True)
        m_svm = GridSearchCV(svr, parameters).fit(train_features, train_labels)

        m_mlp = MLPClassifier(max_iter=1000)

        clf1 = VotingClassifier(estimators=[('knn', m_knn), ('svm', m_svm),
                                            ('mpl', m_mlp)],
                                voting='hard')
        clf2 = VotingClassifier(estimators=[('knn', m_knn), ('svm', m_svm),
                                            ('mpl', m_mlp)],
                                voting='soft')

        clf1.fit(train_features, train_labels)
        clf2.fit(train_features, train_labels)

        result1 = clf1.predict(test_features)
        result2 = clf2.predict(test_features)

        printResult(test_labels, result)
        printResult(test_labels, result)
    print(
        "===================================================================")
Beispiel #2
0
def model_data(training_data):
    dtc = DecisionTreeClassifier(random_state=9, min_samples_split=5)
    dtc.fit(training_data['data'], training_data['result'])

    nn = MLPClassifier(solver='lbfgs',
                       alpha=1e-5,
                       hidden_layer_sizes=(5, 2),
                       random_state=1)
    nn.fit(training_data['data'], training_data['result'])

    svc = SVC(C=100, kernel="linear")
    svc.fit(training_data['data'], training_data['result'])

    rfc = RFC(n_estimators=10,
              criterion='entropy',
              max_depth=10,
              min_samples_split=5,
              bootstrap='true',
              random_state=None)
    rfc.fit(training_data['data'], training_data['result'])

    knc_map = knc(n_neighbors=15, weights='distance')
    knc_map.fit(training_data['data'], training_data['result'])

    gbc_map = gbc(n_estimators=150, verbose=0)
    gbc_map.fit(training_data['data'], training_data['result'])

    return {
        'Decision Tree Classifier': dtc,
        'Neural Networks': nn,
        'Support Vector Machines': svc,
        'Random Forest Classification': rfc,
        'k Nearest Neighbours': knc_map,
        'Gradient Boosting Classifier': gbc_map
    }
Beispiel #3
0
def voxelizePart(grid, scale, translate, dims, cloud, labels, partId,
                 outputPath):
    bbmin = np.min(cloud, axis=0)
    bbmax = np.max(cloud, axis=0)
    center = 0.5 * (bbmax - bbmin)
    w1s = np.where(grid == 1)
    grid_xyz = [[x, y, z] for x, y, z in zip(w1s[0], w1s[1], w1s[2])]
    grid_xyz = np.array(grid_xyz)
    grid_xyz_sc = []
    for p in grid_xyz:
        trans_p = [0, 0, 0]
        trans_p[0] = scale * ((1 / scale) * center[0] - 0.5 + float(
            (p[0] + 0.5) / dims)) + translate[0]
        trans_p[1] = scale * ((1 / scale) * center[1] - 0.5 + float(
            (p[1] + 0.5) / dims)) + translate[1]
        trans_p[2] = scale * ((1 / scale) * center[2] - 0.5 + float(
            (p[2] + 0.5) / dims)) + translate[2]
        grid_xyz_sc.append(trans_p)
    grid_xyz_sc = np.array(grid_xyz_sc)
    #grid_xyz_sc is now in the same coordinate frame as the point-cloud

    clf = knc(n_neighbors=1)
    clf.fit(cloud, labels)
    voxelLabels = clf.predict(grid_xyz_sc)
    partIndices = voxelLabels == partId
    partVoxelIndices = grid_xyz[partIndices, :]
    partvox = np.zeros((dims, dims, dims, 1))
    partvox[partVoxelIndices[:, 0], partVoxelIndices[:, 2],
            partVoxelIndices[:, 1], 0] = 1
    partvox = partvox.astype('int')
    partbinvox = binvox_rw.Voxels(partvox, (dims, dims, dims), [0, 0, 0], 1,
                                  'xzy')
    partname = 'model_' + str(partId) + '.binvox'
    binvox_rw.write(partbinvox, open(os.path.join(outputPath, partname), 'wb'))
Beispiel #4
0
def bias_variance(data, nbrs, bootstrap=10):
    result_dict = {}
    for bootsrap_num in bootstrap:
        train, test = split(data)
        classifier = knc(n_neighbors=nbrs)
        coordinates_train, labels_train = train[:, :2], train[:, -2]
        classifier.fit(coordinates_train, labels_train)
        for instance in test:
            coordinates_test = instance[:2]
            pred = classifier.predict(coordinates_test)
            result_dict[instance[:-1]].append(pred)
def runAlgo(filename):
    # Read a given dataset to csv and then run it through six different
    # classifiers. 3 Normal classifiers and 3 ensembles
    print(filename)
    d, t = load_csv(filename)
    runModel("Decision Tree", tree.DecisionTreeClassifier(), d, t)
    runModel("KNearesest Ne", knc(), d, t)
    runModel("Neural Networ", MLPClassifier(hidden_layer_sizes=(30,30,30)), d, t)
    runModel("Bagging      ", BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.9, max_features=0.9), d, t)
    runModel("Random Forest", RandomForestClassifier(n_estimators=100), d, t)
    runModel("AdaBoost     ", AdaBoostClassifier(n_estimators=100), d, t)
Beispiel #6
0
def bias_variance(data, nbrs, bootstrap=10):
    result_dict = {}
    for bootsrap_num in bootstrap:
        train, test = split(data)
        classifier = knc(n_neighbors=nbrs)
        coordinates_train, labels_train = train[:, :2], train[:, -2]
        classifier.fit(coordinates_train, labels_train)
        for instance in test:
            coordinates_test = instance[:2]
            pred = classifier.predict(coordinates_test)
            result_dict[instance[:-1]].append(pred)
Beispiel #7
0
def knn(f_train,l_train,f_test):
    from sklearn.neighbors import KNeighborsClassifier as knc
    import time
    clf=knc(n_neighbors=3)
    start_time=time.time()
    clf.fit(f_train,l_train)
    print("Training Time: %s seconds"%(time.time()-start_time))
    start_time=time.time()
    pre=clf.predict(f_test)
    print("Predicting Time: %s seconds"%(time.time()-start_time))
    return pre
Beispiel #8
0
def handwritingClassTestSKL():
    #测试集的Labels
    hwLabels = []
    #返回trainingDigits目录下的文件名
    trainingFileList = listdir("trainingDigits")
    #返回文件夹下文件的个数
    m = len(trainingFileList)
    #初始化训练的Mat矩阵,测试集
    trainingMat = np.zeros((m, 1024))
    #从文件名中解析出训练集的类别
    for i in range(m):
        #获得文件的名字
        fileNameStr = trainingFileList[i]
        #fileStr = fileNameStr.split('.')[0]
        #获得分类的数字
        classNumStr = int(fileNameStr.split('_')[0])
        #将获得的类别添加到hwLabels中
        hwLabels.append(classNumStr)
        #将每一个文件的1x1024数据存储到trainingMat矩阵中
        trainingMat[i, :] = img2vector("trainingDigits/%s" % fileNameStr)
    """
    SK-learn method
    """
    #构建kNN分类器
    neigh = knc(n_neighbors=3, algorithm='auto')
    #拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签
    neigh.fit(trainingMat, hwLabels)

    #返回testDigits目录下的文件列表
    testFileList = listdir("testDigits")
    #错误检测计数
    errorCount = 0.0
    #测试数据的数量
    mTest = len(testFileList)
    #从文件中解析出测试集的类别并进行分类测试
    for i in range(mTest):
        #获得文件的名字
        fileNameStr = testFileList[i]
        #fileStr = fileNameStr.split('.')[0]
        #获得分类的数字
        classNumStr = int(fileNameStr.split('_')[0])
        #获得测试集的1x1024向量,用于训练
        vectorUnderTest = img2vector("testDigits/%s" % fileNameStr)
        #获得预测结果
        #classifierResult =  classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        classifierResult = neigh.predict(vectorUnderTest)
        print("the classifier came back with: %d, the real answer is: %d" %
              (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\n the total number of errors is: %d" % errorCount)
    print("\n the total error rate is: %100.2f%%" %
          (errorCount / float(mTest) * 100))
Beispiel #9
0
def run(menu, value):
    dataset = Dataset(menu.getValue("d") + ".data")
    percent = int(menu.getValue("p")) / 100
    nNeighbors = int(menu.getValue("k"))
    algorithm = menu.getValue("a")
    te_data, tr_data, te_target, tr_target = ms.train_test_split(
        dataset.data, dataset.target, test_size=percent)

    classifier = Classifier(nNeighbors) if algorithm == "Quade" else knc(
        n_neighbors=nNeighbors)
    classifier.fit(tr_data, tr_target)
    predicted_target = classifier.predict(te_data)
    print("Accuracy of Algorithm:", accuracy(te_target, predicted_target))
Beispiel #10
0
def knn_function(train_features, train_labels, test_features, test_labels):
    print("\n\nKNN")
    print(
        "===================================================================")
    ks = [1, 3, 5]
    for k in ks:
        knn = knc(n_neighbors=k)
        knn.fit(train_features, train_labels)
        result = knn.predict(test_features)
        print("k = ", k)
        printResult(test_labels, result)
    print(
        "===================================================================")
def train_model_knc (features, labels) :
	# Scaling is very important for distance based classifiers
	scaler = StandardScaler()
	clf_knc = knc()

	# Transforms are applied exactly in the order specified
	estimators = [('sscaler', scaler), ('knc', clf_knc)]
	# p = 2 corresponds to Euclidean distance, p = 1 corresponds to Manhattan distance
	params_dict = {'knc__n_neighbors': [5, 8, 10, 15, 20, 25, 30], 'knc__weights':['uniform', 'distance'], 'knc__p': [1, 2]}
	
	clf = GridSearchCV(Pipeline(estimators), params_dict, scoring = 'roc_auc', cv = 5)
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	#print ("Best grid scores: ", clf.grid_scores_)
	return clf
def make_prediction_grid(points, outcomes, limits, steps=1, k=5):
    (x_min, x_max, y_min, y_max) = limits
    xs = np.arange(x_min, x_max, steps)
    ys = np.arange(y_min, y_max, steps)

    knn = knc(n_neighbors=k)
    knn.fit(points, outcomes)

    (xx, yy) = np.meshgrid(xs, ys)

    prediction_grid = np.zeros(xx.shape, dtype=int)

    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            p = np.array([x, y])
            prediction_grid[j, i] = knn.predict([p])[0]

    return (xx, yy, prediction_grid)
def covid_knn(trainfile, testfile, process_rank):
    with open(trainfile) as covidfile:
        cases = pd.read_csv(covidfile, index_col="id")

    with open(testfile) as casefile:
        tests = pd.read_csv(casefile, index_col="id")

    features = ['age', 'bmi', 'HbA1c']
    cases = normalizeDF(cases, features)

    features.append('resp_disease')
    X = cases[features].values
    y = cases['death_risk'].values

    # split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # knn classification
    classifier = knc(n_neighbors=6)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    # knn testing with input files
    tests = normalizeDF(tests, features)
    Z = tests[features].values
    Z_pred = classifier.predict(Z)
    print("The predictions for: " + testfile)
    print(Z_pred)

    if process_rank == 0:
        accuracy = acc(y_test, y_pred) * 100
        print("Accuracy of the model is: ")
        print(accuracy)
Beispiel #14
0
from time import time
from split import data_split

X_train, X_test, Y_train, Y_test = data_split()

from sklearn.neighbors import KNeighborsClassifier as knc

t0 = time()

clf = knc(n_neighbors=20)
clf.fit(X_train, Y_train)

print("Training Time: " + str(round(time() - t0, 3)) + "s")

t1 = time()

pred = clf.predict(X_test)

print("Testing Time: " + str(round(time() - t1, 3)) + "s")

from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test, pred))
	model.fit( X , y )

	X_test = [ row[ :-1 ] for row in test_data ]
	y_real = [ row[ -1 ] for row in test_data ]
	y_pred = model.predict( X_test )
	print report( y_real , y_pred )
	tp = lambda x : 1 if x == 'spam' else 0
	real = [ tp( v ) for v in y_real ]
	pred = [ tp( v ) for v in y_pred ]
	print mean_absolute_error( real , pred )
	print mean_squared_error( real , pred )

if __name__ == '__main__' :
	if len( sys.argv ) > 2 :
		train_fpath , test_fpath = sys.argv[ 1: ]
		train_data = import_csv( train_fpath )
		test_data = import_csv( test_fpath )
		''' DECISION TREE '''
		cf = dtc( criterion = 'gini' , max_depth = 50 )
		classify( cf , train_data , test_data , 'decision_tree' )
		
		''' NEAREST NEIGHBORS '''
		cf = knc( n_neighbors = 1 , metric = 'hamming' )
		classify( cf , train_data , test_data , 'knearest_neighbors' )
		
		''' NAIVE BAYES '''
		cf = mnb( alpha = 100.0 )
		classify( cf , train_data , test_data , 'naive_bayes' )
	else :
		print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
Beispiel #16
0
test_data = np.array(test)

print("CONVERTED TO NUMPY ...")
print()

print("SPLITTING TRAIN DATA ...")
print()

features = []
target = []

#obtaining categories in target    
target = train['Category'].values

features = train_data[0:, 1:5]
model = knc(n_neighbors =  9)

#model = knc(n_neighbors = 9, n_jobs = -1)
#
print('TRAINING THE MODEL USING...', model)

print()
print('MODELLING DATA...')
#splitting into train and target variables
print()

model.fit(features,target)

print('MODELLING DONE...')
print()
print('PREDICTING TEST SET...')
Beispiel #17
0
from sklearn.preprocessing import StandardScaler as ss

df = pd.read_csv('Social_Network_Ads.csv')

#sns.scatterplot(df['EstimatedSalary'],df['Purchased'])
df.drop('User ID', inplace=True, axis=1)
#print(df.info())
gen = pd.get_dummies(df['Gender'], drop_first=True)
df.drop('Gender', inplace=True, axis=1)
#print(gen.head())
dff = pd.concat([df, gen], axis=1)
#print(dff.info())
x = dff.drop('Purchased', axis=1)
y = dff['Purchased']
print(y.head())

sss = ss()
xx = sss.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(xx,
                                                y,
                                                test_size=0.3,
                                                random_state=101)

cm = knc(n_neighbors=3)
cm.fit(xtrain, ytrain)
pdata = cm.predict(xtest)

creport = cr(ytest, pdata)

print(creport)
Beispiel #18
0
y = train_data['Survived']
X = train_data.drop(['Survived'], axis=1)

# # ML

# In[ ]:

from sklearn.tree import DecisionTreeClassifier as dtc

model1 = dtc()
model1.fit(X, y)

from sklearn.neighbors import KNeighborsClassifier as knc

model2 = knc(n_neighbors=5)
model2.fit(X, y)

from sklearn.svm import SVC

model4 = SVC(C=1.0, kernel='rbf', degree=3)
model4.fit(X, y)

from sklearn.ensemble import RandomForestClassifier as rfc

model3 = rfc(n_estimators=100,
             max_depth=3,
             max_features=0.5,
             min_samples_leaf=32)
model3.fit(X, y)
def classify(s):
    import pandas as pd
    import numpy
    import pandas_montecarlo
    from scipy.stats import shapiro, kruskal, f_oneway
    from sklearn.ensemble import RandomForestClassifier as rfc
    from sklearn.neighbors import KNeighborsClassifier as knc
    from sklearn.svm import SVC as svc
    from sklearn.linear_model import LogisticRegression as lgr
    ## RandomForest Classifier with monte carlo simulated training set
    numpy.random.seed(s)

    #df = pd.read_csv("mc_test_data.csv")
    #df = pd.read_csv("rndf_filt_data.csv")
    df = pd.read_csv("data.csv")
    #random forest selected the following columns as most predictive
    df = df[['diagnosis','area_worst','concave points_mean','concave points_worst','perimeter_worst','radius_worst']]

    #print(df.head())
    #df = df.drop(["id","Unnamed: 32"],axis=1)
    #df = df.drop(["Unnamed: 0"],axis=1)
    df = df.replace({'diagnosis': "M"}, 1)
    df = df.replace({'diagnosis': "B"}, 0)

    #split dataset for mc seed and testing

    df_mc, df = numpy.split(df, [int(.7*len(df))])

    #split dataset by class
    #df_1 = pd.read_csv("mc_data_M.csv").drop(["Unnamed: 0"],axis=1)
    #df_0 = pd.read_csv("mc_data_B.csv").drop(["Unnamed: 0"],axis=1)
    df_1 = df_mc.loc[df_mc.diagnosis==1]
    df_0 = df_mc.loc[df_mc.diagnosis==0]
    df_1 = df_1.drop(["diagnosis"],axis=1)
    df_0 = df_0.drop(["diagnosis"],axis=1)

    #simulate class 0 data
    mc_sim_df_0 = pd.DataFrame()
    mc_sim_df_0['diagnosis']= ['0'] * len(df_0.index)
    for col in df_0.columns:
        col_sim = df_0[col].montecarlo(sims = 2, bust = 0, goal = 0).data
        col_sim = col_sim.drop(["original"],axis = 1)
        for col2 in col_sim.columns:
            mc_sim_df_0[col]=col_sim[col2]
            #if(shapiro(mc_sim_df_1[col])[1]>0.05):
                #print(kruskal(mc_sim_df_1[col],df_1[col]))
            #else:
                #print(f_oneway(mc_sim_df_1[col],df_1[col]))

    #simulate class 1 data
    mc_sim_df_1 = pd.DataFrame()
    mc_sim_df_1['diagnosis']= ['1'] * len(df_1.index)
    for col in df_1.columns:
        col_sim = df_1[col].montecarlo(sims = 2, bust = 0, goal = 0).data
        col_sim = col_sim.drop(["original"],axis = 1)
        for col2 in col_sim.columns:
            mc_sim_df_1[col]=col_sim[col2]
            #if(shapiro(mc_sim_df_1[col])[1]>0.05):
                #print(kruskal(mc_sim_df_1[col],df_1[col]))
            #else:
                #print(f_oneway(mc_sim_df_1[col],df_1[col]))


    #diag = mc_sim_df_1.append(mc_sim_df_0)['diagnosis']
    mc_sim_df = mc_sim_df_1.append(mc_sim_df_0)
    #shuffling dataframe for good luck
    #mc_sim_df = mc_sim_df.sample(frac=1)
    #mc_sim_df['diagnosis']=diag
    mc_sim_df.head(20)


    #values formatted
    labels = df["diagnosis"]
    df = df.drop("diagnosis",axis=1)
    dfDev, dfTes = numpy.split(df, [int(.7*len(df))])
    DDev, DTes = numpy.split(labels, [int(.7*len(labels))])

    #DTrn =  mc_sim_df['diagnosis']
    #dfTrn = mc_sim_df.drop(['diagnosis'], axis = 1)
    DTrn =  df_mc['diagnosis']
    dfTrn = df_mc.drop(['diagnosis'], axis = 1)
    
    scores = []

    #run model and test
    #randomforest
    model = rfc()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #knn
    model = knc()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #svc
    model = svc(kernel="linear")
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #svc
    model = svc(kernel="rbf")
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    #logistic regression
    model = lgr()
    model = model.fit(dfTrn.values,DTrn)
    pd = model.predict(dfDev)
    hit = 0
    for i in range(len(pd)):
        if(int(pd[i])==int(DDev.iloc[i])):
            hit+=1
    scores.append(hit/len(pd))
    
    return scores
Beispiel #20
0
df.dropna(thresh=8, inplace=True)
df.shape  #players=1130

#set missings to 0
df.fillna(value=0, inplace=True)
df.isnull().sum()

#set explanatory and response variables
explanatory = [
    col for col in df.columns if col not in ['playerid', 'inducted', 'year']
]
df_exp = df[explanatory]
df_res = df.inducted

#KNN
knn = knc(p=2)  #specify Euclidean distance

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_accuracy = gscv(knn, param_grid, cv=10,
                   scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
Beispiel #21
0
sex = 'F'
scaler = StandardScaler()

data_partial = data[data['Sex'] == sex].drop('Sex', axis=1)
# corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr()
# plot_corr_matrices(corr_matrix_f, corr_matrix_m)

y = data_partial['EmoState']
X = scaler.fit_transform(data_partial.drop('EmoState', axis=1))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=71)

models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)),
          ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC',
                                               mlpc(max_iter=1000,
                                                    learning_rate='adaptive')))
results = []
names = []
seed = 13
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
Beispiel #22
0
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

train_x.head()
train_y.head()
test_x.head()
test_y.head()

# KNN using sklearn
# Importing Knn algorithm from sklearn.neighbors

from sklearn.neighbors import KNeighborsClassifier as knc

# for 3 nearest neighbours
neighbour = knc(n_neighbors=3)

# Fitting with training data
neighbour.fit(train_x, train_y)

# train accuracy
train_acc = np.mean(neighbour.predict(train_x) == train_y)
train_acc  # 95.71%

# test accuracy
test_acc = np.mean(neighbour.predict(test_x) == test_y)
test_acc  # 93.55%

# for 5 nearest neighbours
neighbour = knc(n_neighbors=5)
Beispiel #23
0
import sys
import sklearn
from classifier_utils import *
from sklearn.neighbors import KNeighborsClassifier as knc

if __name__ == '__main__' :
	if len( sys.argv ) > 3 :
		infilepath , k , dist = sys.argv[ 1: ]
		data = import_csv( infilepath )
		cf = knc( n_neighbors = int( k ) , metric = dist )
		stats = cross_validation( data , cf )
		print "PARAMS: K=%s , metric=%s" % ( k , dist )
		print_stats( stats )
	else :
		print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]
Beispiel #24
0
b = pd.read_csv('ft.csv',index_col=0)
b = np.array(b)

valid_data = valid_data * b.T
test_data = test_data * b.T
train_data = train_data * b.T
k = 101
nbs = 92
c = np.argsort(b,axis = 0)
for k in range(1,k,1):
    d = c[-k:-1,0]
    print d
    d = list(d)

    d.append(c[-1,0])

    td = train_data[:,d]
    tsd = test_data[:,d]
    vld = valid_data[:,d]
    #train_data = train_data[0:100,:]
    #train_class = train_class[0:100]
    for nb in range(91,nbs,1):    
        clf = knc(n_neighbors = nb)
        clf.fit(td,train_class)
        print
        print k,nb
        print 'scr'
        print clf.score(tsd,test_class)
    
    
Beispiel #25
0
sns.countplot(zoo['animal name'])

from sklearn.model_selection import train_test_split

train,test = train_test_split(zoo,test_size=0.2)

train.head()

train.shape

test.shape

from sklearn.neighbors import KNeighborsClassifier as knc

neigh = knc(n_neighbors = 3)

neigh

neigh.fit(train.iloc[:,2:17],train.iloc[:,17])

train_acc = np.mean(neigh.predict(train.iloc[:,2:17])==train.iloc[:,17])

train_acc

test_acc = np.mean(neigh.predict(test.iloc[:,2:17])==test.iloc[:,17])

test_acc

neigh1 = knc(n_neighbors = 5)
Beispiel #26
0
def knn(test_mode=False, custom_data=False):

    results = {
        'test': {
            'accuracy': None,
            'confusion': None
        },
        'train': {
            'accuracy': [],
            'confusion': []
        },
        'best_model': None,
        'best_acc': 0
    }

    settings = {
        'cv_iter': 100,
        'cv_score': 'accuracy',
        'n_cv': 3,
        'n_folds': 10,
        'n_samples': 2000,
    }

    if test_mode:
        settings['n_samples'] = 100

    data_path = os.path.join(
        Path(__file__).resolve().parents[2], 'data', 'processed')

    if custom_data:
        data = np.load(os.path.join(data_path, 'vectors.npy'))
        X = data.item()['data']
        y = data.item()['labels']

        X_train = X[:60000, :]
        X_test = X[60000:, :]
        y_train = y[:60000]
        y_test = y[60000:]
        del X, y, data

        metric = cosine

    else:
        train_data = os.path.join(data_path, 'training.pt')
        test_data = os.path.join(data_path, 'test.pt')
        X_train, y_train = convert_ds_to_np(train_data)
        X_test, y_test = convert_ds_to_np(train_data)

        metric = 'euclidean'

    X_train = X_train[:settings['n_samples'], :]
    y_train = y_train[:settings['n_samples']]
    X_test = X_test[:settings['n_samples'], :]
    y_test = y_test[:settings['n_samples']]

    # model set up using pipeline for randomized CV
    clf = knc(metric=metric, algorithm='brute')

    cv_opts = {'n_neighbors': randint(2, 10)}

    model = RandomizedSearchCV(clf,
                               cv_opts,
                               n_jobs=-1,
                               n_iter=settings['cv_iter'],
                               cv=settings['n_cv'],
                               scoring=settings['cv_score'])

    kf = StratifiedKFold(n_splits=settings['n_folds'], shuffle=True)

    for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
        X_trn = X_train[train_idx]
        X_vld = X_train[valid_idx]
        y_trn = y_train[train_idx]
        y_vld = y_train[valid_idx]

        model.fit(X_trn, y_trn)

        y_pred = model.predict(X_vld)
        this_acc = accuracy_score(y_pred, y_vld)
        results['train']['accuracy'].append(this_acc)
        results['train']['confusion'].append(confusion_matrix(y_pred, y_vld))

        print('[{}/{}]: this={} : best={}'.format(i + 1, settings['n_folds'],
                                                  this_acc,
                                                  results['best_acc']))
        if this_acc > results['best_acc']:
            results['best_acc'] = this_acc
            results['best_model'] = copy(model)

    # get test performance with best model:
    y_pred = results['best_model'].predict(X_test)
    results['test']['accuracy'] = accuracy_score(y_pred, y_test)
    results['test']['confusion'] = confusion_matrix(y_pred, y_test)

    return (results)
Beispiel #27
0
        output.append(-1)
    elif x > -0.5 and x < 0.5:
        output.append(0)
#%%

data_ml = pd.DataFrame(data2_norm_gen)

data_ml1 = data_ml.drop(['missed_book', "date"], axis=1)

from sklearn.model_selection import train_test_split as tts

x_train1, x_test1, y_train1, y_test1 = tts(data_ml1, output)

from sklearn.neighbors import KNeighborsClassifier as knc

cla = knc()

cla.fit(x_train1, y_train1)

pred = cla.predict(x_test1)

from sklearn.metrics import accuracy_score as ac

acc = ac(y_test1, pred)

#%% Plotting hexbin for data 2 and patterns

day_section = []

for i in data2.hour:
    if i >= 0 and i < 3:
# Import required libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn import metrics
import numpy as np

# Load dataset
iris = datasets.load_iris()

# Feature and label differentiation
x = iris.data
y = iris.target

# Set initial K value
k = 3

# Checks from 3 - 23 to assess best fit K value for the model.
# Iterates through KNN model creation to find best fit.
while k < 23:
    x_train, x_test, y_train, y_test = train_test_split(
        x, y,
        test_size=0.2)  # 0.2 value changed to 0.3 to test 30% test size also
    knn = knc(n_neighbors=k)
    knn.fit(x_train, y_train)
    test_prediction = knn.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, test_prediction)
    print(k, "=", accuracy)
    k += 2
# Code run through 10 times with test_size set to 0.2 and a further 10 times with test_size set to 0.3
# Results saved to tables. See Images/k_values.PNG
Beispiel #29
0
y = []
for line in infile:
    y += [line]
infile.close()

#Get X and Y
train_x = pd.DataFrame(data)
train_y = pd.Series(y)

##########################################################################
#kNN
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.metrics import confusion_matrix as cm

#set up and fit
neigh = knc()
neigh.fit(train_x, train_y)

#Test the training set
train_pred = neigh.predict(train_x)

#confusion matrix
confusion = cm(train_y, train_pred)
fun = lambda x: x / sum(x)
cm_perc = np.apply_along_axis(fun, 1, confusion)

# array([[0.60714286, 0.15079365, 0.15873016, 0.08333333],
#        [0.09896907, 0.78762887, 0.06597938, 0.04742268],
#        [0.21988528, 0.12810707, 0.56978967, 0.08221797],
#        [0.23360656, 0.17213115, 0.23155738, 0.36270492]])
Beispiel #30
0
import pandas as pd
import sklearn.cross_validation as cv
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.preprocessing import scale

df = pd.read_csv('wine.data', index_col=None)
target = df[df.columns[0]]
x = df.drop(df.columns[0], axis=1)
kf = cv.KFold(len(df.index), n_folds=5, shuffle=True, random_state=42)
unscaled_result = list()
for i in xrange(1, 50):
    cs_result = cv.cross_val_score(knc(n_neighbors=i), X=x, y=target, cv=kf)
    unscaled_result.append(cs_result.mean())
max_unscaled = max(unscaled_result)
print unscaled_result.index(max_unscaled) + 1
print max_unscaled
scaled_x = scale(X=x)
scaled_result = list()
for i in xrange(1, 50):
    cs_result = cv.cross_val_score(knc(n_neighbors=i),
                                   X=scaled_x,
                                   y=target,
                                   cv=kf)
    scaled_result.append(cs_result.mean())
max_scaled = max(scaled_result)
print scaled_result.index(max_scaled) + 1
print max_scaled
#drop 27 players where all B/P/F stats are missing
df.dropna(thresh=8, inplace=True) 
df.shape #players=1130

#set missings to 0
df.fillna(value=0, inplace=True)
df.isnull().sum()

#set explanatory and response variables
explanatory = [col for col in df.columns if col not in ['playerid', 'inducted','year']]
df_exp = df[explanatory]
df_res = df.inducted

#KNN
knn=knc(p = 2) #specify Euclidean distance

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')
Beispiel #32
0
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))

#applying k-fold cross validation
from sklearn.model_selection import cross_val_score as cvs
accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10)
print(accuracies.mean())
print(accuracies.std())

"""K-NN"""

from sklearn.neighbors import KNeighborsClassifier as knc
classifier=knc(n_neighbors=10,metric='minkowski',p=2)
classifier.fit(x_train, y_train)

#predicting the test set results
y_pred=classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, classification_report

cm=confusion_matrix(y_test, y_pred)
plt.figure(figsize = (5,5))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))
Beispiel #33
0
valid_class = np.ravel(np.array(valid_class))

print train_data.shape
print test_data.shape
print valid_data.shape


'''
b = pd.read_csv('ft.csv',index_col=0)
b = np.array(b)

valid_data = valid_data * b.T
test_data = test_data * b.T
train_data = train_data * b.T
'''
#train_data = train_data[0:100,:]
#train_class = train_class[0:100]
svc = knc()
nb = range(31,52,5)
svm_parameters = {'n_neighbors':nb}

clf = gsc(svc, svm_parameters)

clf.fit(train_data,train_class)


print clf.grid_scores_
print clf.best_score_
print clf.best_estimator_
print clf.best_params_
import numpy as np
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.metrics import accuracy_score
from time import time

X = np.array([[2, 5], [3, 6], [1, 7], [1, 2], [4, 3], [6, 8], [7, 3], [6, 1],
              [8, 7], [9, 3]])
Y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

startTime = time()
clf = knc(n_neighbors=3)
clf.fit(X, Y)
pred = clf.predict([[0, 1]])
print(pred)

testX = np.array([[1, 9], [3, 1], [4, 7], [6, 5], [5, 5], [7, 9]])
testY = np.array([1, 1, 1, 2, 2, 2])

pred = clf.predict(testX)
print("Accuracy ",
      accuracy_score(testY, pred) * 100)

print("Time ", round(time() - startTime, 3), "sec")
Beispiel #35
0
                       dict(zip(range(nodes_n), [[it] for it in dm.index])))
names = dict(zip(range(nodes_n), dm.index))
G = nx.relabel_nodes(G, names)
mst = nx.minimum_spanning_tree(G)
mst_json = json_graph.node_link_data(mst)
for each in mst_json['links']:
    each['type'] = 'mst'

#mst_json['links'] = []
# if you uncomment upper line, this will also display MST lines or it will only display KNN line.

# KNN part
# KNN config
nearest_num = 1
###
M = knc(weights='distance', metric='precomputed')
M.fit(dm.values.tolist(), list(dm.index))

query_dict = {}
for _idx, name in enumerate(mst_json['nodes']):
    name = name['id']
    query_dict[name] = _idx

for _idx, name in enumerate(list(dm.index)):
    temp = M.kneighbors(np.array(dm.values.tolist()[_idx]).reshape(1, -1),
                        n_neighbors=nearest_num + 1)
    for num in range(nearest_num):
        links = {
            'source': query_dict[name],
            'target': query_dict[list(dm.index)[temp[1][0][num + 1]]],
            'weight': temp[0][0][num + 1],
    G, {_i: list(distance.columns)[_i]
        for _i in range(len(G.nodes()))})
# Above graph it isn't have any labels. we need to assign its.
mst_G = nx.mst.minimum_spanning_tree(G)
# generate a new graph which is MST version of ori G.
for edge in mst_G.edges():
    draw_data.append(
        go.Scatter(
            x=[vals[names.index(edge[0]), 0], vals[names.index(edge[1]), 0]],
            y=[vals[names.index(edge[0]), 1], vals[names.index(edge[1]), 1]],
            mode='lines',
            showlegend=False,
            line=dict(color=MST_color, width=MST_width)))

# KNN
M = knc(n_neighbors=3, weights='distance', metric='precomputed')
M.fit(distance.values.tolist(), list(distance.index))

# same as before, Model doen't have any label, so below if using whole row to represent sample.
for _idx in range(len(names)):
    temp = M.kneighbors(distance.values.tolist()[_idx], n_neighbors=2)[1]
    # 2 is mean choose closest one sample, total is 2, but need to consider itself is the nearest.

    # temp is a complex structure object.
    # last [1:] mean except itself, left -> right is distance nearest -> far.
    for _x in temp[0][1:]:
        if type(dot_color) == list:
            current_color = dot_color[_idx]
        else:
            current_color = ['#000000']
        draw_data.append(
Beispiel #37
0
#XTRAINN=np.array(XTRAINN)
#

#xtr,xts,ytr,yts=train_test_split(XTRAINN,Y=YTRAINN,test_size=0.2,random_state=1)

#svc_op.fit(xtrain2,ytrain2)
#acc=[]
#acc=cvs(estimator=svc_op,X=xts,y=yts,cv=3)
#acc.mean()
#

y_stpred_svc=svc_op.predict(xtest2)

#KNN
from sklearn.neighbors import KNeighborsClassifier as knc
knn=knc()
param=[{'n_neighbors':[240,250,230],'p':[2],'metric':['minkowski']}]
gs_knn=GridSearchCV(estimator=knn,param_grid=param,verbose=5,n_jobs=-1,cv=10)
gs_knn.fit(xtrain2,ytrain2)#try later.....this skips fitting on kfold data

gs_knn.best_params_
gs_knn.best_score_

knnf=knc(n_neighbors=230 ,p=2 ,metric='minkowski')
knnf.fit(xtrain2,ytrain2)

200=59.82
300-59.881
250-59.94

#knnf.fit(xtrain2,ytrain2)