def makeGraph(relPath, columns, resultColumn,k ,tp): dataSet = r.readDataSet(relPath, columns) trainingSets = [] avaliationSets = [] kfold = kc(dataSet, k, resultColumn, True) kfold.run(trainingSets, avaliationSets, stratified = True) dataSet = dataSet.apply(pd.to_numeric) ks = [1,2,3,5,7,9,11,13,15] means = [] for j in ks: print("Using k = " + str(j)) correctPercentage = 0 for i in range(len(trainingSets)): tset=[] aset=[] for index, row in dataSet.iterrows(): tupla = (dataSet.iloc[index][resultColumn], index) if tupla in trainingSets[i]: tset.append(row.tolist()) if tupla in avaliationSets[i]: aset.append(row.tolist()) k = Knn(tset, j, tp = tp) correctPercentage += k.test(aset) generalMean = correctPercentage / len(trainingSets) means.append(generalMean) matplotlib.pyplot.plot(ks, means) matplotlib.pyplot.show()
def _makeGraph(relPath, columns, resultColumn): dataSet = r.readDataSet(relPath, columns) trainingSets = [] avaliationSets = [] kfold = kc(dataSet, 10, resultColumn, True) kfold.run(trainingSets, avaliationSets, stratified=True) dataSet = dataSet.apply(pd.to_numeric) ks = [1, 3] nPrototypes = [3, 5, 10, 20] for k in ks: meansGeral = [] meansFalse = [] meansTrue = [] for j in nPrototypes: correctnessPercentage = 0 correctTrue = 0 correctFalse = 0 for i in range(len(trainingSets)): print("\n") print(" --------- FOLD " + str(i + 1) + " ----------------") tset = [] aset = [] for index, row in dataSet.iterrows(): tupla = (dataSet.iloc[index][resultColumn], index) if tupla in trainingSets[i]: tset.append(row.tolist()) if tupla in avaliationSets[i]: aset.append(row.tolist()) lvq = LVQ3(tset, resultColumn) newtset = lvq.run(nPrototypes=j) kn = Knn(newtset, k) result = kn.test(aset) correctnessPercentage += result[0] classErrors = result[1] classNumbers = result[2] correctFalse += ( classErrors[False] / classNumbers[False]) if False in classErrors.keys() else 0 correctTrue += ( classErrors[True] / classNumbers[True]) if True in classErrors.keys() else 0 meansGeral.append(correctnessPercentage / len(trainingSets)) meansFalse.append(correctFalse / len(trainingSets)) meansTrue.append(correctTrue / len(trainingSets)) plt.ylim(0, 1) plt.plot(nPrototypes, meansGeral, 'r', label='general') plt.plot(nPrototypes, meansFalse, 'g', label='false') plt.plot(nPrototypes, meansTrue, 'b', label='true') plt.legend(loc='upper left') plt.show()
def _LVQ3(relPath, columns, resultColumn): dataSet = r.readDataSet(relPath, columns) trainingSets = [] avaliationSets = [] kfold = kc(dataSet, 10, resultColumn, True) kfold.run(trainingSets, avaliationSets, stratified=True) dataSet = dataSet.apply(pd.to_numeric) tset = [] aset = [] for i in range(len(trainingSets)): print("\n") print(" --------- FOLD " + str(i + 1) + " ----------------") tset = [] aset = [] for index, row in dataSet.iterrows(): tupla = (dataSet.iloc[index][resultColumn], index) if tupla in trainingSets[i]: tset.append(row.tolist()) if tupla in avaliationSets[i]: aset.append(row.tolist()) print("------------- SIMPLE KNN ----------------") k = Knn(tset, 3) k.test(aset) lvq = LVQ3(tset, resultColumn) newtset = lvq.run() print("-------------- LVQ3 ----------------------") k = Knn(newtset, 3) k.test(aset)
def compare(filename): #filename vai ser Tp1_data.csv showPlots = False Xs, Ys = get_data(filename) X_r, X_t, Y_r, Y_t = train_test_split(Xs, Ys, test_size=0.33, stratify=Ys) folds = 5 Kf = StratifiedKFold(Y_r, n_folds=folds) KnnErr, bestN, KnnPred = Knn(Kf, X_r, Y_r, X_t, Y_t, showPlots) #KnnPred AA-07 print("KnnErr, best_N:", KnnErr, bestN) LogScore, bestC, LogPred = Logistic(Kf, X_r, Y_r, X_t, Y_t, showPlots) print("LogisticScore, best_C:", LogScore, bestC) NBScore, bestBandwidth, NBPred = NaiveBayes(Kf, X_r, Y_r, X_t, Y_t, showPlots) print("NBScore, best_Bandwidth:", NBScore, bestBandwidth) MCNemarKnn_Log = MCNemar(KnnPred, LogPred, Y_t) #(|e01-e10|-1)²/e01+e10 MCNemarNB_Log = MCNemar(NBPred, LogPred, Y_t) MCNemarNB_Knn = MCNemar(KnnPred, NBPred, Y_t) print() print("McNemar:") print("MCNemarKnn_Log", MCNemarKnn_Log) print("MCNemarNB_Log", MCNemarNB_Log) print("MCNemarNB_Knn", MCNemarNB_Knn)
def knnTest(feature_len, all_lines, all_features, all_labels): counts = {} for i in range(10): rate = 0 print("Test %d:" % (i + 1)) train_features = all_features[0:int(0.8 * len(all_features))] train_labels = all_labels[0:int(0.8 * len(all_features))] test_features = all_features[int(0.8 * len(all_features)):] test_labels = all_labels[int(0.8 * len(all_features)):] length = len(test_labels) for k in range(1, 5): rate = 0 print("k = %d: " % k, end=" ") for j in range(0, length): res = Knn(train_features, train_labels, test_features[j], k) if res == test_labels[j]: rate += 1 print(rate / length) if k not in counts: counts[k] = rate / length else: counts[k] += rate / length all_features, all_labels = now_provider.getFeatureAndLabel( all_lines, feature_len) for x in counts: print(x, counts[x])
def simpleKnn(relPath, columns, resultColumn,k ,tp): dataSet = r.readDataSet(relPath, columns) trainingSets = [] avaliationSets = [] kfold = kc(dataSet, k, resultColumn, True) kfold.run(trainingSets, avaliationSets, stratified = True) dataSet = dataSet.apply(pd.to_numeric) for i in range(len(trainingSets)): tset=[] aset=[] for index, row in dataSet.iterrows(): tupla = (dataSet.iloc[index][resultColumn], index) if tupla in trainingSets[i]: tset.append(row.tolist()) if tupla in avaliationSets[i]: aset.append(row.tolist()) k = Knn(tset, 1, tp = tp) k.test(aset)
def Prediction(trainUser,testUser,trainBook,numberof_k,smilarityname,predictdict): MeanAbsolute=0 predictsum=0 totalpredict=len(testUser) weightedMeanAbsolute=0 if smilarityname == "cosine": TrainSquares=squareforcos(trainUser) for testkey in testUser: if smilarityname == "cosine": cossim = cosineSim(testUser[testkey],trainUser,trainBook,TrainSquares) if smilarityname == "correlation": cossim = correlation(testUser[testkey],trainUser,trainBook) if smilarityname == "adjcosine": cossim = adjCosineSim(testUser[testkey],trainUser,trainBook) if len(cossim)!=0 : #print("\nTEST ID ->",testkey," With K=",numberof_k,"neighbours",len(cossim)) mean,weightedmean,prediction = Knn(cossim,numberof_k,trainBook,predictdict[testkey]) MeanAbsolute += mean weightedMeanAbsolute += weightedmean predictsum += prediction else: # if test user dont have common item in train data, prediction will be updated users own mean lenisbn=len(testUser[testkey]) predictsum += lenisbn testusersum=sum(testUser[testkey].values())/lenisbn testMean=0 testWmean=0 for key,value in testUser[testkey].items(): predictdict[testkey][key][1],predictdict[testkey][key][2]=round(testusersum),round(testusersum) testMean+=abs(value-testusersum) testWmean+=abs(value-testusersum) testMean=testMean/lenisbn testWmean=testWmean/lenisbn MeanAbsolute += testMean weightedMeanAbsolute += testWmean MeanAbsolute= MeanAbsolute/totalpredict weightedMeanAbsolute= weightedMeanAbsolute/totalpredict print("\nSmilarity function:",smilarityname, "\nNeighbours number ==",numberof_k, "\nMEAN ABSOLUTE ERROR:",MeanAbsolute, "\nWeighted MEAN ABSOLUTE ERROR:",weightedMeanAbsolute, "\nTotal Prediction:",totalpredict) return MeanAbsolute,weightedMeanAbsolute
def main(): # ASSERTS knn_numeric = Knn.Knn_numeric(0) filename = 'data/dataset1-1.csv' comp_number = 1 #Numero de componentes principais # PCA #pca_inst = pca.PCA() #(data, target, all_data) = pca_inst.get_data(filename, comp_number) # LDA lda_inst = lda.LDA() (data, target, all_data) = lda_inst.get_data(filename, comp_number) # Acuracia (k, values) = knn_numeric.get_acuraccy_by_neighbor(data, target, all_data) print(list(zip(k, values)))
def compareTest(feature_len, all_lines, all_features, all_labels): count = {} for i in range(10): print("\nTest %d" % (i + 1)) train_features = all_features[0:int(0.8 * len(all_features))] train_labels = all_labels[0:int(0.8 * len(all_features))] test_features = all_features[int(0.8 * len(all_features)):] test_labels = all_labels[int(0.8 * len(all_features)):] length = len(test_labels) rate = 0 print("NaiveBayes : ", end="") new_bayes = NaiveBayes(train_features, train_labels, feature_len) new_bayes.train() for j in range(0, length): res = new_bayes.predict(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "NaiveBayes" not in count: count["NaiveBayes"] = rate / length else: count["NaiveBayes"] += rate / length rate = 0 print("KNN : ", end="") for j in range(0, length): res = Knn(train_features, train_labels, test_features[j], 3) if res == test_labels[j]: rate += 1 print(rate / length) if "KNN" not in count: count["KNN"] = rate / length else: count["KNN"] += rate / length rate = 0 print("Logistic : ", end="") new_logistic = Logistic(train_features, train_labels, feature_len, alpha=5, tol=0.000001) new_logistic.train() for j in range(0, length): res = new_logistic.predict(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "Logistic" not in count: count["Logistic"] = rate / length else: count["Logistic"] += rate / length rate = 0 print("NeuralNetwork : ", end="") new_NN = NeuralNetwork(train_features, train_labels, feature_len, hidden_num=32, learn_rate=100) new_NN.train() for j in range(0, length): res = new_NN.predict(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "NeuralNetwork" not in count: count["NeuralNetwork"] = rate / length else: count["NeuralNetwork"] += rate / length rate = 0 print("Tree : ", end="") new_tree = Tree(train_features, train_labels, len(train_features[0]), 3, 8) new_tree.train() for j in range(0, length): res = new_tree.predictTree(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "Tree" not in count: count["Tree"] = rate / length else: count["Tree"] += rate / length rate = 0 print("AdaBoost : ", end="") new_boost = AdaBoost(train_features, train_labels, len(train_features[0]), 28, mode=2) new_boost.train() for j in range(0, length): res = new_boost.predict(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "AdaBoost" not in count: count["AdaBoost"] = rate / length else: count["AdaBoost"] += rate / length rate = 0 print("RandomForest : ", end="") new_forest = RandomForest(30) new_forest.buildTrees(train_features, train_labels, len(train_features[0]), 3, 6) for j in range(0, length): res = new_forest.predictForest(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "RandomForest" not in count: count["RandomForest"] = rate / length else: count["RandomForest"] += rate / length rate = 0 print("SVM : ", end="") new_svm = SVM(train_features, train_labels, C=43, function='RBF', d=0.53) new_svm.train() for j in range(0, length): res = new_svm.predict(test_features[j]) if res == test_labels[j]: rate += 1 print(rate / length) if "SVM" not in count: count["SVM"] = rate / length else: count["SVM"] += rate / length all_features, all_labels = now_provider.getFeatureAndLabel( all_lines, feature_len) print("\nAverage:") for x in count: print(x, end=": ") print(count[x] / 10)
### RANDOM FOREST ### rf = RandomForest(X_train, y_train) rf.predict(X_test) rf.set_prediction_data() #rf.plot_num_deaths_per_age() #rf.plot_num_deaths_per_gender() #rf.ageScore(age) #rf.genderScore(gender) #rf.deathScore(death) ############################### ### KNN ### knn = Knn(X_train, y_train) knn.predict(X_test) knn.set_prediction_data() #knn.plot_num_patient_neg_summary_based_on_gender() #knn.plot_num_patient_neg_summary_baseg_on_age() #knn.plot_num_patient_neg_summary_based_on_is_from_wuhan() #knn.ageScore(age) #knn.genderScore(gender) #knn.deathScore(death) ################################### T_1 = df.drop(columns=[ 'reporting date', 'summary', 'location', 'country', 'symptom', 'death' ])
.format(X_test.size, X_test.shape, X_test[0])) print( "Testing target (y_target) has {} elements\ny_test.shape = {} --> A single array of 30 elements\n" .format(y_test.size, y_test.shape)) print("target_labels = {}".format(y_test)) ### view data # plt.figure() # plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap, edgecolor='k', s=20) # plt.show() # a = [1, 1, 1, 2, 2, 3, 4, 5, 6] # from collections import Counter # most_common = Counter(a).most_common(1) # print(most_common[0][0]) from KNN import Knn clf = Knn( k=5 ) # instantiate a Knn classifier (clf) passing in the number of neighbors (default is 3) clf.fit(X_train, y_train) # pass the training data to your Knn classifier predictions = clf.predict(X_test) # predict() algorithm... # 1) calculate distance between X_test and every training data entry # 2) find kth nearest training data entries i.e. smallest euclidean distance # 3) match kth nearest data entries with flower class target array and select the most common acc = np.sum(predictions == y_test) / len(y_test) print("accuracy = {}".format(acc))