def computePredictions(userIds, userIdsTest, userData, userDataTest, userItemMatrix, userClusters, userClustersTest, metric): predictions = np.zeros(( userDataTest.shape[0], userItemMatrix.shape[1], )) for userIndex, userRow in enumerate(predictions): neighbors = [] for i, cluster in enumerate(userClusters): if userIds[i] != userIdsTest[ userIndex] and cluster == userClustersTest[userIndex]: neighbors.append(i) for itemIndex, itemRating in enumerate(userRow): ratingSum = 0 #simSum = 0 for neighborIndex in neighbors: sim = similarity(userDataTest[userIndex], userData[neighborIndex], metric) #if userItemMatrix[neighborIndex][itemIndex] != 0: # simSum += sim ratingSum += userItemMatrix[neighborIndex][itemIndex] * (sim) #if simSum == 0: # simSum = 1 predictions[userIndex][itemIndex] = ratingSum return predictions
def main(): K = 5 train_labels = MNISTtrain_df.iloc[:, 0] # label data train_images = MNISTtrain_df.drop('label', axis=1) # pixel values for image test_labels = MNISTtest_df.iloc[:, 0] # label data test_images = MNISTtest_df.drop('label', axis=1) # pixel values for image neighbors = [] num_correct = 0 # variable used to count the number of correct predictions for index, row in test_images.iterrows(): #print(row.values) #get first row print(index) df = subtractor(row.values, train_images, train_labels) #subtract to all vals in train data print(df) for i in range(K): # for range in the k value neighbors.append( df['label'][i]) # append the label to the neighbors list print(neighbors) prediction = compute_mode(neighbors) neighbors = [] print('Expected ' + str(test_labels[index]) + ', Got ' + str(prediction) + " for K = " + str(K)) if test_labels[index] == prediction: num_correct = num_correct + 1 print("Number correct is ", num_correct) accur = (num_correct / 50) * 100 # Calculate the accuracy print('Accuracy of KNN was ' + str(accur) + '% with k equal to ' + str(K)) # Print to command line
def knn(self ,trainingSet, testInstance, k): print(k) distances = {} sort = {} length = testInstance.shape[1] for x in range(len(trainingSet)): dist = self.euclideanDistance(testInstance, trainingSet.iloc[x], length) distances[x] = dist[0] sorted_d = sorted(distances.items(), key=lambda x: x[1]) neighbors = [] for x in range(k): neighbors.append(sorted_d[x][0]) classVotes = {} for x in range(len(neighbors)): response = trainingSet.iloc[neighbors[x]][-1] if response in classVotes: classVotes[response] += 1 else: classVotes[response] = 1 sortedVotes = sorted(classVotes.items(), key=lambda x: x[1], reverse=True) return (sortedVotes, neighbors)
def calculateNeighbors(data, encodings): print("[INFO] calculating neighbours") max_neighbors = int(data.shape[0]) if max_neighbors > 50: nbrs = NearestNeighbors(n_neighbors=50, algorithm='ball_tree').fit(encodings) else: nbrs = NearestNeighbors(n_neighbors=max_neighbors, algorithm='ball_tree').fit(encodings) distances, neighbor_indices = nbrs.kneighbors(encodings) # print(indices, distances) ''' for i in neighbor_indices[9]: person = data.loc[i, 'name'] cluster = data.loc[i, 'HDBSCAN_clusters'] # print(person, cluster) ''' reducer = umap.UMAP(n_neighbors=2, min_dist=0.2, metric='euclidean', random_state=42).fit(encodings) embedding = reducer.transform(encodings) knn_indices, knn_dists, rp_forest = umap.umap_.nearest_neighbors( embedding, n_neighbors=20, metric='euclidean', metric_kwds={}, angular=False, random_state=np.random.RandomState(42)) # print(knn_indices, knn_dists) ''' for i in knn_indices[9]: person = data.loc[i, 'name'] cluster = data.loc[i, 'HDBSCAN_clusters'] # print(person, cluster) ''' neighbors = [] umapneighbors = [] for i, row in data.iterrows(): neighbors.append(neighbor_indices[i].tolist()) umapneighbors.append(knn_indices[i].tolist()) data['neighbors'] = neighbors data['UMAP_neighbors'] = umapneighbors cluster = [] for i, row in data.iterrows(): c = data.at[i, 'HDBSCAN_clusters'] if c != -1: lijst = data[data['HDBSCAN_clusters'] == c].index.tolist() cluster.append(lijst) #print(i, lijst, namen) else: lijst = [] cluster.append(lijst) data['cluster_list'] = cluster
def get_neighbors(train, test_row, num_neighbors): distances = list() for train_row in train: dist = euclidean_distance(test_row, train_row) distances.append((train_row, dist)) distances.sort(key=lambda tup: tup[1]) neighbors = list() for i in range(num_neighbors): neighbors.append(distances[i][0]) return neighbors
def getNeighbors(trainingSet, testInstance, k): distances = [] length = len(testInstance) - 1 for x in range(len(trainingSet)): dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] for x in range(k): neighbors.append(distances[x][0]) return neighbors
def getNearestNeighbors(data, datapoint, k): distances = [] length = len(datapoint) for x in range(len(data)): dist = euclideanDistance(datapoint, data[x], length) distances.append((data[x], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] edist = [] for x in range(k): neighbors.append(distances[x][0]) edist.append(distances[x][1]) return neighbors, edist
def getNeighbors(trainingSet, testInstance, k): distances = [] length = len(testInstance) - 1 # 计算每一个测试实例到训练集实例的距离 for x in range(len(trainingSet)): dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x], dist)) # 对所有的距离进行排序 distances.sort(key=operator.itemgetter(1)) neighbors = [] # 返回k个最近邻 for x in range(k): neighbors.append(distances[x][0]) return neighbors
def get_neighbors(self, vectors, test_row, num_neighbors): distances = list() count = 0 dim = len(test_row) for word, train_row in vectors.items(): dist = self.euclidean_distance(test_row, train_row, dim) distances.append((count, dist)) count += 1 distances.sort(key=lambda tup: tup[1]) neighbors = [] for i in range(num_neighbors): neighbors.append(distances[i][0]) return neighbors
def getNeighbors(trainingSet,testInstance,k): distances = [] length = len(testInstance) -1 #计算每一个测试实例到训练集实例的距离 for x in range(len(trainingSet)): dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x],dist)) #对所有的距离进行排序 distances.sort(key=operator.itemgetter(1)) neighbors = [] #返回k个最近邻 for x in range(k): neighbors.append(distances[x][0]) return neighbors
def getNeighbors(trainingSet, testInstance, k): distance=[] length = len(testInstance)-1 for x in range(len(trainingSet)): dist = euclideanDistance(testInstance, trainingSet[x], length) distance.append((trainingSet[x],dist)) distance.sort(key=operator.itemgetter(1)) neighbors=[] for x in range(k): neighbors.append(distance[x][0]) classVotes={} for x in range(len(neighbors)): response = neighbors[x][0] if response in classVotes: classVotes[response] += 1 else: classVotes[response] = 1 sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1),reverse=True) return sortedVotes[0][0]
def get_neighbors(train, row, k, dimensionality): """ Locate the most similar neighbors train: the reduced training data set row: The row of interest k: the k value dimensionality: the dimension I am dealing with """ distances = [] labels = [] neighbors = [] # 3 dimensions if dimensionality == 3: for label, x1, y1, z1 in zip(train['label'], train['x'], train['y'], train['z']): # zip is super quick to itereate through dataframes dist = get_eucladian_distance(row, np.array( (x1, y1, z1))) # get euclidean distance distances.append( dist) # Append the label and the distance to distances list labels.append(label) # 2 dimensions else: for label, x1, y1 in zip(train['label'], train['x'], train['y']): # zip is supah quick dist = get_eucladian_distance(row, np.array( (x1, y1))) # get euclideandistance distances.append( dist) # Append the label and the distance to distances list labels.append(label) data = {'label': labels, 'distance': distances} df = pd.DataFrame(data=data) # sort the data frame in ascending order df = df.sort_values(by=['distance'], ascending=True) df = df.reset_index() for i in range(k): # for range in the k value neighbors.append( df['label'][i]) # append the label to the neighbors list return neighbors #return the neighbors
def predict(self, X_test, X_train, y_train): classes = np.unique(y_train) y_pred = [] # Determine the class of each sample for test_sample in X_test: neighbors = [] # Calculate the distance form each observed sample to the sample we wish to predict for j, observed_sample in enumerate(X_train): distance = ml_helpers.euclidean_distance( test_sample, observed_sample) label = y_train[j] # Add neighbor information neighbors.append([distance, label]) neighbors = np.array(neighbors) # Sort the list of observed samples from lowest to highest distance and select the k first k_nearest_neighbors = neighbors[neighbors[:, 0].argsort()][:self.k] # Do a majority vote among the k neighbors and set prediction as the class receing the most votes label = self._majority_vote(k_nearest_neighbors, classes) y_pred.append(label) return np.array(y_pred)
def computeRecommendations(dataset, algorithm, nClusters, metric): userData, userIds = readUserData(dataset) userItemMatrix = readUserItemMatrix(dataset) userData = scaleData(userData) userClusters = computeClusters(userData, algorithm, nClusters, metric) print userClusters print silhouetteScore(userData, userClusters, metric) print raters(nClusters, userClusters, userItemMatrix) userSimilarityMatrix = np.empty(( userData.shape[0], userData.shape[0], )) for i, u1 in enumerate(userData): for j, u2 in enumerate(userData): userSimilarityMatrix[i][j] = similarity(u1, u2, metric) predictions = np.zeros(( userItemMatrix.shape[0], userItemMatrix.shape[1], )) for userIndex, userRow in enumerate(predictions): if userIndex % 2 == 0: neighbors = [] for i, cluster in enumerate(userClusters): if i != userIndex and cluster == userClusters[userIndex]: neighbors.append(i) for itemIndex, itemRating in enumerate(userRow): if userItemMatrix[userIndex][itemIndex] == 0: ratingSum = 0 for neighborIndex in neighbors: sim = similarity(userData[userIndex], userData[neighborIndex], metric) ratingSum += userItemMatrix[neighborIndex][ itemIndex] * (sim) predictions[userIndex][itemIndex] = ratingSum else: randItens = [x for x in range(userItemMatrix.shape[1])] random.shuffle(randItens) count = 0 for item in randItens: if userItemMatrix[userIndex][item] == 0: predictions[userIndex][item] = 1 count += 1 if (count == 3): break for userIndex, userRow in enumerate(predictions): recommendations = userRow.argsort()[-3:][::-1] print userIds[userIndex], for recommendation in recommendations: if predictions[userIndex][recommendation] > 0: print recommendation + 1, else: print 0, print \
def main(): """ Implement KNN from scratch using Python. You are given two data sets: MNIST_training.csv and MNIST_test.csv, where “MNIST_training.csv” contains training data that you will find the K-nearest neighbors, whereas “MNIST_test.csv” consists of test data that you need to predict labels. The training data contains 10 classes (i.e., 0, 1, 2, …, 9), each of which has 95 samples, while there are 5 samples on each class in the test data set """ print("Hello World from KNNscratch.py Script!\n") # --------------- print("Commencing KNN using MNIST reduced to 2D by PCA") #reduction_2D("PCA", "2dPCA_MNISTtrain_data.csv", "2dPCA_MNISTtest_data.csv", "AccuracyWithPCA2D.txt") print("2D PCA finished\n") # --------------- print("Commencing KNN using MNIST reduced to 3D by PCA") #reduction_3D("PCA", "3dPCA_MNISTtrain_data.csv", "3dPCA_MNISTtest_data.csv", "AccuracyWithPCA3D.txt") print("3D PCA finished\n") # --------------- print("Commencing KNN using MNIST reduced to 2D by t-SNE") #reduction_2D("t-SNE", "2dtsne_MNISTtrain_data.csv", "2dtsne_MNISTtest_data.csv", "AccuracyWithtSNE2D.txt") print("2D t-SNE finished\n") # --------------- print("Commencing KNN using MNIST reduced to 3D by t-SNE") #reduction_3D("t-SNE", "3dtsne_MNISTtrain_data.csv", "3dtsne_MNISTtest_data.csv","AccuracyWithtSNE3D.txt") print("3D t-SNE finished\n") # --------------- print("Let's do this") #MNISTtrain_df #MNISTtest_df # add the first test data into the training data X # let i be the index of the test data that we want to predict K = 5 train_labels = MNISTtrain_df.iloc[:, 0] # label data train_images = MNISTtrain_df.drop('label', axis=1) # pixel values for image test_labels = MNISTtest_df.iloc[:, 0] # label data test_images = MNISTtest_df.drop('label', axis=1) # pixel values for image neighbors = [] num_correct = 0 # variable used to count the number of correct predictions for index, row in test_images.iterrows(): #print(row.values) #get first row print(index) df = subtractor(row.values, train_images, train_labels) #subtract to all vals in train data print(df) for i in range(K): # for range in the k value neighbors.append( df['label'][i]) # append the label to the neighbors list print(neighbors) prediction = compute_mode(neighbors) neighbors = [] print('Expected ' + str(test_labels[index]) + ', Got ' + str(prediction) + " for K = " + str(K)) if test_labels[index] == prediction: num_correct = num_correct + 1 print("Number correct is ", num_correct) accur = (num_correct / 50) * 100 # Calculate the accuracy print('Accuracy of KNN was ' + str(accur) + '% with k equal to ' + str(K)) # Print to command line # X_eval = train_images.append(np.transpose(pd.DataFrame(test_images.iloc[i, :]))) # X_eval.index = range(0, train_images.shape[0] + 1) # nbrs = NearestNeighbors(n_neighbors = K + 1).fit(X_eval) # distances, indices = nbrs.kneighbors(X_eval) # print([distances[300], indices[300]]) # print(train_labels[indices[300][1:]]) # what are labels of the neighbors? # print(sum(train_labels[indices[300][1:]]) > (K/2)) # True is major? # print(test_labels[i]) print("KNNscratch.py is finished, have a great day! :)")