def computePredictions(userIds, userIdsTest, userData, userDataTest,
                       userItemMatrix, userClusters, userClustersTest, metric):
    predictions = np.zeros((
        userDataTest.shape[0],
        userItemMatrix.shape[1],
    ))

    for userIndex, userRow in enumerate(predictions):

        neighbors = []
        for i, cluster in enumerate(userClusters):
            if userIds[i] != userIdsTest[
                    userIndex] and cluster == userClustersTest[userIndex]:
                neighbors.append(i)

        for itemIndex, itemRating in enumerate(userRow):
            ratingSum = 0
            #simSum = 0
            for neighborIndex in neighbors:
                sim = similarity(userDataTest[userIndex],
                                 userData[neighborIndex], metric)
                #if userItemMatrix[neighborIndex][itemIndex] != 0:
                #    simSum += sim
                ratingSum += userItemMatrix[neighborIndex][itemIndex] * (sim)
            #if simSum == 0:
            #    simSum = 1
            predictions[userIndex][itemIndex] = ratingSum

    return predictions
Beispiel #2
0
def main():
    K = 5
    train_labels = MNISTtrain_df.iloc[:, 0]  # label data
    train_images = MNISTtrain_df.drop('label',
                                      axis=1)  # pixel values for image

    test_labels = MNISTtest_df.iloc[:, 0]  # label data
    test_images = MNISTtest_df.drop('label', axis=1)  # pixel values for image
    neighbors = []
    num_correct = 0  # variable used to count the number of correct predictions

    for index, row in test_images.iterrows():
        #print(row.values) #get first row
        print(index)
        df = subtractor(row.values, train_images,
                        train_labels)  #subtract to all vals in train data
        print(df)
        for i in range(K):  # for range in the k value
            neighbors.append(
                df['label'][i])  # append the label to the neighbors list
        print(neighbors)
        prediction = compute_mode(neighbors)
        neighbors = []
        print('Expected ' + str(test_labels[index]) + ', Got ' +
              str(prediction) + " for K = " + str(K))
        if test_labels[index] == prediction:
            num_correct = num_correct + 1
            print("Number correct is ", num_correct)
    accur = (num_correct / 50) * 100  # Calculate the accuracy
    print('Accuracy of KNN was ' + str(accur) + '% with k equal to ' +
          str(K))  # Print to command line
Beispiel #3
0
    def knn(self ,trainingSet, testInstance, k):
        print(k)
        distances = {}
        sort = {}
        length = testInstance.shape[1]

        for x in range(len(trainingSet)):
            dist = self.euclideanDistance(testInstance, trainingSet.iloc[x], length)

            distances[x] = dist[0]

        sorted_d = sorted(distances.items(), key=lambda x: x[1])

        neighbors = []

        for x in range(k):
            neighbors.append(sorted_d[x][0])

        classVotes = {}

        for x in range(len(neighbors)):
            response = trainingSet.iloc[neighbors[x]][-1]
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1

        sortedVotes = sorted(classVotes.items(), key=lambda x: x[1], reverse=True)

        return (sortedVotes, neighbors)
Beispiel #4
0
def calculateNeighbors(data, encodings):
    print("[INFO] calculating neighbours")
    max_neighbors = int(data.shape[0])
    if max_neighbors > 50:
        nbrs = NearestNeighbors(n_neighbors=50,
                                algorithm='ball_tree').fit(encodings)
    else:
        nbrs = NearestNeighbors(n_neighbors=max_neighbors,
                                algorithm='ball_tree').fit(encodings)
    distances, neighbor_indices = nbrs.kneighbors(encodings)
    # print(indices, distances)
    '''
    for i in neighbor_indices[9]:
    person = data.loc[i, 'name'] 
    cluster = data.loc[i, 'HDBSCAN_clusters']
    # print(person, cluster)
    '''
    reducer = umap.UMAP(n_neighbors=2,
                        min_dist=0.2,
                        metric='euclidean',
                        random_state=42).fit(encodings)
    embedding = reducer.transform(encodings)
    knn_indices, knn_dists, rp_forest = umap.umap_.nearest_neighbors(
        embedding,
        n_neighbors=20,
        metric='euclidean',
        metric_kwds={},
        angular=False,
        random_state=np.random.RandomState(42))
    # print(knn_indices, knn_dists)
    '''
    for i in knn_indices[9]:
    person = data.loc[i, 'name'] 
    cluster = data.loc[i, 'HDBSCAN_clusters']
    # print(person, cluster)
    '''

    neighbors = []
    umapneighbors = []
    for i, row in data.iterrows():
        neighbors.append(neighbor_indices[i].tolist())
        umapneighbors.append(knn_indices[i].tolist())

    data['neighbors'] = neighbors
    data['UMAP_neighbors'] = umapneighbors

    cluster = []
    for i, row in data.iterrows():
        c = data.at[i, 'HDBSCAN_clusters']

        if c != -1:
            lijst = data[data['HDBSCAN_clusters'] == c].index.tolist()
            cluster.append(lijst)
            #print(i, lijst, namen)
        else:
            lijst = []
            cluster.append(lijst)

    data['cluster_list'] = cluster
Beispiel #5
0
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors
Beispiel #6
0
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance) - 1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
Beispiel #7
0
def getNearestNeighbors(data, datapoint, k):
    distances = []
    length = len(datapoint)
    for x in range(len(data)):
        dist = euclideanDistance(datapoint, data[x], length)
        distances.append((data[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    edist = []
    for x in range(k):
        neighbors.append(distances[x][0])
        edist.append(distances[x][1])
    return neighbors, edist
Beispiel #8
0
def getNearestNeighbors(data, datapoint, k):
    distances = []
    length = len(datapoint)
    for x in range(len(data)):
        dist = euclideanDistance(datapoint, data[x], length)
        distances.append((data[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    edist = []
    for x in range(k):
        neighbors.append(distances[x][0])
        edist.append(distances[x][1])
    return neighbors, edist
Beispiel #9
0
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance) - 1
    # 计算每一个测试实例到训练集实例的距离
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
        # 对所有的距离进行排序
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    # 返回k个最近邻
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
Beispiel #10
0
    def get_neighbors(self, vectors, test_row, num_neighbors):
        distances = list()
        count = 0
        dim = len(test_row)
        for word, train_row in vectors.items():
            dist = self.euclidean_distance(test_row, train_row, dim)
            distances.append((count, dist))
            count += 1
        distances.sort(key=lambda tup: tup[1])
        neighbors = []

        for i in range(num_neighbors):
            neighbors.append(distances[i][0])
        return neighbors
Beispiel #11
0
def getNeighbors(trainingSet,testInstance,k):
    distances = []
    length = len(testInstance) -1
    #计算每一个测试实例到训练集实例的距离
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x],dist))
    #对所有的距离进行排序
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    #返回k个最近邻
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
Beispiel #12
0
def getNeighbors(trainingSet, testInstance, k):
    distance=[]
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distance.append((trainingSet[x],dist))
    distance.sort(key=operator.itemgetter(1))
    neighbors=[]
    for x in range(k):
        neighbors.append(distance[x][0])  
    classVotes={}
    for x in range(len(neighbors)):
        response = neighbors[x][0]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1),reverse=True)
    return sortedVotes[0][0]
Beispiel #13
0
def get_neighbors(train, row, k, dimensionality):
    """
        Locate the most similar neighbors
            train: the reduced training data set
            row: The row of interest
            k: the k value
            dimensionality: the dimension I am dealing with
    """
    distances = []
    labels = []
    neighbors = []
    # 3 dimensions
    if dimensionality == 3:
        for label, x1, y1, z1 in zip(train['label'], train['x'], train['y'],
                                     train['z']):
            # zip is super quick to itereate through dataframes
            dist = get_eucladian_distance(row, np.array(
                (x1, y1, z1)))  # get euclidean distance
            distances.append(
                dist)  # Append the label and the distance to distances list
            labels.append(label)
    # 2 dimensions
    else:
        for label, x1, y1 in zip(train['label'], train['x'], train['y']):
            # zip is supah quick
            dist = get_eucladian_distance(row, np.array(
                (x1, y1)))  # get euclideandistance
            distances.append(
                dist)  # Append the label and the distance to distances list
            labels.append(label)
    data = {'label': labels, 'distance': distances}
    df = pd.DataFrame(data=data)
    # sort the data frame in ascending order
    df = df.sort_values(by=['distance'], ascending=True)
    df = df.reset_index()
    for i in range(k):  # for range in the k value
        neighbors.append(
            df['label'][i])  # append the label to the neighbors list
    return neighbors  #return the neighbors
Beispiel #14
0
    def predict(self, X_test, X_train, y_train):
        classes = np.unique(y_train)
        y_pred = []
        # Determine the class of each sample
        for test_sample in X_test:
            neighbors = []

            # Calculate the distance form each observed sample to the sample we wish to predict
            for j, observed_sample in enumerate(X_train):
                distance = ml_helpers.euclidean_distance(
                    test_sample, observed_sample)
                label = y_train[j]

                # Add neighbor information
                neighbors.append([distance, label])
            neighbors = np.array(neighbors)

            # Sort the list of observed samples from lowest to highest distance and select the k first
            k_nearest_neighbors = neighbors[neighbors[:, 0].argsort()][:self.k]

            # Do a majority vote among the k neighbors and set prediction as the class receing the most votes
            label = self._majority_vote(k_nearest_neighbors, classes)
            y_pred.append(label)
        return np.array(y_pred)
Beispiel #15
0
def computeRecommendations(dataset, algorithm, nClusters, metric):

    userData, userIds = readUserData(dataset)
    userItemMatrix = readUserItemMatrix(dataset)

    userData = scaleData(userData)

    userClusters = computeClusters(userData, algorithm, nClusters, metric)
    print userClusters
    print silhouetteScore(userData, userClusters, metric)
    print raters(nClusters, userClusters, userItemMatrix)

    userSimilarityMatrix = np.empty((
        userData.shape[0],
        userData.shape[0],
    ))
    for i, u1 in enumerate(userData):
        for j, u2 in enumerate(userData):
            userSimilarityMatrix[i][j] = similarity(u1, u2, metric)

    predictions = np.zeros((
        userItemMatrix.shape[0],
        userItemMatrix.shape[1],
    ))

    for userIndex, userRow in enumerate(predictions):
        if userIndex % 2 == 0:
            neighbors = []
            for i, cluster in enumerate(userClusters):
                if i != userIndex and cluster == userClusters[userIndex]:
                    neighbors.append(i)

            for itemIndex, itemRating in enumerate(userRow):
                if userItemMatrix[userIndex][itemIndex] == 0:
                    ratingSum = 0
                    for neighborIndex in neighbors:
                        sim = similarity(userData[userIndex],
                                         userData[neighborIndex], metric)
                        ratingSum += userItemMatrix[neighborIndex][
                            itemIndex] * (sim)
                    predictions[userIndex][itemIndex] = ratingSum
        else:
            randItens = [x for x in range(userItemMatrix.shape[1])]
            random.shuffle(randItens)
            count = 0
            for item in randItens:
                if userItemMatrix[userIndex][item] == 0:
                    predictions[userIndex][item] = 1
                    count += 1
                if (count == 3):
                    break

    for userIndex, userRow in enumerate(predictions):
        recommendations = userRow.argsort()[-3:][::-1]
        print userIds[userIndex],
        for recommendation in recommendations:
            if predictions[userIndex][recommendation] > 0:
                print recommendation + 1,
            else:
                print 0,
        print \
Beispiel #16
0
def main():
    """
        Implement KNN from scratch using Python. You are given two data sets:
        MNIST_training.csv and MNIST_test.csv, where “MNIST_training.csv” contains training data
        that you will find the K-nearest neighbors, whereas “MNIST_test.csv” consists of test data that you need
        to predict labels. The training data contains 10 classes (i.e., 0, 1, 2, …, 9), each of which has 95 samples,
        while there are 5 samples on each class in the test data set
    """
    print("Hello World from KNNscratch.py Script!\n")

    # ---------------
    print("Commencing KNN using MNIST reduced to 2D by PCA")
    #reduction_2D("PCA", "2dPCA_MNISTtrain_data.csv", "2dPCA_MNISTtest_data.csv", "AccuracyWithPCA2D.txt")
    print("2D PCA finished\n")

    # ---------------
    print("Commencing KNN using MNIST reduced to 3D by PCA")
    #reduction_3D("PCA", "3dPCA_MNISTtrain_data.csv", "3dPCA_MNISTtest_data.csv", "AccuracyWithPCA3D.txt")
    print("3D PCA finished\n")

    # ---------------
    print("Commencing KNN using MNIST reduced to 2D by t-SNE")
    #reduction_2D("t-SNE", "2dtsne_MNISTtrain_data.csv", "2dtsne_MNISTtest_data.csv",  "AccuracyWithtSNE2D.txt")
    print("2D t-SNE finished\n")

    # ---------------
    print("Commencing KNN using MNIST reduced to 3D by t-SNE")
    #reduction_3D("t-SNE", "3dtsne_MNISTtrain_data.csv", "3dtsne_MNISTtest_data.csv","AccuracyWithtSNE3D.txt")
    print("3D t-SNE finished\n")

    # ---------------
    print("Let's do this")
    #MNISTtrain_df
    #MNISTtest_df
    # add the first test data into the training data X
    # let i be the index of the test data that we want to predict
    K = 5
    train_labels = MNISTtrain_df.iloc[:, 0]  # label data
    train_images = MNISTtrain_df.drop('label',
                                      axis=1)  # pixel values for image

    test_labels = MNISTtest_df.iloc[:, 0]  # label data
    test_images = MNISTtest_df.drop('label', axis=1)  # pixel values for image
    neighbors = []
    num_correct = 0  # variable used to count the number of correct predictions

    for index, row in test_images.iterrows():
        #print(row.values) #get first row
        print(index)
        df = subtractor(row.values, train_images,
                        train_labels)  #subtract to all vals in train data
        print(df)
        for i in range(K):  # for range in the k value
            neighbors.append(
                df['label'][i])  # append the label to the neighbors list
        print(neighbors)
        prediction = compute_mode(neighbors)
        neighbors = []
        print('Expected ' + str(test_labels[index]) + ', Got ' +
              str(prediction) + " for K = " + str(K))
        if test_labels[index] == prediction:
            num_correct = num_correct + 1
            print("Number correct is ", num_correct)
    accur = (num_correct / 50) * 100  # Calculate the accuracy
    print('Accuracy of KNN was ' + str(accur) + '% with k equal to ' +
          str(K))  # Print to command line

    #    X_eval = train_images.append(np.transpose(pd.DataFrame(test_images.iloc[i, :])))
    #    X_eval.index = range(0, train_images.shape[0] + 1)
    #    nbrs = NearestNeighbors(n_neighbors = K + 1).fit(X_eval)
    #    distances, indices = nbrs.kneighbors(X_eval)
    #    print([distances[300], indices[300]])
    #    print(train_labels[indices[300][1:]]) # what are labels of the neighbors?
    #    print(sum(train_labels[indices[300][1:]]) > (K/2)) # True is major?
    #    print(test_labels[i])

    print("KNNscratch.py is finished, have a great day! :)")