def relevence_search(self, searchVector, formula="cosine", weighting='tf'):
     ratings = {}
     for key, value in self.documentVectors.items():
         if formula == "cosine":
             rating = util.cosine(searchVector, value) 
         elif formula == "euclidean":
             rating = util.euclidean(searchVector, value)
         ratings[key] = rating
     ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
     
     return ratings
 def searchTf(self, query):
     """ search for documents that match based on a list of terms """
     queryVector = self.makeTfVector(query)
     tf_cos = [
         util.cosine(queryVector, documentVector)
         for documentVector in self.tfVectors
     ]
     tf_dist = [
         util.euclidean(queryVector, documentVector)
         for documentVector in self.tfVectors
     ]
     return [tf_cos, tf_dist]
 def search(self,searchList, formula="cosine", weighting="tf"):
     """ search for documents that match based on a list of terms """
     ratings = {}
     queryVector = self.buildQueryVector(searchList, weighting)
     for key, value in self.documentVectors.items():
         if formula == "cosine":
             rating = util.cosine(queryVector, value) 
         elif formula == "euclidean":
             rating = util.euclidean(queryVector, value)
         ratings[key] = rating
     ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
     return ratings
Example #4
0
    def search_tfidf_eul(self, searchList, method="1"):
        rating_dic = {}
        queryVector = self.buildQueryVector(searchList, method)
        for key, value in self.documentVectors.items():
            rating_dic[key] = util.euclidean(queryVector, value)

        result = {
            k: v
            for k, v in sorted(
                rating_dic.items(), key=lambda item: item[1], reverse=True)
        }

        return list(result.items())[:10]
Example #5
0
def assign_datapoints(data: dict,
                      clusters: dict,
                      dist=0):  # 0 is euclidean, 1 is squared
    """
    Function that assigns each datapoint to the nearest cluster, based on the euclidean distance between the datapoint and each cluster centroid.
    
    Parameters
    ----------
    data : dict
        key-value pairs with data points as keys and values at the second place in the values tuple.
    clusters : dict
        contains the cluster IDs as keys, and their centroid location, as well as a list of data points in the values tuple
    dist : boolean value
        indicates whether the normal or squared euclidean distance should be used.
        0 = normal, 1 = squared. The default is 0.

    Returns
    -------
    data : dict
        contains key-value pairs, with data points as keys and amongst the values are the updated cluster IDs
    clusters : dict
        contains the cluster IDs as keys, and as values the outdated centroid location, as well as an updated list of cell line indices

    """
    # empty list of data points
    for key in clusters:
        clusters[key][1] = []

    # loop over all data points in data
    for datapoint in data:
        centroids = []

        # loop over all clusters to calculate distance of data point to each centroid
        for cluster in clusters:
            if (dist == 0) or (dist == 1):
                distance = euclidean(data[datapoint][2], clusters[cluster][0],
                                     dist)

            # create list of cluster IDs with distances to it
            centroids.append([cluster, distance])

        # sort list of distances to centroids and choose the cluster ID and centroid belonging to the smallest distance
        nearest = sorted(centroids, key=lambda x: x[1])[0]

        # assign the cluster ID as attribute to the data point

        data[datapoint][3] = nearest[0]
        # assign the data point to the list of data points belonging to the centroid
        clusters[nearest[0]][1].append(datapoint)

    return data, clusters
Example #6
0
    def search_eul(self, searchList, method="0"):
        """ search for documents that match based on a list of terms """
        rating_dic = {}
        queryVector = self.buildQueryVector(searchList, method)
        for key, value in self.documentVectors.items():
            rating_dic[key] = util.euclidean(queryVector, value)

        result = {
            k: v
            for k, v in sorted(
                rating_dic.items(), key=lambda item: item[1], reverse=True)
        }

        return list(result.items())[:10]
Example #7
0
    def search(self, relevanceType):
        """ search for documents that match based on a list of terms """
        self.queryVector = self.buildQueryVector(self.queryList)

        if relevanceType == 'cs':
            ratings = [
                util.cosine(self.queryVector, documentVector)
                for documentVector in self.documentVectors
            ]
        elif relevanceType == 'eu':
            ratings = [
                util.euclidean(self.queryVector, documentVector)
                for documentVector in self.documentVectors
            ]
        return ratings
Example #8
0
 def _h_euclidean(self, coord1, coord2):
     return util.euclidean(coord1, coord2)
Example #9
0
 def _h_euclidean(self, coord1, coord2):
     return util.euclidean(coord1, coord2)
Example #10
0
 def searchTFidfdist(self, searchList):
     searchVec = self.makeVectorforTFidf(searchList)
     vector = [
         util.euclidean(searchVec, docVec) for docVec in self.tfidfVec
     ]
     return vector
Example #11
0
    # pick one of the data points for the test set
    for test in data.keys():

        # assign the other data points to the training set
        training = {}
        for key in data.keys():
            if key != test: training[key] = data[key]

        # perform the k-means clustering
        training, clusters, S = clustering(training, kclusters, max_iterations,
                                           dist)

        # determine the k nearest neighbours
        distances = {}
        for key in training:
            test_dist = euclidean(data[test][2], training[key][2], dist)
            distances[key] = [test_dist, training[key][3]]
        nearest = sorted(distances.items(), key=lambda x: x[1][0])[0:kmeans]

        # determine the nearest cluster, based on the majority vote of the k nearest neighbours
        counts = {}
        for key in clusters:
            counts[key] = 0
            for i in nearest:
                if i[1][1] == key: counts[key] += 1

        test_cluster = sorted(counts.items(), key=lambda x: x[1],
                              reverse=True)[0][0]

        # determine the nearest cluster label, depending on the majority vote of datapoints within the cluster
        cluster_type = sorted(clusters[test_cluster][2].items(),