def relevence_search(self, searchVector, formula="cosine", weighting='tf'): ratings = {} for key, value in self.documentVectors.items(): if formula == "cosine": rating = util.cosine(searchVector, value) elif formula == "euclidean": rating = util.euclidean(searchVector, value) ratings[key] = rating ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)} return ratings
def searchTf(self, query): """ search for documents that match based on a list of terms """ queryVector = self.makeTfVector(query) tf_cos = [ util.cosine(queryVector, documentVector) for documentVector in self.tfVectors ] tf_dist = [ util.euclidean(queryVector, documentVector) for documentVector in self.tfVectors ] return [tf_cos, tf_dist]
def search(self,searchList, formula="cosine", weighting="tf"): """ search for documents that match based on a list of terms """ ratings = {} queryVector = self.buildQueryVector(searchList, weighting) for key, value in self.documentVectors.items(): if formula == "cosine": rating = util.cosine(queryVector, value) elif formula == "euclidean": rating = util.euclidean(queryVector, value) ratings[key] = rating ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)} return ratings
def search_tfidf_eul(self, searchList, method="1"): rating_dic = {} queryVector = self.buildQueryVector(searchList, method) for key, value in self.documentVectors.items(): rating_dic[key] = util.euclidean(queryVector, value) result = { k: v for k, v in sorted( rating_dic.items(), key=lambda item: item[1], reverse=True) } return list(result.items())[:10]
def assign_datapoints(data: dict, clusters: dict, dist=0): # 0 is euclidean, 1 is squared """ Function that assigns each datapoint to the nearest cluster, based on the euclidean distance between the datapoint and each cluster centroid. Parameters ---------- data : dict key-value pairs with data points as keys and values at the second place in the values tuple. clusters : dict contains the cluster IDs as keys, and their centroid location, as well as a list of data points in the values tuple dist : boolean value indicates whether the normal or squared euclidean distance should be used. 0 = normal, 1 = squared. The default is 0. Returns ------- data : dict contains key-value pairs, with data points as keys and amongst the values are the updated cluster IDs clusters : dict contains the cluster IDs as keys, and as values the outdated centroid location, as well as an updated list of cell line indices """ # empty list of data points for key in clusters: clusters[key][1] = [] # loop over all data points in data for datapoint in data: centroids = [] # loop over all clusters to calculate distance of data point to each centroid for cluster in clusters: if (dist == 0) or (dist == 1): distance = euclidean(data[datapoint][2], clusters[cluster][0], dist) # create list of cluster IDs with distances to it centroids.append([cluster, distance]) # sort list of distances to centroids and choose the cluster ID and centroid belonging to the smallest distance nearest = sorted(centroids, key=lambda x: x[1])[0] # assign the cluster ID as attribute to the data point data[datapoint][3] = nearest[0] # assign the data point to the list of data points belonging to the centroid clusters[nearest[0]][1].append(datapoint) return data, clusters
def search_eul(self, searchList, method="0"): """ search for documents that match based on a list of terms """ rating_dic = {} queryVector = self.buildQueryVector(searchList, method) for key, value in self.documentVectors.items(): rating_dic[key] = util.euclidean(queryVector, value) result = { k: v for k, v in sorted( rating_dic.items(), key=lambda item: item[1], reverse=True) } return list(result.items())[:10]
def search(self, relevanceType): """ search for documents that match based on a list of terms """ self.queryVector = self.buildQueryVector(self.queryList) if relevanceType == 'cs': ratings = [ util.cosine(self.queryVector, documentVector) for documentVector in self.documentVectors ] elif relevanceType == 'eu': ratings = [ util.euclidean(self.queryVector, documentVector) for documentVector in self.documentVectors ] return ratings
def _h_euclidean(self, coord1, coord2): return util.euclidean(coord1, coord2)
def searchTFidfdist(self, searchList): searchVec = self.makeVectorforTFidf(searchList) vector = [ util.euclidean(searchVec, docVec) for docVec in self.tfidfVec ] return vector
# pick one of the data points for the test set for test in data.keys(): # assign the other data points to the training set training = {} for key in data.keys(): if key != test: training[key] = data[key] # perform the k-means clustering training, clusters, S = clustering(training, kclusters, max_iterations, dist) # determine the k nearest neighbours distances = {} for key in training: test_dist = euclidean(data[test][2], training[key][2], dist) distances[key] = [test_dist, training[key][3]] nearest = sorted(distances.items(), key=lambda x: x[1][0])[0:kmeans] # determine the nearest cluster, based on the majority vote of the k nearest neighbours counts = {} for key in clusters: counts[key] = 0 for i in nearest: if i[1][1] == key: counts[key] += 1 test_cluster = sorted(counts.items(), key=lambda x: x[1], reverse=True)[0][0] # determine the nearest cluster label, depending on the majority vote of datapoints within the cluster cluster_type = sorted(clusters[test_cluster][2].items(),