def test_approx_nn(method, traindata, testdata, m, alpha): avg_distance = 0 if method == "hashing": #train lsh = LocalitySensitiveHash(traindata, D=1000, m=m) #time test t0 = time.time() for testdoc_id, testdoc in testdata.iteritems(): avg_distance += lsh.nearest_neighbor(testdoc, depth=HW2_DEPTH).distance if method == "kdtree": #train kdt = KDTree(D) for i, document in traindata.iteritems(): key = make_dense(document) kdt.insert(key, i) #time test t0 = time.time() for _, testdoc in testdata.iteritems(): key = make_dense(testdoc) neighbor = kdt.nearest(key, alpha) avg_distance += EvalUtil.distance(testdoc, docdata[neighbor]) #finish timing, report results mean_time = (time.time() - t0) / len(testdata) mean_distance = avg_distance / len(testdata) return TestResult(method, m=m, D=D, alpha=alpha, avg_time=mean_time, avg_distance=mean_distance)
def test_approx_nn(method, traindata, testdata, m, alpha): avg_distance = 0 if method == "hashing": #train lsh = LocalitySensitiveHash(traindata, D=1000, m=m) #time test t0 = time.time() for testdoc_id, testdoc in testdata.iteritems(): avg_distance += lsh.nearest_neighbor(testdoc, depth = HW2_DEPTH).distance if method == "kdtree": #train kdt = KDTree(D) for i, document in traindata.iteritems(): key = make_dense(document) kdt.insert(key, i) #time test t0 = time.time() for _, testdoc in testdata.iteritems(): key = make_dense(testdoc) neighbor = kdt.nearest(key, alpha) avg_distance += EvalUtil.distance(testdoc, docdata[neighbor]) #finish timing, report results mean_time = (time.time() - t0) / len(testdata) mean_distance = avg_distance / len(testdata) return TestResult(method, m=m, D=D, alpha = alpha, avg_time=mean_time, avg_distance=mean_distance)
def test_kd_tree(n, D, n_test, alphas): """ Tests the query time and distance for a random data set and test set @param n: int - the number of points of the dataset @param D: int - the dimension of the data points @param n_test: int - the number of points to test @param alphas: [float] - a set of alphas to test @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query """ documents = RandomData.random_dataset(n, DOCDIM) test_documents = RandomData.random_dataset(n_test, DOCDIM) rand_tree = KDTree(DOCDIM) for i, document in documents.iteritems(): key = [document.get(idx) for idx in xrange(0, DOCDIM)] rand_tree.insert(key, i) times = [] for alpha in alphas: start_time = time.clock() cum_dist = 0.0 for i, test_document in test_documents.iteritems(): key = [test_document.get(idx) for idx in xrange(0, DOCDIM)] doc_id = rand_tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_document, documents[doc_id]) duration = time.clock() - start_time times.append( TestResult("KDTree", n, DOCDIM, alpha, duration / n_test, cum_dist / n_test)) return times
def test_kd_tree(train_docs, test_docs, D, alphas): """ Tests the query time and distance for the given training and testing sets @param D: int - the dimension of the data points @param alphas: [float] - a set of alphas to test @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query """ # Populate the tree with the training data print "Forming KD-tree" tree = KDTree(D) for i, document in train_docs.iteritems(): key = [document.get(idx,0) for idx in xrange(0, D)] tree.insert(key, i) print "Done" times = [] n = len(test_docs) for alpha in alphas: print "Computing average lookup time and distance to nearest neighbor for alpha = %d" %alpha start_time = time.clock() cum_dist = 0.0 for i, test_doc in test_docs.iteritems(): key = [test_doc.get(idx,0) for idx in xrange(0, D)] doc_id = tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_doc, train_docs[doc_id]) duration = time.clock() - start_time times.append(TestResult("KDTree", n, D, alpha, duration / n, cum_dist / n)) print "Average distance: %f" %(cum_dist / n) print "Average time: %f\n" %(duration / n) return times
class GaussianRandomProjection(object): """ @ivar documents: dict[int => dict[int => int/float]] list of documents @ivar D: int - dimension of vectors @ivar m: int - number of random projections @ivar projection_vectors: [[float]] - the projection vectors @ivar kdt: methods.KDTree - a KD-tree instance """ def __init__(self, documents, D, m): """ Creates a GaussianRandomProjection with the specified dimension and number of random projections @param documents: dict[int => dict[int => int/float]] - the documents @param D: int - dimension @param m: int - number of random projections """ self.documents = documents self.D = D self.m = m self.projection_vectors = Helper.create_projection_vectors(D, m) self.kdt = KDTree(m) for doc_id, doc in documents.iteritems(): self.kdt.insert(self.hash_document(doc), doc_id) def nearest_neighbor(self, document, alpha): """ Finds the approximate nearest neighbor for given document. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param alpha: float - alpha for approximate k-nn """ hashed_document = self.hash_document(document) nearest_id = self.kdt.nearest(hashed_document, alpha) distance = EvalUtil.distance(document, self.documents[nearest_id]) return NeighborDistance(nearest_id, distance) def hash_document(self, document): """ Hashes a document using the random projections @param document: dict[int => int/float] - document represented as dictionary of word ids => counts """ # hashed_document = [0.0 for _ in self.m] # # TODO: hash/project the document onto the "m" gaussian random projections # raise Exception("Please implement the GaussianRandomProjection.hash_document method") # return hashed_document return [ self.project_document(document, self.projection_vectors[i]) for i in xrange(self.m) ] def project_document(self, document, vector): """ Projects a document onto a vector. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param vector: [float] - a vector on which to project the document """ dotprod = 0.0 for word in document: dotprod += document[word] * vector[word - 1] return dotprod
def test_kd_tree(documents, test_documents, D, alphas): n = len(documents) n_test = len(test_documents) tree = KDTree(D) for i, document in documents.iteritems(): key = [document.get(idx) for idx in xrange(0, D)] tree.insert(key, i) print "Finished making random tree." times = [] for alpha in alphas: print "Running for alpha", alpha start_time = time.clock() cum_dist = 0.0 print "Running for the test documents..." for i, test_document in test_documents.iteritems(): if i%50 == 0: print " ", i, "of", len(test_documents) key = [test_document.get(idx) for idx in xrange(0, D)] doc_id = tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_document, documents[doc_id]) print "Finished." duration = time.clock() - start_time times.append(TestResult("KDTree", n, D, alpha, duration / n_test, cum_dist / n_test)) return times
class GaussianRandomProjection(object): """ @ivar documents: dict[int => dict[int => int/float]] list of documents @ivar D: int - dimension of vectors @ivar m: int - number of random projections @ivar projection_vectors: [[float]] - the projection vectors @ivar kdt: methods.KDTree - a KD-tree instance """ def __init__(self, documents, D, m): """ Creates a GaussianRandomProjection with the specified dimension and number of random projections @param documents: dict[int => dict[int => int/float]] - the documents @param D: int - dimension @param m: int - number of random projections """ self.documents = documents self.D = D self.m = m self.projection_vectors = Helper.create_projection_vectors(D, m) self.kdt = KDTree(m) for doc_id, doc in documents.iteritems(): self.kdt.insert(self.hash_document(doc), doc_id) def nearest_neighbor(self, document, alpha): """ Finds the approximate nearest neighbor for given document. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param alpha: float - alpha for approximate k-nn """ hashed_document = self.hash_document(document) nearest_id = self.kdt.nearest(hashed_document, alpha) distance = EvalUtil.distance(document, self.documents[nearest_id]) return NeighborDistance(nearest_id, distance) def hash_document(self, document): """ Hashes a document using the random projections @param document: dict[int => int/float] - document represented as dictionary of word ids => counts """ hashed_document = [0.0 for _ in range(self.m)] for hash_ind in xrange(self.m): hashed_document[hash_ind] = self.project_document( document, self.projection_vectors[hash_ind]) return hashed_document def project_document(self, document, vector): """ Projects a document onto a vector. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param vector: [float] - a vector on which to project the document """ dotprod = 0.0 #sparse dot product: keys exist only for nonzero values for word_id in document: #words originally numbered 1 to 1000, but vector goes 0 to 999 dotprod += document[word_id] * vector[word_id - 1] return dotprod
def __init__(self, documents, D, m): """ Creates a GaussianRandomProjection with the specified dimension and number of random projections @param documents: dict[int => dict[int => int/float]] - the documents @param D: int - dimension @param m: int - number of random projections """ self.documents = documents self.D = D self.m = m self.projection_vectors = Helper.create_projection_vectors(D, m) self.kdt = KDTree(m) for doc_id, doc in documents.iteritems(): self.kdt.insert(self.hash_document(doc), doc_id)
class GaussianRandomProjection(object): """ @ivar documents: dict[int => dict[int => int/float]] list of documents @ivar D: int - dimension of vectors @ivar m: int - number of random projections @ivar projection_vectors: [[float]] - the projection vectors @ivar kdt: methods.KDTree - a KD-tree instance """ def __init__(self, documents, D, m): """ Creates a GaussianRandomProjection with the specified dimension and number of random projections @param documents: dict[int => dict[int => int/float]] - the documents @param D: int - dimension @param m: int - number of random projections """ self.documents = documents self.D = D self.m = m self.projection_vectors = Helper.create_projection_vectors(D, m) self.kdt = KDTree(m) for doc_id, doc in documents.iteritems(): self.kdt.insert(self.hash_document(doc), doc_id) def nearest_neighbor(self, document, alpha): """ Finds the approximate nearest neighbor for given document. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param alpha: float - alpha for approximate k-nn """ hashed_document = self.hash_document(document) nearest_id = self.kdt.nearest(hashed_document, alpha) distance = EvalUtil.distance(document, self.documents[nearest_id]) return NeighborDistance(nearest_id, distance) def hash_document(self, document): """ Hashes a document using the random projections @param document: dict[int => int/float] - document represented as dictionary of word ids => counts """ hashed_document = [0.0 for _ in range(self.m)] for hash_ind in xrange(self.m): hashed_document[hash_ind] = self.project_document(document, self.projection_vectors[hash_ind]) return hashed_document def project_document(self, document, vector): """ Projects a document onto a vector. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param vector: [float] - a vector on which to project the document """ dotprod = 0.0 #sparse dot product: keys exist only for nonzero values for word_id in document: #words originally numbered 1 to 1000, but vector goes 0 to 999 dotprod += document[word_id] * vector[word_id - 1] return dotprod