Esempio n. 1
0
def test_approx_nn(method, traindata, testdata, m, alpha):
    avg_distance = 0
    if method == "hashing":
        #train
        lsh = LocalitySensitiveHash(traindata, D=1000, m=m)
        #time test
        t0 = time.time()
        for testdoc_id, testdoc in testdata.iteritems():
            avg_distance += lsh.nearest_neighbor(testdoc,
                                                 depth=HW2_DEPTH).distance
    if method == "kdtree":
        #train
        kdt = KDTree(D)
        for i, document in traindata.iteritems():
            key = make_dense(document)
            kdt.insert(key, i)
        #time test
        t0 = time.time()
        for _, testdoc in testdata.iteritems():
            key = make_dense(testdoc)
            neighbor = kdt.nearest(key, alpha)
            avg_distance += EvalUtil.distance(testdoc, docdata[neighbor])

    #finish timing, report results
    mean_time = (time.time() - t0) / len(testdata)
    mean_distance = avg_distance / len(testdata)
    return TestResult(method,
                      m=m,
                      D=D,
                      alpha=alpha,
                      avg_time=mean_time,
                      avg_distance=mean_distance)
Esempio n. 2
0
def test_approx_nn(method, traindata, testdata, m, alpha):
        avg_distance = 0
        if method == "hashing":
            #train
            lsh = LocalitySensitiveHash(traindata, D=1000, m=m)
            #time test
            t0 = time.time()
            for testdoc_id, testdoc in testdata.iteritems():
                avg_distance += lsh.nearest_neighbor(testdoc, depth = HW2_DEPTH).distance
        if method == "kdtree":
            #train
            kdt = KDTree(D)
            for i, document in traindata.iteritems():
                key = make_dense(document)
                kdt.insert(key, i)
            #time test
            t0 = time.time()
            for _, testdoc in testdata.iteritems():
                key = make_dense(testdoc)
                neighbor = kdt.nearest(key, alpha)
                avg_distance += EvalUtil.distance(testdoc, docdata[neighbor])

        #finish timing, report results
        mean_time = (time.time() - t0) / len(testdata)
        mean_distance = avg_distance   / len(testdata)
        return TestResult(method, m=m, D=D, alpha = alpha, avg_time=mean_time, avg_distance=mean_distance)
Esempio n. 3
0
def test_kd_tree(n, D, n_test, alphas):
    """
    Tests the query time and distance for a random data set and test set
    @param n: int - the number of points of the dataset
    @param D: int - the dimension of the data points
    @param n_test: int - the number of points to test
    @param alphas: [float] - a set of alphas to test
    @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query
    """
    documents = RandomData.random_dataset(n, DOCDIM)
    test_documents = RandomData.random_dataset(n_test, DOCDIM)

    rand_tree = KDTree(DOCDIM)
    for i, document in documents.iteritems():
        key = [document.get(idx) for idx in xrange(0, DOCDIM)]
        rand_tree.insert(key, i)

    times = []
    for alpha in alphas:
        start_time = time.clock()
        cum_dist = 0.0
        for i, test_document in test_documents.iteritems():
            key = [test_document.get(idx) for idx in xrange(0, DOCDIM)]
            doc_id = rand_tree.nearest(key, alpha)
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        duration = time.clock() - start_time
        times.append(
            TestResult("KDTree", n, DOCDIM, alpha, duration / n_test,
                       cum_dist / n_test))
    return times
Esempio n. 4
0
def test_kd_tree(train_docs, test_docs, D, alphas):
	"""
	Tests the query time and distance for the given training and testing sets
	@param D: int - the dimension of the data points
	@param alphas: [float] - a set of alphas to test
	@return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query
	"""

	# Populate the tree with the training data
	print "Forming KD-tree"
	tree = KDTree(D)
	for i, document in train_docs.iteritems():
		key = [document.get(idx,0) for idx in xrange(0, D)]
		tree.insert(key, i)
	print "Done"

	times = []
	n = len(test_docs)
	for alpha in alphas:
		print "Computing average lookup time and distance to nearest neighbor for alpha = %d" %alpha
		start_time = time.clock()
		cum_dist = 0.0
		for i, test_doc in test_docs.iteritems():
			key = [test_doc.get(idx,0) for idx in xrange(0, D)]
			doc_id = tree.nearest(key, alpha)
			cum_dist += EvalUtil.distance(test_doc, train_docs[doc_id])
		duration = time.clock() - start_time
		times.append(TestResult("KDTree", n, D, alpha, duration / n, cum_dist / n))
		print "Average distance: %f" %(cum_dist / n)
		print "Average time: %f\n" %(duration / n)
	return times
Esempio n. 5
0
class GaussianRandomProjection(object):
    """
    @ivar documents: dict[int => dict[int => int/float]] list of documents
    @ivar D: int - dimension of vectors
    @ivar m: int - number of random projections
    @ivar projection_vectors: [[float]] - the projection vectors
    @ivar kdt: methods.KDTree - a KD-tree instance
    """
    def __init__(self, documents, D, m):
        """
        Creates a GaussianRandomProjection with the specified dimension and
        number of random projections
        @param documents: dict[int => dict[int => int/float]] - the documents
        @param D: int - dimension
        @param m: int - number of random projections
        """
        self.documents = documents
        self.D = D
        self.m = m
        self.projection_vectors = Helper.create_projection_vectors(D, m)
        self.kdt = KDTree(m)
        for doc_id, doc in documents.iteritems():
            self.kdt.insert(self.hash_document(doc), doc_id)

    def nearest_neighbor(self, document, alpha):
        """
        Finds the approximate nearest neighbor for given document.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param alpha: float - alpha for approximate k-nn
        """
        hashed_document = self.hash_document(document)
        nearest_id = self.kdt.nearest(hashed_document, alpha)
        distance = EvalUtil.distance(document, self.documents[nearest_id])
        return NeighborDistance(nearest_id, distance)

    def hash_document(self, document):
        """
        Hashes a document using the random projections
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        """
        # hashed_document = [0.0 for _ in self.m]
        # # TODO: hash/project the document onto the "m" gaussian random projections
        # raise Exception("Please implement the GaussianRandomProjection.hash_document method")
        # return hashed_document

        return [
            self.project_document(document, self.projection_vectors[i])
            for i in xrange(self.m)
        ]

    def project_document(self, document, vector):
        """
        Projects a document onto a vector.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param vector: [float] - a vector on which to project the document
        """
        dotprod = 0.0
        for word in document:
            dotprod += document[word] * vector[word - 1]
        return dotprod
Esempio n. 6
0
def test_kd_tree(documents, test_documents, D, alphas):
    n = len(documents)
    n_test = len(test_documents)

    tree = KDTree(D)
    for i, document in documents.iteritems():
        key = [document.get(idx) for idx in xrange(0, D)]
        tree.insert(key, i)

    print "Finished making random tree."
    times = []
    for alpha in alphas:
        print "Running for alpha", alpha
        start_time = time.clock()
        cum_dist = 0.0
        print "Running for the test documents..."
        for i, test_document in test_documents.iteritems():
            if i%50 == 0:
                print "  ", i, "of", len(test_documents)
            key = [test_document.get(idx) for idx in xrange(0, D)]
            doc_id = tree.nearest(key, alpha)
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        print "Finished."
        duration = time.clock() - start_time
        times.append(TestResult("KDTree", n, D, alpha, duration / n_test, cum_dist / n_test))
    return times
class GaussianRandomProjection(object):
    """
    @ivar documents: dict[int => dict[int => int/float]] list of documents
    @ivar D: int - dimension of vectors
    @ivar m: int - number of random projections
    @ivar projection_vectors: [[float]] - the projection vectors
    @ivar kdt: methods.KDTree - a KD-tree instance
    """
    def __init__(self, documents, D, m):
        """
        Creates a GaussianRandomProjection with the specified dimension and
        number of random projections
        @param documents: dict[int => dict[int => int/float]] - the documents
        @param D: int - dimension
        @param m: int - number of random projections
        """
        self.documents = documents
        self.D = D
        self.m = m
        self.projection_vectors = Helper.create_projection_vectors(D, m)
        self.kdt = KDTree(m)
        for doc_id, doc in documents.iteritems():
            self.kdt.insert(self.hash_document(doc), doc_id)

    def nearest_neighbor(self, document, alpha):
        """
        Finds the approximate nearest neighbor for given document.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param alpha: float - alpha for approximate k-nn
        """
        hashed_document = self.hash_document(document)
        nearest_id = self.kdt.nearest(hashed_document, alpha)
        distance = EvalUtil.distance(document, self.documents[nearest_id])
        return NeighborDistance(nearest_id, distance)

    def hash_document(self, document):
        """
        Hashes a document using the random projections
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        """
        hashed_document = [0.0 for _ in range(self.m)]
        for hash_ind in xrange(self.m):
            hashed_document[hash_ind] = self.project_document(
                document, self.projection_vectors[hash_ind])
        return hashed_document

    def project_document(self, document, vector):
        """
        Projects a document onto a vector.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param vector: [float] - a vector on which to project the document
        """
        dotprod = 0.0

        #sparse dot product: keys exist only for nonzero values
        for word_id in document:
            #words originally numbered 1 to 1000, but vector goes 0 to 999
            dotprod += document[word_id] * vector[word_id - 1]
        return dotprod
 def __init__(self, documents, D, m):
     """
     Creates a GaussianRandomProjection with the specified dimension and
     number of random projections
     @param documents: dict[int => dict[int => int/float]] - the documents
     @param D: int - dimension
     @param m: int - number of random projections
     """
     self.documents = documents
     self.D = D
     self.m = m
     self.projection_vectors = Helper.create_projection_vectors(D, m)
     self.kdt = KDTree(m)
     for doc_id, doc in documents.iteritems():
         self.kdt.insert(self.hash_document(doc), doc_id)
 def __init__(self, documents, D, m):
     """
     Creates a GaussianRandomProjection with the specified dimension and
     number of random projections
     @param documents: dict[int => dict[int => int/float]] - the documents
     @param D: int - dimension
     @param m: int - number of random projections
     """
     self.documents = documents
     self.D = D
     self.m = m
     self.projection_vectors = Helper.create_projection_vectors(D, m)
     self.kdt = KDTree(m)
     for doc_id, doc in documents.iteritems():
         self.kdt.insert(self.hash_document(doc), doc_id)
class GaussianRandomProjection(object):
    """
    @ivar documents: dict[int => dict[int => int/float]] list of documents
    @ivar D: int - dimension of vectors
    @ivar m: int - number of random projections
    @ivar projection_vectors: [[float]] - the projection vectors
    @ivar kdt: methods.KDTree - a KD-tree instance
    """

    def __init__(self, documents, D, m):
        """
        Creates a GaussianRandomProjection with the specified dimension and
        number of random projections
        @param documents: dict[int => dict[int => int/float]] - the documents
        @param D: int - dimension
        @param m: int - number of random projections
        """
        self.documents = documents
        self.D = D
        self.m = m
        self.projection_vectors = Helper.create_projection_vectors(D, m)
        self.kdt = KDTree(m)
        for doc_id, doc in documents.iteritems():
            self.kdt.insert(self.hash_document(doc), doc_id)


    def nearest_neighbor(self, document, alpha):
        """
        Finds the approximate nearest neighbor for given document.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param alpha: float - alpha for approximate k-nn
        """
        hashed_document = self.hash_document(document)
        nearest_id = self.kdt.nearest(hashed_document, alpha)
        distance = EvalUtil.distance(document, self.documents[nearest_id])
        return NeighborDistance(nearest_id, distance)


    def hash_document(self, document):
        """
        Hashes a document using the random projections
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        """
        hashed_document = [0.0 for _ in range(self.m)]
        for hash_ind in xrange(self.m):
            hashed_document[hash_ind] = self.project_document(document, self.projection_vectors[hash_ind])
        return hashed_document


    def project_document(self, document, vector):
        """
        Projects a document onto a vector.
        @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
        @param vector: [float] - a vector on which to project the document
        """
        dotprod = 0.0

        #sparse dot product: keys exist only for nonzero values
        for word_id in document:
            #words originally numbered 1 to 1000, but vector goes 0 to 999
            dotprod += document[word_id] * vector[word_id - 1]
        return dotprod