Esempio n. 1
0
    def compare_algs_tfidf_simhashtfidf(self):
        token_length = 1
        test_set = self.generate_random_triples()

        ds = DocSim(self.document_set)
        sh = SimHashTfIdf(self.document_set)

        total = float(len(test_set))
        correct = 0.
        for t1, t2, t3 in test_set:
            dsim1 = ds.similarity(t1, t2)
            dsim2 = ds.similarity(t1, t3)
            ssim1 = sh.similarity(t1, t2)
            ssim2 = sh.similarity(t1, t3)

            if ((abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001)):
                total -= 1.
                continue
            db = dsim1 < dsim2
            sb = ssim1 < ssim2

            if db == sb:
                correct += 1.
        # print len(test_set)
        # print total
        return correct / total
Esempio n. 2
0
    def benchmark_memory_tfidf(self, iterations):
        ds = DocSim(self.document_set)
        for i in range(iterations):
            title1 = self.select_random_document()
            title2 = self.select_random_document()
            sim = ds.similarity(title1, title2)

        return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
Esempio n. 3
0
    def benchmark_tfidf(self, iterations):
        t0 = time.clock()
        ds = DocSim(self.document_set)
        for i in range(iterations):
            title1 = self.select_random_document()
            title2 = self.select_random_document()
            sim = ds.similarity(title1, title2)

        t1 = time.clock()
        span = t1 - t0
        return span