def test_medium_minhashexcludes(self): #Minhash must exclude at least some misses! testmgr = zTestDataManager.TestBookManager('samplebooks.zip','../testbooks/') testmgr.unpack_archive(max=10) testbook = testmgr.get_testbooks()[0] e = zTestDataManager.ErrorMaker() dupe = testmgr.make_error_dupes([testbook,], errormaker=e)[0] books = testmgr.get_testbooks() library = Library.Library() for book in books: mybook = Book(textfile=book) mybook.initialize_text_data() library.add_book(mybook) dupe = library.get_book_textfile(dupe) testablebook = library.get_book_textfile(testbook) possiblematches = library.get_possible_matches(testablebook) self.assertLess(len(possiblematches), 5, msg="minhash was not able to eliminate at least one possible book") self.assertGreaterEqual(len(possiblematches), 1, msg="minhash didn't get at least one hit") minverification = testmgr.verify_minhash_results(library) testmgr.print_formatted_minhash_results(minverification)
def test_medium_minhashfindsmatches(self): #make certain minhash actually...works. testmgr = zTestDataManager.TestBookManager('samplebooks.zip','../testbooks/') testmgr.unpack_archive(max=2) testbook = testmgr.get_testbooks()[0] e = zTestDataManager.ErrorMaker() dupe = testmgr.make_error_dupes([testbook,], errormaker=e)[0] books = testmgr.get_testbooks() library = Library.Library() for book in books: mybook = Book(textfile=book) mybook.initialize_text_data() library.add_book(mybook) dupe = library.get_book_textfile(dupe) testablebook = library.get_book_textfile(testbook) possiblematches = library.get_possible_matches(testablebook) found = False for id, score in possiblematches: if id == dupe.id: found=True self.assertTrue(found)