def test_persists_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.docset.save() docsetp = Docset(DOCSET_NAME) pos1 = docsetp.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = docsetp.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4)
class TestDocset(unittest.TestCase): def setUp(self): global WROTE_DUMP if not WROTE_DUMP: self.__write_dumps() WROTE_DUMP = True self.docset = Docset(DOCSET_NAME) def tearDown(self): self.docset.delete() def test_delete(self): self.docset.add_dump("d1", dump1) self.docset.delete() # If this test is failing, then we might need to wait for DDFS garbage # collection here, but it seems to work fine for now. self.assertFalse("d1" in self.docset.dump_names()) def test_add_dump(self): self.docset.add_dump("d1", dump1) # check that it's in list of dumps self.assertTrue("d1" in self.docset.dump_names()) # check accessible over http from disco.util import urlresolve import urllib2 uri = list(self.docset.dump_uris())[0] httpuri = urlresolve(uri) d = urllib2.urlopen(httpuri).read() self.assertEquals(d, fixtures.warc_file1) def test_exists(self): self.assertFalse(self.docset.exists()) self.docset.add_dump("d1", dump1) self.assertTrue(self.docset.exists()) def test_doc_count(self): self.assertEquals(0, self.docset.doc_count) self.docset.add_dump("d1", dump1) self.assertEquals(2, self.docset.doc_count) self.docset.add_dump("d2", dump2) self.assertEquals(4, self.docset.doc_count) def test_persists_doc_count(self): self.docset.add_dump("d1", dump1) self.docset.save() self.assertEquals(2, Docset(DOCSET_NAME).doc_count) self.docset.add_dump("d2", dump2) self.docset.save() # import time # time.sleep(2.0) docsetp = Docset(DOCSET_NAME) self.assertEquals(4, docsetp.doc_count) def test_persists_doc_count_on_non_new_load(self): self.docset.add_dump("d1", dump1) self.docset.save() docsetp = Docset(DOCSET_NAME) docsetp.add_dump("d2", dump2) docsetp.save() self.assertEquals(4, Docset(DOCSET_NAME).doc_count) def test_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) pos1 = self.docset.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = self.docset.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4) def test_persists_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.docset.save() docsetp = Docset(DOCSET_NAME) pos1 = docsetp.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = docsetp.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4) def test_get(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.assertEquals(fixtures.example, self.docset.get("http://example.com")) self.assertEquals(fixtures.apple, self.docset.get("http://apple.com")) self.assertEquals(fixtures.examplez, self.docset.get("http://example.com/z.html")) def test_get_bad_uri_raises(self): from freequery.document.docset import DocumentNotFound self.docset.add_dump("d1", dump1) self.assertRaises(DocumentNotFound, lambda: self.docset.get_pos("http://baduri.example.com")) def __write_dumps(self): with open(dump1, "w+b") as f: f.write(fixtures.warc_file1) with open(dump2, "w+b") as f: f.write(fixtures.warc_file2)