Beispiel #1
0
class LinkParseJob(object):

    def __init__(self, spec, verbose=False, **kwargs):
        self.spec = spec
        self.docset = Docset(self.spec.docset_name)
        self.disco = Disco("disco://localhost")
        self.verbose = verbose

    def start(self):
        from disco import func
        job = self.disco.new_job(
            name="linkparse",
            input=self.docset.dump_uris(),
            map_reader=docparse,
            map=linkparse_map,
            map_output_stream=(func.map_output_stream,
                               func.disco_output_stream,
                               LinkFileOutputStream.disco_output_stream),
            partitions=0,
            save=True,
        )
        results = job.wait()

        self.__tag_results(results)

        if self.verbose:
            self.__print_results(results)

    def __tag_results(self, results):
        from disco.ddfs import DDFS
        ddfs = DDFS()
        results_tag = results[0]
        ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))

        # remove old, temporary tag
        ddfs.delete(results_tag)
            
    def __print_results(self, results):
        for doc in result_iterator(results, tempdir=False, reader=doclinksparse):
            print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
Beispiel #2
0
class TestDocset(unittest.TestCase):
    def setUp(self):
        global WROTE_DUMP
        if not WROTE_DUMP:
            self.__write_dumps()
            WROTE_DUMP = True
        self.docset = Docset(DOCSET_NAME)

    def tearDown(self):
        self.docset.delete()

    def test_delete(self):
        self.docset.add_dump("d1", dump1)
        self.docset.delete()
        # If this test is failing, then we might need to wait for DDFS garbage
        # collection here, but it seems to work fine for now.
        self.assertFalse("d1" in self.docset.dump_names())

    def test_add_dump(self):
        self.docset.add_dump("d1", dump1)

        # check that it's in list of dumps
        self.assertTrue("d1" in self.docset.dump_names())

        # check accessible over http
        from disco.util import urlresolve
        import urllib2

        uri = list(self.docset.dump_uris())[0]
        httpuri = urlresolve(uri)
        d = urllib2.urlopen(httpuri).read()
        self.assertEquals(d, fixtures.warc_file1)

    def test_exists(self):
        self.assertFalse(self.docset.exists())
        self.docset.add_dump("d1", dump1)
        self.assertTrue(self.docset.exists())

    def test_doc_count(self):
        self.assertEquals(0, self.docset.doc_count)
        self.docset.add_dump("d1", dump1)
        self.assertEquals(2, self.docset.doc_count)
        self.docset.add_dump("d2", dump2)
        self.assertEquals(4, self.docset.doc_count)

    def test_persists_doc_count(self):
        self.docset.add_dump("d1", dump1)
        self.docset.save()
        self.assertEquals(2, Docset(DOCSET_NAME).doc_count)
        self.docset.add_dump("d2", dump2)
        self.docset.save()
        # import time
        # time.sleep(2.0)
        docsetp = Docset(DOCSET_NAME)
        self.assertEquals(4, docsetp.doc_count)

    def test_persists_doc_count_on_non_new_load(self):
        self.docset.add_dump("d1", dump1)
        self.docset.save()
        docsetp = Docset(DOCSET_NAME)
        docsetp.add_dump("d2", dump2)
        docsetp.save()
        self.assertEquals(4, Docset(DOCSET_NAME).doc_count)

    def test_get_pos(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        pos1 = self.docset.get_pos("http://example.com")
        self.assertEquals(("d1", 0, 117), pos1)
        pos2 = self.docset.get_pos("http://apple.com")
        self.assertEquals(("d1", 117, 113), pos2)
        pos3 = self.docset.get_pos("http://example.com/m.html")
        self.assertEquals(("d2", 0, 115), pos3)
        pos4 = self.docset.get_pos("http://example.com/z.html")
        self.assertEquals(("d2", 115, 115), pos4)

    def test_persists_get_pos(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        self.docset.save()
        docsetp = Docset(DOCSET_NAME)
        pos1 = docsetp.get_pos("http://example.com")
        self.assertEquals(("d1", 0, 117), pos1)
        pos2 = docsetp.get_pos("http://apple.com")
        self.assertEquals(("d1", 117, 113), pos2)
        pos3 = self.docset.get_pos("http://example.com/m.html")
        self.assertEquals(("d2", 0, 115), pos3)
        pos4 = self.docset.get_pos("http://example.com/z.html")
        self.assertEquals(("d2", 115, 115), pos4)

    def test_get(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        self.assertEquals(fixtures.example, self.docset.get("http://example.com"))
        self.assertEquals(fixtures.apple, self.docset.get("http://apple.com"))
        self.assertEquals(fixtures.examplez, self.docset.get("http://example.com/z.html"))

    def test_get_bad_uri_raises(self):
        from freequery.document.docset import DocumentNotFound

        self.docset.add_dump("d1", dump1)
        self.assertRaises(DocumentNotFound, lambda: self.docset.get_pos("http://baduri.example.com"))

    def __write_dumps(self):
        with open(dump1, "w+b") as f:
            f.write(fixtures.warc_file1)
        with open(dump2, "w+b") as f:
            f.write(fixtures.warc_file2)