Example #1
0
 def test_persists_doc_count_on_non_new_load(self):
     self.docset.add_dump("d1", dump1)
     self.docset.save()
     docsetp = Docset(DOCSET_NAME)
     docsetp.add_dump("d2", dump2)
     docsetp.save()
     self.assertEquals(4, Docset(DOCSET_NAME).doc_count)
Example #2
0
 def test_persists_get_pos(self):
     self.docset.add_dump("d1", dump1)
     self.docset.add_dump("d2", dump2)
     self.docset.save()
     docsetp = Docset(DOCSET_NAME)
     pos1 = docsetp.get_pos("http://example.com")
     self.assertEquals(("d1", 0, 117), pos1)
     pos2 = docsetp.get_pos("http://apple.com")
     self.assertEquals(("d1", 117, 113), pos2)
     pos3 = self.docset.get_pos("http://example.com/m.html")
     self.assertEquals(("d2", 0, 115), pos3)
     pos4 = self.docset.get_pos("http://example.com/z.html")
     self.assertEquals(("d2", 115, 115), pos4)
Example #3
0
class FreequeryClient(object):

    def __init__(self, spec):
        if isinstance(spec, Spec):
            self.spec = spec
        else:
            self.spec = Spec(spec)
        self.discodex_client = DiscodexClient()
        self.docset = Docset(self.spec.docset_name)

    def query(self, q, ranked=True):
        """Return a ranked list of matching `Document` instances."""
        qq = Query.parse(q)
        res = self.discodex_client.query(self.spec.invindex_name, qq)
        res = map(TfIdf.undemux, res)
        if not res:
            return []

        pageranks = None
        if ranked:
            scoredb = ScoreDB(self.spec.scoredb_path)
            uris = [e[0] for e in res]
            pageranks = dict(scoredb.rank(uris))
            if not pageranks:
                raise Exception("no ranks available")
            
        docs = []
        for uri,scores in res:
            doc = self.docset.get(uri)
            doc.score = Score(**scores)
            if pageranks:
                doc.score['pagerank'] = pageranks[uri]
            doc.excerpt = doc.excerpt(qq)
            docs.append(doc)
        return docs

    def index(self, **kwargs):
        if not self.docset.exists():
            print "fq: cannot index `%s': no such docset" % self.spec.docset_name
            exit(1)
        job = IndexJob(self.spec, self.discodex_client, **kwargs)
        job.start()


    def linkparse(self, **kwargs):
        job = LinkParseJob(self.spec, **kwargs)
        job.start()
        
    def rank(self, **kwargs):
         job = PagerankJob(self.spec, **kwargs)
         job.start()
Example #4
0
 def __init__(self, spec):
     if isinstance(spec, Spec):
         self.spec = spec
     else:
         self.spec = Spec(spec)
     self.discodex_client = DiscodexClient()
     self.docset = Docset(self.spec.docset_name)
Example #5
0
class LinkParseJob(object):

    def __init__(self, spec, verbose=False, **kwargs):
        self.spec = spec
        self.docset = Docset(self.spec.docset_name)
        self.disco = Disco("disco://localhost")
        self.verbose = verbose

    def start(self):
        from disco import func
        job = self.disco.new_job(
            name="linkparse",
            input=self.docset.dump_uris(),
            map_reader=docparse,
            map=linkparse_map,
            map_output_stream=(func.map_output_stream,
                               func.disco_output_stream,
                               LinkFileOutputStream.disco_output_stream),
            partitions=0,
            save=True,
        )
        results = job.wait()

        self.__tag_results(results)

        if self.verbose:
            self.__print_results(results)

    def __tag_results(self, results):
        from disco.ddfs import DDFS
        ddfs = DDFS()
        results_tag = results[0]
        ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))

        # remove old, temporary tag
        ddfs.delete(results_tag)
            
    def __print_results(self, results):
        for doc in result_iterator(results, tempdir=False, reader=doclinksparse):
            print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
Example #6
0
 def __init__(self, spec, verbose=False, **kwargs):
     self.spec = spec
     self.docset = Docset(self.spec.docset_name)
     self.disco = Disco("disco://localhost")
     self.verbose = verbose
Example #7
0
 def setUp(self):
     global WROTE_DUMP
     if not WROTE_DUMP:
         self.__write_dumps()
         WROTE_DUMP = True
     self.docset = Docset(DOCSET_NAME)
Example #8
0
class TestDocset(unittest.TestCase):
    def setUp(self):
        global WROTE_DUMP
        if not WROTE_DUMP:
            self.__write_dumps()
            WROTE_DUMP = True
        self.docset = Docset(DOCSET_NAME)

    def tearDown(self):
        self.docset.delete()

    def test_delete(self):
        self.docset.add_dump("d1", dump1)
        self.docset.delete()
        # If this test is failing, then we might need to wait for DDFS garbage
        # collection here, but it seems to work fine for now.
        self.assertFalse("d1" in self.docset.dump_names())

    def test_add_dump(self):
        self.docset.add_dump("d1", dump1)

        # check that it's in list of dumps
        self.assertTrue("d1" in self.docset.dump_names())

        # check accessible over http
        from disco.util import urlresolve
        import urllib2

        uri = list(self.docset.dump_uris())[0]
        httpuri = urlresolve(uri)
        d = urllib2.urlopen(httpuri).read()
        self.assertEquals(d, fixtures.warc_file1)

    def test_exists(self):
        self.assertFalse(self.docset.exists())
        self.docset.add_dump("d1", dump1)
        self.assertTrue(self.docset.exists())

    def test_doc_count(self):
        self.assertEquals(0, self.docset.doc_count)
        self.docset.add_dump("d1", dump1)
        self.assertEquals(2, self.docset.doc_count)
        self.docset.add_dump("d2", dump2)
        self.assertEquals(4, self.docset.doc_count)

    def test_persists_doc_count(self):
        self.docset.add_dump("d1", dump1)
        self.docset.save()
        self.assertEquals(2, Docset(DOCSET_NAME).doc_count)
        self.docset.add_dump("d2", dump2)
        self.docset.save()
        # import time
        # time.sleep(2.0)
        docsetp = Docset(DOCSET_NAME)
        self.assertEquals(4, docsetp.doc_count)

    def test_persists_doc_count_on_non_new_load(self):
        self.docset.add_dump("d1", dump1)
        self.docset.save()
        docsetp = Docset(DOCSET_NAME)
        docsetp.add_dump("d2", dump2)
        docsetp.save()
        self.assertEquals(4, Docset(DOCSET_NAME).doc_count)

    def test_get_pos(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        pos1 = self.docset.get_pos("http://example.com")
        self.assertEquals(("d1", 0, 117), pos1)
        pos2 = self.docset.get_pos("http://apple.com")
        self.assertEquals(("d1", 117, 113), pos2)
        pos3 = self.docset.get_pos("http://example.com/m.html")
        self.assertEquals(("d2", 0, 115), pos3)
        pos4 = self.docset.get_pos("http://example.com/z.html")
        self.assertEquals(("d2", 115, 115), pos4)

    def test_persists_get_pos(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        self.docset.save()
        docsetp = Docset(DOCSET_NAME)
        pos1 = docsetp.get_pos("http://example.com")
        self.assertEquals(("d1", 0, 117), pos1)
        pos2 = docsetp.get_pos("http://apple.com")
        self.assertEquals(("d1", 117, 113), pos2)
        pos3 = self.docset.get_pos("http://example.com/m.html")
        self.assertEquals(("d2", 0, 115), pos3)
        pos4 = self.docset.get_pos("http://example.com/z.html")
        self.assertEquals(("d2", 115, 115), pos4)

    def test_get(self):
        self.docset.add_dump("d1", dump1)
        self.docset.add_dump("d2", dump2)
        self.assertEquals(fixtures.example, self.docset.get("http://example.com"))
        self.assertEquals(fixtures.apple, self.docset.get("http://apple.com"))
        self.assertEquals(fixtures.examplez, self.docset.get("http://example.com/z.html"))

    def test_get_bad_uri_raises(self):
        from freequery.document.docset import DocumentNotFound

        self.docset.add_dump("d1", dump1)
        self.assertRaises(DocumentNotFound, lambda: self.docset.get_pos("http://baduri.example.com"))

    def __write_dumps(self):
        with open(dump1, "w+b") as f:
            f.write(fixtures.warc_file1)
        with open(dump2, "w+b") as f:
            f.write(fixtures.warc_file2)