def test_persists_doc_count_on_non_new_load(self): self.docset.add_dump("d1", dump1) self.docset.save() docsetp = Docset(DOCSET_NAME) docsetp.add_dump("d2", dump2) docsetp.save() self.assertEquals(4, Docset(DOCSET_NAME).doc_count)
def test_persists_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.docset.save() docsetp = Docset(DOCSET_NAME) pos1 = docsetp.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = docsetp.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4)
class FreequeryClient(object): def __init__(self, spec): if isinstance(spec, Spec): self.spec = spec else: self.spec = Spec(spec) self.discodex_client = DiscodexClient() self.docset = Docset(self.spec.docset_name) def query(self, q, ranked=True): """Return a ranked list of matching `Document` instances.""" qq = Query.parse(q) res = self.discodex_client.query(self.spec.invindex_name, qq) res = map(TfIdf.undemux, res) if not res: return [] pageranks = None if ranked: scoredb = ScoreDB(self.spec.scoredb_path) uris = [e[0] for e in res] pageranks = dict(scoredb.rank(uris)) if not pageranks: raise Exception("no ranks available") docs = [] for uri,scores in res: doc = self.docset.get(uri) doc.score = Score(**scores) if pageranks: doc.score['pagerank'] = pageranks[uri] doc.excerpt = doc.excerpt(qq) docs.append(doc) return docs def index(self, **kwargs): if not self.docset.exists(): print "fq: cannot index `%s': no such docset" % self.spec.docset_name exit(1) job = IndexJob(self.spec, self.discodex_client, **kwargs) job.start() def linkparse(self, **kwargs): job = LinkParseJob(self.spec, **kwargs) job.start() def rank(self, **kwargs): job = PagerankJob(self.spec, **kwargs) job.start()
def __init__(self, spec): if isinstance(spec, Spec): self.spec = spec else: self.spec = Spec(spec) self.discodex_client = DiscodexClient() self.docset = Docset(self.spec.docset_name)
class LinkParseJob(object): def __init__(self, spec, verbose=False, **kwargs): self.spec = spec self.docset = Docset(self.spec.docset_name) self.disco = Disco("disco://localhost") self.verbose = verbose def start(self): from disco import func job = self.disco.new_job( name="linkparse", input=self.docset.dump_uris(), map_reader=docparse, map=linkparse_map, map_output_stream=(func.map_output_stream, func.disco_output_stream, LinkFileOutputStream.disco_output_stream), partitions=0, save=True, ) results = job.wait() self.__tag_results(results) if self.verbose: self.__print_results(results) def __tag_results(self, results): from disco.ddfs import DDFS ddfs = DDFS() results_tag = results[0] ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag))) # remove old, temporary tag ddfs.delete(results_tag) def __print_results(self, results): for doc in result_iterator(results, tempdir=False, reader=doclinksparse): print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
def __init__(self, spec, verbose=False, **kwargs): self.spec = spec self.docset = Docset(self.spec.docset_name) self.disco = Disco("disco://localhost") self.verbose = verbose
def setUp(self): global WROTE_DUMP if not WROTE_DUMP: self.__write_dumps() WROTE_DUMP = True self.docset = Docset(DOCSET_NAME)
class TestDocset(unittest.TestCase): def setUp(self): global WROTE_DUMP if not WROTE_DUMP: self.__write_dumps() WROTE_DUMP = True self.docset = Docset(DOCSET_NAME) def tearDown(self): self.docset.delete() def test_delete(self): self.docset.add_dump("d1", dump1) self.docset.delete() # If this test is failing, then we might need to wait for DDFS garbage # collection here, but it seems to work fine for now. self.assertFalse("d1" in self.docset.dump_names()) def test_add_dump(self): self.docset.add_dump("d1", dump1) # check that it's in list of dumps self.assertTrue("d1" in self.docset.dump_names()) # check accessible over http from disco.util import urlresolve import urllib2 uri = list(self.docset.dump_uris())[0] httpuri = urlresolve(uri) d = urllib2.urlopen(httpuri).read() self.assertEquals(d, fixtures.warc_file1) def test_exists(self): self.assertFalse(self.docset.exists()) self.docset.add_dump("d1", dump1) self.assertTrue(self.docset.exists()) def test_doc_count(self): self.assertEquals(0, self.docset.doc_count) self.docset.add_dump("d1", dump1) self.assertEquals(2, self.docset.doc_count) self.docset.add_dump("d2", dump2) self.assertEquals(4, self.docset.doc_count) def test_persists_doc_count(self): self.docset.add_dump("d1", dump1) self.docset.save() self.assertEquals(2, Docset(DOCSET_NAME).doc_count) self.docset.add_dump("d2", dump2) self.docset.save() # import time # time.sleep(2.0) docsetp = Docset(DOCSET_NAME) self.assertEquals(4, docsetp.doc_count) def test_persists_doc_count_on_non_new_load(self): self.docset.add_dump("d1", dump1) self.docset.save() docsetp = Docset(DOCSET_NAME) docsetp.add_dump("d2", dump2) docsetp.save() self.assertEquals(4, Docset(DOCSET_NAME).doc_count) def test_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) pos1 = self.docset.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = self.docset.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4) def test_persists_get_pos(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.docset.save() docsetp = Docset(DOCSET_NAME) pos1 = docsetp.get_pos("http://example.com") self.assertEquals(("d1", 0, 117), pos1) pos2 = docsetp.get_pos("http://apple.com") self.assertEquals(("d1", 117, 113), pos2) pos3 = self.docset.get_pos("http://example.com/m.html") self.assertEquals(("d2", 0, 115), pos3) pos4 = self.docset.get_pos("http://example.com/z.html") self.assertEquals(("d2", 115, 115), pos4) def test_get(self): self.docset.add_dump("d1", dump1) self.docset.add_dump("d2", dump2) self.assertEquals(fixtures.example, self.docset.get("http://example.com")) self.assertEquals(fixtures.apple, self.docset.get("http://apple.com")) self.assertEquals(fixtures.examplez, self.docset.get("http://example.com/z.html")) def test_get_bad_uri_raises(self): from freequery.document.docset import DocumentNotFound self.docset.add_dump("d1", dump1) self.assertRaises(DocumentNotFound, lambda: self.docset.get_pos("http://baduri.example.com")) def __write_dumps(self): with open(dump1, "w+b") as f: f.write(fixtures.warc_file1) with open(dump2, "w+b") as f: f.write(fixtures.warc_file2)