def many_server(request, tempdir): sock = do_zeo_server(request, tempdir, name="many_server", fsname='many.fs') db = zerodb.testing.db(None, sock) with transaction.manager: for i in range(2000): db.add(Page(title="hello %s" % i, text="lorem ipsum dolor sit amet" * 2)) for i in range(1000): # Variable length while keeping number of terms the same # will cause variable scores db.add(Page(title="hello %s" % i, text="this is something we're looking for" * int(i ** 0.5))) db.add(Page(title="extra page", text="something else is here")) db.disconnect() return sock
def extract_from_db_to_file_system(min_page_len=500): """ :param min_page_len: :return: """ folder = "%s/temp" % config.data_base_path if not os.path.exists(folder) or not os.path.isdir(folder): os.makedirs(folder) p = Page() counter = 1 batch_size = 1000 offset = 0 limit = batch_size batch_result = p.get_pages(offset, limit, min_page_len) while len(batch_result) > 0: for r in batch_result: file_path = "%s/%s.md" % (folder, counter) f = open(file_path, "w") f.write(r.page_title.encode("utf8")) f.write("\n") f.write(r.page_content) f.close() if counter % 100 == 0: print("extract %d th page now" % counter) counter += 1 offset += batch_size batch_result = p.get_pages(offset, limit, min_page_len)
def new_book_page(bid, p_name, p_content): nbpage = Page(page_name=p_name, page_content=p_content, created=datetime.now(), updated=datetime.now(), book_id=bid) db_session.add(nbpage) db_session.commit()
def test_reindex(db): with transaction.manager: page = Page( title="hello", text="Quick0 brown lazy fox jumps over lorem ipsum dolor sit amet" ) docid = db.add(page) assert len(db[Page].query(Contains("text", "quick0"))) == 1 # DbModel, by ID with transaction.manager: page.text = "Quick1 brown lazy fox jumps over well, you know" db[Page].reindex(docid) assert len(db[Page].query(Contains("text", "quick0"))) == 0 assert len(db[Page].query(Contains("text", "quick1"))) == 1 # DbModel, by obj with transaction.manager: page.text = "quick2 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick1"))) == 0 assert len(db[Page].query(Contains("text", "quick2"))) == 1 # DB, by obj with transaction.manager: page.text = "quick3 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick2"))) == 0 assert len(db[Page].query(Contains("text", "quick3"))) == 1 # DB, multiple objects with transaction.manager: page2 = Page( title="hello", text="Quick4 brown lazy fox jumps over lorem ipsum dolor sit amet" ) db.add(page2) with transaction.manager: page.text = "quick5 brown lazy fox jumps over well, you know" page2.text = "quick5 brown lazy fox jumps over well, you know" db.reindex([page, page2]) assert len(db[Page].query( Contains("text", "quick3") | Contains("text", "quick4"))) == 0 assert len(db[Page].query(Contains("text", "quick5"))) == 2
def test_add(db): with transaction.manager: pre_commit_count = db._storage._debug_download_count page = Page(title="hello", text="Quick brown lazy fox jumps over lorem ipsum dolor sit amet") db.add(page) post_commit_count = db._storage._debug_download_count print("Number of requests:", post_commit_count - pre_commit_count) assert post_commit_count - pre_commit_count < 22 with transaction.manager: db.remove(page)
def test_all_uid(db): # Test for https://gist.github.com/micxjo/a097698b33fc4669b0b4 page = Page(title="Test page", text="Hello world") with transaction.manager: db.add(page) del page # Clear in-memory and on-disk caches db._storage._cache.clear() db._connection._cache.full_sweep() for item in db[Page].all(): assert hasattr(item, "_p_uid") del item db._storage._cache.clear() db._connection._cache.full_sweep() for uid in db[Page].all_uids(): obj = db[Page][uid] assert hasattr(obj, "_p_uid") del obj db._storage._cache.clear() db._connection._cache.full_sweep() uids = list(islice(db[Page].all_uids(), 10)) objs = db[Page][uids] for obj in objs: assert hasattr(obj, "_p_uid") objs = list(db[Page].all()) obj = objs[0] obj._p_activate() assert hasattr(obj, "_p_uid") objs[1].text += " xxx" transaction.commit() for obj in objs: assert hasattr(obj, "_p_uid")
# page info page_number = 1 for page_link in driver.find_elements_by_css_selector( "div#content div.prevws a" ): page_url = page_link.get_attribute("href") page = session.query(Page).filter(Page.url == page_url).first() page_text = get_page_text( journal_title, issue_date, issue_text, page_number ) try: page_link.find_element_by_class_name("treffer") hit = True except NoSuchElementException: hit = False if page: if args.update: page.text = page_text else: page = Page(issue.issue_id, page_number, page_text, hit, page_url) session.add(page) logger.debug( f"Page info extracted. Number: {page_number}, page url: {page_url} and page text: {page_text[:10] if page_text else None}" ) page_number += 1 session.commit() session.close() driver.quit() logger.info(f"Completed. Processing took {(datetime.now() - t1).seconds}s.")
def test_auto_reindex(db): with transaction.manager: page = Page(title="hello", text="autoreindex0, test whether to work") db.add(page) assert len(db[Page].query(Contains("text", "autoreindex0"))) == 1 with transaction.manager: page.text = "autoreindex1, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex0"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex1"))) == 1 with transaction.manager: page2 = Page(title="hello", text="autoreindex2, test whether to work") db.add(page2) with transaction.manager: page.text = "autoreindex3, test whether to work" page2.text = "autoreindex3, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex1") | Contains("text", "autoreindex2"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: page.text = "autoreindex3, test whether to work1" page.text = "autoreindex3, test whether to work2" page.text = "autoreindex3, test whether to work3" assert reindex_mock.call_count == 1 db.enableAutoReindex(False) with transaction.manager: page.text = "autoreindex4, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2 assert len(db[Page].query(Contains("text", "autoreindex4"))) == 0 db.enableAutoReindex(True) with transaction.manager: # should not throw ModleException page3 = Page(title="helloworld", text="autoreindex5, test whether to work") page3.title = "helloworld1" assert len(db[Page].query(Eq("title", "helloworld"))) == 0 assert len(db[Page].query(Eq("title", "helloworld1"))) == 0 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: # should not reindex page3 = Page(title="helloworld", text="autoreindex5, test whether to work") page3.title = "helloworld1" db.add(page3) assert reindex_mock.call_count == 0 with transaction.manager: # should reindex page3 = Page(title="helloworld", text="autoreindex6, test whether to work") db.add(page3) page3.title = "helloworld1" page3.text = "autoreindex7, test whether to work" assert len(db[Page].query(Eq("title", "helloworld"))) == 0 assert len(db[Page].query(Eq("title", "helloworld1"))) == 2 assert len(db[Page].query(Contains("text", "autoreindex6"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex7"))) == 1 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: # should reindex page3 = Page(title="helloworld", text="autoreindex6, test whether to work") db.add(page3) page3.title = "helloworld1" page3.text = "autoreindex7, test whether to work" assert reindex_mock.call_count == 1