def extract_from_db_to_file_system(min_page_len=500): """ :param min_page_len: :return: """ folder = "%s/temp" % config.data_base_path if not os.path.exists(folder) or not os.path.isdir(folder): os.makedirs(folder) p = Page() counter = 1 batch_size = 1000 offset = 0 limit = batch_size batch_result = p.get_pages(offset, limit, min_page_len) while len(batch_result) > 0: for r in batch_result: file_path = "%s/%s.md" % (folder, counter) f = open(file_path, "w") f.write(r.page_title.encode("utf8")) f.write("\n") f.write(r.page_content) f.close() if counter % 100 == 0: print("extract %d th page now" % counter) counter += 1 offset += batch_size batch_result = p.get_pages(offset, limit, min_page_len)
def __init__(self, website, initial_page): self.website = website self.netloc = urlsplit(website).netloc self.pages = {} self.queue = set() self.initial_page = initial_page Page.create_table() self.id = -1
def many_server(request, tempdir): sock = do_zeo_server(request, tempdir, name="many_server", fsname='many.fs') db = zerodb.testing.db(None, sock) with transaction.manager: for i in range(2000): db.add(Page(title="hello %s" % i, text="lorem ipsum dolor sit amet" * 2)) for i in range(1000): # Variable length while keeping number of terms the same # will cause variable scores db.add(Page(title="hello %s" % i, text="this is something we're looking for" * int(i ** 0.5))) db.add(Page(title="extra page", text="something else is here")) db.disconnect() return sock
def worker(domain): while True: # if LINKS_QUEUE.qsize() == 0: # sleep(10) # if LINKS_QUEUE.qsize() == 0: # break # continue url = LINKS_QUEUE.get() SCANNED_LINKS.add(url) try: with HTMLSession() as session: resp = session.get(url) assert resp.status_code == 200 except Exception as e: print(e, type(e)) continue try: page_title = resp.html.xpath('//title')[0].text except IndexError: page_title = 'Not Found' try: page_h1 = resp.html.xpath('//h1')[0].text except IndexError: page_h1 = 'Not Found' Page.create(url=url, title=page_title, h1=page_h1) print('[OK]', url) with locker: with open('results.csv', 'a') as f: f.write(f'{url}\t{page_title}\t{page_h1}\n') for link in resp.html.absolute_links: link = link.split('#')[0] if domain not in link: continue if link in SCANNED_LINKS: continue if any(part in link for part in BAD_PARTS): continue LINKS_QUEUE.put(link)
def generate_adjacency_matrix(self, drop_static=False): pages = [page for page in Page.select() if not drop_static or "html" in page.content_type] ids = {} matrix = {} for page in pages: if drop_static and "text/html" not in page.content_type: continue ids[page.url] = int(page.id) matrix[page.id] = set() for page in pages: if drop_static and "text/html" not in page.content_type: continue for link in json.loads(page.links): if drop_static and link not in ids: continue if ids[link] not in matrix[page.id]: matrix[page.id].add(ids[link]) for el in matrix: matrix[el] = list(matrix[el]) with open("data\\matrix.json", "w") as w: w.write(json.dumps(matrix))
def new_book_page(bid, p_name, p_content): nbpage = Page(page_name=p_name, page_content=p_content, created=datetime.now(), updated=datetime.now(), book_id=bid) db_session.add(nbpage) db_session.commit()
def extract_from_db_to_file_system(number_limit = -1, min_page_len = 0): """ :param number_limit: the number limit , if value equal -1, it means no limit :param min_page_len: :return: """ folder = "temp" if not os.path.isdir(folder): os.makedirs(folder) p = Page() counter = 1741794 for r in p.get_pages(number_limit,min_page_len): print (counter -1741794) *1.0/670471 file_path = folder + "/" + str(counter)+".md" f = open(file_path,"w") f.write(r.page_title.encode("utf8")) f.write("\n") f.write(r.page_content) f.close() counter += 1
def test_add(db): with transaction.manager: pre_commit_count = db._storage._debug_download_count page = Page(title="hello", text="Quick brown lazy fox jumps over lorem ipsum dolor sit amet") db.add(page) post_commit_count = db._storage._debug_download_count print("Number of requests:", post_commit_count - pre_commit_count) assert post_commit_count - pre_commit_count < 22 with transaction.manager: db.remove(page)
def test_reindex(db): with transaction.manager: page = Page( title="hello", text="Quick0 brown lazy fox jumps over lorem ipsum dolor sit amet" ) docid = db.add(page) assert len(db[Page].query(Contains("text", "quick0"))) == 1 # DbModel, by ID with transaction.manager: page.text = "Quick1 brown lazy fox jumps over well, you know" db[Page].reindex(docid) assert len(db[Page].query(Contains("text", "quick0"))) == 0 assert len(db[Page].query(Contains("text", "quick1"))) == 1 # DbModel, by obj with transaction.manager: page.text = "quick2 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick1"))) == 0 assert len(db[Page].query(Contains("text", "quick2"))) == 1 # DB, by obj with transaction.manager: page.text = "quick3 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick2"))) == 0 assert len(db[Page].query(Contains("text", "quick3"))) == 1 # DB, multiple objects with transaction.manager: page2 = Page( title="hello", text="Quick4 brown lazy fox jumps over lorem ipsum dolor sit amet" ) db.add(page2) with transaction.manager: page.text = "quick5 brown lazy fox jumps over well, you know" page2.text = "quick5 brown lazy fox jumps over well, you know" db.reindex([page, page2]) assert len(db[Page].query( Contains("text", "quick3") | Contains("text", "quick4"))) == 0 assert len(db[Page].query(Contains("text", "quick5"))) == 2
def load(self): max_id = 1 visited_links = set() all_links = set() for page in Page.select(): if page.id > max_id: max_id = page.id visited_links.add(page.url) for link in json.loads(page.links): all_links.add(link) self.pages = {l: None for l in visited_links} self.queue = all_links - visited_links return max_id
def get_page(self, url): normalized_url = self.normalize(url) if normalized_url not in self.pages: headers = requests.head(url) content_type = headers.headers.get('content-type', '') if "text/html" in content_type: try: page = requests.get(url) except Exception as e: logging.error(f"Requests get exception: {e}") Page.create(id=self.id, url=normalized_url, status=headers.status_code, content_type=content_type, links=json.dumps([])) self.id += 1 return logging.debug(f"Got {url} [{page.status_code}]") try: page_content = get_page_source(url) except Exception as e: logging.error(f"Got selenium error: [{e}]") page_content = page.content links = [ self.normalize(link) for link in self.parse_page(page_content) ] Page.create(id=self.id, url=normalized_url, status=page.status_code, content_type=content_type, links=json.dumps(links)) self.pages[normalized_url] = None for link in links: if link not in self.pages: self.queue.add(link) else: logging.debug(f"Add {url} with content_type: {content_type}") Page.create(id=self.id, url=normalized_url, status=headers.status_code, content_type=content_type, links=json.dumps({})) self.id += 1
def test_all_uid(db): # Test for https://gist.github.com/micxjo/a097698b33fc4669b0b4 page = Page(title="Test page", text="Hello world") with transaction.manager: db.add(page) del page # Clear in-memory and on-disk caches db._storage._cache.clear() db._connection._cache.full_sweep() for item in db[Page].all(): assert hasattr(item, "_p_uid") del item db._storage._cache.clear() db._connection._cache.full_sweep() for uid in db[Page].all_uids(): obj = db[Page][uid] assert hasattr(obj, "_p_uid") del obj db._storage._cache.clear() db._connection._cache.full_sweep() uids = list(islice(db[Page].all_uids(), 10)) objs = db[Page][uids] for obj in objs: assert hasattr(obj, "_p_uid") objs = list(db[Page].all()) obj = objs[0] obj._p_activate() assert hasattr(obj, "_p_uid") objs[1].text += " xxx" transaction.commit() for obj in objs: assert hasattr(obj, "_p_uid")
def test_reindex(db): with transaction.manager: page = Page(title="hello", text="Quick0 brown lazy fox jumps over lorem ipsum dolor sit amet") docid = db.add(page) assert len(db[Page].query(Contains("text", "quick0"))) == 1 # DbModel, by ID with transaction.manager: page.text = "Quick1 brown lazy fox jumps over well, you know" db[Page].reindex(docid) assert len(db[Page].query(Contains("text", "quick0"))) == 0 assert len(db[Page].query(Contains("text", "quick1"))) == 1 # DbModel, by obj with transaction.manager: page.text = "quick2 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick1"))) == 0 assert len(db[Page].query(Contains("text", "quick2"))) == 1 # DB, by obj with transaction.manager: page.text = "quick3 brown lazy fox jumps over well, you know" db[Page].reindex(page) assert len(db[Page].query(Contains("text", "quick2"))) == 0 assert len(db[Page].query(Contains("text", "quick3"))) == 1 # DB, multiple objects with transaction.manager: page2 = Page(title="hello", text="Quick4 brown lazy fox jumps over lorem ipsum dolor sit amet") db.add(page2) with transaction.manager: page.text = "quick5 brown lazy fox jumps over well, you know" page2.text = "quick5 brown lazy fox jumps over well, you know" db.reindex([page, page2]) assert len(db[Page].query(Contains("text", "quick3") | Contains("text", "quick4"))) == 0 assert len(db[Page].query(Contains("text", "quick5"))) == 2
# page info page_number = 1 for page_link in driver.find_elements_by_css_selector( "div#content div.prevws a" ): page_url = page_link.get_attribute("href") page = session.query(Page).filter(Page.url == page_url).first() page_text = get_page_text( journal_title, issue_date, issue_text, page_number ) try: page_link.find_element_by_class_name("treffer") hit = True except NoSuchElementException: hit = False if page: if args.update: page.text = page_text else: page = Page(issue.issue_id, page_number, page_text, hit, page_url) session.add(page) logger.debug( f"Page info extracted. Number: {page_number}, page url: {page_url} and page text: {page_text[:10] if page_text else None}" ) page_number += 1 session.commit() session.close() driver.quit() logger.info(f"Completed. Processing took {(datetime.now() - t1).seconds}s.")
def test_auto_reindex(db): with transaction.manager: page = Page(title="hello", text="autoreindex0, test whether to work") db.add(page) assert len(db[Page].query(Contains("text", "autoreindex0"))) == 1 with transaction.manager: page.text = "autoreindex1, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex0"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex1"))) == 1 with transaction.manager: page2 = Page(title="hello", text="autoreindex2, test whether to work") db.add(page2) with transaction.manager: page.text = "autoreindex3, test whether to work" page2.text = "autoreindex3, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex1") | Contains("text", "autoreindex2"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: page.text = "autoreindex3, test whether to work1" page.text = "autoreindex3, test whether to work2" page.text = "autoreindex3, test whether to work3" assert reindex_mock.call_count == 1 db.enableAutoReindex(False) with transaction.manager: page.text = "autoreindex4, test whether to work" assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2 assert len(db[Page].query(Contains("text", "autoreindex4"))) == 0 db.enableAutoReindex(True) with transaction.manager: # should not throw ModleException page3 = Page(title="helloworld", text="autoreindex5, test whether to work") page3.title = "helloworld1" assert len(db[Page].query(Eq("title", "helloworld"))) == 0 assert len(db[Page].query(Eq("title", "helloworld1"))) == 0 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: # should not reindex page3 = Page(title="helloworld", text="autoreindex5, test whether to work") page3.title = "helloworld1" db.add(page3) assert reindex_mock.call_count == 0 with transaction.manager: # should reindex page3 = Page(title="helloworld", text="autoreindex6, test whether to work") db.add(page3) page3.title = "helloworld1" page3.text = "autoreindex7, test whether to work" assert len(db[Page].query(Eq("title", "helloworld"))) == 0 assert len(db[Page].query(Eq("title", "helloworld1"))) == 2 assert len(db[Page].query(Contains("text", "autoreindex6"))) == 0 assert len(db[Page].query(Contains("text", "autoreindex7"))) == 1 with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock: with transaction.manager: # should reindex page3 = Page(title="helloworld", text="autoreindex6, test whether to work") db.add(page3) page3.title = "helloworld1" page3.text = "autoreindex7, test whether to work" assert reindex_mock.call_count == 1