def crawl_pastes(): print("doing some work") tree = Downloader.downloadPage(PASTES_ARCHIVE_URL + "/archive") paste_links = tree.xpath('//table[@class="maintable"]//tbody\ //tr//td[1]//a//@href') start = time.time() pastes = p.map(DataParser.parse_link, [ PASTES_ARCHIVE_URL + paste_rel_link for paste_rel_link in paste_links if paste_rel_link[1:] not in processed_links ]) p.starmap(DataManager.saveToFile, [(p, p.relative_link) for p in pastes if p is not None]) processed_links.extend([p.removeprefix('/') for p in paste_links]) print("took " + str(time.time() - start))
def parse_link(url: str) -> Paste: tree = Downloader.downloadPage(url) if tree is None: return None try: date = tree.xpath(DataParser.DATE_SELECTOR)[0] username = tree.xpath(DataParser.USERNAME_SELECTOR)[0] title = tree.xpath(DataParser.TITLE_SELECTOR)[0] content = tree.xpath(DataParser.CONTENT_SELECTOR)[0] except Exception: print("Failed to process url: ", url) return None # todo: validate fetched results return Paste(author=DataParser.norm_author_title(username), title=DataParser.norm_author_title(title), content=DataParser.norm_content(content), datetime=DataParser.strdate_to_arrow(date), link=url.split('/')[-1])