Ejemplo n.º 1
0
def crawl_pastes():
    print("doing some work")
    tree = Downloader.downloadPage(PASTES_ARCHIVE_URL + "/archive")
    paste_links = tree.xpath('//table[@class="maintable"]//tbody\
        //tr//td[1]//a//@href')

    start = time.time()
    pastes = p.map(DataParser.parse_link, [
        PASTES_ARCHIVE_URL + paste_rel_link for paste_rel_link in paste_links
        if paste_rel_link[1:] not in processed_links
    ])
    p.starmap(DataManager.saveToFile,
              [(p, p.relative_link) for p in pastes if p is not None])

    processed_links.extend([p.removeprefix('/') for p in paste_links])
    print("took " + str(time.time() - start))
Ejemplo n.º 2
0
    def parse_link(url: str) -> Paste:
        tree = Downloader.downloadPage(url)
        if tree is None:
            return None

        try:
            date = tree.xpath(DataParser.DATE_SELECTOR)[0]
            username = tree.xpath(DataParser.USERNAME_SELECTOR)[0]
            title = tree.xpath(DataParser.TITLE_SELECTOR)[0]
            content = tree.xpath(DataParser.CONTENT_SELECTOR)[0]
        except Exception:
            print("Failed to process url: ", url)
            return None

        # todo: validate fetched results
        return Paste(author=DataParser.norm_author_title(username),
                     title=DataParser.norm_author_title(title),
                     content=DataParser.norm_content(content),
                     datetime=DataParser.strdate_to_arrow(date),
                     link=url.split('/')[-1])