Python XCrawler Examples

Programming Language: Python

Namespace/Package Name: xcrawler

Class/Type: XCrawler

Examples at hotexamples.com: 4

Python XCrawler - 4 examples found. These are the top rated real world Python examples of xcrawler.XCrawler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

XCrawler(3)

query(1)

run(1)

Frequently Used Methods

XCrawler (3)

query (1)

run (1)

Example #1

Show file

        self.data = None


class WikimediaScraper(PageScraper):
    def extract(self, page):
        titles = page.xpath(
            "//ul[@class='mw-search-results']/li/div[1]/a/@title")
        urls = page.xpath("//ul[@class='mw-search-results']/li/div[1]/a/@href")
        data = page.xpath("//ul[@class='mw-search-results']/li/div[3]/text()")

        items = []
        for i in range(0, len(titles)):
            item = WikimediaItem()
            item.title = titles[i]
            item.url = urls[i]
            item.data = data[i]
            items.append(item)
        return items


start_pages = []
for i in range(1, 6):
    url = "https://commons.wikimedia.org/w/index.php?title=Special:Search&limit=20&offset=" + str(
        i * 20) + "&profile=default&search=water"
    page = Page(url, WikimediaScraper())
    start_pages.append(page)

crawler = XCrawler(start_pages)
crawler.config.output_file_name = "wikimedia_search_results_crawler_output.csv"
crawler.run()

Example #2

Show file

from xcrawler import XCrawler, Page, PageScraper


class Scraper(PageScraper):
    def extract(self, page):
        return page.__str__()


start_page = Page("http://192.168.5.5", Scraper())
start_page.request.cookies = {"theme": "classic"}
crawler = XCrawler([start_page])
crawler.config.request_timeout = (5, 5)
crawler.config.output_file_name = "router_request_example_output.csv"
crawler.run()

Example #3

Show file

File: stackoverflow_three_level_crawler.py Project: cardsurf/xcrawler

    def extract(self, page):
        item = StackOverflowItem()
        item.description = "A web page with tagged questions"
        item.url = page.url
        item.tag = page.xpath("//div[@class='tagged']/a/text()").get(0)
        item.related_tags = page.xpath("//div[@class='module js-gps-related-tags']//div[not(@*)]/a/text()")
        return item

    def visit(self, page):
        hrefs = page.xpath("//a[@class='question-hyperlink']/@href")[0:2]
        urls = page.to_urls(hrefs)
        return [Page(url, QuestionScraper()) for url in urls]


class QuestionScraper(PageScraper):
    def extract(self, page):
        item = StackOverflowItem()
        item.description = "A web page with question details"
        item.url = page.url
        item.title = page.css_text("h1 a").get(0)
        item.votes = page.css_text(".question .vote-count-post").get(0).strip()
        return item


start_pages = [ Page("http://stackoverflow.com/tags", TagsScraper()) ]
crawler = XCrawler(start_pages)
crawler.config.output_file_name = "stackoverflow_three_level_crawler_output.csv"
crawler.config.number_of_threads = 3
crawler.run()

Example #4

Show file

def setup_xcrawler():
    xcrawler = XCrawler()
    xcrawler.query = QUERY
    xcrawler.sure_value = SURE_VALUE

    return xcrawler