self.data = None class WikimediaScraper(PageScraper): def extract(self, page): titles = page.xpath( "//ul[@class='mw-search-results']/li/div[1]/a/@title") urls = page.xpath("//ul[@class='mw-search-results']/li/div[1]/a/@href") data = page.xpath("//ul[@class='mw-search-results']/li/div[3]/text()") items = [] for i in range(0, len(titles)): item = WikimediaItem() item.title = titles[i] item.url = urls[i] item.data = data[i] items.append(item) return items start_pages = [] for i in range(1, 6): url = "https://commons.wikimedia.org/w/index.php?title=Special:Search&limit=20&offset=" + str( i * 20) + "&profile=default&search=water" page = Page(url, WikimediaScraper()) start_pages.append(page) crawler = XCrawler(start_pages) crawler.config.output_file_name = "wikimedia_search_results_crawler_output.csv" crawler.run()
from xcrawler import XCrawler, Page, PageScraper class Scraper(PageScraper): def extract(self, page): return page.__str__() start_page = Page("http://192.168.5.5", Scraper()) start_page.request.cookies = {"theme": "classic"} crawler = XCrawler([start_page]) crawler.config.request_timeout = (5, 5) crawler.config.output_file_name = "router_request_example_output.csv" crawler.run()
def extract(self, page): item = StackOverflowItem() item.description = "A web page with tagged questions" item.url = page.url item.tag = page.xpath("//div[@class='tagged']/a/text()").get(0) item.related_tags = page.xpath("//div[@class='module js-gps-related-tags']//div[not(@*)]/a/text()") return item def visit(self, page): hrefs = page.xpath("//a[@class='question-hyperlink']/@href")[0:2] urls = page.to_urls(hrefs) return [Page(url, QuestionScraper()) for url in urls] class QuestionScraper(PageScraper): def extract(self, page): item = StackOverflowItem() item.description = "A web page with question details" item.url = page.url item.title = page.css_text("h1 a").get(0) item.votes = page.css_text(".question .vote-count-post").get(0).strip() return item start_pages = [ Page("http://stackoverflow.com/tags", TagsScraper()) ] crawler = XCrawler(start_pages) crawler.config.output_file_name = "stackoverflow_three_level_crawler_output.csv" crawler.config.number_of_threads = 3 crawler.run()
def setup_xcrawler(): xcrawler = XCrawler() xcrawler.query = QUERY xcrawler.sure_value = SURE_VALUE return xcrawler