def parse(self, response): print(self.queue.current) print(len(self.queue.store)) if self.queue.can_visit: self.queue.visit(response.url) soup = BeautifulSoup( response.css('body').get().replace('>', '> '), 'lxml') a = [s.extract() for s in soup(['script', 'style', 'noscript'])] result = soup.get_text() item = WikipediaItem() item['title'] = response.css('title::text').get() item['url'] = response.url item['content'] = result yield item new_urls = [ response.urljoin(url) for url in response.css('a::attr(href)').getall() ] self.queue.extend(new_urls) while len(self.queue) > 0 and self.queue.can_visit: url = self.queue.pop() if url not in self.queue.visited and url is not None: yield scrapy.Request(url, callback=self.parse) if not self.queue.can_visit: break else: break
def parse_wikipedia_page(self, response): item = WikipediaItem() soup = BeautifulSoup(response.body) item['url'] = response.url item['title'] = response.css('title::text').extract_first()[:-11] return item
def parse(self, response): titles = scrapy.Selector(response).xpath( '//div[@id="mw-pages"]//li') #bypassing child elements // for title in titles: item = WikipediaItem() url = title.select("a/@href").extract() item['title'] = title.select("a/text()").extract() item['url'] = url[0] yield item
def parse(self, response): titles = Selector(response).xpath('//div[@id="mw-pages"]//li') for title in titles: item = WikipediaItem() url = title.xpath("a/@href").extract() if url: item["title"] = title.xpath("a/text()").extract() item["url"] = urljoin("http://en.wikipedia.org", url[0]) yield item
def parse(self, response): sel = Selector(response) titles = sel.xpath('//tr[@style="vertical-align: top;"]//li') items = [] for title in titles: item = WikipediaItem() item["title"] = title.xpath("a/text()").extract() item["url"] = title.xpath("a/@href").extract() items.append(item) return items
def parse(self, response): item = WikipediaItem() item['title'] = response.xpath( '//*[@id="firstHeading"]/text()').extract_first() item['parent'] = dict() item['parent']['link'] = self.__join_url(response.xpath( '//*[@id="mw-content-text"]/div/div[1]/a/@href').extract_first()) item['parent']['title'] = response.xpath( '//*[@id="mw-content-text"]/div/div[1]/a/@title').extract_first() item['introduction'] = response.xpath( '//div[@class="mw-parser-output"]/div[@id="toc"]/preceding-sibling::p').extract() item['categories'] = list() categories = response.xpath( '//div[@class="mw-normal-catlinks"]/ul/li/a') for category in categories: link = dict() link['name'] = category.xpath('./text()').extract_first() link['link'] = self.__join_url(category.xpath('./@href').extract_first()) item['categories'].append(link) item['languages'] = list() languages = response.xpath( '//li[contains(@class,"interlanguage-link")]/a') for language in languages: link = dict() link['name'] = language.xpath('./text()').extract_first() link['link'] = self.__join_url( language.xpath('./@href').extract_first()) item['languages'].append(link) """ Parse Content Area """ item['content'] = content = dict() # parse techniques self.__parse_techniques(content, response) # parse software self.__parse_software(content, response) # parse legal issues self.__parse_legal_issues(content, response) # parse prevent scraping self.__parse_prevent(content, response) # parse see also self.__parse_see_also(content, response) # parse references self.__parse_referenes(content, response) """ End Content Area """ yield item
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select('//tr[@style="vertical-align: top;"]//li') items = [] for title in titles: item = WikipediaItem() url = title.select("a/@href").extract() item["title"] = title.select("a/text()").extract() item["url"] = urljoin("http://en.wikipedia.org", url[1:]) items.append(item) return(items)
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select('//div[@id="mw-pages"]//li') items = [] for title in titles: item = WikipediaItem() url = title.select("a/@href").extract() if url: item["title"] = title.select("a/text()").extract() item["url"] = urljoin("http://en.wikipedia.org", url[0]) items.append(item) return items
def parse_item(self, response): xpath_title = 'string(//h1[@id="firstHeading"])' xpath_content = '//div[@id="bodyContent"]' item = WikipediaItem() title = response.xpath(xpath_title).extract_first() content = response.xpath(xpath_content).extract_first() item['url'] = response.url item['title'] = title item['content'] = content yield item
def parse_page(self, response: Response): try: div = response.xpath('//div[@class="mw-parser-output"]') for element in div.xpath('p'): item = WikipediaItem() contents = element.xpath('string()').get() content = contents.encode('utf-8') item['topic'] = response item['text'] = content yield item except Exception as e: print(e)
def parse(self, response): links = [link for link in response.xpath(self.body_link_selector).extract() if link[0] != '#'] item = WikipediaItem() item['title'] = response.css(self.header_selector).extract_first() item['url'] = response.url item['snippet'] = BeautifulSoup(response.xpath('//div[@id="mw-content-text"]/p[1]').extract_first(), "lxml").text[:255] + "..." item['links'] = links yield item self.visited_urls.add(response.url) for link in links: next_url = response.urljoin(link) if self.allowed_re.match(next_url) and not next_url in self.visited_urls: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): soup = BeautifulSoup(response.body) allLinks = soup.select('p a[href]') for next_page in allLinks: if next_page is not None: next_page = 'http://vi.wikipedia.org' + next_page['href'] yield scrapy.Request(next_page, callback=self.parse, dont_filter=False) item = WikipediaItem() links = [] for link in allLinks: if link['href'].startswith( '/wiki/') and ":" not in link['href']: links.append(link['title']) cnt = Counter(links) item['links'] = cnt item['title'] = soup.find("h1", {"id": "firstHeading"}).string yield item
def parse(self, response): item = WikipediaItem() soup = BeautifulSoup(response.body, "lxml") item['url'] = response.url item['name'] = soup.find("h1", {"id": "firstHeading"}).string item['description'] = BeautifulSoup( response.xpath('//div[@id="mw-content-text"]/p[1]').extract_first( ), "lxml").text[:255] + "..." item['links'] = [ y for y in [ response.urljoin(x) for x in response.xpath(self.body_link_selector).extract() if x[0] != "#" ] if self.allowed_re.match(y) ] yield item self.visited_urls.add(response.url) print(len(self.visited_urls)) for link in item['links']: if not link in self.visited_urls: yield Request(link, callback=self.parse)