Example #1
0
 def parse(self, response):
     print(self.queue.current)
     print(len(self.queue.store))
     if self.queue.can_visit:
         self.queue.visit(response.url)
         soup = BeautifulSoup(
             response.css('body').get().replace('>', '> '), 'lxml')
         a = [s.extract() for s in soup(['script', 'style', 'noscript'])]
         result = soup.get_text()
         item = WikipediaItem()
         item['title'] = response.css('title::text').get()
         item['url'] = response.url
         item['content'] = result
         yield item
         new_urls = [
             response.urljoin(url)
             for url in response.css('a::attr(href)').getall()
         ]
         self.queue.extend(new_urls)
         while len(self.queue) > 0 and self.queue.can_visit:
             url = self.queue.pop()
             if url not in self.queue.visited and url is not None:
                 yield scrapy.Request(url, callback=self.parse)
                 if not self.queue.can_visit:
                     break
             else:
                 break
    def parse_wikipedia_page(self, response):
        item = WikipediaItem()
        soup = BeautifulSoup(response.body)

        item['url'] = response.url
        item['title'] = response.css('title::text').extract_first()[:-11]

        return item
 def parse(self, response):
     titles = scrapy.Selector(response).xpath(
         '//div[@id="mw-pages"]//li')  #bypassing child elements //
     for title in titles:
         item = WikipediaItem()
         url = title.select("a/@href").extract()
         item['title'] = title.select("a/text()").extract()
         item['url'] = url[0]
         yield item
Example #4
0
    def parse(self, response):
        titles = Selector(response).xpath('//div[@id="mw-pages"]//li')

        for title in titles:
            item = WikipediaItem()
            url = title.xpath("a/@href").extract()
            if url:
                item["title"] = title.xpath("a/text()").extract()
                item["url"] = urljoin("http://en.wikipedia.org", url[0])
                yield item
Example #5
0
 def parse(self, response):
     sel = Selector(response)
     titles = sel.xpath('//tr[@style="vertical-align: top;"]//li')
     items = []
     for title in titles:
         item = WikipediaItem()
         item["title"] = title.xpath("a/text()").extract()
         item["url"] = title.xpath("a/@href").extract()
         items.append(item)
     return items
Example #6
0
    def parse(self, response):
        item = WikipediaItem()

        item['title'] = response.xpath(
            '//*[@id="firstHeading"]/text()').extract_first()

        item['parent'] = dict()
        item['parent']['link'] = self.__join_url(response.xpath(
            '//*[@id="mw-content-text"]/div/div[1]/a/@href').extract_first())
        item['parent']['title'] = response.xpath(
            '//*[@id="mw-content-text"]/div/div[1]/a/@title').extract_first()

        item['introduction'] = response.xpath(
            '//div[@class="mw-parser-output"]/div[@id="toc"]/preceding-sibling::p').extract()

        item['categories'] = list()
        categories = response.xpath(
            '//div[@class="mw-normal-catlinks"]/ul/li/a')
        for category in categories:
            link = dict()
            link['name'] = category.xpath('./text()').extract_first()
            link['link'] = self.__join_url(category.xpath('./@href').extract_first())
            item['categories'].append(link)

        item['languages'] = list()
        languages = response.xpath(
            '//li[contains(@class,"interlanguage-link")]/a')
        for language in languages:
            link = dict()
            link['name'] = language.xpath('./text()').extract_first()
            link['link'] = self.__join_url(
                language.xpath('./@href').extract_first())
            item['languages'].append(link)

        """
        Parse Content Area
        """
        item['content'] = content = dict()
        # parse techniques
        self.__parse_techniques(content, response)
        # parse software
        self.__parse_software(content, response)
        # parse legal issues
        self.__parse_legal_issues(content, response)
        # parse prevent scraping
        self.__parse_prevent(content, response)
        # parse see also
        self.__parse_see_also(content, response)
        # parse references
        self.__parse_referenes(content, response)
        """
        End Content Area
        """

        yield item
Example #7
0
	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		titles = hxs.select('//tr[@style="vertical-align: top;"]//li')
		items = []
		for title in titles:
			item = WikipediaItem()
			url = title.select("a/@href").extract()
			item["title"] = title.select("a/text()").extract()
			item["url"] = urljoin("http://en.wikipedia.org", url[1:])
			items.append(item)
		return(items)
Example #8
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.select('//div[@id="mw-pages"]//li')
     items = []
     for title in titles:
         item = WikipediaItem()
         url = title.select("a/@href").extract()
         if url:
             item["title"] = title.select("a/text()").extract()
             item["url"] = urljoin("http://en.wikipedia.org", url[0])
             items.append(item)
     return items
Example #9
0
    def parse_item(self, response):
        xpath_title = 'string(//h1[@id="firstHeading"])'
        xpath_content = '//div[@id="bodyContent"]'

        item = WikipediaItem()

        title = response.xpath(xpath_title).extract_first()
        content = response.xpath(xpath_content).extract_first()

        item['url'] = response.url
        item['title'] = title
        item['content'] = content

        yield item
Example #10
0
    def parse_page(self, response: Response):
        try:
            div = response.xpath('//div[@class="mw-parser-output"]')
            for element in div.xpath('p'):
                item = WikipediaItem()

                contents = element.xpath('string()').get()
                content = contents.encode('utf-8')

                item['topic'] = response
                item['text'] = content

                yield item
        except Exception as e:
            print(e)
Example #11
0
    def parse(self, response):
        links = [link for link in response.xpath(self.body_link_selector).extract() if link[0] != '#']
        item = WikipediaItem()
        item['title'] = response.css(self.header_selector).extract_first()
        item['url'] = response.url
        item['snippet'] = BeautifulSoup(response.xpath('//div[@id="mw-content-text"]/p[1]').extract_first(), "lxml").text[:255] + "..."
        item['links'] = links
        yield item

        self.visited_urls.add(response.url)

        for link in links:
            next_url = response.urljoin(link)
            if self.allowed_re.match(next_url) and not next_url in self.visited_urls:
                yield scrapy.Request(next_url, callback=self.parse)
Example #12
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        allLinks = soup.select('p a[href]')
        for next_page in allLinks:
            if next_page is not None:
                next_page = 'http://vi.wikipedia.org' + next_page['href']
                yield scrapy.Request(next_page,
                                     callback=self.parse,
                                     dont_filter=False)

            item = WikipediaItem()

            links = []
            for link in allLinks:
                if link['href'].startswith(
                        '/wiki/') and ":" not in link['href']:
                    links.append(link['title'])
            cnt = Counter(links)
            item['links'] = cnt
            item['title'] = soup.find("h1", {"id": "firstHeading"}).string

        yield item
Example #13
0
    def parse(self, response):
        item = WikipediaItem()
        soup = BeautifulSoup(response.body, "lxml")

        item['url'] = response.url
        item['name'] = soup.find("h1", {"id": "firstHeading"}).string
        item['description'] = BeautifulSoup(
            response.xpath('//div[@id="mw-content-text"]/p[1]').extract_first(
            ), "lxml").text[:255] + "..."
        item['links'] = [
            y for y in [
                response.urljoin(x)
                for x in response.xpath(self.body_link_selector).extract()
                if x[0] != "#"
            ] if self.allowed_re.match(y)
        ]
        yield item

        self.visited_urls.add(response.url)
        print(len(self.visited_urls))

        for link in item['links']:
            if not link in self.visited_urls:
                yield Request(link, callback=self.parse)