Example #1
0
    def parse(self, response):

        # Extract current revisions
        for revision in response.css('ul#pagehistory li'):

            comment = revision.css('span.comment').extract_first()
            date = revision.css('a.mw-changeslist-date::text').extract_first()
            rvid = revision.css(
                'aw.mw-changeslist-date::attr(href)').extract_first()
            ip = revision.css('a.mw-anonuserlink bdi::text').extract_first()
            user = revision.css('a.mw-userlink bdi::text').extract_first()
            size = revision.css('span.history-size::text').extract_first()
            tags = revision.css('span.mw-tag-marker::text').extract()

            item = WikiItem(
                comment=comment,
                date=date,
                rvid=rcid,
                ip=ip,
                user=user,
                size=size,
                tags=tags,
            )

            yield item

        # Get next revision history page
        next_page = response.css('a.mw-nextlink::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            # Be kind and treat wikimedia with care
            # time.sleep(1)
            yield scrapy.Request(next_page, callback=self.parse)
    def parse(self, response):
        rows = response.xpath('//div[@id="mw-content-text"]/div/table/tr')
        patterns = [
            './td[1]/i/a/text()', './td[1]/i/b/a/text()',
            './td[1]/i/span[2]//text()', './td[1]/i/b/span/text()'
        ]
        for row in rows:

            for pattern in patterns:
                film = row.xpath(pattern).extract()
                # film = row.xpath(pattern).extract_first()
                film = self.verify(film)
                if film:
                    break
            year = row.xpath('./td[2]/a/text()').extract()
            year = self.verify(year)
            awards = row.xpath('./td[3]/text()').extract()
            awards = self.verify(awards)
            nominations = row.xpath('./td[4]/text()').extract()
            nominations = self.verify(nominations)

            item = WikiItem()
            item['film'] = film
            item['year'] = year
            item['awards'] = awards
            item['nominations'] = nominations
            yield item
Example #3
0
    def parse_wikipedia_page(self, response):
        item = WikiItem()
        soup = BeautifulSoup(response.body)
        item['url'] = response.url
        item['title'] = soup.find('h1', {'id': 'firstHeading'}).string
        item['desc'] = soup.find('div', {'id': 'mw-content-text'}).find('p')

        #Create array of all links in description field
        unique_links = set()
        for link in soup.find('div', {
                'id': 'mw-content-text'
        }).find('p').findAll('a', attrs={'href': re.compile('^/wiki/')}):
            unique_links.add('http://en.wikipedia.org{}'.format(
                link.get('href')))
        links = []
        for link in unique_links:
            links.append(link)
        item['links'] = links

        #Get fotter information on page
        item['footer'] = soup.find('ul', {
            'id': 'footer-info'
        }).find('li', {
            'id': 'footer-info-lastmod'
        }).string

        return item
    def parse(self, response):
        """Parse the wikipedia page of a historical figure"""

        # Extract the figure's image, birth and death date
        info_box = Selector(text=response.css('.infobox').extract_first())
        image_url = info_box.xpath('//a[@class="image"]//@src').extract_first()
        image_url = "https:" + image_url
        death_date = info_box.xpath(
            '//*[@class="dday deathdate"]//text()').extract_first()
        death_date_raw = info_box.select(
            "//th[contains(text(), 'Died')]/following-sibling::td/text()"
        ).extract_first()
        birth_date = info_box.xpath(
            '//*[@class="bday"]//text()').extract_first()
        birth_date_raw = info_box.select(
            "//th[contains(text(), 'Born')]/following-sibling::td/text()"
        ).extract_first()

        # Create the wiki item
        wiki_item = WikiItem()
        wiki_item["image_url"] = image_url
        wiki_item["death_date"] = death_date
        wiki_item["death_date_raw"] = death_date_raw
        wiki_item["birth_date"] = birth_date
        wiki_item["birth_date_raw"] = birth_date_raw
        wiki_item["curid"] = response.meta['curid']

        yield wiki_item
Example #5
0
	def parse(self, response):  # first method in the spyder file
		# Find all the table rows
		rows = response.xpath('//*[@id="mw-content-text"]/div/table/tbody/tr')

		# The movie title could be of different styles so we need to provide all the possibilities.
		patterns = ['./td[1]/i/a/text()', './td[1]/i/b/a/text()',
								'./td[1]/i/span[2]//text()', './td[1]/i/b/span/text()']
		for row in rows:
			# extract() will return a Python list, extract_first() will return the first element in the list
			# If you know the first element is what you want, you can use extract_first()
			for pattern in patterns:
				film = row.xpath(pattern).extract_first()
				if film:
					break
			# If the movie title is missing, then we just skip it.
			if not film:
				continue
			# Relative xpath for all the other columns
			year = row.xpath('./td[2]/a/text()').extract_first()
			awards = row.xpath('./td[3]/text()').extract_first()
			nominations = row.xpath('./td[4]/text()').extract_first().strip()

			# Initialize a new WikiItem instance for each movie.
			item = WikiItem()
			item['film'] = film
			item['year'] = year
			item['awards'] = awards
			item['nominations'] = nominations
			yield item
Example #6
0
    def parse_wiki_page(self, response):
        #print("Parsed: ", response.url)

        item = WikiItem()

        item['url'] = response.url

        item['title'] = BeautifulSoup(
            response.xpath('//h1[@id="firstHeading"]').extract_first(),
            "lxml").text

        item['info'] = BeautifulSoup(
            response.xpath('//div[@id="mw-content-text"]/*/p[1]').
            extract_first(), "lxml").text[:255] + "..."

        #print("Links: ", list(map(lambda link: link.url,
        #                   self.link_extractor.extract_links(response))))

        page_urls = set(
            map(lambda link: link.url,
                self.link_extractor.extract_links(response)))
        page_urls.discard(response.url)
        item['out_urls'] = page_urls

        return item
Example #7
0
    def createItem(self, response):
        item = WikiItem()

        # init Fields for correct sort
        item['uid'] = ""

        # URL from crawled Site (used for generatedUID -> elastic)
        m = re.search('(http[s]?:\/\/)?([^\/\s]+)(.*)', response.url)
        if m:
            relativeUrl = m.group(3)
            item['url'] = "https://de.wikipedia.org" + relativeUrl
        else:
            item['url'] = "https://de.wikipedia.org" + url

        responseSelector = Selector(response)

        # Plugin for easy HTML parsing
        soup = BeautifulSoup(responseSelector.extract(), 'html.parser')

        item['pageTitle'] = soup.find('title').text
        item['text'] = ""

        for p_tag in soup.findAll('p'):
            item['text'] = item['text'] + p_tag.text.replace(
                "\t", " ").replace("\r", " ").replace("\n", " ").replace(
                    "  ", " ").strip()

        # HTML Content of parsed Component
        item['html'] = responseSelector.extract()

        # Generated UID which is used as UID for Elastic, so every Item is Unique
        item['uid'] = self.generateUID(item, 'utf-8')
        return item
    def parse(self, response):
        # Find all the table rows
        #response object passes all the website info to the script
        # rows = response.xpath('//*[@id="mw-content-text"]/div/table/thead/tbody/tr')#[1:]
        # rows = response.xpath('//*[@id="mw-content-text"]/div/table/tbody/tr')[1:]
        rows = response.xpath('//*[@id="constituents"]/tbody/tr')[1:]

        for row in rows:
            # Relative xpath for all the other columns
            symbol = row.xpath('./td[1]/a/text()').extract_first()
            name = row.xpath('./td[2]/a/text()').extract_first()
            sector = row.xpath('./td[4]/text()').extract_first()
            sub_industry = row.xpath('./td[5]/text()').extract_first()
            hq = row.xpath('./td[6]/a/text()').extract_first()
            dt_add = row.xpath('./td[7]/text()').extract_first()
            founded = row.xpath('./td[9]/text()').extract_first().strip()

            item = WikiItem()
            item['symbol'] = symbol
            item['name'] = name
            item['sector'] = sector
            item['sub_industry'] = sub_industry
            item['hq'] = hq
            item['dt_add'] = dt_add
            item['founded'] = founded
            yield item
Example #9
0
    def parse(self, response):
        # Find all the table rows
        rows = response.xpath(
            '//*[@id="mw-content-text"]/div/table/tbody/tr')[1:]

        # The movie title could be of different styles so we need to provide all the possibilities.
        for row in rows:

            film = ''.join(row.xpath('./td[1]//text()').extract())
            # film = row.xpath('./td[1]//text()').extract_first()

            # Relative xpath for all the other columns
            year = int(row.xpath('./td[2]/a/text()').extract_first())
            awards = row.xpath('./td[3]/text()').extract_first()
            nominations = row.xpath('./td[4]/text()').extract_first().strip()
            is_bestpicture = bool(row.xpath('./@style'))

            # Initialize a new WikiItem instance for each movie.
            item = WikiItem()
            item['film'] = film
            item['year'] = year
            item['awards'] = awards
            item['nominations'] = nominations
            item['is_bestpicture'] = is_bestpicture

            yield item
Example #10
0
    def parse_wiki(self, response):
        def hana_upload(cursor, data):
            url = data[0].replace("'", "''")
            title = data[1].replace("'", "''")
            text = data[2].replace("'", "''")
            sql = f"insert into \"SYSTEM\".\"WIKI\" (TITLE, TEXT, URL) VALUES ('{title}','{text}','{url}')"
            cursor.execute(sql)

        def text_cleaner(value):
            value = ' '.join(value)
            value = value.replace('\n', '')
            value = unicodedata.normalize("NFKD", value)
            value = re.sub(r' , ', ', ', value)
            value = re.sub(r' \( ', ' (', value)
            value = re.sub(r' \) ', ') ', value)
            value = re.sub(r' \)', ') ', value)
            value = re.sub(r'\[\d.*\]', ' ', value)
            value = re.sub(r' +', ' ', value)
            return value.strip()

        print(f'Found a page: {response.url}')

        item = WikiItem()
        body = BeautifulSoup(response.body)

        item['url'] = response.url
        item['title'] = body.find("h1", {"id": "firstHeading"}).string

        # get the first paragraph
        strings = []
        try:
            for node in response.xpath('//*[@id="mw-content-text"]/div/p'):
                text = text_cleaner(node.xpath('string()').extract())
                if len(text):
                    strings.append(text)
        except Exception as error:
            strings.append(str(error))

        item['text'] = ' '.join(strings)

        data = [item['url'], item['title'], item['text']]
        # don't upload empty or broken data
        if data[0] is not None and data[1] is not None and data[2] is not None:
            hana_upload(cursor, data)
            # print(data[2])

        # load new pages
        base_url = self.base_url
        if response.url.startswith(base_url):
            links = response.xpath("//a/@href").extract()
            regex = re.compile(r'^/wiki/.*')
            selected_links = list(filter(regex.search, links))
            for link in selected_links:
                if ':' not in link:
                    # print(link)
                    absolute_next_page_url = base_url + link
                    yield Request(absolute_next_page_url)

        return item
Example #11
0
 def matchCuisine(self, cuisine, item):
     countBuffer = 0
     for word in map(str.lower, self.cuisineNamesList):
         cuisine_lower = cuisine.lower()
         if word in cuisine_lower:
             return WikiItem(cuisine=word, foodItem=item)
         countBuffer += 1
         if countBuffer >= self.cuisineNameCount - 1:
             print '###### Not a predefined cuisine:' + cuisine_lower + ':'
Example #12
0
	def parse_wikipedia_page(self, response):
		print '->', response.url
		item = WikiItem()
		soup = BeautifulSoup(response.body)
		item['url'] = response.url
		item['title'] = soup.find('h1', {'id':'firstHeading'}).string
		item['desc'] = soup.find('div', {'id':'mw-content-text'}).find('p')

		links= set()
		for link in soup.find('div', {'id':'mw-content-text'}).find('p').findAll('a', attrs={'href': re.compile('^/wiki/')}):
			links.add('http://en.wikipedia.org{}'.format(link.get('href')))

		arr = []
		for l in links:
			arr.append(l)

		item['links'] = arr
		return item
Example #13
0
    def parse_wiki_page(self, response):
        #print("Parsed: ", response.url)

        item = WikiItem()

        item['url'] = response.url

        item['title'] = BeautifulSoup(
            response.xpath('//h1[@id="firstHeading"]').extract_first(),
            "lxml").text

        item['info'] = BeautifulSoup(
            response.xpath(
                '//div[@id="mw-content-text"]/*/p[1]').extract_first(),
            "lxml").text

        item['index'] = self.wiki_url_dict[response.url]

        return item
Example #14
0
    def parse(self, response):
        logger.debug("Parsing: {}".format(response.url))
        # ['url', 'text', 'fragment', 'nofollow']
        outlinks = [
            response.urljoin(link.url)
            for link in self.link_extractor.extract_links(response)[:100]
        ]
        logger.debug("Outlinks count ({}): {}".format(response.url,
                                                      len(outlinks)))

        item = WikiItem()
        item['url'] = response.url
        item['title'] = response.xpath(self.title_xpath).extract_first()
        item['snippet'] = response.xpath(
            self.snippet_xpath).extract_first()[:255] + "..."
        item['outlinks'] = outlinks
        yield item

        for link in outlinks:
            yield scrapy.Request(link, callback=self.parse)
Example #15
0
    def save(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        print(jsonresponse['query']['pages'])
        word = response.meta['word']
        pages = jsonresponse['query']['pages']
        for page in pages:
            item = WikiItem()
            item['page_id'] = page
            item['keyword'] = word
            try:
                item['title'] = pages[page]['title']
            except:
                print("NO TITLE")
                item['title'] = ''

            try:
                item['content'] = pages[page]['extract']
            except:
                print("NO CONTENT")
                item['content'] = ''

            yield item
Example #16
0
 def parse(self, response):
     pagina=response.url.split('/')[-1]
     carpeta="pags_html"
     if not os.path.exists(carpeta):
         os.makedirs(carpeta)
     nombre_archivo=carpeta+'/pag_%s.html'%pagina.replace(':',('_'))
     with open (nombre_archivo,'wb')as f:
         f.write(response.body)
         
     self.log('Archivo %s guardado' %nombre_archivo)
     page_info= WikiItem()
     page_info['url']=response.url
     page_info['ranking']=1
     page_info['palabras']=""
     page_info['enlaces']=response.css('a[href^="http"] ').xpath('@href').extract()        
     page_info['ruta']=nombre_archivo
     for url in page_info['enlaces']:
         if self.cont<30: 
             self.cont=self.cont+1
             print ('Procesando pagina numero '+str(self.cont))
             yield scrapy.Request(url=url,callback=self.parse)
             
     yield page_info
     pass
Example #17
0
File: wiki.py Project: maxneuds/twm
    def parse_wiki(self, response):
        def hana_upload(cursor, data):
            cat = data[0].replace("'", "''")
            title = data[1].replace("'", "''")
            url = data[2].replace("'", "''")
            text = data[3].replace("'", "''")
            sql = f"insert into \"SYSTEM\".\"WIKI\" (CAT, TITLE, URL, TEXT) VALUES ('{cat}','{title}','{url}','{text}')"
            cursor.execute(sql)

        def text_cleaner(value):
            value = ' '.join(value)
            value = value.replace('\n', '')
            value = unicodedata.normalize("NFKD", value)
            value = re.sub(r' , ', ', ', value)
            value = re.sub(r' \( ', ' (', value)
            value = re.sub(r' \) ', ') ', value)
            value = re.sub(r' \)', ') ', value)
            value = re.sub(r'\[\d.*\]', ' ', value)
            value = re.sub(r' +', ' ', value)
            return value.strip()

        print(f'Found a page: {response.url}')

        base_url = self.base_url
        category_url = self.category_url
        article_url = self.article_url

        # if category then crawl more pages
        if response.url.startswith(category_url):
            print(f'Crawl category: {response.url}')
            # all links
            # links = response.xpath("//a/@href").extract()
            # category links
            links = response.xpath(
                "//div[@class='mw-category-generated']//a/@href").extract()
            regex = re.compile(r'^/wiki/.*')
            selected_links = list(filter(regex.search, links))
            for link in selected_links:
                absolute_next_page_url = base_url + link
                # print(absolute_next_page_url)
                yield Request(absolute_next_page_url)
        # elif articale then fetch page
        elif response.url.startswith(article_url):
            print(f'Crawl article: {response.url}')
            item = WikiItem()
            body = BeautifulSoup(response.body)

            item['cat'] = self.category
            item['url'] = response.url
            item['title'] = body.find("h1", {"id": "firstHeading"}).string

            # get the first paragraph
            strings = []
            try:
                for node in response.xpath('//*[@id="mw-content-text"]/div/p'):
                    text = text_cleaner(node.xpath('string()').extract())
                    if len(text):
                        strings.append(text)
            except Exception as error:
                strings.append(str(error))

            item['text'] = ' '.join(strings)

            data = [item['cat'], item['title'], item['url'], item['text']]

            # don't upload empty or broken data
            if not None in data:
                global cursor
                hana_upload(cursor, data)
                print(f' -> Upload: {data[0]} > {data[1]} > {data[1]}')
                # return for scrapy
                yield item

        # else don't do anything
        else:
            print(f'Page is useless: {response.url}')
            pass
Example #18
0
 def createItem(self, response):
     item = WikiItem()
     soup = BeautifulSoup(response.extract(), 'html.parser')
     item['title'] = soup.h1.text
     item['content'] = soup.get_text()
     return item
Example #19
0
    def parse(self, response):
        item = WikiItem()
        title = response.xpath(
            '//h1[@id="firstHeading"]/text()').extract_first()
        item['title'] = title
        item['url'] = response.url
        # tr_list = response.xpath('//table[@class="infobox vcard"]/tr')
        tr_list = response.css('.infobox tr')
        image = tr_list.xpath('//a[@class="image"]/img/@src').extract_first()
        if image is not None:
            item['image'] = "https:" + image

        r_part = re.compile(r'\[\d.\]|\[\d\]')

        # 右侧的info_box表格
        info_box = []
        for tr in tr_list:
            th = tr.xpath('./th[@scope="row"]//text()').extract_first()
            if th is not None:
                td = re.sub(r_part, "",
                            "".join(tr.xpath('./td//text()').extract()))
                info_box.append({'key': th, 'value': stripTagSimple(td)})
        print(info_box)
        # print(title)

        pic = []
        thumb_tright = response.xpath(
            '//div[@class="thumb tright"]/div[@class="thumbinner"]')
        for p in thumb_tright:
            if p.xpath('./a/img/@src').extract_first() is not None:
                img = 'https:' + p.xpath('./a/img/@src').extract_first()
                img_desc = re.sub(
                    r_part, "", "".join(
                        p.xpath(
                            './div[@class="thumbcaption"]//text()').extract()))
                pic.append({'url': img, 'img_desc': stripTagSimple(img_desc)})
        # print(pic)
        item['pic'] = pic

        html_content = response.xpath(
            '//div[@id="mw-content-text"]').extract_first()
        soup = BeautifulSoup(html_content, 'html.parser')
        # 销毁目录节点
        catalog = soup.find('div', class_="toc")
        if catalog is not None:
            soup.find('div', class_="toc").decompose()
        # 销毁参考资料节点
        ref = soup.find('ol', class_="references")
        if ref is not None:
            soup.find('ol', class_="references").decompose()

        # ps是文中所有的段落
        div = soup.find(name='div', class_='mw-parser-output')
        ps = div.find_all('p', recursive=False)  # only direct children
        index = 0
        for p in ps:
            if p.get_text() == '':
                break
            index += 1
        summary = {}
        s_index = 0
        while s_index < index:
            summary[f'{s_index}'] = stripTagSimple(ps[s_index].get_text())
            s_index += 1
        print(summary)

        start = re.compile(r'<p>', re.DOTALL)
        search_result = start.search(soup.decode('utf-8'))
        if search_result is None:
            search_result = re.compile(r'<h2>',
                                       re.DOTALL).search(soup.decode('utf-8'))
        content_text = collections.OrderedDict()
        if search_result is not None:
            start_node = soup.decode('utf-8')[search_result.start():]
            lists = start_node.split('<h2>')

            i = 1
            while i < len(lists):
                lists[i] = '<h2>' + lists[i]
                final_soup = BeautifulSoup(lists[i], 'html.parser')
                para_title = final_soup.find(
                    'span', class_="mw-headline").get_text().strip()
                if para_title == "外部链接" or "参考" in para_title:
                    i += 1
                    continue
                para_contents = final_soup.find_all(['p', 'li', 'table'])
                texts = []
                for para in para_contents:
                    if para.name == 'table':
                        texts.append(para.prettify())
                        continue
                    texts.append(stripTagSimple(para.get_text('', True)))
                content_text[para_title.replace('.', '点')] = texts
                i += 1
            catlinks = response.xpath(
                '//div[@class="catlinks"]/div[@id="mw-normal-catlinks"]//li')

            tag = {}
            j = 0
            for link in catlinks:
                href = 'https://zh.wikipedia.org' + link.xpath(
                    './a/@href').extract_first()
                cat = link.xpath('./a/text()').extract_first()
                tag[f'{j}'] = cat
                j += 1

            detail = {
                'title': title,
                'summary': summary,
                'infobox': info_box,
                'content': content_text,
                'category': tag,
            }
            item['detail'] = detail
            now_time = datetime.datetime.fromtimestamp(time.time())
            item['updateAt'] = now_time
            return item
Example #20
0
    def parse(self, response):
        # if self.count > 10:
        #     return
        # else:
        #     self.count += 1
        json = response.json()
        json_pages = json['query']['pages']

        # extracts 信息分段发送(冗余)
        if 'continue' in json:
            print_debug_info('Error:', 'extracts need continue')

        # page_id 为json动态属性,需要用for循环获取
        for page_id, page_content in json_pages.items():

            # page_id 为"-1"则词条不存在(冗余)
            if page_id != '-1':
                title = json_pages[page_id]['title']
                wiki = wikiapi.Wikipedia('zh', json, page_id)
                page = wiki.page(title)

                # 只考虑 namespace 为 Main/Article的词条,namespace参见"https://en.wikipedia.org/wiki/Wikipedia:Namespace"(冗余)
                if page.namespace == 0:

                    # 该名字为"别名",需要额外一次请求获取"真名"(冗余)
                    if page_content['extract'] == '':
                        real_title = page.displaytitle
                        print_debug_info('Warning:', 'extracts is empty')
                        yield scrapy.Request(get_url_by_name(real_title))

                    # 该词条是一个完整的实体,可以被保存(正常情况仅有该代码段会执行)
                    else:
                        print_debug_info('Success:', page.namespace, page_id,
                                         title, response)

                        linked_items = []

                        # 将被该词条链接的词条加入爬取队列,此处需要至少一个请求
                        query = link_query(page_id)
                        link_dict = query['link_dict']
                        redirect_dict = query['redirect_dict']
                        for link_title in link_dict:
                            link_page_id = link_dict[link_title]
                            linked_items.append({
                                'page_id': link_page_id,
                                'title': link_title
                            })
                            yield scrapy.Request(
                                get_url_by_page_id(link_page_id))

                        sections = {
                            'title':
                            'summary',
                            'text':
                            page.summary,
                            'linked_words':
                            make_linked_words(page.summary, link_dict,
                                              redirect_dict),
                            'sections':
                            make_sections(page.sections, link_dict,
                                          redirect_dict)
                        }

                        yield WikiItem(page_id=int(page_id),
                                       title=title,
                                       sections=sections,
                                       linked_items=linked_items)

                # namespace 不为0(冗余)
                else:
                    print_debug_info('Warning:', 'ns is not 0')

            # page_id 为 -1,即该词条不存在,且无重定向链接(冗余)
            else:
                print_debug_info('Warning', 'page_id is -1')