Python IfengItem Examples

Programming Language: Python

Namespace/Package Name: ifeng.items

Class/Type: IfengItem

Examples at hotexamples.com: 4

Python IfengItem - 4 examples found. These are the top rated real world Python examples of ifeng.items.IfengItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IfengItem(4)

Frequently Used Methods

IfengItem (4)

Example #1

Show file

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        divs = soup.findAll('div', {'class': 'box_list clearfix'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            h2 = div.find('h2')
            link = h2.find('a')
            url = link['href']
            item['url'] = url
            title = link['title']
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'artical_real'}).get_text()
            item['content'] = content
            item['label'] = 'history'
            if self.check(item['url']):
                yield item
            #//*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接，也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

Example #2

Show file

File: ifeng_history_spider.py Project: kyla1994/Text-Classification

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        #/html/body/div[4]/div[1]/div/div/div[1]/a
        divs = soup.findAll('div', {'class': 'con_lis show'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            url = div.find('a')['href']
            title = div.find('h4').get_text()
            item['url'] = url
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'yc_con_txt'}).get_text()
            item['content'] = content
            item['label'] = 'history'
            if self.check(item['url']):
                yield item
            #//*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接，也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

Example #3

Show file

File: ifeng_spider.py Project: linbw911/ScrapyProject

 def parse_news(self, response):
     title = response.css(
         'div#artical h1#artical_topic::text').extract_first()
     fbody = response.css(
         'div#main_content.js_selection_area p::text').extract()
     body = '\n'.join(fbody)
     source_url = response.css(
         'div#artical_sth.clearfix span.ss03 a::attr(href)').extract_first(
         )
     response.meta['data'].update({
         'content': body,
         'source_url': source_url,
         'title': title,
         'response_url': response.url
     })
     yield IfengItem(response.meta['data'])

Example #4

Show file

File: qq_mil_spiders.py Project: kyla1994/Text-Classification

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        root = soup.find('div', {'class': 'leftList'})
        lis = root.findAll('li')
        for li in lis:
            # title, content,url
            item = IfengItem()
            url = li.find('a')['href']
            item['url'] = url
            title = li.get_text()
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            try:
                content = soup2.find('div', {'id': 'Cnt-Main-Article-QQ'}).get_text()#Cnt-Main-Article-QQ
                item['content'] = content
            except AttributeError:
                print AttributeError.message


            item['label'] = 'military'
            if self.check(item['url']):
                yield item