Beispiel #1
0
    def parse_roll_item(self, response):
        print '=====parse_roll_item:=====response:', response
        sel_a = response.xpath(
            '//div[@class="main"]/div[@class="mainCon"]/div[@class="list c"]')
        sel_b = sel_a.xpath('.//ul')
        sel_b = sel_a.xpath('.//li')
        id = 0
        for site in sel_b:
            id += 1
            title = site.xpath('a/text()').extract()
            link = site.xpath('a/@href').extract()
            time_release = site.xpath('a/@href ').extract()
            type_news = site.xpath('span[@class="t-tit"]/text()').extract()
            #response_news=site.xpath('p[@class]/text() ').extract()

            item = QqproItem(title=title,
                             link=link,
                             time_release=time_release,
                             type_news=type_news)
            print 'id : ', id
            print 'response :  ', response
            print 'response_news:', item['response_news'][0].encode('utf-8')
            if not len(item['title']) == 0:
                print 'title:', item['title'][0].encode('utf-8')
            else:
                print 'title:', 'null'
            yield QqproItem(title=title,
                            link=link,
                            time_release=time_release,
                            type_news=type_news)
Beispiel #2
0
    def parse_roll_item(self, response):
        print '=====parse_roll_item:=====response:', response
        #sel=Selector(response)
        #sel_a=response.xpath('//div[@class="main"]/div[@class="mainBody"]/div[@class="mainCon"]/div[@class="list c"]')
        sel_a = response.xpath('//*[@id=\'artContainer\']')
        #sel_a=response.xpath('/html/body/div[@id="iBody"]/div[@class="wrap c"]/div[@class="main"]/div[@class="mainBody"]/div[@class="mainCon"]/div[@id="artContainer"]/ul/li')
        #sel_m=sel_a.xpath('.//div[@id="artContainer"]/ul/li')
        #sel_a=sel
        sel_b = response.xpath('//ul')
        sel_c = sel_b.xpath('//li')
        print 'sel_a : ', sel_a
        print 'sel_b : ', sel_b
        print 'sel_c : ', sel_c
        #print 'sel_m : ',sel_m
        id = 0
        for site in sel_c:
            id += 1

            title = site.xpath('a/text()').extract()
            link = site.xpath('a/@href').extract()
            time_release = site.xpath('span[2]/text() ').extract()
            response_news = site.xpath('a/text() ').extract()
            #type_news=site.xpath('span[@class="t-tit"]/text()' ).extract()
            type_news = site.xpath('span[2]/text()').extract()

            item = QqproItem(title=title,
                             link=link,
                             time_release=time_release,
                             type_news=type_news,
                             response_news=response_news)

            print 'id : ', id
            print 'response :  ', response
            if not len(item['response_news']) == 0:
                print 'response_news:', item['response_news'][0].encode(
                    'utf-8')
            else:
                print 'response_news', 'null'

            if not len(item['title']) == 0:
                print 'title:', item['title'][0].encode('utf-8')
            else:
                print 'title:', 'null'
            yield QqproItem(title=title,
                            link=link,
                            time_release=time_release,
                            type_news=type_news,
                            response_news=response_news)
Beispiel #3
0
    def parse_base_item(self, response):
        ###  (1)新闻中心_要闻
        print '=====parse_base_item:=====response:', response
        base_url = get_base_url(response)
        sel_a = response.xpath('//div[@id="news"]')
        sel_b = sel_a.xpath('.//div[@class="Q-tpWrap"]')
        id = 0
        ##爬取主页新闻标题列表
        for site in response.xpath('//em[@class]'):
            id += 1
            title = site.xpath('span/span/a/text() ').extract()
            link = site.xpath('span/span/a/@href').extract()
            response_news = site.xpath('../p[@class]/text() ').extract()

            ##show contents
            item=QqproItem(title=title,link=link,\
                    response_news=response_news,\
                    manufacturer='qq_center_yaowen')

            ##正文抓取
            print 'main body loop:'
            print 'response = ', response
            link = site.xpath('span/span/a/@href').extract()
            url_n = ''.join(link)
            url_new = urljoin(base_url, url_n)
            yield scrapy.Request(url_new,
                                 callback=self.parse_body_center_yaowen,
                                 meta={'item': item})
Beispiel #4
0
    def parse(self, response):
        print '=====parse_pro_item:=====response:', response
        #a:
        ########         /ul/li/div/div            ########
        #b:
        ########         /ul/li                    ########
        base_url = get_base_url(response)
        sel_b = response.xpath('//ul')
        sel_c = sel_b.xpath('.//li')
        sel_d = sel_c.xpath('.//div')
        sel_e = sel_d.xpath('.//div')
        id = 0
        for site in sel_e:
            id += 1
            #a:
            #############################楚################粤########################
            title = site.xpath('h3/a/text()   |a/text()').extract()
            print 'title: ', title
            if len(title) != 0:
                link = site.xpath('h3/a/@href |a/@href').extract()
                #time_release=site.xpath('   ./../div[@class="pubTime"]/text()').extract()
                response_news = site.xpath(
                    'p/text() | ./../p/text() ').extract()
                #type_news=['']

                ##item: hash
                title_a = ''.join(title[0]).encode('utf-8')  ###严格的格式
                sha1obj = hashlib.sha1()
                sha1obj.update(title_a)
                hash = sha1obj.hexdigest()
                hash = [hash]  ##item各项采用列表类型
                print 'spider: hash: ', hash
                #b:
                #############################楚############################粤#######################湘、豫########
                #title=site.xpath('div/div[2]/h3/a/text()       |div[2]/div[1]/a/text() |a/text()').extract()
                #link=site.xpath('div/div[2]/h3/a/@href         |div[2]/div[1]/a/@href  |a/@href').extract()
                #time_release=site.xpath('div/div[2]/h3/a/@href |div[2]/div[1]/a/@href  |a/@href').extract()
                #response_news=site.xpath('div/p/text()         |div[2]/p/text()        |../p/text()').extract()
                #type_news=site.xpath('div/div[2]/h3/a/text()   |div[2]/div[1]/a/text() |a/text()').extract()

                url_m = ''.join(link)
                url_new = urljoin(base_url, url_m)

                item=QqproItem(title=title,\
                        link=url_new,\
                    response_news=response_news,\
                    manufacturer='province')

                #yield item

                yield scrapy.Request(url_new,
                                     callback=self.parse_body,
                                     meta={'item': item})

            else:
                print 'parse: 标题为空。不操作.\n\n'
Beispiel #5
0
    def parse_china_item(self, response):
        print '=====parse_china_item:=====response:', response
        sel_a = response.xpath('.//div[@id="news"]')
        sel_b = sel_a.xpath('.//div[@class="Q-tpWrap"]')
        id = 0
        for site in sel_b:
            id += 1
            title = site.xpath('em[@class]/a/text() ').extract()
            link = site.xpath('em[@class]/a/@href').extract()
            #link="http://news.qq.com/"+str(link)
            #print "=============link:       ",link
            time_release = site.xpath('em[@class]/a/@href ').extract()
            response_news = site.xpath('p[@class]/text() ').extract()
            type_news = site.xpath('em[@class]/a/text() ').extract()

            ##item: hash
            title_a = ''.join(title)
            sha1obj = hashlib.sha1()
            sha1obj.update(title_a)
            hash = sha1obj.hexdigest()
            print 'spider: hash: ', hash

            item = QqproItem(title=title,
                             link=link,
                             time_release=time_release,
                             response_news=response_news,
                             type_news=type_news,
                             hash=hash,
                             manufacturer='qq_center_china')

            print 'id : ', id
            print 'response :  ', response

            if not len(item['response_news']) == 0:
                print 'response_news:', item['response_news'][0].encode(
                    'utf-8')
            else:
                print 'response_news:', 'response_news'

            if not len(item['title']) == 0:
                print 'title:', item['title'][0].encode('utf-8')
            else:
                print 'title:', 'null'
            yield item
Beispiel #6
0
    def parse(self, response):
        #print '=====parse:response:',response
        id = 0
        base_url = get_base_url(response)

        ##目录列表抓取
        sel_b = response.xpath('//em[@class]')
        for site in sel_b:
            #print 'contents loop:'
            #print 'response= ',response
            id += 1

            ##item: title link time res type
            title = site.xpath('span/span/a/text() ').extract()
            if len(title) != 0:
                link = site.xpath('span/span/a/@href').extract()
                time_release = ['']
                #response_news=site.xpath('div[@class="Q-tpWrap"]/p[@class]/text() ').extract()
                response_news = site.xpath('./../p[@class]/text() ').extract()
                #type_news=site.xpath('span/span/a/text() ' ).extract()

                ##item: hash
                #title_a=''.join(title[0]).encode('utf-8')
                ##title_a=str( ( title ) ).encode('utf-8')##
                #sha1obj = hashlib.sha1()
                #sha1obj.update(title_a)
                #hash = sha1obj.hexdigest()
                #hash=[hash]##hash 为列表类型
                ##print 'spider: hash: ',hash

                #item=QqproItem(title=title,link=link,time_release=time_release,\
                #        response_news=response_news,hash=hash,\
                #        manufacturer='s_yaowen')
                url_m = (str(link))[3:-2]
                url_new = urljoin(base_url, url_m)
                item=QqproItem(title=title,link=url_new,\
                        response_news=response_news,\
                        manufacturer='s_yaowen')
                yield scrapy.Request(url_new,
                                     callback=self.parse_body,
                                     meta={'item': item})
Beispiel #7
0
    def parse(self,response):
        print '=====parse_surface_item:=====response:',response
        sel_a=response.xpath('//div[contains(@id,"newsContent")]')
        sel_b=sel_a.xpath('.//ul')
        sel_c=sel_b.xpath('.//li')
        id=0        
        base_url = get_base_url(response)
        for site in sel_c:
            id+=1
            
            ##item: ...
            title=site.xpath('a/text()' ).extract()
            if len(title)!=0:
                link=site.xpath('a/@href' ).extract()
                response_news=['']

                item=QqproItem(title=title,link=link,response_news=response_news,\
                    manufacturer='surface')

                url_m= (str(link))[3:-2]
                #url_m= ''.join(link)
                url_new=urljoin(base_url,url_m )
                yield scrapy.Request(url_new, callback=self.parse_body,meta={'item':item})