コード例 #1
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = item['link'].split('/')
            item['date'] = time_raw[-4] + "-" + time_raw[-3] + "-" + time_raw[-2]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//div[@id='articlecontent']/p/text()").extract()
            if content_list ==[]:
                content_list = hxs.select("//div[@id='articlecontent']/P/text()").extract()
            if content_list ==[]:
                content_list = hxs.select("//div[@id='articlecontent']/FONT/text()").extract()
            if content_list ==[]:
                content_list = hxs.select("//div[@id='articlecontent']/text()").extract()
            if content_list ==[]:
                content_list = hxs.select("//font[@id='zoom']/text()").extract()
            if content_list ==[]:
                content_list = hxs.select("//div[@id='zoom']/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(1)

        except:
            pass
コード例 #2
0
ファイル: yulin.py プロジェクト: sarsiz/news-spider-scrapy
    def parse_item(self, response):
        
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = hxs.select("//div[@class='about']/span/text()").extract()[0].split()
            item['date'] = time_raw[0]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//dl[@id='content']/dd//p/text()").extract()
            #if content_list ==[]:
             #   content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(random())

        except:
            pass
コード例 #3
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = item['link'].split('/')
            item['date'] = time_raw[-2]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//div[@id='content']/p/text()").extract()
            #if content_list ==[]:
            #    content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(0.5)

        except:
            pass
コード例 #4
0
    def parse_item(self, response):
        
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = hxs.select("//font[@style='FONT-SIZE: 9pt; COLOR: #666666']/text()").extract()[0].split()
            item['date'] = time_raw[1][:4] + "-" + time_raw[1][5:7] + "-" + time_raw[1][8:-1]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//p/text()").extract()
            #if content_list ==[]:
             #   content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(random())

        except:
            pass
コード例 #5
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = item['link'].split('/')
            item['date'] = time_raw[-3] + "-" + time_raw[-2]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//p/text()").extract()
            #if content_list ==[]:
            #    content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(0.5)

        except:
            pass
コード例 #6
0
    def parse_item(self, response):

        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = hxs.select(
                "//font[@style='FONT-SIZE: 9pt; COLOR: #666666']/text()"
            ).extract()[0].split()
            item['date'] = time_raw[1][:4] + "-" + time_raw[1][
                5:7] + "-" + time_raw[1][8:-1]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//p/text()").extract()
            #if content_list ==[]:
            #   content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(random())

        except:
            pass
コード例 #7
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select(
                "//div[@class='contentbody']/div[@class='contentleft']/div[@class='NewsContent']/h3/text()"
            ).extract()[0]

            time_raw = item['link'].split('/')[-2]
            if len(time_raw) == 6:
                item['date'] = time_raw[:4] + '-' + time_raw[
                    -2] + '-' + time_raw[-1]
            if len(time_raw) == 8:
                item['date'] = time_raw[:4] + '-' + time_raw[
                    -4:-2] + '-' + time_raw[-2:]
            if len(time_raw) == 7:
                if time_raw[4] > 2:
                    item['date'] = time_raw[:4] + '-' + time_raw[
                        -3] + '-' + time_raw[-2:]
                elif time_raw[5] == 0:
                    item['date'] = time_raw[:4] + '-' + time_raw[
                        -4:-2] + '-' + time_raw[-1]
                else:
                    item['date'] = time_raw[:4] + '-' + time_raw[
                        -3] + '-' + time_raw[-2:]

            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select(
                "//div[@id='content']/div[@class='nr']/p/text()").extract()
            #if content_list ==[]:
            #    content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(0.5)

        except:
            pass
コード例 #8
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = item['link'].split('/')
            item['date'] = time_raw[-4] + "-" + time_raw[-3] + "-" + time_raw[
                -2]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select(
                "//div[@id='articlecontent']/p/text()").extract()
            if content_list == []:
                content_list = hxs.select(
                    "//div[@id='articlecontent']/P/text()").extract()
            if content_list == []:
                content_list = hxs.select(
                    "//div[@id='articlecontent']/FONT/text()").extract()
            if content_list == []:
                content_list = hxs.select(
                    "//div[@id='articlecontent']/text()").extract()
            if content_list == []:
                content_list = hxs.select(
                    "//font[@id='zoom']/text()").extract()
            if content_list == []:
                content_list = hxs.select("//div[@id='zoom']/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(1)

        except:
            pass
コード例 #9
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//div[@class='contentbody']/div[@class='contentleft']/div[@class='NewsContent']/h3/text()").extract()[0]

            time_raw = item['link'].split('/')[-2]
            if len(time_raw) == 6:
                item['date'] = time_raw[:4] + '-' + time_raw[-2] + '-' + time_raw[-1]
            if len(time_raw) == 8:
                item['date'] = time_raw[:4] + '-' + time_raw[-4:-2] + '-' + time_raw[-2:]
            if len(time_raw) == 7:
                if time_raw[4] > 2:
                    item['date'] = time_raw[:4] + '-' + time_raw[-3] + '-' + time_raw[-2:]
                elif time_raw[5] == 0:
                    item['date'] = time_raw[:4] + '-' + time_raw[-4:-2] + '-' + time_raw[-1]
                else:
                    item['date'] = time_raw[:4] + '-' + time_raw[-3] + '-' + time_raw[-2:]
                    
            
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//div[@id='content']/div[@class='nr']/p/text()").extract()
            #if content_list ==[]:
            #    content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(0.5)

        except:
            pass