Exemple #1
0
    def parse_table(selfs, response):
        url = response.url
        data_table = response.xpath('/html/body/table')
        data_list = table_to_list(data_table)
        # nodeId = response.meta['nodeId']
        if data_list <> []:
            for data in data_list[4:10]:
                if data[0] <> '':
                    for rows in range(2, len(data_list[3]), 1):
                        item = Puok_chinaisaPartpriceItem()
                        item['datadate'] = response.meta['datadate']
                        item['page_date'] = data_list[0][0]
                        # item['unit'] = data_list[1][13]
                        item['unit'] = u'元/吨'
                        item['part'] = data_list[2][rows]
                        item['mainsort'] = data[0]
                        item['mainsize'] = data[1]
                        item['price'] = data[rows]
                        item['update_dt'] = datetime.datetime.now()
                        item['source'] = url
                        yield item

        else:
            print u'进入了table的数据页,但xpath未找到数据位置' + response.meta[
                'title_name'] + ' ' + response.url
Exemple #2
0
 def parse_newsContent(self, response):
     #现在拿到的可能还不是真正的table地址,需要再次解析table的实际地址
     realdataherf = response.xpath('//*[@id="shLink"]/@href').extract()
     if realdataherf <> []:
         realdataURL = response.urljoin(realdataherf[0])
         request = scrapy.http.Request(realdataURL,
                                       callback=self.parse_table)
         request.meta['datadate'] = response.meta['datadate']
         request.meta['title_name'] = response.meta['title_name']
         yield request
     else:
         data_table = response.xpath('/html/body/div/table')
         data_list = table_to_list(data_table)
         # nodeId = response.meta['nodeId']
         if data_list <> []:
             for data in data_list[1:]:
                 if data[0] <> '':
                     for rows in range(2, len(data_list[1]), 1):
                         item = Puok_chinaisaPartpriceItem()
                         item['datadate'] = response.meta['datadate']
                         item['page_date'] = response.meta['title_name']
                         item['unit'] = u'元/吨'
                         item['part'] = data_list[0][rows]
                         item['mainsort'] = data[0]
                         item['mainsize'] = data[1]
                         item['price'] = data[rows]
                         item['update_dt'] = datetime.datetime.now()
                         item['source'] = response.url
                         yield item
         else:
             print u'parse_newsContent 方法中未解析到数据--->' + response.meta[
                 'title_name'] + ' ' + response.url
Exemple #3
0
 def parse_newsFrameContent(self, response):
     datadate = datetime.datetime.strptime(response.meta['title_date'],
                                           '%Y-%m-%d')
     realUrlPart = response.xpath('//*[@id="mframe"]/@src')
     #如果不等于空,表示此页面不含有文章,文章需要去另外一个页面取到
     # 中国钢铁业协会网站比较特殊,要先解析这个网页框架,从框架中拿出真正的html文章的地址,才能继续解析
     #/html/body/table
     if realUrlPart <> []:
         #realURL才是真正文章位置的url
         realURL = response.urljoin(realUrlPart.extract()[0])
         request = scrapy.http.Request(realURL,
                                       callback=self.parse_newsContent)
         request.meta['title_name'] = response.meta['title_name']
         request.meta['datadate'] = datadate
         request.meta['frameURL'] = response.url
         request.meta['nodeId'] = response.meta['nodeId']
         yield request
     #也有部分页面数据就在此页面
     else:
         data_table = response.xpath(
             '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table')
         data_list = table_to_list(data_table)
         # nodeId = response.meta['nodeId']
         if data_list <> []:
             for data in data_list[1:]:
                 if data[0] <> '':
                     for rows in range(2, len(data_list[1]), 1):
                         item = Puok_chinaisaPartpriceItem()
                         item['datadate'] = datadate
                         item['page_date'] = response.meta['title_name']
                         # item['page_title'] = response.meta['title_name']
                         item['unit'] = u'元/吨'
                         item['part'] = data_list[0][rows]
                         item['mainsort'] = data[0]
                         item['mainsize'] = data[1]
                         item['price'] = data[rows]
                         item['update_dt'] = datetime.datetime.now()
                         item['source'] = response.url
                         yield item
         else:
             print u'未找到首次解析页面的table--->' + response.meta[
                 'title_name'] + ' ' + response.url
Exemple #4
0
    def parse_table(selfs,response):
        url = response.url
        data_table = response.xpath('/html/body/table')
        data_list = table_to_list(data_table)
        # nodeId = response.meta['nodeId']
        if data_list <> []:
            for data in data_list[5:13]:
                for rows in range(2,8,1):
                    item = Puok_chinaisaSortpriceItem()
                    item['datadate'] = response.meta['datadate']
                    item['page_date'] = data_list[0][0]
                    item['page_sort'] = data_list[1][0]
                    if rows < 6:
                        item['index_name'] = data_list[3][rows] + ' ' + data_list[4][rows]
                    else:
                        item['index_name'] = data_list[3][rows]
                    item['mainsort'] = data[0]
                    item['mainsize'] = data[1]
                    item['price'] = data[rows]
                    item['update_dt'] = datetime.datetime.now()
                    item['source'] = url
                    yield item

            for data in data_list[16:19]:
                for rows in [2,4,6,7]:
                    item = Puok_chinaisaSortpriceItem()
                    item['datadate'] = response.meta['datadate']
                    item['page_date'] = data_list[0][0]
                    item['page_sort'] = data_list[13][0]
                    if rows < 6:
                        item['index_name'] = data_list[14][rows] + ' ' + data_list[15][rows]
                    else:
                        item['index_name'] = data_list[14][rows]
                    item['mainsort'] = data[0]
                    item['mainsize'] = u'无规格'
                    item['price'] = data[rows]
                    item['update_dt'] = datetime.datetime.now()
                    item['source'] = url
                    yield item
        else:
            print 'parse_table function, not find table---->' + response.url
Exemple #5
0
    def parse_newsContent(self, response):
        datadate = datetime.datetime.strptime(response.meta['datadate'],
                                              '%Y-%m-%d')
        # imageurl = ''
        # newsTitle = response.meta['newsTitle']
        # newsContentURL = response.meta['newsContentURL']

        data_table = response.xpath(
            '//*[@id="main-right"]/div[@class="right03"]/table')
        data_list = table_to_list(data_table)

        for data in data_list[1:]:
            item = Puok_cansiItem()
            item['datadate'] = data[0].replace('\n', '').replace('\t',
                                                                 '').replace(
                                                                     '\r', '-')
            item['index_name'] = data[1]
            item['world'] = data[2].replace('%', '')
            item['china'] = data[3].replace('%', '')
            item['korea'] = data[4].replace('%', '')
            item['japan'] = data[5].replace('%', '')
            item['update_dt'] = datetime.datetime.now()
            item['source'] = response.url
            yield item
Exemple #6
0
    def parse_newsFrameContent(self, response):
        datadate = datetime.datetime.strptime(response.meta['title_date'], '%Y-%m-%d')
        realUrlPart = response.xpath('//*[@id="mframe"]/@src')
        #如果不等于空,表示此页面不含有文章,文章需要去另外一个页面取到
        # 中国钢铁业协会网站比较特殊,要先解析这个网页框架,从框架中拿出真正的html文章的地址,才能继续解析
        if realUrlPart <> []:
            #realURL才是真正文章位置的url
            realURL = response.urljoin(realUrlPart.extract()[0])
            request = scrapy.http.Request(realURL, callback=self.parse_newsContent)
            request.meta['title_name'] = response.meta['title_name']
            request.meta['datadate'] = datadate
            request.meta['frameURL'] = response.url
            request.meta['nodeId'] = response.meta['nodeId']
            yield request
        #table数据就在此页
        else:
            #找page_sort
            # page_sort1 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/p[3]/span//text()').extract()
            # page_sort2 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/p[5]/span//text()').extract()
            # if page_sort1 == [] and page_sort2 == []:
            #     page_sort1 = response.xpath(
            #         '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/div[1]//text()').extract()
            #     page_sort2 = response.xpath(
            #         '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/div[2]//text()').extract()
            #     if page_sort1 <> [] and page_sort2 <> []:
            #         page_sort1 = page_sort1[0]
            #         page_sort2 = page_sort2[0]
            #     else:
            #         print u'找不到标题了。。。' + response.url
            # else:
            #     page_sort1 = page_sort1[0]
            #     page_sort2 = page_sort2[0]

            data_table1 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table[1]')
            data_list1 = table_to_list(data_table1)

            # nodeId = response.meta['nodeId']
            if data_list1 <> []:
                for data in data_list1[2:]:
                    for rows in range(2, 8, 1):
                        item = Puok_chinaisaSortpriceItem()
                        item['datadate'] = datadate
                        item['page_date'] = response.meta['title_name']
                        item['page_sort'] = u'国内市场八个品种价格及指数(含税价)'
                        if rows < 6:
                            item['index_name'] = data_list1[0][rows] + ' ' + data_list1[1][rows]
                        else:
                            item['index_name'] = data_list1[0][rows]
                        item['mainsort'] = data[0]
                        item['mainsize'] = data[1]
                        item['price'] = data[rows]
                        item['update_dt'] = datetime.datetime.now()
                        item['source'] = response.url
                        yield item
            else:
                print u'parse_newsFrameContent function, data_list1-->' + response.url

            data_table2 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table[2]')
            data_list2 = table_to_list(data_table2)
            if data_list2 <> []:
                for data in data_list2[2:]:
                    for rows in range(1, 5, 1):
                        item = Puok_chinaisaSortpriceItem()
                        item['datadate'] = datadate
                        item['page_date'] = response.meta['title_name']
                        item['page_sort'] = u'国内市场钢材综合价格指数及长材、板材价格指数(含税)'
                        if rows < 3:
                            item['index_name'] = data_list2[0][rows] + ' ' + data_list2[1][rows]
                        else:
                            item['index_name'] = data_list2[0][rows]
                        item['mainsort'] = data[0]
                        item['mainsize'] = u'无规格'
                        item['price'] = data[rows]
                        item['update_dt'] = datetime.datetime.now()
                        item['source'] = response.url
                        yield item
            else:
                print u'parse_newsFrameContent function, data_list2-->' + response.url
Exemple #7
0
    def parse_newsContent(self,response):
        #现在拿到的可能还不是真正的table地址,需要再次解析table的实际地址
        realherf = response.xpath('//*[@id="shLink"]/@href').extract()
        if realherf <> []:
            realURL = response.urljoin(realherf[0])
            # print 'table realURL--->' + realURL
            request = scrapy.http.Request(realURL, callback=self.parse_table)
            request.meta['datadate'] = response.meta['datadate']
            yield request
        else:
            # print 'parse_newsContent function error + ' + response.url
            page_sort1 = response.xpath('/html/body/div/p[1]/span/text()').extract()
            page_sort2 = response.xpath('/html/body/div/p[2]/span/text()').extract()
            if page_sort1 <> [] and page_sort2 <> [] :
                page_sort1 = page_sort1[0]
                page_sort2 = page_sort2[0]

            data_table1 = response.xpath('/html/body/div/div[1]/table')
            if data_table1.extract() == []:
                data_table1 = response.xpath('/html/body/div/table[1]')

            data_list1 = table_to_list(data_table1)

            # nodeId = response.meta['nodeId']
            if data_list1 <> []:
                for data in data_list1[2:]:
                    for rows in range(2, 8, 1):
                        item = Puok_chinaisaSortpriceItem()
                        item['datadate'] = response.meta['datadate']
                        item['page_date'] = response.meta['title_name']
                        item['page_sort'] = page_sort1
                        if rows < 6:
                            item['index_name'] = data_list1[0][rows] + ' ' + data_list1[1][rows]
                        else:
                            item['index_name'] = data_list1[0][rows]
                        item['mainsort'] = data[0]
                        item['mainsize'] = data[1]
                        item['price'] = data[rows]
                        item['update_dt'] = datetime.datetime.now()
                        item['source'] = response.url
                        yield item
            else:
                print u'parse_newsContent function, data_list1-->' + response.url

            data_table2 = response.xpath('/html/body/div/div[2]/table')
            if data_table2.extract() == []:
                data_table2 = response.xpath('/html/body/div/table[2]')

            data_list2 = table_to_list(data_table2)

            if data_list2 <> []:
                for data in data_list2[2:]:
                    for rows in range(1, 5, 1):
                        item = Puok_chinaisaSortpriceItem()
                        item['datadate'] = response.meta['datadate']
                        item['page_date'] = response.meta['title_name']
                        item['page_sort'] = page_sort2
                        if rows < 3:
                            item['index_name'] = data_list2[0][rows] + ' ' + data_list2[1][rows]
                        else:
                            item['index_name'] = data_list2[0][rows]
                        item['mainsort'] = data[0]
                        item['mainsize'] = u'无规格'
                        item['price'] = data[rows]
                        item['update_dt'] = datetime.datetime.now()
                        item['source'] = response.url
                        yield item
            else:
                print u'parse_newsContent function, data_list2-->' + response.url