def parse_table(selfs, response): url = response.url data_table = response.xpath('/html/body/table') data_list = table_to_list(data_table) # nodeId = response.meta['nodeId'] if data_list <> []: for data in data_list[4:10]: if data[0] <> '': for rows in range(2, len(data_list[3]), 1): item = Puok_chinaisaPartpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = data_list[0][0] # item['unit'] = data_list[1][13] item['unit'] = u'元/吨' item['part'] = data_list[2][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = url yield item else: print u'进入了table的数据页,但xpath未找到数据位置' + response.meta[ 'title_name'] + ' ' + response.url
def parse_newsContent(self, response): #现在拿到的可能还不是真正的table地址,需要再次解析table的实际地址 realdataherf = response.xpath('//*[@id="shLink"]/@href').extract() if realdataherf <> []: realdataURL = response.urljoin(realdataherf[0]) request = scrapy.http.Request(realdataURL, callback=self.parse_table) request.meta['datadate'] = response.meta['datadate'] request.meta['title_name'] = response.meta['title_name'] yield request else: data_table = response.xpath('/html/body/div/table') data_list = table_to_list(data_table) # nodeId = response.meta['nodeId'] if data_list <> []: for data in data_list[1:]: if data[0] <> '': for rows in range(2, len(data_list[1]), 1): item = Puok_chinaisaPartpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = response.meta['title_name'] item['unit'] = u'元/吨' item['part'] = data_list[0][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'parse_newsContent 方法中未解析到数据--->' + response.meta[ 'title_name'] + ' ' + response.url
def parse_newsFrameContent(self, response): datadate = datetime.datetime.strptime(response.meta['title_date'], '%Y-%m-%d') realUrlPart = response.xpath('//*[@id="mframe"]/@src') #如果不等于空,表示此页面不含有文章,文章需要去另外一个页面取到 # 中国钢铁业协会网站比较特殊,要先解析这个网页框架,从框架中拿出真正的html文章的地址,才能继续解析 #/html/body/table if realUrlPart <> []: #realURL才是真正文章位置的url realURL = response.urljoin(realUrlPart.extract()[0]) request = scrapy.http.Request(realURL, callback=self.parse_newsContent) request.meta['title_name'] = response.meta['title_name'] request.meta['datadate'] = datadate request.meta['frameURL'] = response.url request.meta['nodeId'] = response.meta['nodeId'] yield request #也有部分页面数据就在此页面 else: data_table = response.xpath( '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table') data_list = table_to_list(data_table) # nodeId = response.meta['nodeId'] if data_list <> []: for data in data_list[1:]: if data[0] <> '': for rows in range(2, len(data_list[1]), 1): item = Puok_chinaisaPartpriceItem() item['datadate'] = datadate item['page_date'] = response.meta['title_name'] # item['page_title'] = response.meta['title_name'] item['unit'] = u'元/吨' item['part'] = data_list[0][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'未找到首次解析页面的table--->' + response.meta[ 'title_name'] + ' ' + response.url
def parse_table(selfs,response): url = response.url data_table = response.xpath('/html/body/table') data_list = table_to_list(data_table) # nodeId = response.meta['nodeId'] if data_list <> []: for data in data_list[5:13]: for rows in range(2,8,1): item = Puok_chinaisaSortpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = data_list[0][0] item['page_sort'] = data_list[1][0] if rows < 6: item['index_name'] = data_list[3][rows] + ' ' + data_list[4][rows] else: item['index_name'] = data_list[3][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = url yield item for data in data_list[16:19]: for rows in [2,4,6,7]: item = Puok_chinaisaSortpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = data_list[0][0] item['page_sort'] = data_list[13][0] if rows < 6: item['index_name'] = data_list[14][rows] + ' ' + data_list[15][rows] else: item['index_name'] = data_list[14][rows] item['mainsort'] = data[0] item['mainsize'] = u'无规格' item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = url yield item else: print 'parse_table function, not find table---->' + response.url
def parse_newsContent(self, response): datadate = datetime.datetime.strptime(response.meta['datadate'], '%Y-%m-%d') # imageurl = '' # newsTitle = response.meta['newsTitle'] # newsContentURL = response.meta['newsContentURL'] data_table = response.xpath( '//*[@id="main-right"]/div[@class="right03"]/table') data_list = table_to_list(data_table) for data in data_list[1:]: item = Puok_cansiItem() item['datadate'] = data[0].replace('\n', '').replace('\t', '').replace( '\r', '-') item['index_name'] = data[1] item['world'] = data[2].replace('%', '') item['china'] = data[3].replace('%', '') item['korea'] = data[4].replace('%', '') item['japan'] = data[5].replace('%', '') item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item
def parse_newsFrameContent(self, response): datadate = datetime.datetime.strptime(response.meta['title_date'], '%Y-%m-%d') realUrlPart = response.xpath('//*[@id="mframe"]/@src') #如果不等于空,表示此页面不含有文章,文章需要去另外一个页面取到 # 中国钢铁业协会网站比较特殊,要先解析这个网页框架,从框架中拿出真正的html文章的地址,才能继续解析 if realUrlPart <> []: #realURL才是真正文章位置的url realURL = response.urljoin(realUrlPart.extract()[0]) request = scrapy.http.Request(realURL, callback=self.parse_newsContent) request.meta['title_name'] = response.meta['title_name'] request.meta['datadate'] = datadate request.meta['frameURL'] = response.url request.meta['nodeId'] = response.meta['nodeId'] yield request #table数据就在此页 else: #找page_sort # page_sort1 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/p[3]/span//text()').extract() # page_sort2 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/p[5]/span//text()').extract() # if page_sort1 == [] and page_sort2 == []: # page_sort1 = response.xpath( # '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/div[1]//text()').extract() # page_sort2 = response.xpath( # '//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/div[2]//text()').extract() # if page_sort1 <> [] and page_sort2 <> []: # page_sort1 = page_sort1[0] # page_sort2 = page_sort2[0] # else: # print u'找不到标题了。。。' + response.url # else: # page_sort1 = page_sort1[0] # page_sort2 = page_sort2[0] data_table1 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table[1]') data_list1 = table_to_list(data_table1) # nodeId = response.meta['nodeId'] if data_list1 <> []: for data in data_list1[2:]: for rows in range(2, 8, 1): item = Puok_chinaisaSortpriceItem() item['datadate'] = datadate item['page_date'] = response.meta['title_name'] item['page_sort'] = u'国内市场八个品种价格及指数(含税价)' if rows < 6: item['index_name'] = data_list1[0][rows] + ' ' + data_list1[1][rows] else: item['index_name'] = data_list1[0][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'parse_newsFrameContent function, data_list1-->' + response.url data_table2 = response.xpath('//*[@id="wrapper"]/table[2]/tr[3]/td/table/tr[3]/td/table[2]') data_list2 = table_to_list(data_table2) if data_list2 <> []: for data in data_list2[2:]: for rows in range(1, 5, 1): item = Puok_chinaisaSortpriceItem() item['datadate'] = datadate item['page_date'] = response.meta['title_name'] item['page_sort'] = u'国内市场钢材综合价格指数及长材、板材价格指数(含税)' if rows < 3: item['index_name'] = data_list2[0][rows] + ' ' + data_list2[1][rows] else: item['index_name'] = data_list2[0][rows] item['mainsort'] = data[0] item['mainsize'] = u'无规格' item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'parse_newsFrameContent function, data_list2-->' + response.url
def parse_newsContent(self,response): #现在拿到的可能还不是真正的table地址,需要再次解析table的实际地址 realherf = response.xpath('//*[@id="shLink"]/@href').extract() if realherf <> []: realURL = response.urljoin(realherf[0]) # print 'table realURL--->' + realURL request = scrapy.http.Request(realURL, callback=self.parse_table) request.meta['datadate'] = response.meta['datadate'] yield request else: # print 'parse_newsContent function error + ' + response.url page_sort1 = response.xpath('/html/body/div/p[1]/span/text()').extract() page_sort2 = response.xpath('/html/body/div/p[2]/span/text()').extract() if page_sort1 <> [] and page_sort2 <> [] : page_sort1 = page_sort1[0] page_sort2 = page_sort2[0] data_table1 = response.xpath('/html/body/div/div[1]/table') if data_table1.extract() == []: data_table1 = response.xpath('/html/body/div/table[1]') data_list1 = table_to_list(data_table1) # nodeId = response.meta['nodeId'] if data_list1 <> []: for data in data_list1[2:]: for rows in range(2, 8, 1): item = Puok_chinaisaSortpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = response.meta['title_name'] item['page_sort'] = page_sort1 if rows < 6: item['index_name'] = data_list1[0][rows] + ' ' + data_list1[1][rows] else: item['index_name'] = data_list1[0][rows] item['mainsort'] = data[0] item['mainsize'] = data[1] item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'parse_newsContent function, data_list1-->' + response.url data_table2 = response.xpath('/html/body/div/div[2]/table') if data_table2.extract() == []: data_table2 = response.xpath('/html/body/div/table[2]') data_list2 = table_to_list(data_table2) if data_list2 <> []: for data in data_list2[2:]: for rows in range(1, 5, 1): item = Puok_chinaisaSortpriceItem() item['datadate'] = response.meta['datadate'] item['page_date'] = response.meta['title_name'] item['page_sort'] = page_sort2 if rows < 3: item['index_name'] = data_list2[0][rows] + ' ' + data_list2[1][rows] else: item['index_name'] = data_list2[0][rows] item['mainsort'] = data[0] item['mainsize'] = u'无规格' item['price'] = data[rows] item['update_dt'] = datetime.datetime.now() item['source'] = response.url yield item else: print u'parse_newsContent function, data_list2-->' + response.url