def process_company_item(item, spider): now = datetime.datetime.utcnow() doc = item['doc'] #删除部分 doc.pop('detail_url', None) doc.pop('about_url', None) doc.pop('contact_url', None) #计算部分 shop_owner_type = calc_shop_owner_type(doc['shop_name']) shop_site_url_hash = fnvhash.fnv_64a_str(doc['shop_site_url']) calc_doc = { 'shop_owner_type': shop_owner_type, 'shop_site_url_hash': str(shop_site_url_hash), } #提取数据的默认填充部分 default_doc1 = { 'crawl_time': now, 'shop_site_type': spider.shop_site_type, 'shop_name': None, 'shop_site_url': None, 'shop_products': None, 'shop_launch_time': None, 'shop_address': None, 'shop_contacts': None, 'shop_phone': None, 'shop_cellphone': None, 'shop_fax': None, 'shop_email': None, 'shop_qq': None, } #默认填充数据 default_doc2 = { 'shop_type_id': None, 'shop_area_id': None, 'shop_certified': 1, 'city_id': 1, 'is_bad_url': 0, 'is_bad_time': None, 'deleted': 0, 'isRead': 0, 'isImport': 0, } all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems(), default_doc2.iteritems()) for k, v in all_doc: doc.setdefault(k, v)
def parse_list_page(self, response): """ 商店名:抓取 性质:计算 来源:计算 类别:抓取? 店主:抓取 地址:抓取 联系方式:抓取 创店日期:抓取 主营产品:抓取 状态:? 操作:不需要 shop_info = {} shop_info['shop_type_id'] = 10 shop_info['shop_name'] = self.company shop_info['shop_address'] = self.address if not self.address: shop_info['shop_address'] = '山东淄博' shop_info['shop_contacts'] = self.contact shop_info['shop_phone'] = self.phone shop_info['shop_products'] = self.keywords shop_info['shop_site_url'] = self.site_url shop_info['shop_site_url_hash'] = fnvhash.fnv_64a_str(self.site_url) shop_info['shop_site_type'] = 24 shop_info['shop_certified'] = 1 shop_info['shop_owner_type'] = 1 company_key = ['厂','站','公司', '事务所', '集团'] for item in company_key: if item in self.company: shop_info['shop_owner_type'] = 2 """ multi_xpath = '//*[@id="sw_mod_searchlist"]/ul/li' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: #shop_products为shop相关所有描述.包括主营产品,简要描述和细节描述 #提取部分 shop_name = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]//text()').extract()) shop_address = ''.join(hxs.select('.//div[@class="sm-offerResult-address"]/a/text()').extract()) #主营产品: 有机化工原料; 苯 醇 酯 醚 类 批发 #这是部分主营产品,所有主营产品在主营产品 链接里的标题里面 shop_part_products = ''.join(hxs.select('.//div[@class="sm-offerResult-sale"]//text()').extract()) shop_brief = ''.join(hxs.select('.//div[@class="sm-offerResult-sub"]//text()').extract()) creditdetail_url = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]/@href').extract()) creditdetail_url = urllib.unquote(creditdetail_url).strip() #计算部分 shop_products = shop_brief + shop_part_products creditdetail_url_query = get_url_query(creditdetail_url) creditdetail_url_query.pop('tracelog', None) creditdetail_url = change_url_query(creditdetail_url, creditdetail_url_query) shop_site_url = get_site_url(creditdetail_url) shop_owner_type = calc_shop_owner_type(shop_name) shop_site_url_hash = fnvhash.fnv_64a_str(shop_site_url) #无对应数据部分 shop_qq = None shop_email = None lack_doc = { 'shop_qq': shop_qq, 'shop_email': shop_email, } #默认填充部分 shop_type_id = None shop_area_id = None shop_site_type = self.shop_site_type shop_certified = 1 city_id = 1 is_bad_url = 0 is_bad_time = None deleted = 0 isRead = 0 isImport = 0 default_doc = { 'shop_type_id': shop_type_id, 'shop_area_id': shop_area_id, 'shop_site_type': shop_site_type, 'shop_certified': shop_certified, 'city_id': city_id, 'is_bad_url': is_bad_url, 'is_bad_time': is_bad_time, 'deleted': deleted, 'isRead': isRead, 'isImport': isImport, } now = datetime.datetime.utcnow() doc = { 'shop_name': shop_name, 'shop_address': shop_address, 'shop_products': shop_products, 'shop_site_url': shop_site_url, 'shop_site_url_hash': shop_site_url_hash, 'show_owner_type': shop_owner_type, 'crawl_time': now, } doc.update(lack_doc) doc.update(default_doc) detail_url = creditdetail_url list_url = response.url query = response.meta.get('query') item = LegItem(collection=self.collection, doc=doc, detail_url=detail_url, list_url=list_url, query=query) if detail_url and self.visit_detail: detail_request = Request(detail_url, callback=self.parse_detail_page) detail_request.meta['item'] = item detail_request.meta['query'] = query yield detail_request else: yield item