def parse_content(self, response): """ 解析文章内容 """ log.info(('Begin parseContent ' + response.url)) item = MerchantItem() item['updated_at'] = int(time.time()) item['url'] = response.url item['area'] = response.url.split('/')[4] item['merchant_id'] = response.url.split('/')[-2] try: item['merchant_name'] = response.xpath( '//span[@id="shop_name_val"]/text()')[0].extract() item['company_profile'] = response.xpath( '//div[@class="i-txt"]/span[@class="s-con"]/text()' )[0].extract() item['service_area'] = \ response.xpath('//div[@class="des"]/div[@class="item-des clearfix"]/div[@class="i-txt i-dTxt"]/text()')[ 0].extract() item['merchant_pic'] = response.urljoin( response.xpath('//div[@class="pic"]/img/@src')[0].extract()) yield item except Exception as e: # log.warn("-----------------------获取到内容:" + response.text + "------------------------------") log.warn("spider error %s ( refer: %s )" % (e, response.url)) log.error(e) if configs.USE_PROXY: proxy_pool.add_failed_time(response.meta['proxy'].replace( 'http://', ''))
def get_client(host: str, port: int) -> MongoClient: try: client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE) log.info("Connected successfully!!!") return client except errors.ConnectionFailure as e: log.error(e)
def process_request(self, request, spider): try: log.info("==== proxy = " + proxy_pool.random_choice_proxy() + " ====") request.meta[ 'proxy'] = "http://%s" % proxy_pool.random_choice_proxy() except Exception as e: log.error(e)
def parse_list(self, response): selector = Selector(response) items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]') for item_selector in items_selector: id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '') # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498 next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format( id=id) if self.design_strategy_service.is_duplicate_url(next_url): log.info("=================过滤了" + next_url + "===========") continue yield scrapy.Request(next_url, self.parse_content, meta={'id': id})
def handle_item(self, design_topic_item: DesignTopicItem): if self.is_duplicate_url(design_topic_item['html_url']): return design_topic_model = self.get_model(design_topic_item) self.save_to_database(self.collection, design_topic_model) self.insert_to_redis(design_topic_model.html_url) log.info( "=========================================================================================" ) log.info("html_url:" + design_topic_item['html_url']) log.info("title:" + design_topic_item['title']) log.info("description:" + design_topic_item['description']) log.info( "=========================================================================================" )
def process_request(self, request, spider): try: log.info('Chrome driver begin...') self.driver.get(request.url) # 获取网页链接内容 return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) # 返回HTML数据 except TimeoutException: return HtmlResponse(url=request.url, request=request, encoding='utf-8', status=500) finally: log.info('Chrome driver end...')
def handle_item(self, item: CityItem): log.info('process item from worm url = ' + item['url']) if isinstance(item, CityItem): session = self.session() model = CityDO() model.name = item['name'] model.url = item['url'] model.create_at = item['create_at'] try: m = session.query(CityDO).filter( CityDO.url == model.url).first() if m is None: # 插入数据 log.info('add model from worm url = ' + model.url) session.add(model) session.flush() session.commit() log.info('spider_success url = ' + model.url) except Exception as error: session.rollback() log.error(error) raise finally: session.close() return item
def parse(self, response): """ 解析文章列表页,拿到页面上的链接,给内容解析页使用,如果有下一页,则调用本身 parse() """ log.info('Begin parse ' + response.url) list = response.xpath( '//div[@class="company-item"]//div[@class="ordinary clearfix"]') for index, merchant in enumerate(list): item = MerchantListItem() item['list_url'] = response.urljoin( merchant.xpath('./div[@class="list-middle fl"]/h2/a/@href'). extract_first()) item['category_name'] = merchant.xpath( './div[@class="list-middle fl"]/h2/a/text()').extract_first() item['merchant_id'] = merchant.xpath( './div[@class="list-right fl"]/a/@shop_id').extract_first() log.info('a href = ' + item['list_url']) yield Request(url=item['list_url'], callback=self.parse_content) ## 是否还有下一页,如果有的话,则继续 pages = response.xpath('//div[@class="p_page"]/a') cur_page_xpath = response.xpath( '//div[@class="p_page"]/span[@class="cur"]/text()') if cur_page_xpath is None: cur_page_num = 1 else: cur_page_num = int(cur_page_xpath.extract_first()) if pages: cur_index = 0 next_index = 0 for index, page_list in enumerate(pages): page_num_str = (page_list.xpath('./text()').extract_first()) if self.is_number(page_num_str): page_num = int(page_num_str) if page_num > cur_page_num: next_index = index break if next_index < len(pages): next_page_url = response.urljoin( pages[next_index].xpath('./@href').extract_first()) log.info('next_page_url: ' + next_page_url) # 将 「下一页」的链接传递给自身,并重新分析 yield scrapy.Request(next_page_url, callback=self.parse)
def handle_item(self, item: MerchantItem): log.info('process item from worm url = ' + item['url']) if isinstance(item, MerchantItem): session = self.session() model = MerchantDO() model.updated_at = item['updated_at'] model.merchant_name = item['merchant_name'] model.company_profile = item['company_profile'] model.service_area = item['service_area'] model.merchant_pic = item['merchant_pic'] model.merchant_id = item['merchant_id'] model.url = item['url'] model.area = item['area'] try: m = session.query(MerchantDO).filter( MerchantDO.url == model.url).first() if m is None: # 插入数据 log.info('add model from worm url = ' + model.url) session.add(model) session.flush() session.commit() log.info('spider_success url = ' + model.url) # else: # 更新数据 # log.info("update model from gp url " + model.url) # m.updated_at = item['updated_at'] # m.merchant_name = item['merchant_name'] # m.merchant_pic = item['merchant_pic'] # m.service_area = item['service_area'] # m.merchant_id = item['merchant_id'] # m.company_profile = item['company_profile'] # m.area = item['area'] # m.url = item['url'] except Exception as error: session.rollback() log.error(error) raise finally: session.close() return item
def parse(self, response): """ 解析内容 """ log.info('Begin parse ' + response.url) list = response.xpath( '//div[@class="city_main"]/dl[@class="clearfix"]/dd/a') for index, city in enumerate(list): item = CityItem() item['url'] = response.urljoin( city.xpath('./@href').extract_first()) item['name'] = city.xpath('./text()').extract_first() item['create_at'] = int(time.time()) log.info('a href = ' + item['url']) yield item log.info('End parse ' + response.url)
def handle_item(self, design_picture_item: DesignPictureItem): if self.is_duplicate_url(design_picture_item['img_url']): return design_picture_model = self.get_design_picture_model( design_picture_item) self.save_to_database(self.collection, design_picture_model) summary_model = self.find_one(self.summary_collection, {'id': design_picture_model.fid}) if summary_model is None: summary_model = self.create_design_picture_summary_model( design_picture_model) self.save_to_database(self.summary_collection, summary_model) else: tags = list( set(summary_model['tags']).union(set( design_picture_model.tags))) summary_model['cid'].append(design_picture_model.id) self.update_one(self.summary_collection, {'id': summary_model['id']}, { 'update_time': utils.get_utc_time(), 'tags': tags, 'cid': summary_model['cid'] }) self.insert_to_redis(design_picture_model.img_url) log.info( "=========================================================================================" ) log.info("title:" + design_picture_item['title']) log.info("sub_title:" + design_picture_item['sub_title']) log.info("original_width:" + design_picture_item['img_width']) log.info("original_height:" + design_picture_item['img_height']) log.info("html_url:" + design_picture_item['html_url']) log.info("img_url:" + design_picture_item['img_url']) log.info("description:" + design_picture_item['description']) log.info("tags:%s" % ','.join(map(str, design_picture_item['tags']))) log.info( "=========================================================================================" )