def spider_closed(self, spider, reason): spider.logger.info('{0} closed, reason: {1}'.format(spider.name, reason)) if spider.save_end_log : if reason == "finished": SpiderLogService.save_spider_log(spider.name, spider.start_time, spider.addr_district.district_key if spider.addr_district is not None else None , 'END') else: SpiderLogService.save_spider_log(spider.name, spider.start_time, spider.addr_district.district_key if spider.addr_district is not None else None , 'ERROR')
def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest if response.status != 200: SpiderLogService.save_error_log(spider.name, response.status, spider.addr_district.district_key if spider.addr_district is not None else None, request.url, '') return response
def __init__(self, category=None, district_key=None, *args, **kwargs): is_finished = SpiderLogService.is_spider_finished( self.name, district_key) self.save_end_log = False self.start_urls = [] if is_finished: self.log("%s is already finished ! " % self.name) else: self.addr_district = SpiderLogService.query_district(district_key) SpiderLogService.save_spider_log(UnitSpider.name, UnitSpider.start_time, district_key, 'START') self.save_end_log = True self.start_urls.append( building_url.format(zone=self.addr_district.zone_key, district=self.addr_district.district_key, sid=random.random()))
def __init__(self, category=None, *args, **kwargs): self.addr_district = None self.save_end_log = False is_finished = SpiderLogService.is_spider_finished(self.name) if is_finished: self.log("%s is already finished ! " % self.name) self.start_urls = [] else: SpiderLogService.save_spider_log(DistrictSpider.name, DistrictSpider.start_time, self.addr_district, 'START') self.save_end_log = True self.start_urls = utils.init_start_district_Urls() zones = utils.cache_zones for z in zones: zone = Zone() zone['id'] = z['id'] zone['name'] = z['name'] zone.save()
def check_building_phase(self, response): building = response.meta["building"] options = response.xpath("//select/option") # query no building by phase param,save building without phase info: if len(building['estate_key'] ) > 0 and building['building_name'] not in [ op.xpath("text()").extract()[0].strip() for op in options ]: building_db = SpiderLogService.query_building( building['district_key'], building['street_key'], building['street_no_key'], building['estate_key'], building['building_key'], building['building_name']) # 如果數據庫已有數據,説明已insert過 if building_db is None: building_5 = utils.copy_value_from_item(building, Building) building_5['phase_key'] = '' building_5['phase_name'] = '' building_5['building_name'] = building_5[ 'building_name'].lstrip().rstrip() full_name_reverse, full_name = utils.get_building_full_name( building_5) building_5['full_name_reverse'] = full_name_reverse building_5['full_name'] = full_name yield building_5 next_url = floor_url.format( district=building_5['district_key'], building=building_5['building_key'].replace('&', '%26'), street=building_5['street_key'], estate=building_5['estate_key'], phase=building_5['phase_key'].replace('&', '%26'), strno=building_5['street_no_key'], sid=random.random()) yield scrapy.Request(next_url, callback=self.parse_floor, meta={"building": building_5}) else: for op in options: if len(op.xpath("@value").extract()[0]) > 0: if building['building_key'] == op.xpath("@value").extract( )[0] and building['building_name'] == op.xpath( "text()").extract()[0].strip(): building_db = SpiderLogService.query_building( building['district_key'], building['street_key'], building['street_no_key'], building['estate_key'], building['building_key'], building['building_name']) if building_db is not None: # 期數爲空,説明有符合期數條件的building,則更新 if building_db.phase_key == '' or len( building_db.phase_key) == 0: building_db.phase_key = building['phase_key'] building_db.phase_name = building['phase_name'] full_name_reverse, full_name = utils.get_building_full_name( building) building_db.full_name_reverse = full_name_reverse building_db.full_name = full_name building_db.save() next_url = floor_url.format( district=building['district_key'], building=building['building_key'].replace( '&', '%26'), street=building['street_key'], estate=building['estate_key'], phase=building['phase_key'].replace( '&', '%26'), strno=building['street_no_key'], sid=random.random()) yield scrapy.Request( next_url, callback=self.parse_floor, meta={"building": building}) elif building_db.phase_key == building[ 'phase_key'] and building_db.phase_name == building[ 'phase_name']: return else: building_5 = utils.copy_value_from_item( building, Building) building_5['building_name'] = building_5[ 'building_name'].lstrip().rstrip() full_name_reverse, full_name = utils.get_building_full_name( building_5) building_5[ 'full_name_reverse'] = full_name_reverse building_5['full_name'] = full_name yield building_5 next_url = floor_url.format( district=building_5['district_key'], building=building_5['building_key']. replace('&', '%26'), street=building_5['street_key'], estate=building_5['estate_key'], phase=building_5['phase_key'].replace( '&', '%26'), strno=building_5['street_no_key'], sid=random.random()) yield scrapy.Request( next_url, callback=self.parse_floor, meta={"building": building_5}) else: building_5 = utils.copy_value_from_item( building, Building) building_5['building_name'] = building_5[ 'building_name'].lstrip().rstrip() full_name_reverse, full_name = utils.get_building_full_name( building_5) building_5['full_name_reverse'] = full_name_reverse building_5['full_name'] = full_name yield building_5 next_url = floor_url.format( district=building_5['district_key'], building=building_5['building_key'].replace( '&', '%26'), street=building_5['street_key'], estate=building_5['estate_key'], phase=building_5['phase_key'].replace( '&', '%26'), strno=building_5['street_no_key'], sid=random.random()) yield scrapy.Request(next_url, callback=self.parse_floor, meta={"building": building_5})
parser = argparse.ArgumentParser(description='manual to this script') parser.add_argument('--dkey', type=str, default=None) args = parser.parse_args() district_key = args.dkey print("crawl data by param :district_key = %s" % district_key) # 在控制台打印日志 configure_logging() # CrawlerRunner获取settings.py里的设置信息 runner = CrawlerRunner(get_project_settings()) from hkpost_scrapy.service.db_service import SpiderLogService district_list = [] if district_key is None: district_list = SpiderLogService.get_all_district() else: district_list.append(SpiderLogService.query_district(district_key)) @defer.inlineCallbacks def crawl_address(): if district_list: for district in district_list: logging.info("new cycle starting") yield runner.crawl("building_spider", district_key=district.district_key) time.sleep(3) yield runner.crawl("unit_spider", district_key=district.district_key) time.sleep(3)