Exemple #1
0
 def spider_closed(self, spider, reason):
     spider.logger.info('{0} closed, reason: {1}'.format(spider.name, reason))
     if spider.save_end_log :
         if reason == "finished":
             SpiderLogService.save_spider_log(spider.name, spider.start_time,
                                              spider.addr_district.district_key if spider.addr_district is not None else None
                                              , 'END')
         else:
             SpiderLogService.save_spider_log(spider.name, spider.start_time,
                                              spider.addr_district.district_key if spider.addr_district is not None else None
                                              , 'ERROR')
Exemple #2
0
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        if response.status != 200:
            SpiderLogService.save_error_log(spider.name, response.status,
                                            spider.addr_district.district_key if spider.addr_district is not None else None,
                                            request.url,
                                            '')
        return response
Exemple #3
0
 def __init__(self, category=None, district_key=None, *args, **kwargs):
     is_finished = SpiderLogService.is_spider_finished(
         self.name, district_key)
     self.save_end_log = False
     self.start_urls = []
     if is_finished:
         self.log("%s is already finished ! " % self.name)
     else:
         self.addr_district = SpiderLogService.query_district(district_key)
         SpiderLogService.save_spider_log(UnitSpider.name,
                                          UnitSpider.start_time,
                                          district_key, 'START')
         self.save_end_log = True
         self.start_urls.append(
             building_url.format(zone=self.addr_district.zone_key,
                                 district=self.addr_district.district_key,
                                 sid=random.random()))
Exemple #4
0
 def __init__(self, category=None, *args, **kwargs):
     self.addr_district = None
     self.save_end_log = False
     is_finished = SpiderLogService.is_spider_finished(self.name)
     if is_finished:
         self.log("%s is already finished ! " % self.name)
         self.start_urls = []
     else:
         SpiderLogService.save_spider_log(DistrictSpider.name,
                                          DistrictSpider.start_time,
                                          self.addr_district, 'START')
         self.save_end_log = True
         self.start_urls = utils.init_start_district_Urls()
         zones = utils.cache_zones
         for z in zones:
             zone = Zone()
             zone['id'] = z['id']
             zone['name'] = z['name']
             zone.save()
Exemple #5
0
    def check_building_phase(self, response):
        building = response.meta["building"]
        options = response.xpath("//select/option")
        # query no building by phase param,save building without phase info:
        if len(building['estate_key']
               ) > 0 and building['building_name'] not in [
                   op.xpath("text()").extract()[0].strip() for op in options
               ]:

            building_db = SpiderLogService.query_building(
                building['district_key'], building['street_key'],
                building['street_no_key'], building['estate_key'],
                building['building_key'], building['building_name'])

            # 如果數據庫已有數據,説明已insert過
            if building_db is None:
                building_5 = utils.copy_value_from_item(building, Building)
                building_5['phase_key'] = ''
                building_5['phase_name'] = ''

                building_5['building_name'] = building_5[
                    'building_name'].lstrip().rstrip()
                full_name_reverse, full_name = utils.get_building_full_name(
                    building_5)
                building_5['full_name_reverse'] = full_name_reverse
                building_5['full_name'] = full_name
                yield building_5
                next_url = floor_url.format(
                    district=building_5['district_key'],
                    building=building_5['building_key'].replace('&', '%26'),
                    street=building_5['street_key'],
                    estate=building_5['estate_key'],
                    phase=building_5['phase_key'].replace('&', '%26'),
                    strno=building_5['street_no_key'],
                    sid=random.random())
                yield scrapy.Request(next_url,
                                     callback=self.parse_floor,
                                     meta={"building": building_5})
        else:
            for op in options:
                if len(op.xpath("@value").extract()[0]) > 0:
                    if building['building_key'] == op.xpath("@value").extract(
                    )[0] and building['building_name'] == op.xpath(
                            "text()").extract()[0].strip():
                        building_db = SpiderLogService.query_building(
                            building['district_key'], building['street_key'],
                            building['street_no_key'], building['estate_key'],
                            building['building_key'],
                            building['building_name'])

                        if building_db is not None:
                            # 期數爲空,説明有符合期數條件的building,則更新
                            if building_db.phase_key == '' or len(
                                    building_db.phase_key) == 0:
                                building_db.phase_key = building['phase_key']
                                building_db.phase_name = building['phase_name']
                                full_name_reverse, full_name = utils.get_building_full_name(
                                    building)
                                building_db.full_name_reverse = full_name_reverse
                                building_db.full_name = full_name
                                building_db.save()

                                next_url = floor_url.format(
                                    district=building['district_key'],
                                    building=building['building_key'].replace(
                                        '&', '%26'),
                                    street=building['street_key'],
                                    estate=building['estate_key'],
                                    phase=building['phase_key'].replace(
                                        '&', '%26'),
                                    strno=building['street_no_key'],
                                    sid=random.random())
                                yield scrapy.Request(
                                    next_url,
                                    callback=self.parse_floor,
                                    meta={"building": building})
                            elif building_db.phase_key == building[
                                    'phase_key'] and building_db.phase_name == building[
                                        'phase_name']:
                                return
                            else:
                                building_5 = utils.copy_value_from_item(
                                    building, Building)
                                building_5['building_name'] = building_5[
                                    'building_name'].lstrip().rstrip()
                                full_name_reverse, full_name = utils.get_building_full_name(
                                    building_5)
                                building_5[
                                    'full_name_reverse'] = full_name_reverse
                                building_5['full_name'] = full_name
                                yield building_5
                                next_url = floor_url.format(
                                    district=building_5['district_key'],
                                    building=building_5['building_key'].
                                    replace('&', '%26'),
                                    street=building_5['street_key'],
                                    estate=building_5['estate_key'],
                                    phase=building_5['phase_key'].replace(
                                        '&', '%26'),
                                    strno=building_5['street_no_key'],
                                    sid=random.random())
                                yield scrapy.Request(
                                    next_url,
                                    callback=self.parse_floor,
                                    meta={"building": building_5})
                        else:
                            building_5 = utils.copy_value_from_item(
                                building, Building)
                            building_5['building_name'] = building_5[
                                'building_name'].lstrip().rstrip()
                            full_name_reverse, full_name = utils.get_building_full_name(
                                building_5)
                            building_5['full_name_reverse'] = full_name_reverse
                            building_5['full_name'] = full_name
                            yield building_5
                            next_url = floor_url.format(
                                district=building_5['district_key'],
                                building=building_5['building_key'].replace(
                                    '&', '%26'),
                                street=building_5['street_key'],
                                estate=building_5['estate_key'],
                                phase=building_5['phase_key'].replace(
                                    '&', '%26'),
                                strno=building_5['street_no_key'],
                                sid=random.random())
                            yield scrapy.Request(next_url,
                                                 callback=self.parse_floor,
                                                 meta={"building": building_5})
Exemple #6
0
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--dkey', type=str, default=None)
args = parser.parse_args()
district_key = args.dkey
print("crawl data by param :district_key = %s" % district_key)

# 在控制台打印日志
configure_logging()
# CrawlerRunner获取settings.py里的设置信息
runner = CrawlerRunner(get_project_settings())

from hkpost_scrapy.service.db_service import SpiderLogService

district_list = []
if district_key is None:
    district_list = SpiderLogService.get_all_district()
else:
    district_list.append(SpiderLogService.query_district(district_key))


@defer.inlineCallbacks
def crawl_address():
    if district_list:
        for district in district_list:
            logging.info("new cycle starting")
            yield runner.crawl("building_spider",
                               district_key=district.district_key)
            time.sleep(3)
            yield runner.crawl("unit_spider",
                               district_key=district.district_key)
            time.sleep(3)