Beispiel #1
0
    def periodic_parse_list(self, response):
        data = json.loads(response.text)
        meta = response.meta['rental']

        houses = data['data']['topData'] + data['data']['data']
        has_outdated = False

        for house in houses:
            house['is_vip'] = 'id' not in house

            # updatetime == creation time in 591...
            if not house['is_vip'] and house['updatetime'] < self.epoch_ago:
                has_outdated = True
            else:
                house_item = self.gen_shared_attrs(house, meta)
                # send non-gps request first at it may be closed soon
                yield self.gen_detail_request(
                    util.DetailRequestMeta(house_item['vendor_house_id'],
                                           False))
                if meta.name in self.count_per_city:
                    self.count_per_city[meta.name] += 1

        if data['data']['data'] and not has_outdated:
            # only goto next page when there's response and not outdated
            yield self.gen_list_request(
                util.ListRequestMeta(meta.id, meta.name, meta.page + 1))
        else:
            logging.info(
                f'[{meta.name}] total {self.count_per_city[meta.name]} house to crawl!'
            )
Beispiel #2
0
    def periodic_parse_list(self, response):
        data = json.loads(response.text)
        meta = response.meta['rental']

        # per discussion in #8, we don't need AD list at all
        # ref: https://github.com/rentea-tw/rentea-crawler/issues/8#issuecomment-558021819
        houses = data['data']['data']
        has_outdated = False

        for house in houses:
            # updatetime == creation time in 591...
            if house['updatetime'] < self.epoch_ago:
                has_outdated = True
            else:
                house_item = self.gen_shared_attrs(house, meta)
                # send non-gps request first at it may be closed soon
                request = self.gen_detail_request(util.DetailRequestMeta(
                    house_item['vendor_house_id'],
                    False
                ))
                yield request
                if meta.name in self.count_per_city:
                    self.count_per_city[meta.name] += 1

        if houses and not has_outdated:
            # only goto next page when there's response and not outdated
            request = self.gen_list_request(util.ListRequestMeta(
                meta.id,
                meta.name,
                meta.page + 1
            ))

            yield request
        else:
            logging.info(f'[{meta.name}] total {self.count_per_city[meta.name]} house to crawl!')
Beispiel #3
0
    def parse_main_response(self, response):
        for item in super().parse_main_response(response):
            if not isinstance(item, GenericHouseItem):
                # Skip original logic about GPS request generation
                continue
            if item['deal_status'] == DealStatusType.NOT_FOUND:
                yield item
            else:
                # Got an item that contains GPS!
                gps_arg = {
                    'callback':
                    self.parse_detail,
                    **self.gen_detail_request_args(
                        util.DetailRequestMeta(item['vendor_house_id'], True))
                }

                gps_arg['meta']['main_item'] = item
                yield Request(**gps_arg)
Beispiel #4
0
    def count_and_parse_list(self, response):
        meta = response.meta['rental']
        data = json.loads(response.text)

        if meta.page == 0:
            count = clean_number(data['records'])
            logging.info(f'[{meta.name}] total {count} house to crawl!')

            # #items return per request may differ from API endpoint
            self.N_PAGE = len(data['data']['data'])

            # generate all list request as now we know number of result
            cur_page = 1
            while cur_page * self.N_PAGE < count:
                yield self.gen_list_request(
                    util.ListRequestMeta(meta.id, meta.name, cur_page))
                cur_page += 1

        houses = data['data']['data']

        if not self.novip:
            houses = data['data']['topData'] + houses

        for house in houses:
            # copy from twrh
            house['is_vip'] = 'id' not in house
            house_item = self.gen_shared_attrs(house, meta)

            stats, created = HouseStats.get_or_create(
                job_id=self.job.id, house_id=house_item['vendor_house_id'])

            if not created:
                continue

            yield RawHouseItem(house_id=house_item['vendor_house_id'],
                               vendor=self.vendor,
                               is_list=True,
                               raw=json.dumps(house, ensure_ascii=False))
            yield GenericHouseItem(**house_item)
            yield self.gen_detail_request(
                util.DetailRequestMeta(house_item['vendor_house_id'], False))
 def parse_seed(self, seed):
     return util.DetailRequestMeta(*seed)