def parse_park(self, response): self.save_park_json(response) # get park string park = codecs.decode(response.body, 'utf8') # convert json string to JSon park_dict = json.loads(park) park_info = park_dict['d']['ListJsonPlaceInfos'][0] # parse park information park_item = ParkItem() park_item['name'] = park_info["Name"] park_item['parkId'] = park_info["PlaceId"] park_item['contractCode'] = "ca" park_item['_id'] = '%s::%s' % (park_item['parkId'], park_item['contractCode']) park_item['url'] = park_info["PlaceinfoUrl"] park_item['services'] = park_info["AllHightlights"].split(",") yield park_item facility_infos = park_info['JsonFacilityInfos'] # get each campsite group while len(facility_infos): facility = facility_infos.pop() if not self.get_env("CA_NOT_CRAWL_CAMPSITES"): body = set_night_by_place_id_and_facility_id_on_unit_grid.copy() body['placeId'] = facility['PlaceId'] body['facilityId'] = facility['FacilityId'] self.cookie_index = self.cookie_index + 1 # step 4: click reserve button, first set night by place id and facility id yield Request(url=unique_url(self.url_set_by_place_id_facility_id), method="POST", body=json.dumps(body), meta={'cookiejar': self.cookie_index, 'FacilityId': facility['FacilityId'], 'PlaceId': facility['PlaceId']}, dont_filter=True, headers={'Content-Type': 'application/json; charset=UTF-8'}, callback=self.after_set_park_facility ) if not self.get_env("CA_NOT_CRAWL_RESERVATIONS"): campsite_list_body = campsites_reservations_post_body.copy() campsite_list_body['FacilityId'] = facility['FacilityId'] campsite_list_body['PlaceId'] = facility['PlaceId'] # step 7: get campsites reservations in each campsite group # yield Request(url=unique_url(self.url_campsites_reservations), yield Request(url=unique_url(self.url_campsites_reservations), method="POST", meta={'cookiejar': response.meta['cookiejar'], 'FacilityId': facility['FacilityId'], 'PlaceId': facility['PlaceId']}, body=json.dumps(campsite_list_body), dont_filter=True, headers={'Content-Type': 'application/json'}, callback=self.parse_campsites_reservations)
def index_page(self,response): body = post_body_park_info_by_name.copy() body['name'] = response.meta['parkName'] # step 2: use park name get park information yield Request(url=unique_url(self.url_get_park_info_by_name), meta={'cookiejar': response.meta['cookiejar']}, method="POST", body=json.dumps(body), headers={'Content-Type': 'application/json; charset=UTF-8'}, dont_filter=True, callback=self.set_select_park)
def after_set_park_facility(self, response): # step 6: get campsites by click facility date_str = self.first_date.strftime('%m/%d/%Y') form_data = advance_search_form.copy() form_data['ctl01$mainContent$hdnFacilityid'] = str(response.meta['FacilityId']) form_data['ctl01$mainContent$hdnPlaceid'] = str(response.meta['PlaceId']) form_data['ctl01$mainContent$txtDateRange'] = date_str yield FormRequest(url=unique_url(self.url_advance_search), meta={'cookiejar': response.meta['cookiejar'], 'FacilityId': response.meta['FacilityId'], 'PlaceId': response.meta['PlaceId']}, formdata=form_data, callback=self.parse_campsite_list)
def home_page(self, response): park = response.meta['park'] body = park_post_body.copy() body['googlePlaceSearchParameters']['Latitude'] = str(park['Latitude']) body['googlePlaceSearchParameters']['Longitude'] = str(park['Longitude']) body['googlePlaceSearchParameters']['MapboxPlaceid'] = str(park['CityParkId']) # step 5: click reserve button, get google map place data yield Request(url=unique_url(self.url_get_google_map_place_data), method="POST", meta={'cookiejar': response.meta['cookiejar']}, body=json.dumps(body), headers={'Content-Type': 'application/json; charset=UTF-8'}, dont_filter=True, callback=self.parse_park)
def set_select_park(self, response): body = codecs.decode(response.body, 'utf8') parks = json.loads(body) park = parks['d'][0] body = web_home.copy() date_str = self.first_date.strftime('%m/%d/%Y') body['ctl00$ctl00$mainContent$txtArrivalDate'] = date_str body['ctl00$ctl00$mainContent$hdnMasterPlaceId'] = str(park['CityParkId']) # step 3: set select park yield FormRequest(url=unique_url(self.url_webhome), meta={'cookiejar': response.meta['cookiejar'], 'park':park}, method="POST", formdata=body, dont_filter=True, callback=self.home_page)
def start_requests(self): if self.get_env("DEBUG"): receives_dir = './receives' if os.path.exists(receives_dir): shutil.rmtree(receives_dir) os.makedirs(receives_dir) crawl_parks = self.get_crawl_parks() logging.debug("=======================================") logging.debug("crawl_parks: %s", json.dumps(crawl_parks)) logging.debug("=======================================") while len(crawl_parks): park = crawl_parks.pop() # step 1: Go to reserve california home page yield Request(url=unique_url(self.url_default), meta={ 'cookiejar': self.cookie_index, 'parkName': park['name'] }, dont_filter=True, callback=self.index_page) self.cookie_index = self.cookie_index + 1
def parse_campsite_list(self, response): self.save_campsite_list_html(response) sites = response.xpath('//div[@id="divUnitGridlist"]/div/table/tr[@class="unitdata"]/td[2]/@onclick').extract() for link in sites: reservation_item = self.parse_campsite_from_url_link(link, response.meta['PlaceId'], response.meta['FacilityId']) is_available = False if reservation_item['status'] == 'a': is_available = True url = self.url_template_campsite % (reservation_item['facilityId'], reservation_item['siteId'], reservation_item['date'], is_available) # step 7: get each campsite information yield Request( url=unique_url(url), meta={'cookiejar': response.meta['cookiejar'], 'FacilityId': response.meta['FacilityId'], 'PlaceId': response.meta['PlaceId'], 'SiteId': reservation_item['siteId']}, dont_filter=True, callback=self.parse_campsite)