def process_store(self, store): opening_hours, phone = ('', '') data = store.xpath( '//div[@class="col-lg-4"]/div/*[not(self::h2 or self::strong)]//text()' ).extract() normalize_data = [ val for val in [info.strip() for info in data] if val ] final_data = [clean for clean in normalize_data if clean not in SIEVE] city, state_zip = final_data[2].split(',') state, pcode = state_zip.strip().split() if 'Phone Number' in final_data: phone = final_data[final_data.index('Phone Number') + 1] if 'Store Hours' in final_data: opening_hours = self.parse_hours( final_data[final_data.index('Store Hours') + 1:][0]) props = { 'addr_full': final_data[1], 'ref': store.url, 'city': city, 'postcode': pcode, 'state': state, 'website': store.url, 'opening_hours': opening_hours, 'phone': phone, } yield hourstudy(**props)
def parse_stores(self, response): ref = response.meta['id'] json_data = json.loads(response.body_as_unicode()) if 'address1' not in json_data: return properties = { 'addr_full': json_data['address1'], 'phone': json_data['phoneNumber'], 'city': json_data['city'], 'state': json_data['stateCode'], 'postcode': json_data['postalCode'], 'ref': ref, 'website': "http://www.acehardware.com/mystore/index.jsp?store=" + ref, 'lat': float(json_data['latitude']), 'lon': float(json_data['longitude']), } hours = self.parse_hours(json_data['hours']) raw = json_data['hours'] if hours: properties['opening_hours'] = hours formatted = hours yield hourstudy(raw, formatted)
def parse(self, response): data = json.loads(response.body_as_unicode()) stores = data.get('Stores', None) props = {} for store in stores: props['lat'] = store.pop('Latitude', None) props['lon'] = store.pop('Longitude', None) props['ref'] = store.pop('StoreID', None) props['website'] = URL for new_key, old_keys in NORMALIZE_KEYS: props[new_key] = ", ".join([ store.pop(key, '').strip() for key in old_keys if store[key] ]) opening_hours = normalize_time(store.pop('Hours', '')) if opening_hours: props['opening_hours'] = opening_hours props.pop('Hours', None) yield hourstudy(**props)
def parse_us(self, response): results = json.loads(response.body_as_unicode()) stores = results['stores'] for store_key in stores: store_data = stores[store_key] properties = { 'phone': store_data['phone'], 'addr_full': store_data['address1'].title(), 'city': store_data['city'].title(), 'state': store_data['stateCode'], 'postcode': store_data['postalCode'], 'lon': float(store_data['longitude']), 'lat': float(store_data['latitude']), 'ref': store_key, } hours = (store_data['storeHours'] if 'storeHours' in store_data else None) opening_hours = None if hours and ("Please call" not in hours): opening_hours = self.store_hours(hours) if opening_hours: properties['opening_hours'] = opening_hours yield hourstudy(**properties)
def parse(self, response): # retrieve JSON data from REST endpoint # items = response.xpath('//text()').extract() # convert data variable from unicode to string # items = str(items) # convert type string representation of list to type list # data = [items] # load list into json object for parsing jsondata = json.loads(response.body_as_unicode()) # iterate items for item in jsondata['d']['results']: # print str(item['Address1']) yield hourstudy( ref=item['EntityID'], lat=float(item['Latitude']), lon=float(item['Longitude']), addr_full=self.parseAddr(item['Address1'],item['Address2']), city=item['Locality'], state=item['AdminDistrict'], postcode=item['PostalCode'], name=item['MallName'], phone=item['Phone'], opening_hours=item['StoreHours'], )
def parse_stores(self, response): app_json = json.loads(response.xpath(SCRIPT_JSON).extract_first()) hours = app_json[0]['openingHours'].replace(' - ', '-').split() hours = [re.sub(r'[:]$', '', day_hour) for day_hour in hours] props = { 'addr_full': response.xpath('//meta[@name="address"]/@content').extract_first(), 'phone': response.xpath(PHONE).extract_first(), 'city': response.xpath('//meta[@name="city"]/@content').extract_first(), 'state': response.xpath('//meta[@name="state"]/@content').extract_first(), 'postcode': response.xpath('//meta[@name="zip"]/@content').extract_first(), 'lat': float(app_json[0]['geo']['latitude']), 'lon': float(app_json[0]['geo']['longitude']), 'opening_hours': "; ".join( ['{} {}'.format(x[0], x[1]) for x in zip(*[iter(hours)] * 2)]), 'ref': response.url, 'website': response.url } return hourstudy(**props)
def parse(self, response): # retrieve js data variable from script tag items = response.xpath('//script/text()')[3].re("var stores =(.+?);\n") # convert data variable from unicode to string items = [str(x) for x in items] # convert type string representation of list to type list data = [items[0]] # load list into json object for parsing jsondata = json.loads(data[0]) # loop through json data object and retrieve values; yield the values to hourstudy for item in jsondata: yield hourstudy( ref=item.get('_id'), lat=float(item.get('latitude')), lon=float(item.get('longitude')), addr_full=item.get('address'), city=item.get('city'), state=item.get('state'), postcode=item.get('zip'), website='https://www.superonefoods.com/store-details/' + item.get('url'), )
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data['response']['properties']['property']: lat, lon = map(float, store['lat_long'].split(', ')) properties = { "ref": store.get('property_id'), "opening_hours": '; '.join( response.xpath('//time[@itemprop="openingHours"]/@datetime' ).extract()), "addr_full": store.get('address'), "city": store.get('city'), "state": store.get('state'), "postcode": store.get('zip'), "lat": lat, "lon": lon, } yield hourstudy(**properties)
def parse_store(self, response): json_data = response.xpath( '//head/script[@type="application/ld+json"]/text()')[1].extract() json_data = json_data.replace( '// if the location file does not have the hours separated into open/close for each day, remove the below section', '') data = json.loads(json_data) properties = { 'phone': data['telephone'], 'website': response.xpath('//head/link[@rel="canonical"]/@href')[0].extract(), 'ref': data['@id'], 'opening_hours': self.store_hours(data['openingHoursSpecification']), 'lon': float(data['geo']['longitude']), 'lat': float(data['geo']['latitude']), } address = self.address(data['address']) if address: properties.update(address) yield hourstudy(**properties)
def parse(self, response): data = response.xpath('.//div[@class="location-listing-item row"]') for store in data: ref = self.parse_Ref(store) properties = { 'ref': ref, 'addr_full': store.xpath("//span[@class='address']//text()").extract_first( ).strip(), 'city': store.xpath( "//span[@class='city']//text()").extract_first().strip(), 'state': store.xpath( "//span[@class='state']//text()").extract_first().strip(), 'postcode': store.xpath( "//span[@class='zip']//text()").extract_first().strip(), 'phone': store.xpath( "//span[@class='phone']//text()").extract_first().strip(), 'name': store.xpath(".//strong//text()").extract_first().strip(), 'lon': store.xpath("@data-lon").extract_first(), 'lat': store.xpath("@data-lat").extract_first() } yield hourstudy(**properties)
def parse_stores(self, response): stores = json.loads(response.body_as_unicode()) for store in stores: props = { 'lat': store.get('latitude'), 'lon': store.get('longitude'), 'ref': store.get('identifier'), 'phone': self._clean_text(store.get('phone')), 'name': store.get('displayName'), 'addr_full': store.get('address1'), 'city': store.get('city'), 'state': store.get('state'), 'postcode': store.get('zipCode'), 'country': store.get('country'), 'website': 'https://www.costco.com/warehouse-locations/store-{}.html'. format(store.get('identifier')), 'opening_hours': self.store_hours(store.get('warehouseHours')), } yield hourstudy(**props)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data.get('features', []): store_info = store['properties'] properties = { "ref": store_info['id'], 'addr_full': store_info['addressLine1'], 'city': store_info['addressLine3'], 'state': store_info['subDivision'], 'country': store_info['addressLine4'], 'postcode': store_info['postcode'], 'phone': store_info.get('telephone'), 'lon': store['geometry']['coordinates'][0], 'lat': store['geometry']['coordinates'][1], } hours = store_info.get('restauranthours') try: hours = self.store_hours(hours) if hours: properties['opening_hours'] = hours except: self.logger.exception("Couldn't process opening hours: %s", hours) yield hourstudy(**properties)
def parse(self, response): results = response.body_as_unicode() sub_str = find_between(results, 'staticLocations=', "shown:!0}}]}") js_obj = sub_str + "shown:!0}}]" js_obj = js_obj.replace('!0', 'true') js_obj = js_obj.replace('!1', 'false') js_obj = js_obj.replace(',hours:', ',"hours":') js_obj = js_obj.replace('\\r\\n', '; ') js_obj = re.sub(r'(-?[a-zA-Z]+):(\d{1,2}:)', r'\1: \2', js_obj) cleaner_js = re.sub(r'([a-z_]+):([^/ ])', r'"\1":\2', js_obj) locations = json.loads(cleaner_js) for data in locations: properties = { 'ref': data['location']['id'], 'name': data['location']['name'], 'lat': data['location']['latitude'], 'lon': data['location']['longitude'], 'addr_full': data['location']['street_address'], 'city': data['location']['locality'], 'state': data['location']['region'], 'postcode': data['location']['postal_code'], 'phone': data['location']['phone'], 'website': data['location']['facebook_url'], 'opening_hours': data['location']['hours'] } yield hourstudy(**properties)
def parse_location(self, response): unp = {} # Unprocessed properties properties = {} unp['phone'] = response.xpath( '//span[@itemprop="telephone"]/a/text()').extract_first() unp['name'] = response.xpath( '//span[@itemprop="name"]/h2[@class="loc_d_title"]/text()' ).extract_first() unp['ref'] = response.url unp['website'] = response.url addressdiv = response.xpath('//div[@itemprop="address"]')[0] unp['addr_full'] = addressdiv.xpath( './/span[@itemprop="streetAddress"]/text()').extract_first() unp['city'] = addressdiv.xpath( './/span[@itemprop="addressLocality"]/text()').extract_first() unp['state'] = addressdiv.xpath( './/span[@itemprop="addressRegion"]/text()').extract_first() unp['postcode'] = addressdiv.xpath( './/span[@itemprop="postalCode"]/text()').extract_first() hours = response.xpath( '//ul[@class="loc_d_times row"]/li/text()').extract() raw = hours opening_hours = None if hours: opening_hours = self.store_hours(','.join(hours)) if opening_hours: formatted = opening_hours yield hourstudy(raw, formatted) for key in unp: if unp[key]: properties[key] = unp[key].strip()
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for store in jsonresponse: addr_full = store['address'] + ", " + store['city'] + " " + store['state'] + " " + store['zip'] datestring = store['hours'] hour_match = re.findall(r"(\d{1,2}:\d{1,2})", datestring) for hour in hour_match: if hour == "9:00": pass else: raise DifferentHours("Store added with different hours than 09:00-21:00") properties = { 'name' : store['store'], 'addr_full' : addr_full, 'street' : store['address'], 'city' : store['city'], 'state' : store['state'], 'postcode' : store['zip'], 'country' : store['country'], 'phone' : store['phone'], 'website' : store['permalink'], 'opening_hours' : '09:00-21:00', 'ref' : store['id'] + " " + store['store'], 'lat': float(store['lat']), 'lon': float(store['lng']), } yield hourstudy(**properties)
def parse(self, response): results = json.loads(response.body_as_unicode()) for data in results['results']: ref = data['id_suc'] name = "Coto " + data['desc_suc'] street = data['direccion'] phone = data['telefono'] lat = data['latitud'] lon = data['longitud'] mon_thu = "Mo-Th " + data['hor_lu_a_ju'] fri = "Fr " + data['hor_vi'] sat = "Sa " + data['hor_sa'] sun = "Su " + data['hor_do'] if data['hor_do'] != "Cerrado" else "Su off" opening_hours = "{}; {}; {}; {}".format( mon_thu, fri, sat, sun).replace(' a ', '-') yield hourstudy( ref=ref, lat=lat, lon=lon, name=name, street=street, country="Argentina", phone=phone, addr_full=street, opening_hours=opening_hours )
def parse(self, response): data = json.loads( re.sub(r"\s<.*?>.*<.*?>\s", "", response.body_as_unicode())) for store in data: # properties = { # "phone" : store["phone_number"], # "ref" : str(store["locator_store_number"]), # "name" : store["post_title"], # "opening_hours" : store["hours"], # "website" : store["permalink"], # "lat" : store["x_coordinate"], # "lon" : store["y_coordinate"], # "street" : store["street_address_1"] + store["street_address_2"], # "city" : store["city"], # "state" : store["state"], # "postcode" : store["zip_code"] # } # yield hourstudy(**properties) raw = store["hours"] formatted = store["hours"] yield hourstudy(raw, formatted) else: self.logger.info("No results")
def parse_store(self, response): # There are newlines in the opening hours, which is bad JSON. We turn # off strict mode so Python's JSON library will parse it. json_content = response.xpath( '//script[@type="application/ld+json"]/text()').extract_first() data = json.loads(json_content, strict=False) store_data = data[0] properties = { 'website': store_data['url'], 'name': store_data['name'], 'phone': store_data['address']['telephone'], 'ref': store_data['url'], 'addr_full': store_data['address']['streetAddress'], 'postcode': store_data['address']['postalCode'], 'state': store_data['address']['addressRegion'], 'city': store_data['address']['addressLocality'], 'lon': float(store_data['geo']['longitude']), 'lat': float(store_data['geo']['latitude']), } opening_hours = self.store_hours(store_data['openingHours']) if opening_hours: raw = store_data['openingHours'] formatted = opening_hours yield hourstudy(raw, formatted)
def parse_location(self, response): ref = response.xpath('//a[@class="btn set-as-location"]/@data-loc-id').extract_first() \ or response.request.url properties = { "phone": response.xpath( '//div[@class="module"]/p/a/text()').extract_first(), "ref": ref, "name": response.xpath( '//div[@class="location-details"]/h1/text()').extract_first(), "opening_hours": self.store_hours(response.xpath('//dl[@class="hours"]')[0]), "lon": float( response.xpath('//span[@id="currentlocdistanceid"]/@data-long' ).extract_first()), "lat": float( response.xpath('//span[@id="currentlocdistanceid"]/@data-lat'). extract_first()), } properties.update(self.address(response)) yield hourstudy(**properties)
def parse(self, response): phoneregex = re.compile('^<a.+>([0-9\-]+)<\/a>$') stores = json.loads(response.body_as_unicode()) for key, value in stores.items(): all_address = value['address'].split(',') len_address = len(all_address) state_zipcode = all_address[len_address - 1] zipcode = re.findall(r"(\d{5})", state_zipcode) addr_full = re.findall(r"^[^(,|.)]+", value['address'])[0] if (len(zipcode) > 0): zipcode = zipcode[0] else: zipcode = '' state = re.findall(r"([A-Z]{2})", state_zipcode) if (len(state) > 0): state = state[0] else: state = '' properties = { 'ref': value['ID'], 'name': value['title'], 'addr_full': addr_full, 'city': value['title'], 'state': state, 'postcode': zipcode, 'lat': value['location']['lat'], 'lon': value['location']['lng'], } if value['phone']: properties['phone'] = value['phone'] yield hourstudy(**properties)
def parse(self, response): data = json.loads(response.body_as_unicode()) stores = data['results'] for store in stores: addr_full='' for add in store['location']['contact']['address']['lines']: addr_full=addr_full+' '+add['text'] properties = { 'ref': store['location']['id'], 'name': store['location']['name'], 'addr_full': addr_full, 'city': store['location']['contact']['address']['town'], 'state': '', 'country':'United Kingdom', 'postcode': store['location']['contact']['address']['postcode'], 'lat': store['location']['geo']['coordinates']['latitude'], 'lon': store['location']['geo']['coordinates']['longitude'], 'phone': store['location']['contact']['phoneNumbers'][0]['number'], } opening_hours = self.store_hours(store['location']['openingHours'][0]['standardOpeningHours']) if opening_hours: properties['opening_hours'] = opening_hours raw = store['location']['openingHours'][0]['standardOpeningHours'] formatted = opening_hours yield hourstudy(raw,formatted)
def parse(self, response): json_str = response.body_as_unicode() data = json.loads(json_str)['locations'] for store in data: store_details = store['bing'] (num, street) = store_details['AddressLine'].split(' ', 1) properties = { "phone": store_details['Phone'], "ref": store_details['EntityID'], "name": store['post']['post_title'], "opening_hours": self.store_hours(store_details), "lat": store_details['Latitude'], "lon": store_details['Longitude'], "addr_full": store_details['AddressLine'], "housenumber": num, "street": street, "city": store_details['Locality'], "state": store_details['AdminDistrict'], "postcode": store_details['PostalCode'], "country": store_details['CountryRegion'], "website": store['url'], } yield hourstudy(**properties)
def parse_link(self, response): website = response.xpath( '//head/meta[@property="og:url"]/@content').extract_first() ref = website.split("/")[-1] lat = response.css("#h_lat::attr(value)").extract_first() lng = response.css("#h_lng::attr(value)").extract_first() blocks = response.css("#location_subcontainer .block") hours_text = response.xpath( '//span[@class="block hours"]/span[@class="hoursTime" or @class="hoursDay"]/text()' ).extract() # properties = { # "ref": ref, # "website": website, # "lat": float(lat), # "lon": float(lng), # "opening_hours": self.hours(hours_text), # } # address = self.address(blocks[0]) # if address: # properties.update(address) raw = hours_text formatted = self.hours(hours_text) yield hourstudy(raw, formatted)
def parse_store(self, response): json_data = response.xpath('//script[@type="text/javascript"]/text()' ).extract_first().replace('\n', '').replace( '\t', '').split('.push(')[1].rstrip(')') data = json.loads(json_data) geojson_data = response.xpath( '//script[@class="js-store-finder-initial-state"][@type="application/json"]/text()' ).extract_first() geodata = json.loads(geojson_data) # properties = { # 'name': data['seoData']['name'], # 'ref': data['seoData']['name'], # 'addr_full': data['seoData']['address']['streetAddress'], # 'city': data['seoData']['address']['addressLocality'], # 'postcode': data['seoData']['address']['postalCode'], # 'country': data['seoData']['address']['addressCountry'], # 'website': response.request.url, # 'opening_hours': str(data['seoData']['openingHours']).replace('[','').replace(']','').replace("'",''), # 'lat': float(geodata['store']['latlng']['lat']), # 'lon': float(geodata['store']['latlng']['lng']), # } raw = str(data['seoData']['openingHours']) formatted = str(data['seoData']['openingHours']).replace( '[', '').replace(']', '').replace("'", '') yield hourstudy(raw, formatted)
def parse(self, response): data = json.loads(response.body_as_unicode()) for key, value in data.items(): if 'AddressLine' in value: addr_full = value['AddressLine'].split(',') address = ", ".join(addr_full[:len(addr_full) - 1]) city = addr_full[len(addr_full) - 1] else: address = "" city = "" if 'postcode' in value: postcode = value['postcode'] else: postcode = "" properties = { 'ref': key, 'name': value['branch_name'], 'addr_full': address, 'city': city, 'country': 'United Kingdom', 'postcode': postcode, 'lat': value['Latitude'], 'lon': value['Longitude'], 'phone': value['telephone'], } opening_hours = self.store_hours(value) if opening_hours: properties['opening_hours'] = opening_hours yield hourstudy(**properties)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data['results']: # properties = { # "ref": store['id'], # "name": store['name'], # "opening_hours": store['hours']['operating'], # "addr_full": store['address'], # "city": store['city'], # "state": store['state'], # "postcode": store['zip'], # "country": store['country'], # "lon": float(store['lon']), # "lat": float(store['lat']), # "phone": store['phone'], # } # "opening_hours": store['hours']['operating'], raw = store['hours']['operating'] formatted = store['hours']['operating'] yield hourstudy(raw, formatted) next_url = data['next'] if next_url is not None: next_url = response.urljoin(next_url) yield scrapy.Request(url=next_url, headers=HEADERS, callback=self.parse)
def parse_links(self, response): hours = response.xpath( '//form[@id="directions-form"]/input[@name="hours"]/@value' ).extract_first() website = response.xpath( '//head/link[@rel="canonical"]/@href').extract_first() link_id = website.split("/")[-2] # properties = { # "addr_full": response.xpath('//form[@id="directions-form"]/input[@name="address"]/@value').extract_first(), # "city": response.xpath('//form[@id="directions-form"]/input[@name="city"]/@value').extract_first(), # "state": response.xpath('//form[@id="directions-form"]/input[@name="state"]/@value').extract_first(), # "postcode": response.xpath('//form[@id="directions-form"]/input[@name="zip"]/@value').extract_first(), # "phone": response.xpath('//form[@id="directions-form"]/input[@name="phone"]/@value').extract_first(), # "website": website, # "ref": link_id, # "opening_hours": self.process_hours(hours[0]), # "lat": float(response.xpath('//form[@id="directions-form"]/input[@name="lat"]/@value').extract_first()), # "lon": float(response.xpath('//form[@id="directions-form"]/input[@name="long"]/@value').extract_first()), # } # yield hourstudy(**properties) raw = hours[0] formatted = self.process_hours(hours[0]) yield hourstudy(raw, formatted)
def parse_detail_product(self, response): product = response.meta.get('product') open_dates = response.xpath('//table[@id="hours-table"]//tr') product['opening_hours'] = self.store_hours( open_dates) if len(open_dates) > 0 else u'24/7' yield hourstudy(**product)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data: opening_hours = [] for day_hour in store['Days']: opening_hours.append(day_hour['Day'][:2] + " " + self.parse_times(day_hour['Open']) + "-" + self.parse_times(day_hour['Close'])) properties = { 'addr_full': store['Street1'] + " " + store['Street2'], 'phone': store['PhoneNumber'], 'city': store['City'], 'state': store['State'], 'postcode': store['ZipCode'], 'name': store['Name'], 'ref': store['id'], 'website': "https://www.simonmed.com/locations", 'lat': store['Latitude'], 'lon': store['Longitude'], 'opening_hours': '; '.join(opening_hours) } yield hourstudy(**properties)
def parse_center(self, response): yield hourstudy( lat=float( response.xpath('//meta[@property="og:latitude"]/@content'). extract_first()), lon=float( response.xpath('//meta[@property="og:longitude"]/@content'). extract_first()), phone=response.xpath('//meta[@property="og:phone_number"]/@content' ).extract_first(), website=response.url, ref=response.xpath( '//meta[@name="twitter:title"]/@content').extract_first(), opening_hours='; '.join( response.xpath( '//div[@class="hours_container"]//meta[@itemprop="openingHours"]/@content' ).extract()), addr_full=response.xpath( '//meta[@property="og:street-address"]/@content'). extract_first(), city=response.xpath( '//meta[@property="og:locality"]/@content').extract_first(), state=response.xpath( '//meta[@property="og:region"]/@content').extract_first(), postcode=response.xpath( '//meta[@property="og:postal-code"]/@content').extract_first(), country=response.xpath( '//meta[@property="og:country-name"]/@content').extract_first( ), )