def parse(self, response): # get open hours if response.meta["comming_soon"] == "": open_hours = [] open_days = response.xpath( "//div[@class='store-hours']//p[@class='day']/text()") open_hrs = response.xpath( "//div[@class='store-hours']//p[@class='hour']/text()") for index in range(0, len(open_days)): open_hours.append(open_days[index].extract() + open_hrs[index].extract()) open_hours = "; ".join(open_hours) else: open_hours = "" item = ChainItem() item['store_name'] = response.meta["store_name"] item['store_number'] = "" item['address'] = response.meta["address"] item['phone_number'] = response.meta["phone_number"] item['city'] = response.meta["city"] item['state'] = response.meta["state"] item['zip_code'] = response.meta["zip_code"] item['country'] = response.meta["country"] item['latitude'] = "" item['longitude'] = "" item['store_hours'] = open_hours #item['store_type'] = info_json["@type"] item['other_fields'] = "" item['coming_soon'] = response.meta["comming_soon"] yield item
def parse_page(self, response): try: item = ChainItem() item['store_name'] = self.validate( response.xpath( '//span[@itemprop="name"]/text()').extract_first()) item['address'] = self.validate( response.xpath('//span[@itemprop="streetAddress"]//text()'). extract_first()) item['city'] = self.validate( response.xpath('//span[@itemprop="addressLocality"]/text()'). extract_first()) item['state'] = self.validate( response.xpath('//span[@itemprop="addressRegion"]/text()'). extract_first()) item['zip_code'] = self.validate( response.xpath( '//span[@itemprop="postalCode"]/text()').extract_first()) item['country'] = 'United States' item['phone_number'] = self.validate( response.xpath( '//span[@itemprop="telephone"]/text()').extract_first()) if item['store_name'] != '': yield item except: pass
def parse_kensas(self, response): item = ChainItem() item['store_number'] = '' item['coming_soon'] = "0" item['store_name'] = response.xpath( './/a[@class="standard-logo"]/img/@alt').extract_first() address = response.xpath('.//address/a[1]/text()').extract() address = [ tp.strip().replace('\n', '') for tp in address if tp.strip() != "" ] addr = usaddress.parse(" ".join(address)) city = state = zip_code = street = '' for temp in addr: if temp[1] == 'PlaceName': city += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': state = temp[0].replace(',', '') elif temp[1] == 'ZipCode': zip_code = temp[0].replace(',', '') else: street += temp[0].replace(',', '') + ' ' item['address'] = street item['country'] = 'United States' item['city'] = city item['state'] = state item['zip_code'] = zip_code item['phone_number'] = response.xpath( './/address/a/text()').extract_first() item['latitude'] = '' item['longitude'] = '' item['store_hours'] = "" item['other_fields'] = "" yield item
def parse_store(self, response): stores = json.loads(response.body) stores_list = [] try: stores_list = stores['Data'] except: print("+++++++++++++++++++++ no store lists") temp_state = response.meta['state_mine'] if stores_list: for store in stores_list: temp_store_number = store['Number'] if not temp_store_number in self.uid_list: self.uid_list.append(temp_store_number) item = ChainItem() item['store_name'] = store['Name'] item['store_number'] = temp_store_number item['address'] = store['AddressMain']['Line'] item['city'] = store['AddressMain']['City'] item['state'] = temp_state item['zip_code'] = store['AddressMain']['PostalCode'] item['country'] = 'Canada' item['phone_number'] = store['PhoneNumberHome']['Number'] item['latitude'] = store['Coordinates']['Latitude'] item['longitude'] = store['Coordinates']['Longitude'] item['store_hours'] = store['OpeningHours'] yield item else: print('+++++++++++++++++++++++++++++ already scraped') else: print('+++++++++++++++++++++++++ there are no any stores')
def parse_page(self, response): try: item = ChainItem() item['store_name'] = self.validate( response.xpath( '//span[@itemprop="name"]/text()').extract_first()) item['address'] = self.validate( response.xpath('//span[@itemprop="streetAddress"]/text()'). extract_first()) item['city'] = self.validate( response.xpath('//span[@itemprop="addressLocality"]/text()'). extract_first()) item['state'] = self.validate( response.xpath('//span[@itemprop="addressRegion"]/text()'). extract_first()) item['zip_code'] = self.validate( response.xpath( '//span[@itemprop="postalCode"]/text()').extract_first()) item['country'] = 'Canada' detail = self.eliminate_space( response.xpath( '//div[contains(@class, "opentimes_box")]//text()'). extract()) item['phone_number'] = '' for de in detail: if '-' in de: item['phone_number'] = de yield item except: pass
def body(self, response): print("========= Checking.......") try: store_list = json.loads(response.body)['features'] for store in store_list: item = ChainItem() item['store_name'] = self.validate(store['properties']['name']) item['store_number'] = self.validate( str(store['properties']['nid'])) item['address'] = self.validate( store['properties']['thoroughfare']) item['city'] = self.validate(store['properties']['localty']) item['state'] = self.validate( store['properties']['state_code']) item['zip_code'] = self.validate( str(store['properties']['postal_code'])) if len(item['zip_code']) > 6: item['zip_code'] = item['zip_code'][:-4] + '-' + item[ 'zip_code'][-4:] item['country'] = 'United States' item['phone_number'] = self.validate( str(store['properties']['phone'])) if item['phone_number'] == 'None': item['phone_number'] = '' item['latitude'] = self.validate( str(store['geometry']['coordinates'][0])) item['longitude'] = self.validate( str(store['geometry']['coordinates'][1])) if item['store_number'] not in self.history: self.history.append(item['store_number']) yield item except: pdb.set_trace()
def parse_store(self, response): item = ChainItem() item['store_number'] = '' item['country'] = 'Canada' item['latitude'] = '' item['longitude'] = '' item['store_name'] = self.validate( response.xpath( './/div[@class="get_in_touch"]/h3/text()').extract_first()) item['other_fields'] = "" item['coming_soon'] = "0" address = response.xpath( './/span[@class="address_wrapper"]//text()').extract() address = [ tp.strip() for tp in address if tp.replace('\n', '').strip() != "" ] item['address'] = address[0] item['address2'] = '' item['city'] = address[1].split('\n')[0].split(',')[0] item['state'] = address[1].split('\n')[0].split(',')[1] item['zip_code'] = address[1].split('\n')[1] item['phone_number'] = response.xpath( './/li[@class="phone"]/p/a/text()').extract_first() item['store_hours'] = '' yield item
def parseStore(self, response): # try: stores = response.xpath('//ol[@class="vlist results"]/li') for store in stores: item = ChainItem() # pdb.set_trace() item['store_name'] = self.validate( store.xpath('.//div[@class="fn org"]/text()')).split( '.')[1].strip() item['store_number'] = "" item['address'] = self.validate( store.xpath('.//div[@class="street-address"]/text()')) item['address2'] = "" item['phone_number'] = store.xpath( './/span[@class="tel"]/text()').extract_first().split( )[0] + store.xpath( './/span[@class="tel"]/text()').extract_first().split()[1] item['city'] = self.validate( store.xpath('.//span[@class="locality"]/text()')).split(',')[0] item['state'] = self.validate( store.xpath('.//span[@class="region"]/text()')) item['zip_code'] = self.validate( store.xpath('.//span[@class="postal-code"]/text()')) item['country'] = "United States" item['latitude'] = store.xpath( './/a[contains(@id,"hlDirections")]/@href').extract_first( ).split('=')[1].split(',')[0] item['longitude'] = store.xpath( './/a[contains(@id,"hlDirections")]/@href').extract_first( ).split('=')[1].split(',')[1] item['store_hours'] = "" #item['store_type'] = info_json["@type"] item['other_fields'] = "" item['coming_soon'] = 0 yield item
def parse_detail(self, response): shop_name = response.xpath('//h2[@class="h2top"]/text()').extract_first() if shop_name: store__list_eles = response.xpath('//div[@class="addresses"]//ul//li[@class="moreLandaddress"]') store_addr_list = store__list_eles[0].xpath('.//p/text()').extract() store_addr_list[0] = store_addr_list[0].replace('\n', ' ') addr = store_addr_list[0] city_state_zip = store_addr_list[1].split(', ') city = city_state_zip[0] state = city_state_zip[1] zip_code = city_state_zip[2] hour = '' store_hour_list = store__list_eles[1].xpath('.//p/text()').extract() if store_hour_list: for store_hour in store_hour_list: temp_hour = store_hour.strip().encode('raw-unicode-escape').replace('\u2013', ' - ') hour += temp_hour + '; ' item = ChainItem() item['store_name'] = shop_name item['address'] = addr item['city'] = city item['state'] = state item['zip_code'] = zip_code item['store_hours'] = hour item['country'] = 'United States' yield item else: print('++++++++++++++++++++++++++++++++++++++++++ deleted')
def body(self, response): if response.body: data = '' scripts = response.xpath('//script/text()').extract() for script in scripts: if script.find('var locations = ') != -1: data = script data = data.split('var locations = {')[1].split('};')[0].strip() data = '{' + data + '}' data = data.replace('new google.maps.LatLng(', '[').replace(')', ']') data_list = data.split('"point":') store_list = json.loads(self.fixLazyJson(data)) item = ChainItem() for store in store_list: item['store_number'] = store item['address'] = store_list[store]['address'] item['city'] = store_list[store]['city'] item['state'] = store_list[store]['state'] item['zip_code'] = store_list[store]['zip'] item['phone_number'] = store_list[store]['phone'] item['coming_soon'] = '0' item['country'] = 'United States' item['latitude'] = store_list[store]['point'][0] item['longitude'] = store_list[store]['point'][1] yield item else: pass
def body(self, response): print("========= Checking.......") store_list = response.xpath('//p[@class="store-p"]') for store in store_list: status = store.xpath( './/span[@class="store-sub-title"]/text()').extract_first() detail = store.xpath('.//a/text()').extract_first() if '-' in detail: detail = detail.split('-') else: detail = detail.split(',') if status: if 'coming' in status.lower() or 'opening' in status.lower(): item = ChainItem() item['city'] = self.validate(detail[0]) item['state'] = self.validate(detail[1]) item['country'] = 'United States' if item['state'] == 'Canada': item['country'] = 'Canada' item['state'] = '' item['coming_soon'] = '1' yield item else: country = 'United States' if 'UK' in detail[1] or 'Isle of Man' in detail[1]: country = 'United Kingdom' elif 'Manitoba' in detail[1] or 'Ontario' in detail[1]: country = 'Canada' yield scrapy.Request( url=store.xpath('.//a/@href').extract_first(), callback=self.parse_page, meta={'country': country})
def parse_store(self, response): for store in response.xpath('//div[contains(@class, "wpseo-result")]'): # pdb.set_trace() item = ChainItem() item['store_name'] = store.xpath('.//span[@itemprop="name"]/text()').extract_first().strip() pos = store.xpath('.//div[@class="wpseo-sl-route"]/a/@onclick').extract_first() item['store_number'] = '' item['address'] = self.validate(store.xpath('.//span[@itemprop="streetAddress"]/text()')) item['address2'] = '' item['city'] = self.validate(store.xpath('.//span[@itemprop="addressLocality"]/text()')) item['state'] = self.validate(store.xpath('.//span[@itemprop="addressRegion"]/text()')) item['zip_code'] = self.validate(store.xpath('.//span[@itemprop="postalCode"]/text()')) item['country'] = 'Canada' item['phone_number'] = self.validate(store.xpath('.//span[@itemprop="telephone"]/text()')) item['latitude'] = pos.split(', \'')[1].split('\'')[0] item['longitude'] = pos.split(', \'')[2].split('\'')[0] item['store_hours'] = '' for hour in store.xpath('.//span[@itemprop="openingHours"]/@content').extract(): item['store_hours'] += hour.replace(u'\xa0', '') + ' ; ' item['store_type'] = '' item['other_fields'] = '' item['coming_soon'] = '0' if item['phone_number'] not in self.uid_list: self.uid_list.append(item['phone_number']) yield item
def body(self, response): print("========= Checking.......") store_list = json.loads(response.body)['Location'] for store in store_list: item = ChainItem() item['store_name'] = self.validate(store['name']) item['store_number'] = self.validate(str(store['id'])) item['address'] = self.validate(store['address']['address1']) item['address2'] = self.validate(store['address']['address2']) item['city'] = self.validate(store['address']['city']) item['state'] = self.validate(store['address']['state']) item['zip_code'] = self.validate( store['address']['address']['zipCode']) item['country'] = self.validate(store['address']['country']) item['latitude'] = self.validate( store['address']['gpsCoordinates']['latitude']) item['longitude'] = self.validate( store['address']['gpsCoordinates']['longitude']) h_temp = '' hour_list = store['meetingsForDay'] for hour in hour_list: h_temp += self.validate( hour['dayOfWeek']) + ' ' + self.validate( hour['meetings'][0]) + ' ' + self.validate( hour['openHours']) + ', ' item['store_hours'] = h_temp[:-2] if item['store_number'] not in self.history: self.history.append(item['store_number']) yield item
def parse_page(self, response): item = ChainItem() item['store_name'] = self.validate(response.xpath('//div[@id="content"]/h2/text()').extract_first()) item['coming_soon'] = '0' if 'Coming Soon' in item['store_name']: item['store_name'] = item['store_name'].split('Coming Soon')[0].strip() item['coming_soon'] = '1' detail = response.xpath('//div[@id="location_information"]') address = detail.xpath('.//div[@id="address"]//p/text()').extract() item['address'] = self.validate(address[0]) item['address2'] = '' addr = address[1].strip().split(',') item['city'] = self.validate(addr[0].strip()) item['state'] = self.validate(addr[1].strip().split(' ')[0].strip()) try: item['zip_code'] = self.validate(addr[1].strip().split(' ')[1].strip()) except: pass item['country'] = 'United States' item['phone_number'] = self.validate(detail.xpath('.//p/span[@class="phone"]/text()').extract_first()) h_temp = '' hour_list = detail.xpath('.//div[@id="hours"]//li') for hour in hour_list: hour = hour.xpath('.//text()').extract() h_temp += self.validate(hour[0]) + ' ' + self.validate(hour[1]) + ', ' item['store_hours'] = h_temp[:-2] item['store_type'] = '' item['other_fields'] = '' yield item
def parse_store(self, response): stores = tree.xpath('//li//div[@class="text"]//table//tbody') if stores: for store in stores: info_list = store.xpath('.//tr') store_name = info_list[0].xpath('.//td')[1].xpath( './a/text()')[0].strip() item = ChainItem() item['store_name'] = store_name temp_list = info_list[1].xpath('.//td')[1].xpath('./text()') temp_address = '' for temp in temp_list: temp_address += temp.strip() + ' ' addr = usaddress.parse(temp_address) item['city'] = '' item['address'] = '' for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' if not item['address'].strip() in self.history: self.history.append(item['address'].strip()) item['country'] = info_list[2].xpath('.//td')[1].xpath( './text()')[0].strip() yield item else: search_zip = response.meta['search_zip'] self.flag_end[search_zip] = 1 print('============================== end pagination')
def body(self, response): print("========= Checking.......") store_list = response.xpath('//div[@class="section group store"]') for store in store_list: try: item = ChainItem() item['store_name'] = self.validate( store.xpath( './/h3[@class="store_name"]/text()').extract_first()) item['address'] = self.validate( store.xpath('.//p[@class="address"]//span[1]/text()'). extract_first()) item['city'] = self.validate( store.xpath('.//p[@class="address"]//span[2]/text()'). extract_first()).split(',')[0] item['state'] = self.validate( store.xpath('./@data-region').extract_first()) item['zip_code'] = self.validate( store.xpath('.//p[@class="address"]//span[2]/text()'). extract_first()).split(',')[1] item['country'] = self.validate( store.xpath('./@data-country').extract_first()) item['phone_number'] = self.validate( store.xpath('.//p[@class="phone"]/text()').extract_first()) yield item except: pass
def body(self, response): print("========= Checking.......") store_list = json.loads(response.body) for store in store_list: item = ChainItem() item['store_name'] = store['name'] item['store_number'] = store['id'] item['address'] = store['address']['address'] item['address2'] = '' item['city'] = store['address']['city'] item['state'] = store['address']['province'] item['zip_code'] = store['address']['postal_code'] item['country'] = store['address']['country'] if 'store' in store['contact']: item['phone_number'] = store['contact']['store'] item['latitude'] = store['coordinates']['latitude'] item['longitude'] = store['coordinates']['longitude'] h_temp = '' hour_list = store['hours'] for hour in hour_list: h_temp += hour + ', ' item['store_hours'] = h_temp[:-2] item['store_type'] = '' item['other_fields'] = '' item['coming_soon'] = '' yield item
def parse_store(self, response): stores = response.xpath('//div[@class="VWStoreInfo"]') for store in stores: item = ChainItem() item['store_name'] = store.xpath('.//h3/text()').extract_first() if item['store_name'] == None: continue item['store_number'] = '' address = store.xpath(".//p/text()").extract_first().split(',') item['address'] = address[0] for idx, value in enumerate(address): if idx == len(address) - 1: item['phone_number'] = value if item['phone_number'] in self.uid_list: continue else: self.uid_list.append(item['phone_number']) item['address2'] = "" item['city'] = address[1].strip() item['state'] = response.meta['state'] item['country'] = "United States" item['latitude'] = response.meta['lat'] item['longitude'] = response.meta['lng'] item['other_fields'] = "" hours = store.xpath(".//p[2]/text()").extract() item['store_hours'] = ";".join(hours) item['coming_soon'] = 0 yield item
def parse_store(self, response): store_info = response item = ChainItem() item['store_number'] = '' item['store_name'] = store_info.xpath( './/div[@class="page-title"]/h1/text()').extract_first() item['address'] = store_info.xpath( './/div[@class="shop-full-description"]/p/text()').extract( )[0].strip() item['address2'] = '' item['city'] = store_info.xpath( './/div[@class="shop-full-description"]/p/text()').extract( )[1].split(',')[0].strip() item['state'] = "".join( store_info.xpath('.//div[@class="shop-full-description"]/p/text()') .extract()[1].split(',')[1].strip().split(' ')[:-1]) item['zip_code'] = store_info.xpath( './/div[@class="shop-full-description"]/p/text()').extract( )[1].split(',')[1].strip().split(' ')[-1] item['country'] = 'United States' item['phone_number'] = store_info.xpath( './/div[@class="shop-full-description"]/p/a/text()').extract_first( ) item['latitude'] = response.meta['lat'] item['longitude'] = response.meta['lng'] item['store_hours'] = store_info.xpath( './/div[@class="shop-full-description"]/p/text()').extract( )[2].strip() item['other_fields'] = "" item['coming_soon'] = "0" yield item
def body(self, response): store_list = response.xpath('//div[contains(@class, "cardgrid__card location__card")]') print("========= Checking.......", len(store_list)) for store in store_list: item = ChainItem() item['store_name'] = self.validate(store.xpath('.//a[1]/text()').extract_first()) detail = store.xpath('.//address/text()').extract() item['address'] = self.validate(detail[0]) addr = detail[1].split(',') item['city'] = self.validate(addr[0].strip()) item['state'] = self.validate(addr[1].strip().split(' ')[0].strip()) item['zip_code'] = self.validate(addr[1].strip().split(' ')[1].strip()) item['country'] = 'United States' item['phone_number'] = self.validate(detail[2]) h_temp = '' hour_list = store.xpath('.//p/text()').extract() for hour in hour_list: if self.validate(hour) != '': h_temp += self.validate(hour) + ', ' item['store_hours'] = h_temp[:-2] item['coming_soon'] = '0' if 'coming' in item['store_hours'].lower(): item['coming_soon'] = '1' item['store_hours'] = '' yield item
def parse_store(self, temp): try : item = ChainItem() if self.contain_check(temp): if len(temp) == 4: item['store_name'] = self.validate(temp[0]) item['address'] = self.validate(temp[2]) address = temp[3].split(',') item['city'] = self.validate(address[0]) item['state'] = self.validate(address[1].strip().split(' ')[0]) item['zip_code'] = self.validate(address[1].strip().split(' ')[1]) else : item['store_name'] = self.validate(temp[0]) item['country'] = 'United States' item['coming_soon'] = '1' return item else: if len(temp) == 4: item['store_name'] = self.validate(temp[0]) item['address'] = self.validate(temp[1]) address = temp[2].split(',') item['city'] = self.validate(address[0]) item['state'] = self.validate(address[1].strip().split(' ')[0]) item['zip_code'] = self.validate(address[1].strip().split(' ')[1]) item['phone_number'] = self.validate(temp[3]) elif len(temp) == 5: item['store_name'] = self.validate(temp[0]) item['address'] = self.validate(temp[2]) address = temp[3].split(',') item['city'] = self.validate(address[0]) item['state'] = self.validate(address[1].strip().split(' ')[0]) item['zip_code'] = self.validate(address[1].strip().split(' ')[1]) item['phone_number'] = self.validate(temp[4]) elif len(temp) == 3 : item['store_name'] = self.validate(temp[0]) item['address'] = self.validate(temp[1]) address = temp[2].split(',') item['city'] = self.validate(address[0]) item['state'] = self.validate(address[1].strip().split(' ')[0]) item['zip_code'] = self.validate(address[1].strip().split(' ')[1]) else : pass try: zip_temp = int(item['zip_code']) item['country'] = 'United States' except: item['country'] = 'Canada' item['coming_soon'] = '0' return item except: return None
def parse(self, response): store_list = json.loads(response.body) for store_info in store_list: item = ChainItem() item['store_number'] = store_info['id'] item['store_name'] = store_info['title'] item['address2'] = '' address1 = '' if store_info['store_address'].find('<br />') != -1: address = store_info['store_address'].split('<br />') else: address = store_info['store_address'].split('</p>\n<p>') if len(address) == 3: item['address'] = self.validate(address[0]) item['address2'] = self.validate(address[1]) address1 = address[2].split(' ') else: item['address'] = self.validate(address[0]) address1 = address[1].split(' ') if len(address1) == 4: item['city'] = self.validate( address1[0]) + ' ' + self.validate(address1[1]) item['state'] = self.validate(address1[2]) item['zip_code'] = self.validate(address1[3]) else: item['zip_code'] = self.validate(address1[2]) try: val = int(item['zip_code']) item['city'] = self.validate(address1[0]) item['state'] = self.validate(address1[1]) if address1[1].find(',') != -1: if address1[1].split(',')[1] == '': item['city'] = self.validate(address1[0] + ' ' + address1[1]) item['state'] = '' else: item['city'] = self.validate( address1[0]) + ' ' + self.validate( address1[1].split(',')[0]) item['state'] = address1[1].split(',')[1] except ValueError: item['city'] = self.validate( address1[0]) + ' ' + self.validate(address1[1]) item['state'] = self.validate(address1[2]) item['zip_code'] = '' item['country'] = 'United States' item['phone_number'] = store_info['store_telephone'] item['latitude'] = store_info['lat'] item['longitude'] = store_info['lng'] item['store_hours'] = self.validate( store_info['store_hours']).replace('<br />', '') item['other_fields'] = "" item['coming_soon'] = "0" yield item
def parse(self, response): stores = json.loads(response.body) # pprint(stores) if 'features' in stores: for store in stores['features']: item = ChainItem() item['chain_id'] = response.meta['chain_id'] item['store_number'] = store['properties']['identifiers'][ 'gblnumber'] item['address'] = store['properties']['addressLine1'].strip() #item['address2'] = store['properties']['addressLine2'].strip() try: item['phone_number'] = store['properties']['telephone'] except: pass item['latitude'] = store['geometry']['coordinates'][1] item['longitude'] = store['geometry']['coordinates'][0] item['city'] = store['properties']['addressLine3'].strip() item['state'] = store['properties']['subDivision'].strip() item['zip_code'] = store['properties']['postcode'].strip() item['country'] = store['properties']['addressLine4'].strip() item['geo_accuracy'] = "Exact" try: item['store_hours'] = 'Mon: ' + store['properties'][ 'restauranthours']['hoursMonday'] item['store_hours'] += '; Tue: ' + store['properties'][ 'restauranthours']['hoursTuesday'] item['store_hours'] += '; Wed: ' + store['properties'][ 'restauranthours']['hoursWednesday'] item['store_hours'] += '; Thu: ' + store['properties'][ 'restauranthours']['hoursThursday'] item['store_hours'] += '; Fri: ' + store['properties'][ 'restauranthours']['hoursFriday'] item['store_hours'] += '; Sat: ' + store['properties'][ 'restauranthours']['hoursSaturday'] item['store_hours'] += '; Sun: ' + store['properties'][ 'restauranthours']['hoursSunday'] except: pass item['other_fields'] = ", ".join( store['properties']['filterType']) for ft in store['properties']['filterType']: if ft == 'WALMARTLOCATION': item['store_name'] = "Walmart" item['store_type'] = "Walmart" else: item['store_name'] = "McDonalds" item['store_type'] = "McDonalds" item['coming_soon'] = "1" if store['properties'][ 'openstatus'] == 'COMINGSOON' else "0" if item['chain_id'] != '': yield item
def body(self, response): try: data = response.body.split( 'jQuery224004805107136324005_1498327407126(')[1].strip()[:-2] data = data.decode('raw-unicode-escape') store_list = json.loads(data) if store_list: for store in store_list: try: item = ChainItem() item['phone_number'] = store['phoneNumber'] item['store_number'] = store['storeNumber'] item['country'] = 'United States' item['latitude'] = store['latitude'] item['longitude'] = store['longitude'] item['store_name'] = store['name'] item['other_fields'] = "" item['coming_soon'] = "0" item['address'] = store['streetAddress'] item['address2'] = '' item['city'] = store['city'] item['state'] = self.validate(store['province']) item['zip_code'] = self.validate(store['postalCode']) item['store_hours'] = "Mon:" + self.parse_time( store['mondayHours'] ['openTime']) + " - " + self.parse_time( store['mondayHours']['closeTime'] ) + '; ' + "Tue:" + self.parse_time( store['tuesdayHours'] ['openTime']) + " - " + self.parse_time( store['tuesdayHours']['closeTime'] ) + '; ' + "Wed:" + self.parse_time( store['wednesdayHours']['openTime'] ) + " - " + self.parse_time( store['wednesdayHours']['closeTime'] ) + '; ' + "Thu:" + self.parse_time( store['thursdayHours']['openTime'] ) + " - " + self.parse_time( store['thursdayHours']['closeTime'] ) + '; ' + "Fri:" + self.parse_time( store['fridayHours']['openTime'] ) + " - " + self.parse_time( store['fridayHours']['closeTime'] ) + '; ' + "Sat:" + self.parse_time( store['saturdayHours']['openTime'] ) + " - " + self.parse_time( store['saturdayHours']['closeTime'] ) + '; ' + "Sun:" + self.parse_time( store['sundayHours']['openTime'] ) + " - " + self.parse_time( store['sundayHours']['closeTime']) + '; ' if item['store_number'] in self.store_number: continue self.store_number.append(item['store_number']) yield item except: pdb.set_trace() except: pdb.set_trace()
def parse(self, response): urls = [] url = 'https://www.signaturestyle.com/content/dam/sitemaps/signaturestyle/sitemap_signaturestyle_en_us.xml' page_text = urllib2.urlopen(url) for line in page_text: if '/mastercuts-' in line and '.html' in line: urls.append(line.split('<loc>')[1].split('<')[0]) item = ChainItem() for url in urls: page_text = urllib2.urlopen(url) sh = '' for line in page_text: if 'var salonDetailSalonID = "' in line: item['store_number'] = line.split( 'var salonDetailSalonID = "')[1].split('"')[0] if '<h2 class="hidden-xs salontitle_salonlrgtxt">' in line: item['store_name'] = line.split( '<h2 class="hidden-xs salontitle_salonlrgtxt">' )[1].split('<')[0] if '<span itemprop="streetAddress">' in line: item['address'] = line.split( '<span itemprop="streetAddress">')[1].split('<')[0] item['address2'] = '' item['country'] = 'United States' if 'itemprop="addressLocality">' in line: item['city'] = line.split( 'op="addressLocality">')[1].split('<')[0] if 'itemprop="addressRegion">' in line: item['state'] = line.split( 'itemprop="addressRegion">')[1].split('<')[0] if '"postalCode">' in line: item['zip_code'] = line.split('"postalCode">')[1].split( '<')[0] if 'id="sdp-phone" href="">' in line: item['phone_number'] = line.split( 'id="sdp-phone" href="">')[1].split('<')[0] if 'itemprop="latitude" content="' in line: item['latitude'] = line.split( 'itemprop="latitude" content="')[1].split('"')[0] if 'itemprop="longitude" content="' in line: item['longitude'] = line.split( 'itemprop="longitude" content="')[1].split('"')[0] if '<span class="' in line and 'day">' in line: if sh == '': sh = next(page_text).split('content="')[1].split( '"')[0] else: sh = sh + ';' + next(page_text).split( 'content="')[1].split('"')[0] item['store_type'] = "MasterCuts" if item['state'] == 'PR': item['country'] = 'Puerto Rico' if ' ' in item['zip_code']: item['country'] = 'Canada' item['other_fields'] = '' item['store_hours'] = sh item['coming_soon'] = '0' yield item
def parse_stores(self, response): if response.body: _response = response.body response_list = _response.split('"markers":[{') _response = response_list[1] response_list = _response.split('}],') _response = response_list[0].replace('\\u0022\\u003E', '').replace('\\u003C\\/', '') store_list = _response.split('},{') for store in store_list: item = ChainItem() store_li = store.split('"latitude":') store = store_li[1] temp_list = store.split(',') item['latitude'] = temp_list[0] store_li = store.split('"longitude":') store = store_li[1] temp_list = store.split(',') item['longitude'] = temp_list[0] store_li = store.split('"markername":"') store = store_li[1] temp_list = store.split('",') item['store_name'] = temp_list[0] store_li = store.split('streetAddress') store = store_li[1] temp_list = store.split('span') item['address'] = temp_list[0].strip() item['address'] = item['address'][:-1] store_li = store.split('postalCode') store = store_li[1] temp_list = store.split('span') item['zip_code'] = temp_list[0].strip() store_li = store.split('addressLocality') store = store_li[1] temp_list = store.split('span') item['city'] = temp_list[0].replace('\\n', '').strip() store_li = store.split('addressRegion') store = store_li[1] temp_list = store.split('span') item['state'] = temp_list[0].strip() store_li = store.split('Store #') store = store_li[1] temp_list = store.split('div') item['store_number'] = temp_list[0].strip() store_li = store.split('Phone:') store = store_li[1] temp_list = store.split('div') item['phone_number'] = temp_list[0].replace('a\\u003E', '').strip() store_li = store.split('Hours') store = store_li[1] temp_list = store.split('div') temp_hour = temp_list[0].replace('a\\u003Eem\\u003Ep\\u003E\\n\\n\\u003Cp\\u003E', '').replace('p\\u003E\\n', '').replace('\\u003Cbr \\/\\u003E\\n', '; ').replace('\u0026nbsp;', ' ').replace('\n\u003Cp\u003E', '').strip() if temp_hour.find('\u003E') != -1: item['store_hours'] = temp_hour[:-15] else: item['store_hours'] = temp_hour item['country'] = 'United States' yield item else: print('+++++++++++++++++++++++++ no response')
def body(self, response): try: store_list = json.loads(response.body)['Stores'] for store in store_list: try: item = ChainItem() item['store_name'] = store['Description'] item['store_number'] = '' item['address'] = store['Address1'] item['address2'] = store['Address2'] item['city'] = store['City'] item['state'] = store['State'] item['zip_code'] = store['Zip'] item['country'] = 'United States' item['phone_number'] = store['Phone'] item['latitude'] = store['Latitude'] item['longitude'] = store['Longitude'] try: time1_close = store['OperatingHours'][0][ 'ClosingTime'][6:-2] time1_close = datetime.datetime.utcfromtimestamp( int(int(time1_close) / 1000)) time1_open = store['OperatingHours'][0]['OpeningTime'][ 6:-2] time1_open = datetime.datetime.utcfromtimestamp( int(int(time1_open) / 1000)) time2_close = store['OperatingHours'][6][ 'ClosingTime'][6:-2] time2_close = datetime.datetime.utcfromtimestamp( int(int(time2_close) / 1000)) time2_open = store['OperatingHours'][6]['OpeningTime'][ 6:-2] time2_open = datetime.datetime.utcfromtimestamp( int(int(time2_open) / 1000)) time1 = "Sun - Thu : " + time1_open.strftime( '%I') + ":" + time1_open.strftime( '%M') + time1_open.strftime( '%p') + ' to ' + time1_close.strftime( '%I') + ":" + time1_close.strftime( '%M') + time1_close.strftime('%p') time2 = "Fri - Sat : " + time2_open.strftime( '%I') + ":" + time2_open.strftime( '%M') + time2_open.strftime( '%p') + ' to ' + time2_close.strftime( '%I') + ":" + time2_close.strftime( '%M') + time2_close.strftime('%p') item['store_hours'] = time1 + ", " + time2 except: pdb.set_trace() item['store_type'] = store['RestType'] if item['address'] + item[ 'phone_number'] not in self.history: self.history.append(item['address'] + item['phone_number']) yield item except: pdb.set_trace() except: pass
def parse_detail(self, response): item = ChainItem() item['Product_Name'] = ''.join(self.eliminate_space(response.xpath('//div[contains(@class, "content-top")]//h1[contains(@class, "product-title")]//text()').extract())) data = response.xpath('//*[contains(@class, "data-pair-item")]') for pro in data: try: prop = self.eliminate_space(pro.xpath('.//text()').extract()) if 'Bruttovikt'.lower() in prop[0].lower(): item['Weight'] = prop[1] if 'Enhet'.lower() in prop[0].lower(): item['Unit'] = prop[1] if 'Antal per enhet'.lower() in prop[0].lower(): item['Number_Per_Unit'] = prop[1] if 'Lagringsform'.lower() in prop[0].lower(): item['Storage_Form'] = prop[1] if 'Antal/hel'.lower() in prop[0].lower(): item['Number_Whole_Package'] = prop[1] if 'Art.nr leveran'.lower() in prop[0].lower(): item['Art_Nr_Supplier'] = prop[1] if 'Artikelnr'.lower() in prop[0].lower(): item['Article_Number'] = prop[1] if 'Land'.lower() in prop[0].lower(): item['Country'] = prop[1] if 'GTIN'.lower() in prop[0].lower(): item['GTIN'] = prop[1] if 'Kategori'.lower() in prop[0].lower(): item['Category'] = prop[1] except: pass yield item
def parse_body(self, response): store = [] for value in response.meta['store_list']: if value.find('Click here to get directions') != -1: item = ChainItem() item['store_name'] = '' item['store_number'] = '' item['country'] = 'United States' item['latitude'] = '' item['longitude'] = '' item['other_fields'] = "" item['coming_soon'] = "0" item['address2'] = '' getName = False getAddress = False address = '' for x, _value1 in enumerate(store): if _value1.find('Phone') != -1: getAddress = True if item['store_name'].find('Fremont Shopping Center') != -1: pdb.set_trace() if address.find(')') != -1: address = address.split(')')[1] addr = usaddress.parse(address) city = state = zip_code = street = '' for temp in addr: if temp[1] == 'PlaceName': city += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': state = temp[0].replace(',','') elif temp[1] == 'ZipCode': zip_code = temp[0].replace(',','') else: street += temp[0].replace(',','') + ' ' # pdb.set_trace() item['address'] = street item['city'] = city item['state'] = state item['zip_code'] = zip_code item['phone_number'] = _value1.replace('Phone:', '').strip() if store[x+1].split('-')[0].isdigit() == True: item['store_hours'] = self.validate(" ".join(store[x+2:])) item['phone_number'] += ' ' + store[x+1] else: item['store_hours'] = self.validate(" ".join(store[x+1:])) elif _value1.split(' ')[-1].find('Center') != -1 or _value1.split(' ')[-1].find('Marketplace') != -1 or _value1.find('Newport Square') != -1: item['store_name'] += _value1 getName = True elif getName == False and getAddress == False: item['store_name'] += _value1 + ' ' elif getName == True and getAddress == False: address += _value1 + ' ' yield item store = [] else: store.append(value)
def parse_store(self, response): store_list = response.xpath( './/div[@class="resource_locations_location"]') for store in store_list: try: item = ChainItem() item['store_number'] = '' item['store_name'] = store.xpath( './/div[@class="resource_locations_location_content_title"]/text()' ).extract_first().replace(u'\u2013', '') address = store.xpath( './/div[@class="resource_locations_location_content_address"]/text()' ).extract_first() if len(address.split(',')) == 3: if len(address.split(',')[2].strip().split(' ')) == 1: item['address'] = address.split(',')[0].strip().split( '|')[0].strip() item['city'] = address.split(',')[0].strip().split( '|')[1].strip() item['state'] = address.split(',')[1].strip() item['zip_code'] = address.split(',')[2].strip() else: item['address'] = address.split(',')[0].strip() item['city'] = address.split(',')[1].strip() item['state'] = address.split(',')[2].strip().split( ' ')[0].strip() item['zip_code'] = address.split(',')[2].strip().split( ' ')[1].strip() else: item['address'] = address.split(',')[0].strip().split( '|')[0].strip() item['city'] = address.split(',')[0].strip().split( '|')[1].strip() item['state'] = address.split(',')[1].strip().split( ' ')[0].strip() item['zip_code'] = address.split(',')[1].strip().split( ' ')[1].strip() # if item['store_name'].find('ARAPAHOE') != -1: # pdb.set_trace() item['address2'] = '' item['country'] = 'United States' item['phone_number'] = store.xpath( './/div[@class="resource_locations_location_content_phone"]/text()' ).extract_first().replace('Ph', '').strip().split('|')[0] item['latitude'] = '' item['longitude'] = '' item['store_hours'] = store.xpath( './/div[@class="resource_locations_location_content_hours"]/text()' ).extract_first() item['other_fields'] = "" item['coming_soon'] = "0" except: pdb.set_trace() yield item