def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['street']) if 'BENU Gyógyszertár' not in poi_data['title']: name = poi_data['title'].strip() branch = None else: name = 'Benu gyógyszertár' branch = poi_data['title'].strip() code = 'hubenupha' website = poi_data['description'].strip( ) if poi_data['description'] is not None else None website = website[19:] nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None city = clean_city(poi_data['city']) postcode = poi_data['postal_code'].strip() lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['street'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text['results']: name = 'OMV' code = 'huomvfu' postcode = poi_data['postcode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address_l']) city = clean_city(poi_data['town_l']) branch = None website = None nonstop = None if poi_data['open_hours'] is not None: oho, ohc = clean_opening_hours(poi_data['open_hours']) if oho == '00:00' and ohc == '24:00': nonstop = True oho, ohc = None, None else: oho, ohc = None, None mo_o = oho th_o = oho we_o = oho tu_o = oho fr_o = oho sa_o = oho su_o = oho mo_c = ohc th_c = ohc we_c = ohc tu_c = ohc fr_c = ohc sa_c = ohc su_c = ohc original = poi_data['address_l'] ref = None lat, lon = check_hu_boundary(poi_data['y'], poi_data['x']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'telnr' in poi_data and poi_data['telnr'] != '': phone = clean_phone(poi_data['telnr']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def test_clean_phone(self): for i in self.phones: original, ph = i['original'], i['converted'] a = clean_phone(original) with self.subTest(): self.assertEqual(ph, a)
def process(self): csv = save_downloaded_pd( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) if csv is not None: csv[['Post code']] = csv[['Post code']].fillna('0000') csv[['Post code']] = csv[['Post code']].astype(int) csv[['Telephone']] = csv[['Telephone']].fillna('0') csv[['Telephone']] = csv[['Telephone']].astype(int) csv[['City']] = csv[['City']].fillna('') csv[['Name']] = csv[['Name']].fillna('') insert_data = [] poi_dict = csv.to_dict('records') for poi_data in poi_dict: if poi_data['Brand'] == 'Shell': name = 'Shell' code = 'hushellfu' elif poi_data['Brand'] == 'Mobilpetrol': name = 'Mobil Petrol' code = 'humobpefu' postcode = poi_data['Post code'] steet_tmp = poi_data['Address'].lower().split() for i in range(0, len(steet_tmp) - 2): steet_tmp[i] = steet_tmp[i].capitalize() steet_tmp = ' '.join(steet_tmp) street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( steet_tmp) if poi_data['City'] != '': city = clean_city(poi_data['City'].title()) else: if poi_data['Name'] != '': city = clean_city(poi_data['Name'].title()) else: city = None branch = poi_data['Name'].strip() website = None if poi_data['24 Hour'] == True: nonstop = True mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None else: nonstop = False mo_o = '06:00' th_o = '06:00' we_o = '06:00' tu_o = '06:00' fr_o = '06:00' sa_o = '06:00' su_o = '06:00' mo_c = '22:00' th_c = '22:00' we_c = '22:00' tu_c = '22:00' fr_c = '22:00' sa_c = '22:00' su_c = '22:00' original = poi_data['Address'] ref = None lat, lon = check_hu_boundary(poi_data['GPS Latitude'], poi_data['GPS Longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'Telephone' in poi_data and poi_data['Telephone'] != '': phone = clean_phone(str(poi_data['Telephone'])) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('^\s*var\s*boltok_nyers.*') script = soup.find('script', text=pattern) m = pattern.match(script.get_text()) data = m.group(0) data = clean_javascript_variable(data, 'boltok_nyers') text = json.loads(data) # for l in text: # print ('postcode: {postcode}; city: {city}; address: {address}; alt_name: {alt_name}'.format(postcode=l['A_IRSZ'], city=l['A_VAROS'], address=l['A_CIM'], alt_name=l['P_NAME'])) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['A_CIM']) city = clean_city(poi_data['A_VAROS']) postcode = poi_data['A_IRSZ'].strip() branch = poi_data['P_NAME'].strip() name = 'Príma' if 'Príma' in branch else 'CBA' code = 'huprimacon' if 'Príma' in branch else 'hucbacon' website = None nonstop = None mo_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_1'] ) if poi_data['PS_OPEN_FROM_1'] is not None else None th_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_2'] ) if poi_data['PS_OPEN_FROM_2'] is not None else None we_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_3'] ) if poi_data['PS_OPEN_FROM_3'] is not None else None tu_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_4'] ) if poi_data['PS_OPEN_FROM_4'] is not None else None fr_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_5'] ) if poi_data['PS_OPEN_FROM_5'] is not None else None sa_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_6'] ) if poi_data['PS_OPEN_FROM_6'] is not None else None su_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_7'] ) if poi_data['PS_OPEN_FROM_7'] is not None else None mo_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_1'] ) if poi_data['PS_OPEN_TO_1'] is not None else None th_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_2'] ) if poi_data['PS_OPEN_TO_2'] is not None else None we_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_3'] ) if poi_data['PS_OPEN_TO_3'] is not None else None tu_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_4'] ) if poi_data['PS_OPEN_TO_4'] is not None else None fr_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_5'] ) if poi_data['PS_OPEN_TO_5'] is not None else None sa_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_6'] ) if poi_data['PS_OPEN_TO_6'] is not None else None su_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_7'] ) if poi_data['PS_OPEN_TO_7'] is not None else None original = poi_data['A_CIM'] lat, lon = check_hu_boundary(poi_data['PS_GPS_COORDS_LAT'], poi_data['PS_GPS_COORDS_LNG']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) ref = None if 'PS_PUBLIC_TEL' in poi_data and poi_data[ 'PS_PUBLIC_TEL'] != '': phone = clean_phone(poi_data['PS_PUBLIC_TEL']) else: phone = None if 'PS_PUBLIC_EMAIL' in poi_data and poi_data[ 'PS_PUBLIC_EMAIL'] != '': email = poi_data['PS_PUBLIC_EMAIL'] else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('var\s*markers\s*=\s*((.*\n)*\]\;)', re.MULTILINE) script = soup.find('script', text=pattern) m = pattern.search(script.get_text()) data = m.group(0) data = data.replace("'", '"') data = clean_javascript_variable(data, 'markers') text = json.loads(data) for poi_data in text: if poi_data['cim'] is not None and poi_data['cim'] != '': postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data['cim']) name = 'Avia' code = 'huaviafu' branch = None if city is None: city = poi_data['title'] ref = poi_data['kutid'] if poi_data[ 'kutid'] is not None and poi_data['kutid'] != '' else None lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) website = '/toltoallomas/?id={}'.format(str(poi_data['kutid'])) if poi_data['kutid'] is not None and \ poi_data['kutid'] != '' else None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data['cim'] if 'tel' in poi_data and poi_data['tel'] != '': phone = clean_phone(poi_data['tel']) else: phone = None if 'email' in poi_data and poi_data['email'] != '': email = clean_email(poi_data['email']) else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` # script = soup.find('div', attrs={'data-stores':True}) script = soup.find(attrs={'data-stores': True}) text = json.loads(script['data-stores']) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = poi_data['name'] if 'xpres' in poi_data['name']: name = 'Tesco Expressz' code = 'hutescoexp' elif 'xtra' in poi_data['name']: name = 'Tesco Extra' code = 'hutescoext' else: name = 'Tesco' code = 'hutescosup' website = poi_data['url'] nonstop = None opening = json.loads(poi_data['opening']) mo_o = opening['1'][0] th_o = opening['2'][0] we_o = opening['3'][0] tu_o = opening['4'][0] fr_o = opening['5'][0] sa_o = opening['6'][0] su_o = opening['0'][0] mo_c = opening['1'][1] th_c = opening['2'][1] we_c = opening['3'][1] tu_c = opening['4'][1] fr_c = opening['5'][1] sa_c = opening['6'][1] su_c = opening['0'][1] lat, lon = check_hu_boundary(poi_data['gpslat'], poi_data['gpslng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, None) original = poi_data['address'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): if self.link: with open(self.link, 'r') as f: insert_data = [] text = json.load(f) for poi_data in text['results']: first_element = next(iter(poi_data)) if self.name == 'K&H bank': name = 'K&H bank' code = 'hukhbank' else: name = 'K&H' code = 'hukhatm' postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data[first_element]['address']) branch = None website = None if code == 'hukhatm': nonstop = True else: nonstop = False mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary( poi_data[first_element]['latitude'], poi_data[first_element]['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data[first_element]['address'] ref = None if 'phoneNumber' in poi_data and poi_data[ 'phoneNumber'] != '': phone = clean_phone(poi_data['phoneNumber']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)