def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['street']) if 'BENU Gyógyszertár' not in poi_data['title']: name = poi_data['title'].strip() branch = None else: name = 'Benu gyógyszertár' branch = poi_data['title'].strip() code = 'hubenupha' website = poi_data['description'].strip( ) if poi_data['description'] is not None else None website = website[19:] nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None city = clean_city(poi_data['city']) postcode = poi_data['postal_code'].strip() lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['street'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): xml = save_downloaded_xml( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] root = etree.fromstring(xml) for e in root.iter('place'): name = 'MOL Bubi' code = 'hububibir' housenumber = None conscriptionnumber = None street = None city = 'Budapest' branch = e.attrib['name'].split( '-')[1].strip() if e.attrib['name'] is not None else None ref = e.attrib['name'].split( '-')[0].strip() if e.attrib['name'] is not None else None capacity = e.attrib['bike_racks'].strip( ) if e.attrib['bike_racks'] is not None else None website = None nonstop = True mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary(e.attrib['lat'].replace(',', '.'), e.attrib['lng'].replace(',', '.')) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, None) original = None ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) print(insert_data) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): if self.link: with open(self.link, 'r') as f: insert_data = [] text = json.load(f) for poi_data in text['results']: first_element = next(iter(poi_data)) if self.name == 'CIB bank': name = 'CIB bank' code = 'hucibbank' else: name = 'CIB' code = 'hucibatm' postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data[first_element]['address']) branch = None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary( poi_data[first_element]['latitude'], poi_data[first_element]['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data[first_element]['address'] ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: name = 'MOL' code = 'humolfu' postcode = poi_data['postcode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data['address'] ref = None lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): try: soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), self.filetype) for pla in soup.findAll('place'): try: self.data.name = 'MOL Bubi' self.data.code = 'hububibir' self.data.city = 'Budapest' if pla.get('name') is not None and pla.get('name') != '': self.data.branch = pla.get('name').split('-')[1].strip() \ if pla.get('name') is not None else None self.data.ref = pla.get('name').split('-')[0].strip() \ if pla.get('name') is not None else None self.data.nonstop = True # self.data.capacity = pla.attrib['bike_racks'].strip() \ # if pla.attrib['bike_racks'] is not None else None self.data.lat, self.data.lon = \ check_hu_boundary(pla.get('lat').replace(',', '.'), pla.get('lng').replace(',', '.')) self.data.postcode = query_postcode_osm_external( True, self.session, self.data.lon, self.data.lat, None) self.data.public_holiday_open = True self.data.add() except Exception as e: logging.error(e) logging.error(pla) logging.exception('Exception occurred') except Exception as e: logging.error(e) logging.exception('Exception occurred') logging.error(soup)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text['results']: name = 'OMV' code = 'huomvfu' postcode = poi_data['postcode'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address_l']) city = clean_city(poi_data['town_l']) branch = None website = None nonstop = None if poi_data['open_hours'] is not None: oho, ohc = clean_opening_hours(poi_data['open_hours']) if oho == '00:00' and ohc == '24:00': nonstop = True oho, ohc = None, None else: oho, ohc = None, None mo_o = oho th_o = oho we_o = oho tu_o = oho fr_o = oho sa_o = oho su_o = oho mo_c = ohc th_c = ohc we_c = ohc tu_c = ohc fr_c = ohc sa_c = ohc su_c = ohc original = poi_data['address_l'] ref = None lat, lon = check_hu_boundary(poi_data['y'], poi_data['x']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'telnr' in poi_data and poi_data['telnr'] != '': phone = clean_phone(poi_data['telnr']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): xml = save_downloaded_xml( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] root = etree.fromstring(xml) for e in root.findall('post'): if e.find('ServicePointType').text == 'PM': name = 'Posta' code = 'hupostapo' elif e.find('ServicePointType').text == 'CS': name = 'Posta csomagautomata' code = 'hupostacso' elif e.find('ServicePointType').text == 'PP': name = 'PostaPont' code = 'hupostapp' else: logging.error('Non existing Posta type.') postcode = e.get('zipCode') street_tmp_1 = e.find('street/name').text.strip() if e.find( 'street/name').text is not None else None street_tmp_2 = e.find('street/type').text.strip() if e.find( 'street/type').text is not None else None if street_tmp_1 is None: street = None elif street_tmp_2 is None: street = street_tmp_1 elif street_tmp_1 is not None and street_tmp_2 is not None: street = '{} {}'.format(street_tmp_1, street_tmp_2) else: logging.error('Non handled state!') housenumber = e.find('street/houseNumber').text.strip().lower( ) if e.find('street/houseNumber').text is not None else None conscriptionnumber = None city = clean_city(e.find('city').text) branch = e.find('name').text if e.find( 'name').text is not None else None website = None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary( e.find('gpsData/WGSLat').text.replace(',', '.'), e.find('gpsData/WGSLon').text.replace(',', '.')) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, postcode) original = None ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup('{}'.format(self.link), os.path.join(self.download_cache, self.filename), None, self.verify_link) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('^\s*var\s*places.*') script = soup.find('script', text=pattern) m = pattern.match(script.get_text()) data = m.group(0) data = clean_javascript_variable(data, 'places') text = json.loads(data) for poi_data in text: poi_data = poi_data['addresses'][0] # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) name = 'Rossmann' code = 'hurossmche' city = clean_city(poi_data['city']) postcode = poi_data['zip'].strip() branch = None website = None nonstop = False if poi_data['business_hours']['monday'] is not None: mo_o, mo_c = clean_opening_hours(poi_data['business_hours']['monday']) else: mo_o, mo_c = None, None if poi_data['business_hours']['tuesday'] is not None: th_o, th_c = clean_opening_hours(poi_data['business_hours']['tuesday']) else: th_o, th_c = None, None if poi_data['business_hours']['wednesday'] is not None: we_o, we_c = clean_opening_hours(poi_data['business_hours']['wednesday']) else: we_o, we_c = None, None if poi_data['business_hours']['thursday'] is not None: tu_o, tu_c = clean_opening_hours(poi_data['business_hours']['thursday']) else: tu_o, tu_c = None, None if poi_data['business_hours']['friday'] is not None: fr_o, fr_c = clean_opening_hours(poi_data['business_hours']['friday']) else: fr_o, fr_c = None, None if poi_data['business_hours']['saturday'] is not None: sa_o, sa_c = clean_opening_hours(poi_data['business_hours']['saturday']) else: sa_o, sa_c = None, None if poi_data['business_hours']['sunday'] is not None: su_o, su_c = clean_opening_hours(poi_data['business_hours']['sunday']) else: su_o, su_c = None, None lat, lon = check_hu_boundary(poi_data['position'][0], poi_data['position'][1]) geom = check_geom(lat, lon) postcode = query_postcode_osm_external(self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['address'] ref = None phone = None email = None insert_data.append( [code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) if 'xpres' in poi_data['name']: name = 'Spar Expressz' code = 'husparexp' elif 'INTER' in poi_data['name']: name = 'Interspar' code = 'husparint' elif 'market' in poi_data['name']: name = 'Spar' code = 'husparsup' else: name = 'Spar' code = 'husparsup' poi_data['name'] = poi_data['name'].replace( 'INTERSPAR', 'Interspar') poi_data['name'] = poi_data['name'].replace('SPAR', 'Spar') ref_match = PATTERN_SPAR_REF.search(poi_data['name']) ref = ref_match.group( 1).strip() if ref_match is not None else None city = clean_city(poi_data['city']) postcode = poi_data['zipCode'].strip() branch = poi_data['name'].split('(')[0].strip() website = poi_data['pageUrl'].strip() nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None lat, lon = check_hu_boundary(poi_data['latitude'], poi_data['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) original = poi_data['address'] phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): csv = save_downloaded_pd( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) if csv is not None: csv[['Post code']] = csv[['Post code']].fillna('0000') csv[['Post code']] = csv[['Post code']].astype(int) csv[['Telephone']] = csv[['Telephone']].fillna('0') csv[['Telephone']] = csv[['Telephone']].astype(int) csv[['City']] = csv[['City']].fillna('') csv[['Name']] = csv[['Name']].fillna('') insert_data = [] poi_dict = csv.to_dict('records') for poi_data in poi_dict: if poi_data['Brand'] == 'Shell': name = 'Shell' code = 'hushellfu' elif poi_data['Brand'] == 'Mobilpetrol': name = 'Mobil Petrol' code = 'humobpefu' postcode = poi_data['Post code'] steet_tmp = poi_data['Address'].lower().split() for i in range(0, len(steet_tmp) - 2): steet_tmp[i] = steet_tmp[i].capitalize() steet_tmp = ' '.join(steet_tmp) street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( steet_tmp) if poi_data['City'] != '': city = clean_city(poi_data['City'].title()) else: if poi_data['Name'] != '': city = clean_city(poi_data['Name'].title()) else: city = None branch = poi_data['Name'].strip() website = None if poi_data['24 Hour'] == True: nonstop = True mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None else: nonstop = False mo_o = '06:00' th_o = '06:00' we_o = '06:00' tu_o = '06:00' fr_o = '06:00' sa_o = '06:00' su_o = '06:00' mo_c = '22:00' th_c = '22:00' we_c = '22:00' tu_c = '22:00' fr_c = '22:00' sa_c = '22:00' su_c = '22:00' original = poi_data['Address'] ref = None lat, lon = check_hu_boundary(poi_data['GPS Latitude'], poi_data['GPS Longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) if 'Telephone' in poi_data and poi_data['Telephone'] != '': phone = clean_phone(str(poi_data['Telephone'])) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def online_poi_matching(args): data, comm_data = args try: db = POIBase('{}://{}:{}@{}:{}/{}'.format( config.get_database_type(), config.get_database_writer_username(), config.get_database_writer_password(), config.get_database_writer_host(), config.get_database_writer_port(), config.get_database_poi_database())) pgsql_pool = db.pool session_factory = sessionmaker(pgsql_pool) Session = scoped_session(session_factory) session = Session() osm_live_query = OsmApi() for i, row in data.iterrows(): # for i, row in data[data['poi_code'].str.contains('posta')].iterrows(): try: # Try to search OSM POI with same type, and name contains poi_search_name within the specified distance osm_query = db.query_osm_shop_poi_gpd( row.get('poi_lon'), row.get('poi_lat'), comm_data.loc[comm_data['pc_id'] == row.get( 'poi_common_id')]['poi_type'].values[0], row.get('poi_search_name'), row.get('poi_search_avoid_name'), row.get('poi_addr_street'), row.get('poi_addr_housenumber'), row.get('poi_conscriptionnumber'), row.get('poi_city'), row.get('osm_search_distance_perfect'), row.get('osm_search_distance_safe'), row.get('osm_search_distance_unsafe')) # Enrich our data with OSM database POI metadata if osm_query is not None: row['poi_new'] = False # Collect additional OSM metadata. Note: this needs style change during osm2pgsql osm_id = osm_query['osm_id'].values[0] if osm_query.get( 'osm_id') is not None else None osm_node = osm_query.get( 'node').values[0] if osm_query.get( 'node') is not None else None # Set OSM POI coordinates for all kind of geom lat = osm_query.get('lat').values[0] lon = osm_query.get('lon').values[0] if data.at[i, 'poi_lat'] != lat and data.at[i, 'poi_lon'] != lon: logging.info( 'Using new coodinates %s %s instead of %s %s.', lat, lon, data.at[i, 'poi_lat'], data.at[i, 'poi_lon']) data.at[i, 'poi_lat'] = lat data.at[i, 'poi_lon'] = lon if osm_node == 'node': osm_node = OSM_object_type.node elif osm_node == 'way': osm_node = OSM_object_type.way elif osm_node == 'relation': osm_node = OSM_object_type.relation else: logging.warning('Illegal state: %s', osm_query['node'].values[0]) data.at[i, 'osm_id'] = osm_id data.at[i, 'osm_node'] = osm_node # Refine postcode if row['preserve_original_post_code'] is not True: # Current OSM postcode based on lat,long query. postcode = query_postcode_osm_external( config.get_geo_prefer_osm_postcode(), session, lon, lat, row.get('poi_postcode')) force_postcode_change = False # TODO: Has to be a setting in app.conf if force_postcode_change is True: # Force to use datasource postcode if postcode != row.get('poi_postcode'): logging.info( 'Changing postcode from %s to %s.', row.get('poi_postcode'), postcode) data.at[i, 'poi_postcode'] = postcode else: # Try to use smart method for postcode check ch_posctode = smart_postcode_check( row, osm_query, postcode) if ch_posctode is not None: data.at[i, 'poi_postcode'] = ch_posctode else: logging.info('Preserving original postcode %s', row.get('poi_postcode')) data.at[i, 'osm_version'] = osm_query['osm_version'].values[0] \ if osm_query['osm_version'] is not None else None data.at[i, 'osm_changeset'] = osm_query['osm_changeset'].values[0] \ if osm_query['osm_changeset'] is not None else None if osm_query['osm_timestamp'] is not None: osm_query['osm_timestamp'] = \ data.at[i, 'osm_timestamp'] = pd.to_datetime(str((osm_query['osm_timestamp'].values[0]))) else: osm_query['osm_timestamp'] = None data.at[i, 'poi_distance'] = osm_query.get( 'distance').values[0] if osm_query.get( 'distance') is not None else None # For OSM way also query node points if osm_node == OSM_object_type.way: logging.info( 'This is an OSM way looking for id %s nodes.', osm_id) # Add list of nodes to the dataframe nodes = db.query_ways_nodes(osm_id) data.at[i, 'osm_nodes'] = nodes elif osm_node == OSM_object_type.relation: logging.info( 'This is an OSM relation looking for id %s nodes.', osm_id) # Add list of relation nodes to the dataframe nodes = db.query_relation_nodes(osm_id) data.at[i, 'osm_nodes'] = nodes logging.info( 'Old %s (not %s) type: %s POI within %s m: %s %s, %s %s (%s)', data.at[i, 'poi_search_name'], data.at[i, 'poi_search_avoid_name'], data.at[i, 'poi_type'], data.at[i, 'poi_distance'], data.at[i, 'poi_postcode'], data.at[i, 'poi_city'], data.at[i, 'poi_addr_street'], data.at[i, 'poi_addr_housenumber'], data.at[i, 'poi_conscriptionnumber']) try: # Download OSM POI way live tags if osm_node == OSM_object_type.way: for rtc in range(0, RETRY): logging.info( 'Downloading OSM live tags to this way: %s.', osm_id) cached_way = db.query_from_cache( osm_id, osm_node) if cached_way is None: live_tags_container = osm_live_query.WayGet( osm_id) if live_tags_container is not None: data.at[ i, 'osm_live_tags'] = live_tags_container.get( 'tag') cache_row = { 'osm_id': int(osm_id), 'osm_live_tags': live_tags_container.get('tag'), 'osm_version': live_tags_container.get('version'), 'osm_user': live_tags_container.get('user'), 'osm_user_id': live_tags_container.get('uid'), 'osm_changeset': live_tags_container.get( 'changeset'), 'osm_timestamp': live_tags_container.get( 'timestamp'), 'osm_object_type': osm_node, 'osm_lat': None, 'osm_lon': None, 'osm_nodes': live_tags_container.get('nd') } get_or_create_cache( session, POI_OSM_cache, **cache_row) # Downloading referenced nodes of the way for way_nodes in live_tags_container[ 'nd']: logging.debug( 'Getting node %s belongs to way %s', way_nodes, osm_id) live_tags_node = osm_live_query.NodeGet( way_nodes) cache_row = { 'osm_id': int(way_nodes), 'osm_live_tags': live_tags_node.get('tag'), 'osm_version': live_tags_node.get('version'), 'osm_user': live_tags_node.get('user'), 'osm_user_id': live_tags_node.get('uid'), 'osm_changeset': live_tags_node.get( 'changeset'), 'osm_timestamp': live_tags_node.get( 'timestamp'), 'osm_object_type': OSM_object_type.node, 'osm_lat': live_tags_node.get('lat'), 'osm_lon': live_tags_node.get('lon'), 'osm_nodes': None } get_or_create_cache( session, POI_OSM_cache, **cache_row) break else: logging.warning( 'Download of external data has failed.' ) else: data.at[i, 'osm_live_tags'] = cached_way.get( 'osm_live_tags') break session.commit() # Download OSM POI node live tags elif osm_node == OSM_object_type.node: for rtc in range(0, RETRY): logging.info( 'Downloading OSM live tags to this node: %s.', osm_id) cached_node = db.query_from_cache( osm_id, osm_node) if cached_node is None: live_tags_container = osm_live_query.NodeGet( osm_id) if live_tags_container is not None: data.at[ i, 'osm_live_tags'] = live_tags_container.get( 'tag') cache_row = { 'osm_id': int(osm_id), 'osm_live_tags': live_tags_container.get('tag'), 'osm_version': live_tags_container.get('version'), 'osm_user': live_tags_container.get('user'), 'osm_user_id': live_tags_container.get('uid'), 'osm_changeset': live_tags_container.get( 'changeset'), 'osm_timestamp': live_tags_container.get( 'timestamp'), 'osm_object_type': osm_node, 'osm_lat': live_tags_container.get('lat'), 'osm_lon': live_tags_container.get('lon'), 'osm_nodes': None } get_or_create_cache( session, POI_OSM_cache, **cache_row) break else: logging.warning( 'Download of external data has failed.' ) else: data.at[i, 'osm_live_tags'] = cached_node.get( 'osm_live_tags') break session.commit() elif osm_node == OSM_object_type.relation: for rtc in range(0, RETRY): logging.info( 'Downloading OSM live tags to this relation: %s.', osm_id) live_tags_container = osm_live_query.RelationGet( abs(osm_id)) if live_tags_container is not None: data.at[ i, 'osm_live_tags'] = live_tags_container.get( 'tag') break else: logging.warning( 'Download of external data has failed.' ) session.commit() else: logging.warning('Invalid state for live tags.') except Exception as e: logging.warning( 'There was an error during OSM request: %s.', e) logging.exception('Exception occurred') logging.warning('Live tag is: {}'.format( cached_node.get('osm_live_tags'))) # This is a new POI else: # This is a new POI - will add fix me tag to the new items. data.at[i, 'poi_new'] = True # Get the first character of then name of POI and generate a floating number between 0 and 1 # for a PostGIS function: https://postgis.net/docs/ST_LineInterpolatePoint.html # If there is more than one POI in a building this will try to do a different location and # not only on center or not only on edge ib = row.get('poi_name') if ib is not None: ibp = 1 - (((ord(ib[0]) // 16) + 1) / 17) else: ibp = 0.50 # Refine postcode osm_bulding_q = db.query_osm_building_poi_gpd( row.get('poi_lon'), row.get('poi_lat'), row.get('poi_city'), row.get('poi_postcode'), row.get('poi_addr_street'), row.get('poi_addr_housenumber'), in_building_percentage=ibp) if osm_bulding_q is not None: logging.info( 'Relocating POI coordinates to the building with same address: %s %s, %s %s', row.get('poi_lat'), row.get('poi_lon'), osm_bulding_q.get('lat')[0], osm_bulding_q.get('lon')[0]), row['poi_lat'], row['poi_lon'] = osm_bulding_q.get( 'lat')[0], osm_bulding_q.get('lon')[0] else: logging.info( 'The POI is already in its building or there is no building match. \ Keeping POI coordinates as is as.') if row['preserve_original_post_code'] is not True: postcode = query_postcode_osm_external( config.get_geo_prefer_osm_postcode(), session, data.at[i, 'poi_lon'], data.at[i, 'poi_lat'], row.get('poi_postcode')) if postcode != row.get('poi_postcode'): logging.info('Changing postcode from %s to %s.', row.get('poi_postcode'), postcode) data.at[i, 'poi_postcode'] = postcode else: logging.info('Preserving original postcode %s', row.get('poi_postcode')) logging.info( 'New %s (not %s) type: %s POI: %s %s, %s %s (%s)', row.get('poi_search_name'), row.get('poi_search_avoid_name'), row.get('poi_type'), row.get('poi_postcode'), row.get('poi_city'), row.get('poi_addr_street'), row.get('poi_addr_housenumber'), row.get('poi_conscriptionnumber')) except Exception as e: logging.error(e) logging.error(row) logging.exception('Exception occurred') session.commit() return data except Exception as e: logging.error(e) logging.exception('Exception occurred')
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('^\s*var\s*boltok_nyers.*') script = soup.find('script', text=pattern) m = pattern.match(script.get_text()) data = m.group(0) data = clean_javascript_variable(data, 'boltok_nyers') text = json.loads(data) # for l in text: # print ('postcode: {postcode}; city: {city}; address: {address}; alt_name: {alt_name}'.format(postcode=l['A_IRSZ'], city=l['A_VAROS'], address=l['A_CIM'], alt_name=l['P_NAME'])) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['A_CIM']) city = clean_city(poi_data['A_VAROS']) postcode = poi_data['A_IRSZ'].strip() branch = poi_data['P_NAME'].strip() name = 'Príma' if 'Príma' in branch else 'CBA' code = 'huprimacon' if 'Príma' in branch else 'hucbacon' website = None nonstop = None mo_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_1'] ) if poi_data['PS_OPEN_FROM_1'] is not None else None th_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_2'] ) if poi_data['PS_OPEN_FROM_2'] is not None else None we_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_3'] ) if poi_data['PS_OPEN_FROM_3'] is not None else None tu_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_4'] ) if poi_data['PS_OPEN_FROM_4'] is not None else None fr_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_5'] ) if poi_data['PS_OPEN_FROM_5'] is not None else None sa_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_6'] ) if poi_data['PS_OPEN_FROM_6'] is not None else None su_o = clean_opening_hours_2( poi_data['PS_OPEN_FROM_7'] ) if poi_data['PS_OPEN_FROM_7'] is not None else None mo_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_1'] ) if poi_data['PS_OPEN_TO_1'] is not None else None th_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_2'] ) if poi_data['PS_OPEN_TO_2'] is not None else None we_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_3'] ) if poi_data['PS_OPEN_TO_3'] is not None else None tu_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_4'] ) if poi_data['PS_OPEN_TO_4'] is not None else None fr_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_5'] ) if poi_data['PS_OPEN_TO_5'] is not None else None sa_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_6'] ) if poi_data['PS_OPEN_TO_6'] is not None else None su_c = clean_opening_hours_2( poi_data['PS_OPEN_TO_7'] ) if poi_data['PS_OPEN_TO_7'] is not None else None original = poi_data['A_CIM'] lat, lon = check_hu_boundary(poi_data['PS_GPS_COORDS_LAT'], poi_data['PS_GPS_COORDS_LNG']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) ref = None if 'PS_PUBLIC_TEL' in poi_data and poi_data[ 'PS_PUBLIC_TEL'] != '': phone = clean_phone(poi_data['PS_PUBLIC_TEL']) else: phone = None if 'PS_PUBLIC_EMAIL' in poi_data and poi_data[ 'PS_PUBLIC_EMAIL'] != '': email = poi_data['PS_PUBLIC_EMAIL'] else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` pattern = re.compile('var\s*markers\s*=\s*((.*\n)*\]\;)', re.MULTILINE) script = soup.find('script', text=pattern) m = pattern.search(script.get_text()) data = m.group(0) data = data.replace("'", '"') data = clean_javascript_variable(data, 'markers') text = json.loads(data) for poi_data in text: if poi_data['cim'] is not None and poi_data['cim'] != '': postcode, city, street, housenumber, conscriptionnumber = extract_all_address( poi_data['cim']) name = 'Avia' code = 'huaviafu' branch = None if city is None: city = poi_data['title'] ref = poi_data['kutid'] if poi_data[ 'kutid'] is not None and poi_data['kutid'] != '' else None lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) website = '/toltoallomas/?id={}'.format(str(poi_data['kutid'])) if poi_data['kutid'] is not None and \ poi_data['kutid'] != '' else None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None original = poi_data['cim'] if 'tel' in poi_data and poi_data['tel'] != '': phone = clean_phone(poi_data['tel']) else: phone = None if 'email' in poi_data and poi_data['email'] != '': email = clean_email(poi_data['email']) else: email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: # parse the html using beautiful soap and store in variable `soup` # script = soup.find('div', attrs={'data-stores':True}) script = soup.find(attrs={'data-stores': True}) text = json.loads(script['data-stores']) for poi_data in text: # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['address']) city = clean_city(poi_data['city']) branch = poi_data['name'] if 'xpres' in poi_data['name']: name = 'Tesco Expressz' code = 'hutescoexp' elif 'xtra' in poi_data['name']: name = 'Tesco Extra' code = 'hutescoext' else: name = 'Tesco' code = 'hutescosup' website = poi_data['url'] nonstop = None opening = json.loads(poi_data['opening']) mo_o = opening['1'][0] th_o = opening['2'][0] we_o = opening['3'][0] tu_o = opening['4'][0] fr_o = opening['5'][0] sa_o = opening['6'][0] su_o = opening['0'][0] mo_c = opening['1'][1] th_c = opening['2'][1] we_c = opening['3'][1] tu_c = opening['4'][1] fr_c = opening['5'][1] sa_c = opening['6'][1] su_c = opening['0'][1] lat, lon = check_hu_boundary(poi_data['gpslat'], poi_data['gpslng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, None) original = poi_data['address'] ref = None if 'phone' in poi_data and poi_data['phone'] != '': phone = clean_phone(poi_data['phone']) else: phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): soup = save_downloaded_soup( '{}'.format(self.link), os.path.join(self.download_cache, self.filename)) insert_data = [] if soup != None: text = json.loads(soup.get_text()) for poi_data in text: name = 'Foxpost' code = 'hufoxpocso' postcode = poi_data['zip'].strip() street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['street']) city = clean_city(poi_data['city']) branch = poi_data['name'] website = None nonstop = None if poi_data['open']['hetfo'] is not None: mo_o, mo_c = clean_opening_hours(poi_data['open']['hetfo']) else: mo_o, mo_c = None, None if poi_data['open']['kedd'] is not None: th_o, th_c = clean_opening_hours(poi_data['open']['kedd']) else: th_o, th_c = None, None if poi_data['open']['szerda'] is not None: we_o, we_c = clean_opening_hours( poi_data['open']['szerda']) else: we_o, we_c = None, None if poi_data['open']['csutortok'] is not None: tu_o, tu_c = clean_opening_hours( poi_data['open']['csutortok']) else: tu_o, tu_c = None, None if poi_data['open']['pentek'] is not None: fr_o, fr_c = clean_opening_hours( poi_data['open']['pentek']) else: fr_o, fr_c = None, None if poi_data['open']['szombat'] is not None: sa_o, sa_c = clean_opening_hours( poi_data['open']['szombat']) else: sa_o, sa_c = None, None if poi_data['open']['vasarnap'] is not None: su_o, su_c = clean_opening_hours( poi_data['open']['vasarnap']) else: su_o, su_c = None, None original = poi_data['address'] ref = None lat, lon = check_hu_boundary(poi_data['geolat'], poi_data['geolng']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, postcode) phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)
def process(self): ''' soup = save_downloaded_soup('{}'.format(self.link), os.path.join(self.download_cache, self.filename), POST_DATA) insert_data = [] if soup != None: text = json.loads(soup.get_text()) ''' with open(os.path.join(self.download_cache, self.filename), 'r') as f: insert_data = [] text = json.load(f) for poi_data in text: street, housenumber, conscriptionnumber = extract_street_housenumber_better_2( poi_data['cim']) if 'Kulcs patika' not in poi_data['nev']: name = poi_data['nev'].strip() branch = None else: name = 'Kulcs patika' branch = poi_data['nev'].strip() code = 'hukulcspha' website = poi_data['link'].strip( ) if poi_data['link'] is not None else None nonstop = None mo_o = None th_o = None we_o = None tu_o = None fr_o = None sa_o = None su_o = None mo_c = None th_c = None we_c = None tu_c = None fr_c = None sa_c = None su_c = None city = clean_city(poi_data['helyseg']) lat, lon = check_hu_boundary( poi_data['marker_position']['latitude'], poi_data['marker_position']['longitude']) geom = check_geom(lat, lon) postcode = query_postcode_osm_external( self.prefer_osm_postcode, self.session, lat, lon, poi_data['irsz'].strip()) original = poi_data['cim'] ref = None phone = None email = None insert_data.append([ code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, phone, email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c ]) if len(insert_data) < 1: logging.warning('Resultset is empty. Skipping ...') else: df = pd.DataFrame(insert_data) df.columns = POI_COLS insert_poi_dataframe(self.session, df)