Example #1
0
 def process(self):
     soup = save_downloaded_soup(
         '{}'.format(self.link),
         os.path.join(self.download_cache, self.filename))
     insert_data = []
     if soup != None:
         text = json.loads(soup.get_text())
         for poi_data in text:
             street, housenumber, conscriptionnumber = extract_street_housenumber_better_2(
                 poi_data['street'])
             if 'BENU Gyógyszertár' not in poi_data['title']:
                 name = poi_data['title'].strip()
                 branch = None
             else:
                 name = 'Benu gyógyszertár'
                 branch = poi_data['title'].strip()
             code = 'hubenupha'
             website = poi_data['description'].strip(
             ) if poi_data['description'] is not None else None
             website = website[19:]
             nonstop = None
             mo_o = None
             th_o = None
             we_o = None
             tu_o = None
             fr_o = None
             sa_o = None
             su_o = None
             mo_c = None
             th_c = None
             we_c = None
             tu_c = None
             fr_c = None
             sa_c = None
             su_c = None
             city = clean_city(poi_data['city'])
             postcode = poi_data['postal_code'].strip()
             lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng'])
             geom = check_geom(lat, lon)
             postcode = query_postcode_osm_external(
                 self.prefer_osm_postcode, self.session, lat, lon, postcode)
             original = poi_data['street']
             ref = None
             if 'phone' in poi_data and poi_data['phone'] != '':
                 phone = clean_phone(poi_data['phone'])
             else:
                 phone = None
             email = None
             insert_data.append([
                 code, postcode, city, name, branch, website, original,
                 street, housenumber, conscriptionnumber, ref, phone, email,
                 geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                 mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
             ])
         if len(insert_data) < 1:
             logging.warning('Resultset is empty. Skipping ...')
         else:
             df = pd.DataFrame(insert_data)
             df.columns = POI_COLS
             insert_poi_dataframe(self.session, df)
Example #2
0
 def process(self):
     soup = save_downloaded_soup(
         '{}'.format(self.link),
         os.path.join(self.download_cache, self.filename), POST_DATA)
     insert_data = []
     if soup != None:
         text = json.loads(soup.get_text())
         for poi_data in text['results']:
             name = 'OMV'
             code = 'huomvfu'
             postcode = poi_data['postcode'].strip()
             street, housenumber, conscriptionnumber = extract_street_housenumber_better_2(
                 poi_data['address_l'])
             city = clean_city(poi_data['town_l'])
             branch = None
             website = None
             nonstop = None
             if poi_data['open_hours'] is not None:
                 oho, ohc = clean_opening_hours(poi_data['open_hours'])
                 if oho == '00:00' and ohc == '24:00':
                     nonstop = True
                     oho, ohc = None, None
             else:
                 oho, ohc = None, None
             mo_o = oho
             th_o = oho
             we_o = oho
             tu_o = oho
             fr_o = oho
             sa_o = oho
             su_o = oho
             mo_c = ohc
             th_c = ohc
             we_c = ohc
             tu_c = ohc
             fr_c = ohc
             sa_c = ohc
             su_c = ohc
             original = poi_data['address_l']
             ref = None
             lat, lon = check_hu_boundary(poi_data['y'], poi_data['x'])
             geom = check_geom(lat, lon)
             postcode = query_postcode_osm_external(
                 self.prefer_osm_postcode, self.session, lat, lon, postcode)
             if 'telnr' in poi_data and poi_data['telnr'] != '':
                 phone = clean_phone(poi_data['telnr'])
             else:
                 phone = None
             email = None
             insert_data.append([
                 code, postcode, city, name, branch, website, original,
                 street, housenumber, conscriptionnumber, ref, phone, email,
                 geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                 mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
             ])
         if len(insert_data) < 1:
             logging.warning('Resultset is empty. Skipping ...')
         else:
             df = pd.DataFrame(insert_data)
             df.columns = POI_COLS
             insert_poi_dataframe(self.session, df)
 def test_clean_phone(self):
     for i in self.phones:
         original, ph = i['original'], i['converted']
         a = clean_phone(original)
         with self.subTest():
             self.assertEqual(ph, a)
Example #4
0
 def process(self):
     csv = save_downloaded_pd(
         '{}'.format(self.link),
         os.path.join(self.download_cache, self.filename))
     if csv is not None:
         csv[['Post code']] = csv[['Post code']].fillna('0000')
         csv[['Post code']] = csv[['Post code']].astype(int)
         csv[['Telephone']] = csv[['Telephone']].fillna('0')
         csv[['Telephone']] = csv[['Telephone']].astype(int)
         csv[['City']] = csv[['City']].fillna('')
         csv[['Name']] = csv[['Name']].fillna('')
         insert_data = []
         poi_dict = csv.to_dict('records')
         for poi_data in poi_dict:
             if poi_data['Brand'] == 'Shell':
                 name = 'Shell'
                 code = 'hushellfu'
             elif poi_data['Brand'] == 'Mobilpetrol':
                 name = 'Mobil Petrol'
                 code = 'humobpefu'
             postcode = poi_data['Post code']
             steet_tmp = poi_data['Address'].lower().split()
             for i in range(0, len(steet_tmp) - 2):
                 steet_tmp[i] = steet_tmp[i].capitalize()
             steet_tmp = ' '.join(steet_tmp)
             street, housenumber, conscriptionnumber = extract_street_housenumber_better_2(
                 steet_tmp)
             if poi_data['City'] != '':
                 city = clean_city(poi_data['City'].title())
             else:
                 if poi_data['Name'] != '':
                     city = clean_city(poi_data['Name'].title())
                 else:
                     city = None
             branch = poi_data['Name'].strip()
             website = None
             if poi_data['24 Hour'] == True:
                 nonstop = True
                 mo_o = None
                 th_o = None
                 we_o = None
                 tu_o = None
                 fr_o = None
                 sa_o = None
                 su_o = None
                 mo_c = None
                 th_c = None
                 we_c = None
                 tu_c = None
                 fr_c = None
                 sa_c = None
                 su_c = None
             else:
                 nonstop = False
                 mo_o = '06:00'
                 th_o = '06:00'
                 we_o = '06:00'
                 tu_o = '06:00'
                 fr_o = '06:00'
                 sa_o = '06:00'
                 su_o = '06:00'
                 mo_c = '22:00'
                 th_c = '22:00'
                 we_c = '22:00'
                 tu_c = '22:00'
                 fr_c = '22:00'
                 sa_c = '22:00'
                 su_c = '22:00'
             original = poi_data['Address']
             ref = None
             lat, lon = check_hu_boundary(poi_data['GPS Latitude'],
                                          poi_data['GPS Longitude'])
             geom = check_geom(lat, lon)
             postcode = query_postcode_osm_external(
                 self.prefer_osm_postcode, self.session, lat, lon, postcode)
             if 'Telephone' in poi_data and poi_data['Telephone'] != '':
                 phone = clean_phone(str(poi_data['Telephone']))
             else:
                 phone = None
             email = None
             insert_data.append([
                 code, postcode, city, name, branch, website, original,
                 street, housenumber, conscriptionnumber, ref, phone, email,
                 geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                 mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
             ])
         if len(insert_data) < 1:
             logging.warning('Resultset is empty. Skipping ...')
         else:
             df = pd.DataFrame(insert_data)
             df.columns = POI_COLS
             insert_poi_dataframe(self.session, df)
Example #5
0
    def process(self):
        soup = save_downloaded_soup(
            '{}'.format(self.link),
            os.path.join(self.download_cache, self.filename))
        insert_data = []
        if soup != None:
            # parse the html using beautiful soap and store in variable `soup`
            pattern = re.compile('^\s*var\s*boltok_nyers.*')
            script = soup.find('script', text=pattern)
            m = pattern.match(script.get_text())
            data = m.group(0)
            data = clean_javascript_variable(data, 'boltok_nyers')
            text = json.loads(data)
            # for l in text:
            # print ('postcode: {postcode}; city: {city}; address: {address}; alt_name: {alt_name}'.format(postcode=l['A_IRSZ'], city=l['A_VAROS'], address=l['A_CIM'], alt_name=l['P_NAME']))

            for poi_data in text:
                # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom
                street, housenumber, conscriptionnumber = extract_street_housenumber_better_2(
                    poi_data['A_CIM'])
                city = clean_city(poi_data['A_VAROS'])
                postcode = poi_data['A_IRSZ'].strip()
                branch = poi_data['P_NAME'].strip()
                name = 'Príma' if 'Príma' in branch else 'CBA'
                code = 'huprimacon' if 'Príma' in branch else 'hucbacon'
                website = None
                nonstop = None
                mo_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_1']
                ) if poi_data['PS_OPEN_FROM_1'] is not None else None
                th_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_2']
                ) if poi_data['PS_OPEN_FROM_2'] is not None else None
                we_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_3']
                ) if poi_data['PS_OPEN_FROM_3'] is not None else None
                tu_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_4']
                ) if poi_data['PS_OPEN_FROM_4'] is not None else None
                fr_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_5']
                ) if poi_data['PS_OPEN_FROM_5'] is not None else None
                sa_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_6']
                ) if poi_data['PS_OPEN_FROM_6'] is not None else None
                su_o = clean_opening_hours_2(
                    poi_data['PS_OPEN_FROM_7']
                ) if poi_data['PS_OPEN_FROM_7'] is not None else None
                mo_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_1']
                ) if poi_data['PS_OPEN_TO_1'] is not None else None
                th_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_2']
                ) if poi_data['PS_OPEN_TO_2'] is not None else None
                we_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_3']
                ) if poi_data['PS_OPEN_TO_3'] is not None else None
                tu_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_4']
                ) if poi_data['PS_OPEN_TO_4'] is not None else None
                fr_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_5']
                ) if poi_data['PS_OPEN_TO_5'] is not None else None
                sa_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_6']
                ) if poi_data['PS_OPEN_TO_6'] is not None else None
                su_c = clean_opening_hours_2(
                    poi_data['PS_OPEN_TO_7']
                ) if poi_data['PS_OPEN_TO_7'] is not None else None
                original = poi_data['A_CIM']
                lat, lon = check_hu_boundary(poi_data['PS_GPS_COORDS_LAT'],
                                             poi_data['PS_GPS_COORDS_LNG'])
                geom = check_geom(lat, lon)
                postcode = query_postcode_osm_external(
                    self.prefer_osm_postcode, self.session, lat, lon, postcode)
                ref = None
                if 'PS_PUBLIC_TEL' in poi_data and poi_data[
                        'PS_PUBLIC_TEL'] != '':
                    phone = clean_phone(poi_data['PS_PUBLIC_TEL'])
                else:
                    phone = None
                if 'PS_PUBLIC_EMAIL' in poi_data and poi_data[
                        'PS_PUBLIC_EMAIL'] != '':
                    email = poi_data['PS_PUBLIC_EMAIL']
                else:
                    email = None
                insert_data.append([
                    code, postcode, city, name, branch, website, original,
                    street, housenumber, conscriptionnumber, ref, phone, email,
                    geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                    mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
                ])
            if len(insert_data) < 1:
                logging.warning('Resultset is empty. Skipping ...')
            else:
                df = pd.DataFrame(insert_data)
                df.columns = POI_COLS
                insert_poi_dataframe(self.session, df)
Example #6
0
 def process(self):
     soup = save_downloaded_soup(
         '{}'.format(self.link),
         os.path.join(self.download_cache, self.filename))
     insert_data = []
     if soup != None:
         # parse the html using beautiful soap and store in variable `soup`
         pattern = re.compile('var\s*markers\s*=\s*((.*\n)*\]\;)',
                              re.MULTILINE)
         script = soup.find('script', text=pattern)
         m = pattern.search(script.get_text())
         data = m.group(0)
         data = data.replace("'", '"')
         data = clean_javascript_variable(data, 'markers')
         text = json.loads(data)
         for poi_data in text:
             if poi_data['cim'] is not None and poi_data['cim'] != '':
                 postcode, city, street, housenumber, conscriptionnumber = extract_all_address(
                     poi_data['cim'])
             name = 'Avia'
             code = 'huaviafu'
             branch = None
             if city is None:
                 city = poi_data['title']
             ref = poi_data['kutid'] if poi_data[
                 'kutid'] is not None and poi_data['kutid'] != '' else None
             lat, lon = check_hu_boundary(poi_data['lat'], poi_data['lng'])
             geom = check_geom(lat, lon)
             postcode = query_postcode_osm_external(
                 self.prefer_osm_postcode, self.session, lat, lon, postcode)
             website = '/toltoallomas/?id={}'.format(str(poi_data['kutid'])) if poi_data['kutid'] is not None and \
                                                                                poi_data['kutid'] != '' else None
             nonstop = None
             mo_o = None
             th_o = None
             we_o = None
             tu_o = None
             fr_o = None
             sa_o = None
             su_o = None
             mo_c = None
             th_c = None
             we_c = None
             tu_c = None
             fr_c = None
             sa_c = None
             su_c = None
             original = poi_data['cim']
             if 'tel' in poi_data and poi_data['tel'] != '':
                 phone = clean_phone(poi_data['tel'])
             else:
                 phone = None
             if 'email' in poi_data and poi_data['email'] != '':
                 email = clean_email(poi_data['email'])
             else:
                 email = None
             insert_data.append([
                 code, postcode, city, name, branch, website, original,
                 street, housenumber, conscriptionnumber, ref, phone, email,
                 geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                 mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
             ])
         if len(insert_data) < 1:
             logging.warning('Resultset is empty. Skipping ...')
         else:
             df = pd.DataFrame(insert_data)
             df.columns = POI_COLS
             insert_poi_dataframe(self.session, df)
Example #7
0
 def process(self):
     soup = save_downloaded_soup(
         '{}'.format(self.link),
         os.path.join(self.download_cache, self.filename))
     insert_data = []
     if soup != None:
         # parse the html using beautiful soap and store in variable `soup`
         # script = soup.find('div', attrs={'data-stores':True})
         script = soup.find(attrs={'data-stores': True})
         text = json.loads(script['data-stores'])
         for poi_data in text:
             # Assign: code, postcode, city, name, branch, website, original, street, housenumber, conscriptionnumber, ref, geom
             street, housenumber, conscriptionnumber = extract_street_housenumber_better_2(
                 poi_data['address'])
             city = clean_city(poi_data['city'])
             branch = poi_data['name']
             if 'xpres' in poi_data['name']:
                 name = 'Tesco Expressz'
                 code = 'hutescoexp'
             elif 'xtra' in poi_data['name']:
                 name = 'Tesco Extra'
                 code = 'hutescoext'
             else:
                 name = 'Tesco'
                 code = 'hutescosup'
             website = poi_data['url']
             nonstop = None
             opening = json.loads(poi_data['opening'])
             mo_o = opening['1'][0]
             th_o = opening['2'][0]
             we_o = opening['3'][0]
             tu_o = opening['4'][0]
             fr_o = opening['5'][0]
             sa_o = opening['6'][0]
             su_o = opening['0'][0]
             mo_c = opening['1'][1]
             th_c = opening['2'][1]
             we_c = opening['3'][1]
             tu_c = opening['4'][1]
             fr_c = opening['5'][1]
             sa_c = opening['6'][1]
             su_c = opening['0'][1]
             lat, lon = check_hu_boundary(poi_data['gpslat'],
                                          poi_data['gpslng'])
             geom = check_geom(lat, lon)
             postcode = query_postcode_osm_external(
                 self.prefer_osm_postcode, self.session, lat, lon, None)
             original = poi_data['address']
             ref = None
             if 'phone' in poi_data and poi_data['phone'] != '':
                 phone = clean_phone(poi_data['phone'])
             else:
                 phone = None
             email = None
             insert_data.append([
                 code, postcode, city, name, branch, website, original,
                 street, housenumber, conscriptionnumber, ref, phone, email,
                 geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o, sa_o, su_o,
                 mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
             ])
         if len(insert_data) < 1:
             logging.warning('Resultset is empty. Skipping ...')
         else:
             df = pd.DataFrame(insert_data)
             df.columns = POI_COLS
             insert_poi_dataframe(self.session, df)
 def process(self):
     if self.link:
         with open(self.link, 'r') as f:
             insert_data = []
             text = json.load(f)
             for poi_data in text['results']:
                 first_element = next(iter(poi_data))
                 if self.name == 'K&H bank':
                     name = 'K&H bank'
                     code = 'hukhbank'
                 else:
                     name = 'K&H'
                     code = 'hukhatm'
                 postcode, city, street, housenumber, conscriptionnumber = extract_all_address(
                     poi_data[first_element]['address'])
                 branch = None
                 website = None
                 if code == 'hukhatm':
                     nonstop = True
                 else:
                     nonstop = False
                 mo_o = None
                 th_o = None
                 we_o = None
                 tu_o = None
                 fr_o = None
                 sa_o = None
                 su_o = None
                 mo_c = None
                 th_c = None
                 we_c = None
                 tu_c = None
                 fr_c = None
                 sa_c = None
                 su_c = None
                 lat, lon = check_hu_boundary(
                     poi_data[first_element]['latitude'],
                     poi_data[first_element]['longitude'])
                 geom = check_geom(lat, lon)
                 postcode = query_postcode_osm_external(
                     self.prefer_osm_postcode, self.session, lat, lon,
                     postcode)
                 original = poi_data[first_element]['address']
                 ref = None
                 if 'phoneNumber' in poi_data and poi_data[
                         'phoneNumber'] != '':
                     phone = clean_phone(poi_data['phoneNumber'])
                 else:
                     phone = None
                 email = None
                 insert_data.append([
                     code, postcode, city, name, branch, website, original,
                     street, housenumber, conscriptionnumber, ref, phone,
                     email, geom, nonstop, mo_o, th_o, we_o, tu_o, fr_o,
                     sa_o, su_o, mo_c, th_c, we_c, tu_c, fr_c, sa_c, su_c
                 ])
             if len(insert_data) < 1:
                 logging.warning('Resultset is empty. Skipping ...')
             else:
                 df = pd.DataFrame(insert_data)
                 df.columns = POI_COLS
                 insert_poi_dataframe(self.session, df)