def collect_data(self, data: Restaurant) -> Restaurant: """ Retrieve data from cvrapi and modify the data through every class method prefixed by 'append_'. Note that it is important that we set the user agent when requesting. It should be on the form: '<company_name> - <project_name> - <contact_name> [<contact_phone_or_email>]' """ print('-' * 40) print(f'{data.name} | {data.pnr}') params = { 'produ': data.pnr, 'country': 'dk', 'token': FilterXMLConfig.cvrapi_api_key() } headers = {'User-Agent': 'sw814f21 - FindSmiley app - Jonas Andersen'} res = get(self.URL, params=params, headers=headers) content = json.loads(res.content.decode('utf-8')) if res.status_code == 200: for appender in self.appenders: data = appender(content, data) else: print( f'Skipping restaurant with p-nr {data.pnr}: record not found remotely' ) return super().collect_data(data)
def _create_file(self): """ Create the file and write an empty json structure """ with open(self.LOG_FILE, 'w') as f: f.write( json.dumps({ 'time': datetime.now().strftime(FilterXMLConfig.iso_fmt()), 'log': {} }))
def pre_processing(self, data: list): all_pnrs = [r.pnr for r in data if r.pnr is not None] chunks = list(self.chunks(all_pnrs, 3000)) num_reqs = len(chunks) print( f'Fetching pnr-info on {len(all_pnrs)} pnrs in {num_reqs} request(s):' ) auth = (FilterXMLConfig.cvr_elastic_username(), FilterXMLConfig.cvr_elastic_password()) for i in range(len(chunks)): data = { 'from': 0, 'size': 3000, 'query': { 'terms': { 'VrproduktionsEnhed.pNummer': chunks[i] } }, '_source': [ 'VrproduktionsEnhed.livsforloeb.periode.gyldigFra', 'VrproduktionsEnhed.livsforloeb.periode.gyldigTil', 'VrproduktionsEnhed.pNummer', 'VrproduktionsEnhed.produktionsEnhedMetadata.nyesteHovedbranche.branchekode', 'VrproduktionsEnhed.produktionsEnhedMetadata.nyesteHovedbranche.branchetekst' ] } res = post(self.URL, json=data, auth=auth) if res.status_code == 200: self.parse_response(res.json()) else: print("Bad response code") if i + 1 != num_reqs: print('.', end="", flush=True) else: print('Done!', flush=True)
def from_json(cls, row: dict) -> SmileyReport: """ Constructs a single SmileyReport object. Expects a dict as defined by a row in the smiley_reports list in: https://github.com/sw814f21/filter_xml#final-output """ self = SmileyReport() self.report_id = row['report_id'] self.smiley = int(row['smiley']) self.date = datetime.strptime(row['date'], FilterXMLConfig.iso_fmt()) return self
def from_json(cls, row: dict) -> Restaurant: """ Constructs a single Restaurant object Expects a dict as as defined by: https://github.com/sw814f21/filter_xml#final-output """ self = Restaurant() self.cvrnr = row['cvrnr'] self.pnr = row['pnr'] self.region = row['region'] self.industry_code = row['industry_code'] self.industry_text = row['industry_text'] self.start_date = datetime.strptime(row['start_date'], FilterXMLConfig.iso_fmt()) \ if row['start_date'] else '' self.end_date = datetime.strptime(row['end_date'], FilterXMLConfig.iso_fmt()) \ if row['end_date'] else '' self.smiley_reports = [ SmileyReport.from_json(report) for report in row['smiley_reports'] ] self.city = row['city'] self.elite_smiley = row['elite_smiley'] self.geo_lat = float(row['geo_lat']) if row['geo_lat'] else None self.geo_lng = float(row['geo_lng']) if row['geo_lng'] else None self.niche_industry = row['niche_industry'] self.url = row['url'] self.address = row['address'] self.name = row['name'].strip() if row['name'] else None self.name_seq_nr = row['name_seq_nr'] self.zip_code = row['zip_code'] self.ad_protection = row['ad_protection'] self.company_type = row['company_type'] self.franchise_name = row['franchise_name'] return self
def get_cvr_handler() -> CVRHandlerBase: """ Retrieve CVR handler as specified by 'provider' in config file. """ provider = FilterXMLConfig.cvr_provider() if provider == 'cvrapi': return CVRHandlerCVRAPI() elif provider == 'cvr_elastic': return CVRHandlerElastic() elif provider == 'scrape': return CVRHandlerScrape() else: raise KeyError( f'provider \"{provider}\" is invalid, please choose one of ' f'[ cvrapi | virk | scrape ]')
def process_smiley_json(self, data: RestaurantCatalog) -> None: """ Processes smiley .json file. Includes only production units Applies filters from DataHandler Collects additional, external data through CVRHandler Restaurants that have already been processed (i.e., external data has been collected) are stored in processed_companies.csv - handled by PrevProcessedFile. Restaurants that have been processed during the current session are stored in temp.csv - handled by TempFile. This is done to save progress in the case of a crash during the run. Once data has been processed, keys are renamed. Cf. the translation map in _rename_keys() """ temp_file = TempFile() res = temp_file.get_all() total_rows = data.catalog_size row_index = 0 if self._cvr_handler.PRE_PROCESSING_STEP: self._cvr_handler.pre_processing(data.catalog) for restaurant in data.catalog: # we use this to avoid using the same fallback in three separate if statements row_kept = False # if sample size CLI arg is supplied, stop when its reached if self._sample_size and res.catalog_size >= self._sample_size: break # first check if the restaurant is valid if restaurant.is_valid_production_unit(): # then ensure it hasn't already been processed prior to a crash if not temp_file.contains(restaurant.name_seq_nr): # only sleep if --no-scrape is not passed, and if our cvr provider requests it. if not self._skip_scrape and self._cvr_handler.SHOULD_SLEEP and row_index > 0: time.sleep(self._cvr_handler.CRAWL_DELAY) # only collect data if we haven't passed --no-scrape if not self._skip_scrape: restaurant = self._cvr_handler.collect_data(restaurant) # check filters to see if we should keep the row # otherwise add it to blacklist so we don't scrape it next time if self.post_filters.filter(restaurant): if not self._skip_scrape: restaurant = self._smiley_handler.collect_data( restaurant) res.add(restaurant) row_kept = True temp_file.add_data(restaurant) else: Blacklist.add(restaurant) # if any check resulted in a row skip, decrement the total row count # for terminal output purposes if not row_kept: total_rows -= 1 if self._sample_size: if row_kept: print( f'Collected {res.catalog_size} of {self._sample_size} samples' ) else: print(f'{total_rows - res.catalog_size} rows to go') row_index += 1 self.post_filters.log_filters() token = datetime.now().strftime(FilterXMLConfig.iso_fmt()) res.setup_diff(self._outputter.get()) self._outputter.insert(res.insert_set(), token) self._outputter.update(res.update_set(), token) self._outputter.delete(res.delete_set(), token) temp_file.close() Blacklist.close_file()
def date_string(self) -> str: """ ISO-8601 formatted date string property """ return self.date.strftime(FilterXMLConfig.iso_fmt())
def end_date_string(self) -> str: """ ISO-8601 formatted start date string property """ return self.end_date.strftime( FilterXMLConfig.iso_fmt()) if self.end_date else ''
class DatabaseOutputter(_BaseDataOutputter): ENDPOINT = FilterXMLConfig.data_endpoint() def get(self) -> RestaurantCatalog: """ Retrieve all current restaurants from the API """ catalog = RestaurantCatalog() try: res = requests.get(self.ENDPOINT, timeout=4) if res.status_code == 200: catalog.add_many( [Restaurant.from_json(row) for row in res.json()]) except ConnectionError: print('Failed to connect to API') return catalog def insert(self, data: Union[dict, list], token: str) -> None: """ Send restaurants marked as insert to API :param data: a list of restaurants or a single restaurant :param token: an identifier for the current session, to ensure that separate POST / PUT / DELETE requests are recognized as a single version of data """ if len(data) == 0: return put_data = {'timestamp': token, 'data': data} res = requests.post(self.ENDPOINT, json=put_data) if res.status_code != 200: print( 'Failed to send insert data to database, writing to file instead' ) FileOutputter().insert(data, token) def update(self, data: Union[dict, list], token: str) -> None: """ Send restaurants marked as update to API :param data: a list of restaurants or a single restaurant :param token: an identifier for the current session, to ensure that separate POST / PUT / DELETE requests are recognized as a single version of data """ if len(data) == 0: return post_data = {'timestamp': token, 'data': data} res = requests.put(self.ENDPOINT, json=post_data) if res.status_code != 200: print( 'Failed to send update data to database, writing to file instead' ) FileOutputter().update(data, token) def delete(self, data: Union[dict, list], token: str) -> None: """ Send restaurants marked as delete to API :param data: a list of restaurants or a single restaurant :param token: an identifier for the current session, to ensure that separate POST / PUT / DELETE requests are recognized as a single version of data """ if len(data) == 0: return delete_data = {'timestamp': token, 'data': data} res = requests.delete(self.ENDPOINT, json=delete_data) if res.status_code != 200: print( 'Failed to send delete data to database, writing to file instead' ) FileOutputter().delete(data, token)