def retrieve_reports(url, market, dst_file, date_val, cols) -> DownloadDetails: request_data = dict(market=market, date=date_val) response = requests.post(url=url, headers=headers.get_random_headers(), data=request_data) if response.status_code == requests.codes.ok: text = response.text try: json_val = json.loads(text) results = json_val['results'] except (ValueError, KeyError) as e: app_logger.error(e, exc_info=True) else: app_logger.info("Downloaded entries for date %s OK" % date_val) # save if there's some non-empty data if results: if save_to_file(dst_csv_file_name=dst_file, csv_cols=cols, data=results): app_logger.info("Saved entries for date %s OK" % date_val) return DownloadDetails(skipped=False, status=True) else: app_logger.warning("Skipped empty entries for date %s" % date_val) return DownloadDetails(skipped=True, status=True) else: app_logger.error( "Data for %s is not available, request returned %d status" % (date_val, response.status_code)) return DownloadDetails(skipped=False, status=False)
def del_file(fpath): try: os.remove(fpath) except Exception as e: app_logger.warning( "Unable to remove file %s, exception occurred: %s" % (fpath, str(e)))
def preprocess_data() -> bool: app_logger.info("Preparing data started...") start = time.time() series: List[dd.Series] = [] for f in os.listdir(constants.DATA_DIR): if not f.endswith(".csv"): app_logger.warning("non-CSV file found in DATA_DIR: %s" % f) continue app_logger.info("Processing %s" % f) try: if len(series) < 2: df = dd.read_csv(constants.DATA_DIR / f, header=None) if len( df.columns ) != PriceDataProcessor.REQUIRED_CSV_FORMAT_COLUMNS_COUNT: app_logger.error( "File %s has insufficient amount of columns: required %d, found %d" % (f, PriceDataProcessor. REQUIRED_CSV_FORMAT_COLUMNS_COUNT, len(df.columns))) continue # we are interested in the 4th column's values fourth_col: dd.Series = df.iloc[:, 3] unique_vals_series = fourth_col.drop_duplicates() series.append(unique_vals_series) if len(series) == 2: # merge two Series into one and remove duplicates s = dd.concat(series).drop_duplicates() # keep the result Series in the first list's element del series[-1] series[0] = s except Exception as e: app_logger.error("Processing file %f had errors: " + str(e)) app_logger.info("Processing %s done" % f) if series: s: dd.Series = series[0] s.to_csv(constants.PROCESSED_DATA_DIR / "single.csv", single_file=True, index=False, header=False) else: app_logger.error( "Prepare data: could not generate the result CSV file") app_logger.info("Preparing data completed in %s seconds" % str(time.time() - start)) return bool(series)
def init_handler(): processor = PriceDataProcessor() if processor.fetch_data(): if processor.get_errors(): problem_urls = processor.get_problem_urls() app_logger.warning("The following urls were not downloaded: %s" % ", ".join(problem_urls)) processor.preprocess_data() else: app_logger.error("All downloads failed")
def run(self): postcodes_file_path = PROCESSED_DATA_DIR / "single.csv" if not postcodes_file_path.exists(): app_logger.error("No 'single.csv' file %s found. Exiting..." % postcodes_file_path) return # order of column names is important columns = ["Address", "Postcode", "Council Tax band", "Local authority reference number"] for chunk_df in pd.read_csv(postcodes_file_path, chunksize=100, header=None, usecols=[0]): for _, row in chunk_df.iterrows(): postcode = row[0] result_file = RESULTS_DIR / "{}.csv".format(postcode.replace(" ", "_")) if result_file.exists(): app_logger.warning("Skipping result file %s, already exists" % result_file) continue app_logger.info("Scraping %s postcode started" % postcode) items = self.query(postcode=postcode) result_list = [[result_item.address, postcode, result_item.council_tax_band, result_item.local_auth_ref_number] for result_item in items] if items else [] result_df = pd.DataFrame(result_list, columns=columns) result_df.to_csv(result_file, index=False) if items: app_logger.info("Scraping %s postcode completed" % postcode) else: app_logger.info("Scraping %s postcode completed, but it discovered no entries" % postcode) # sleep (5, 60) seconds randomly secs = random.randint(5, 60) app_logger.info("Sleeping %d seconds" % secs) time.sleep(secs)
def fetch_data(self) -> bool: self.errors = False self.problem_urls = [] self.clear_dir(constants.DATA_DIR) self.clear_dir(constants.PROCESSED_DATA_DIR) app_logger.info("Fetching started") if not constants.DOWNLOAD_LINKS: app_logger.warning("no links to download") self.errors = True return False start = time.time() for url in constants.DOWNLOAD_LINKS: app_logger.info("downloading %s..." % url) name = self.get_file_name(url) h = headers.get_random_headers() h['Referer'] = self.REFERER try: resp = requests.get(url=url, stream=True, headers=h) if resp.status_code != HTTPStatus.OK: app_logger("Request error: bad response code " + str(resp.status_code)) self.problem_urls.append(url) self.errors = True continue app_logger.info("saving %s..." % url) with open(constants.DATA_DIR / name, 'wb') as f: for chunk in resp.iter_content(chunk_size=self.CHUNK_SIZE): if chunk: f.write(chunk) app_logger.info("saved") except requests.exceptions.RequestException as e: app_logger.error("Request error: " + str(e)) self.problem_urls.append(url) self.errors = True except Exception as e: app_logger.error("General error: " + str(e)) self.problem_urls.append(url) self.errors = True # remove file data leftovers in case of errors # (it may be corrupted, incomplete, etc) self.del_file(constants.DATA_DIR / name) app_logger.info("Fetching data completed in %s seconds" % str(time.time() - start)) # check if at least some urls have been downloaded without problem return not self.errors or (len(self.problem_urls) < len( constants.DOWNLOAD_LINKS))