コード例 #1
0
ファイル: utils.py プロジェクト: varnie/niborstibor
def retrieve_reports(url, market, dst_file, date_val, cols) -> DownloadDetails:
    request_data = dict(market=market, date=date_val)

    response = requests.post(url=url,
                             headers=headers.get_random_headers(),
                             data=request_data)

    if response.status_code == requests.codes.ok:
        text = response.text
        try:
            json_val = json.loads(text)
            results = json_val['results']
        except (ValueError, KeyError) as e:
            app_logger.error(e, exc_info=True)
        else:
            app_logger.info("Downloaded entries for date %s OK" % date_val)
            # save if there's some non-empty data
            if results:
                if save_to_file(dst_csv_file_name=dst_file,
                                csv_cols=cols,
                                data=results):
                    app_logger.info("Saved entries for date %s OK" % date_val)
                    return DownloadDetails(skipped=False, status=True)
            else:
                app_logger.warning("Skipped empty entries for date %s" %
                                   date_val)
                return DownloadDetails(skipped=True, status=True)
    else:
        app_logger.error(
            "Data for %s is not available, request returned %d status" %
            (date_val, response.status_code))

    return DownloadDetails(skipped=False, status=False)
コード例 #2
0
ファイル: processor.py プロジェクト: varnie/PricePaidData
 def del_file(fpath):
     try:
         os.remove(fpath)
     except Exception as e:
         app_logger.warning(
             "Unable to remove file %s, exception occurred: %s" %
             (fpath, str(e)))
コード例 #3
0
ファイル: processor.py プロジェクト: varnie/PricePaidData
    def preprocess_data() -> bool:
        app_logger.info("Preparing data started...")

        start = time.time()

        series: List[dd.Series] = []
        for f in os.listdir(constants.DATA_DIR):
            if not f.endswith(".csv"):
                app_logger.warning("non-CSV file found in DATA_DIR: %s" % f)
                continue

            app_logger.info("Processing %s" % f)
            try:
                if len(series) < 2:
                    df = dd.read_csv(constants.DATA_DIR / f, header=None)

                    if len(
                            df.columns
                    ) != PriceDataProcessor.REQUIRED_CSV_FORMAT_COLUMNS_COUNT:
                        app_logger.error(
                            "File %s has insufficient amount of columns: required %d, found %d"
                            % (f, PriceDataProcessor.
                               REQUIRED_CSV_FORMAT_COLUMNS_COUNT,
                               len(df.columns)))
                        continue

                    # we are interested in the 4th column's values
                    fourth_col: dd.Series = df.iloc[:, 3]
                    unique_vals_series = fourth_col.drop_duplicates()
                    series.append(unique_vals_series)

                if len(series) == 2:
                    # merge two Series into one and remove duplicates
                    s = dd.concat(series).drop_duplicates()

                    # keep the result Series in the first list's element
                    del series[-1]
                    series[0] = s

            except Exception as e:
                app_logger.error("Processing file %f had errors: " + str(e))

            app_logger.info("Processing %s done" % f)

        if series:
            s: dd.Series = series[0]
            s.to_csv(constants.PROCESSED_DATA_DIR / "single.csv",
                     single_file=True,
                     index=False,
                     header=False)
        else:
            app_logger.error(
                "Prepare data: could not generate the result CSV file")

        app_logger.info("Preparing data completed in %s seconds" %
                        str(time.time() - start))
        return bool(series)
コード例 #4
0
ファイル: app.py プロジェクト: varnie/PricePaidData
def init_handler():

    processor = PriceDataProcessor()
    if processor.fetch_data():
        if processor.get_errors():
            problem_urls = processor.get_problem_urls()
            app_logger.warning("The following urls were not downloaded: %s" %
                               ", ".join(problem_urls))

        processor.preprocess_data()
    else:
        app_logger.error("All downloads failed")
コード例 #5
0
    def run(self):
        postcodes_file_path = PROCESSED_DATA_DIR / "single.csv"

        if not postcodes_file_path.exists():
            app_logger.error("No 'single.csv' file %s found. Exiting..." % postcodes_file_path)
            return

        # order of column names is important
        columns = ["Address", "Postcode", "Council Tax band", "Local authority reference number"]

        for chunk_df in pd.read_csv(postcodes_file_path, chunksize=100, header=None, usecols=[0]):
            for _, row in chunk_df.iterrows():

                postcode = row[0]
                result_file = RESULTS_DIR / "{}.csv".format(postcode.replace(" ", "_"))
                if result_file.exists():
                    app_logger.warning("Skipping result file %s, already exists" % result_file)
                    continue

                app_logger.info("Scraping %s postcode started" % postcode)
                items = self.query(postcode=postcode)

                result_list = [[result_item.address,
                                postcode,
                                result_item.council_tax_band,
                                result_item.local_auth_ref_number] for result_item in items] if items else []

                result_df = pd.DataFrame(result_list, columns=columns)
                result_df.to_csv(result_file, index=False)
                if items:
                    app_logger.info("Scraping %s postcode completed" % postcode)
                else:
                    app_logger.info("Scraping %s postcode completed, but it discovered no entries" % postcode)

                # sleep (5, 60) seconds randomly
                secs = random.randint(5, 60)
                app_logger.info("Sleeping %d seconds" % secs)
                time.sleep(secs)
コード例 #6
0
ファイル: processor.py プロジェクト: varnie/PricePaidData
    def fetch_data(self) -> bool:
        self.errors = False
        self.problem_urls = []

        self.clear_dir(constants.DATA_DIR)
        self.clear_dir(constants.PROCESSED_DATA_DIR)

        app_logger.info("Fetching started")

        if not constants.DOWNLOAD_LINKS:
            app_logger.warning("no links to download")
            self.errors = True
            return False

        start = time.time()

        for url in constants.DOWNLOAD_LINKS:
            app_logger.info("downloading %s..." % url)

            name = self.get_file_name(url)

            h = headers.get_random_headers()
            h['Referer'] = self.REFERER

            try:
                resp = requests.get(url=url, stream=True, headers=h)
                if resp.status_code != HTTPStatus.OK:
                    app_logger("Request error: bad response code " +
                               str(resp.status_code))
                    self.problem_urls.append(url)
                    self.errors = True
                    continue

                app_logger.info("saving %s..." % url)

                with open(constants.DATA_DIR / name, 'wb') as f:
                    for chunk in resp.iter_content(chunk_size=self.CHUNK_SIZE):
                        if chunk:
                            f.write(chunk)

                app_logger.info("saved")

            except requests.exceptions.RequestException as e:
                app_logger.error("Request error: " + str(e))
                self.problem_urls.append(url)
                self.errors = True
            except Exception as e:
                app_logger.error("General error: " + str(e))
                self.problem_urls.append(url)
                self.errors = True

                # remove file data leftovers in case of errors
                # (it may be corrupted, incomplete, etc)
                self.del_file(constants.DATA_DIR / name)

        app_logger.info("Fetching data completed in %s seconds" %
                        str(time.time() - start))

        # check if at least some urls have been downloaded without problem
        return not self.errors or (len(self.problem_urls) < len(
            constants.DOWNLOAD_LINKS))