def download(self, area_name, year, month, page=1): """ 指定した検索条件でホームページをダウンロードして、CSV に保存します """ download_url = Config.get_url(area_name) download_file = Config.get_download_file(area_name, year, month, page) save_path = Config.get_download_path(download_file) if os.path.exists(save_path): _logger.info( "skip download for file exist {}".format(download_file)) return form_data = self.get_form_data(year, month, page) req = urllib.request.Request(download_url, form_data) try: html_data = urllib.request.urlopen(req).read() except IncompleteRead as e: html_data = e.partial time.sleep(self.crawl_interval) self.check_html_no_data(html_data) if self.page_found: with open(save_path, mode="wb") as f: f.write(html_data) _logger.info("save {}".format(download_file))
def test_get_path(): assert Config.get_datastore_path("choka.csv") assert Config.get_download_path("choka_daikoku_2021_04_001.html") assert Config.test_resource("daikoku1.html") assert Config.get_url("daikoku") assert Config.get_download_file("daikoku", 2021, 4) assert Config.get_db_path() assert Config.get_config_path("config.toml")
def reset_download(self): """ SQLite3 データベースファイルを削除します """ download_dir = Config.get_download_path("") _logger.info("initialize {}".format(download_dir)) download_files = os.listdir(download_dir) for download_file in download_files: if download_file.endswith(".html"): os.remove(os.path.join(download_dir, download_file))
def run(self): """ data ディレクトリ下の ダウンロード済みの釣果情報 HTML ファイルを 順に読み込み、釣果情報を抽出して、CSV 形式にして保存します """ html_files = Config.list_download_dirs() for html_file in html_files: point = Config.get_point_from_html_filename(html_file) if not point: continue _logger.info("read {}".format(html_file)) html_path = Config.get_download_path(html_file) parser = Parser(point).parse_html(html_path) self.append(parser) self.cleansing_fishing_summary() self.export()