def download(self, area_name, year, month, page=1): """ 指定した検索条件でホームページをダウンロードして、CSV に保存します """ download_url = Config.get_url(area_name) download_file = Config.get_download_file(area_name, year, month, page) save_path = Config.get_download_path(download_file) if os.path.exists(save_path): _logger.info( "skip download for file exist {}".format(download_file)) return form_data = self.get_form_data(year, month, page) req = urllib.request.Request(download_url, form_data) try: html_data = urllib.request.urlopen(req).read() except IncompleteRead as e: html_data = e.partial time.sleep(self.crawl_interval) self.check_html_no_data(html_data) if self.page_found: with open(save_path, mode="wb") as f: f.write(html_data) _logger.info("save {}".format(download_file))
def test_get_path(): assert Config.get_datastore_path("choka.csv") assert Config.get_download_path("choka_daikoku_2021_04_001.html") assert Config.test_resource("daikoku1.html") assert Config.get_url("daikoku") assert Config.get_download_file("daikoku", 2021, 4) assert Config.get_db_path() assert Config.get_config_path("config.toml")
def test_get_url(): assert Config.get_url( "daikoku") == 'http://daikoku.yokohama-fishingpiers.jp/choka.php'