Beispiel #1
0
    def download(self, area_name, year, month, page=1):
        """
        指定した検索条件でホームページをダウンロードして、CSV に保存します
        """
        download_url = Config.get_url(area_name)
        download_file = Config.get_download_file(area_name, year, month, page)
        save_path = Config.get_download_path(download_file)
        if os.path.exists(save_path):
            _logger.info(
                "skip download for file exist {}".format(download_file))
            return

        form_data = self.get_form_data(year, month, page)
        req = urllib.request.Request(download_url, form_data)
        try:
            html_data = urllib.request.urlopen(req).read()
        except IncompleteRead as e:
            html_data = e.partial
        time.sleep(self.crawl_interval)

        self.check_html_no_data(html_data)
        if self.page_found:
            with open(save_path, mode="wb") as f:
                f.write(html_data)
            _logger.info("save {}".format(download_file))
def test_get_path():
    assert Config.get_datastore_path("choka.csv")
    assert Config.get_download_path("choka_daikoku_2021_04_001.html")
    assert Config.test_resource("daikoku1.html")
    assert Config.get_url("daikoku")
    assert Config.get_download_file("daikoku", 2021, 4)
    assert Config.get_db_path()
    assert Config.get_config_path("config.toml")
Beispiel #3
0
 def reset_download(self):
     """
     SQLite3 データベースファイルを削除します
     """
     download_dir = Config.get_download_path("")
     _logger.info("initialize {}".format(download_dir))
     download_files = os.listdir(download_dir)
     for download_file in download_files:
         if download_file.endswith(".html"):
             os.remove(os.path.join(download_dir, download_file))
Beispiel #4
0
 def run(self):
     """
     data ディレクトリ下の ダウンロード済みの釣果情報 HTML ファイルを
     順に読み込み、釣果情報を抽出して、CSV 形式にして保存します
     """
     html_files = Config.list_download_dirs()
     for html_file in html_files:
         point = Config.get_point_from_html_filename(html_file)
         if not point:
             continue
         _logger.info("read {}".format(html_file))
         html_path = Config.get_download_path(html_file)
         parser = Parser(point).parse_html(html_path)
         self.append(parser)
     self.cleansing_fishing_summary()
     self.export()