コード例 #1
0
    def write_parsed(self, name: str, cache: DirectoryCache):

        key = f"{name}_parsed.txt"
        if self.df is None:
            cache.remove(key)
        else:
            content = dataframe_to_text(self.df)
            cache.write(key, content)
コード例 #2
0
    def write(self, name: str, cache: DirectoryCache, change_list: ChangeList):

        key = f"{name}_data.txt"
        new_content = dataframe_to_text(self.df)

        old_content = cache.read(key)
        if old_content != new_content:
            cache.write(key, new_content)

            key = f"{name}_source.{self.content_type}"
            cache.write(key, self.content)
            change_list.record_changed(name, "source", self.endpoint)
        else:
            change_list.record_unchanged(name, "source", self.endpoint)
コード例 #3
0
class DataPipeline():
    def __init__(self, config: DataPipelineConfig):

        self.config = config

        self.change_list: ChangeList = None

        base_dir = config.base_dir

        self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources"))

        self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw"))
        self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean"))
        self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract"))
        self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff"))

        self.url_manager = UrlManager(config.headless, config.browser)

        self.sources: UrlSources = None

        self._capture: SpecializedCapture = None

    def get_capture(self) -> SpecializedCapture:
        if self._capture == None:
            publish_dir = os.path.join(self.config.base_dir, 'captive-browser')
            driver = self.url_manager._captive.driver if self.url_manager._captive else None
            self._capture = SpecializedCapture(self.config.temp_dir,
                                               publish_dir, driver)
        return self._capture

    def shutdown_capture(self):
        if self._capture != None:
            self._capture.close()
            if self.config.auto_push:
                self._capture.publish()
        self._capture = None

    def update_sources(self):
        " update the remote url sources "
        manager = UrlSourceManager(self.cache_sources)
        self.sources = manager.update_sources("scan")

    def process(self) -> Dict[str, str]:
        " run the pipeline "

        self.url_manager.reset()
        self.change_list = ChangeList(self.cache_raw)

        host = get_host()
        print(
            f"=== run started on {host} at {udatetime.to_logformat(self.change_list.start_date)}"
        )

        self.change_list.start_run()
        try:
            if self.sources == None:
                raise Exception("Sources not provided")
            src = self.sources.items[0]
            if src.name != "google-states-csv":
                raise Exception(
                    f"Expected first source to be google-states-csv, not {src.name}"
                )
            return self._main_loop(src, self.change_list)
        except Exception as ex:
            logger.exception(ex)
            self.change_list.abort_run(ex)
        finally:
            self.change_list.finish_run()

            self.shutdown_capture()

            logger.info(
                f"  [in-memory content cache took {self.url_manager.size*1e-6:.1f} MBs"
            )
            logger.info(
                f"run finished on {host} at {udatetime.to_logformat(self.change_list.start_date)}"
            )

    def format_html(self, rerun=False):
        " format raw html "
        is_first = False
        for key in self.cache_raw.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_raw.exists(key):
                if is_first:
                    logger.info(f"format existing files...")
                    is_first = False
                logger.info(f"  format {key}")
                local_raw_content = self.cache_raw.read(key)
                formater = HtmlFormater()
                local_clean_content = formater.format(None, local_raw_content)
                self.cache_raw.write(key, local_clean_content)

    def clean_html(self, rerun=False):
        " generate clean files from existing raw html "
        is_first = False
        for key in self.cache_raw.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_clean.exists(key):
                if is_first:
                    logger.info(f"clean existing files...")
                    is_first = False
                logger.info(f"  clean {key}")
                local_raw_content = self.cache_raw.read(key)
                cleaner = HtmlCleaner()
                local_clean_content = cleaner.clean(local_raw_content)
                self.cache_clean.write(key, local_clean_content)

    def extract_html(self, rerun=False):
        " generate extract files from existing clean html "

        self.change_list = ChangeList(self.cache_raw)
        self.change_list.load()

        is_first = False
        for key in self.cache_clean.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_extract.exists(key):
                if is_first:
                    logger.info(f"extract existing files...")
                    is_first = False
                logger.info(f"  extract {key}")
                local_clean_content = self.cache_clean.read(key)

                item = self.change_list.get_item(key)
                if item == None:
                    logger.warning("   skip because it is a new item")
                    continue

                extracter = HtmlExtracter()
                local_extract_content = extracter.extract(
                    local_clean_content, item)
                self.cache_extract.write(key, local_extract_content)

    def _main_loop(self, source: UrlSource,
                   change_list: ChangeList) -> Dict[str, str]:
        def remove_duplicate_if_exists(location: str, source: str,
                                       other_state: str):
            key = location + ".html"

            self.cache_raw.remove(key)
            self.cache_clean.remove(key)
            change_list.record_duplicate(key, source,
                                         f"duplicate of {other_state}")

            if self.config.capture_image:
                c = self.get_capture()
                c.remove(location)

        def fetch_if_changed(location: str,
                             source: str,
                             xurl: str,
                             skip: bool = False) -> bool:

            key = location + ".html"

            if xurl == "" or xurl == None or xurl == "None":
                change_list.record_skip(key, source, xurl, "missing url")
                return

            mins = change_list.get_minutes_since_last_check(key)
            if self.config.trace:
                logger.info(f"  checked {key} {mins:.1f} minutes ago")
            if mins < 15.0:
                if self.config.rerun_now:
                    logger.info(f"{key}: checked {mins:.1f} mins ago")
                else:
                    logger.info(
                        f"{key}: checked {mins:.1f} mins ago -> skip b/c < 15 mins"
                    )
                    change_list.temporary_skip(key, source, xurl,
                                               "age < 15 mins")
                    return False

            if skip:
                change_list.record_skip(key, source, xurl, "skip flag set")
                return False

            if self.config.trace: logger.info(f"fetch {xurl}")
            remote_raw_content, status = self.url_manager.fetch(xurl)

            is_bad, msg = is_bad_content(remote_raw_content)
            if is_bad:
                change_list.record_failed(key, source, xurl, msg)
                return False

            if status > 300:
                change_list.record_failed(location, source, xurl,
                                          f"HTTP status {status}")
                return False

            remote_raw_content = remote_raw_content.replace(b"\r", b"")

            formater = HtmlFormater()
            remote_raw_content = formater.format(xurl, remote_raw_content)

            local_clean_content = self.cache_clean.read(key)
            cleaner = HtmlCleaner()
            remote_clean_content = cleaner.clean(remote_raw_content)

            if local_clean_content != remote_clean_content:

                self.cache_raw.write(key, remote_raw_content)
                self.cache_clean.write(key, remote_clean_content)
                change_list.record_changed(key, source, xurl)

                item = change_list.get_item(key)

                formatter = HtmlFormater()
                remote_raw_content = formatter.format(xurl, remote_raw_content)

                extracter = HtmlExtracter()
                remote_extract_content = extracter.extract(
                    remote_clean_content, item)
                self.cache_extract.write(key, remote_extract_content)

                if self.config.capture_image:
                    c = self.get_capture()
                    c.screenshot(key, f"Screenshot for {location}", xurl)
            else:
                change_list.record_unchanged(key, source, xurl)
                return False

        # -- get urls to hit
        if source.status != "valid":
            raise Exception(f"URL source {source.name} status is not valid")

        df_config = source.df
        if df_config is None:
            raise Exception(
                f"URL source {source.name} does not have any data loaded")

        # -- fetch pages
        skip = False
        err_cnt = 0

        for idx, r in df_config.iterrows():
            location = r["location"]
            source = r["source_name"]
            general_url = r["main_page"]
            data_url = r["data_page"]

            if general_url == None and data_url == None:
                logger.warning(f"  no urls for {location} -> skip")
                change_list.record_skip(location)
                continue

            if idx % 10 == 1: change_list.save_progress()

            if general_url != None:
                try:
                    fetch_if_changed(location, source, general_url, skip=skip)
                except Exception as ex:
                    err_cnt += 1
                    if err_cnt > 10: break
                    change_list.record_failed(location, source, general_url,
                                              "Exception in code")
                    logger.exception(ex)
                    logger.error("    error -> continue to next page")

            if data_url != None:
                if general_url == data_url:
                    remove_duplicate_if_exists(location + "_data", source,
                                               location)
                else:
                    try:
                        fetch_if_changed(location + "_data",
                                         source,
                                         data_url,
                                         skip=skip)
                    except Exception as ex:
                        err_cnt += 1
                        if err_cnt > 10: break
                        change_list.record_failed(location, source,
                                                  general_url,
                                                  "Exception in code")
                        logger.exception(ex)
                        logger.error("    error -> continue to next page")

        if err_cnt > 10:
            logger.error(f"  abort run due to {err_cnt} errors")

        change_list.write_html_to_cache(self.cache_raw, "RAW")
        change_list.write_html_to_cache(self.cache_clean, "CLEAN")
        change_list.write_html_to_cache(self.cache_extract, "EXTRACT")
コード例 #4
0
 def write(self, cache: DirectoryCache, name: str):
     if not self.df_status is None:
         content = dataframe_to_text(self.df_status)
         cache.write(name, content)