def write_parsed(self, name: str, cache: DirectoryCache):

        key = f"{name}_parsed.txt"
        if self.df is None:
            cache.remove(key)
        else:
            content = dataframe_to_text(self.df)
            cache.write(key, content)
    def read(self, name: str, cache: DirectoryCache):

        key = f"{name}_data.txt"
        content = cache.read(key)
        if content == None:
            self.df = dataframe_from_text(content)

        key = f"{name}_source.{self.content_type}"
        self.content = cache.read(key)
    def __init__(self,
                 temp_dir: str,
                 publish_dir: str,
                 driver: CaptiveBrowser = None):
        self.temp_dir = temp_dir
        self.publish_dir = publish_dir

        self.cache_images = DirectoryCache(os.path.join(publish_dir, "images"))
        self.cache = DirectoryCache(os.path.join(publish_dir))

        self.changed = False
        self._is_internal_browser = driver is None
        self._browser: CaptiveBrowser = driver
    def write(self, name: str, cache: DirectoryCache, change_list: ChangeList):

        key = f"{name}_data.txt"
        new_content = dataframe_to_text(self.df)

        old_content = cache.read(key)
        if old_content != new_content:
            cache.write(key, new_content)

            key = f"{name}_source.{self.content_type}"
            cache.write(key, self.content)
            change_list.record_changed(name, "source", self.endpoint)
        else:
            change_list.record_unchanged(name, "source", self.endpoint)
class PageCompare():
    def __init__(self, work_dir_a: str, work_dir_b: str, out_dir: str):
        self.cache_a = DirectoryCache(work_dir_a)
        self.cache_b = DirectoryCache(work_dir_b)
        self.out_dir = out_dir

    def process_all(self):

        ignore_list = ["main_sheet", "WV"]

        if not os.path.exists(self.out_dir): os.makedirs(self.out_dir)

        for fn in self.cache_a.list_html_files():
            x = fn.replace(".html", "")
            if x in ignore_list: continue

            content_a = self.cache_a.read(fn)
            content_b = self.cache_b.read(fn)
            if content_a == content_b:
                logger.info(f"=================| {fn}")
                logger.info("   data is SAME")
                continue

            fn_a = os.path.join(self.out_dir, f"{x}_A.html")
            fn_b = os.path.join(self.out_dir, f"{x}_B.html")
            if os.path.exists(fn_a): os.remove(fn_a)
            if os.path.exists(fn_b): os.remove(fn_b)

            if content_a == content_b:
                logger.info(f"=================| {fn}")
                logger.info("   data is FIXED")
                continue

            doc_a = html.fromstring(content_a)
            doc_b = html.fromstring(content_b)

            remove_identical_nodes(doc_a, doc_b)
            str_a = html.tostring(doc_a, pretty_print=True)
            str_b = html.tostring(doc_b, pretty_print=True)

            logger.info(f"=================| {fn}")
            logger.info("   data is different")

            with open(fn_a, "wb") as f:
                f.write(str_a)
            with open(fn_b, "wb") as f:
                f.write(str_b)
    def read(self, cache: DirectoryCache, name: str):
        if len(self.names) == 0: raise Exception("No sources")

        content = cache.read(name)
        if content == None: return

        df = dataframe_from_text(content)
        df.index = df.names
        missing = ~df.index.isin(self.names)
        if len(missing) > 0:
            df.loc[missing, "status"] = "removed"
        df.reset_index(inplace=True, drop=True)
        self.df_status = df
    def __init__(self, config: DataPipelineConfig):

        self.config = config

        self.change_list: ChangeList = None

        base_dir = config.base_dir

        self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources"))

        self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw"))
        self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean"))
        self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract"))
        self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff"))

        self.url_manager = UrlManager(config.headless, config.browser)

        self.sources: UrlSources = None

        self._capture: SpecializedCapture = None
 def __init__(self, work_dir_a: str, work_dir_b: str, out_dir: str):
     self.cache_a = DirectoryCache(work_dir_a)
     self.cache_b = DirectoryCache(work_dir_b)
     self.out_dir = out_dir
class DataPipeline():
    def __init__(self, config: DataPipelineConfig):

        self.config = config

        self.change_list: ChangeList = None

        base_dir = config.base_dir

        self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources"))

        self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw"))
        self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean"))
        self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract"))
        self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff"))

        self.url_manager = UrlManager(config.headless, config.browser)

        self.sources: UrlSources = None

        self._capture: SpecializedCapture = None

    def get_capture(self) -> SpecializedCapture:
        if self._capture == None:
            publish_dir = os.path.join(self.config.base_dir, 'captive-browser')
            driver = self.url_manager._captive.driver if self.url_manager._captive else None
            self._capture = SpecializedCapture(self.config.temp_dir,
                                               publish_dir, driver)
        return self._capture

    def shutdown_capture(self):
        if self._capture != None:
            self._capture.close()
            if self.config.auto_push:
                self._capture.publish()
        self._capture = None

    def update_sources(self):
        " update the remote url sources "
        manager = UrlSourceManager(self.cache_sources)
        self.sources = manager.update_sources("scan")

    def process(self) -> Dict[str, str]:
        " run the pipeline "

        self.url_manager.reset()
        self.change_list = ChangeList(self.cache_raw)

        host = get_host()
        print(
            f"=== run started on {host} at {udatetime.to_logformat(self.change_list.start_date)}"
        )

        self.change_list.start_run()
        try:
            if self.sources == None:
                raise Exception("Sources not provided")
            src = self.sources.items[0]
            if src.name != "google-states-csv":
                raise Exception(
                    f"Expected first source to be google-states-csv, not {src.name}"
                )
            return self._main_loop(src, self.change_list)
        except Exception as ex:
            logger.exception(ex)
            self.change_list.abort_run(ex)
        finally:
            self.change_list.finish_run()

            self.shutdown_capture()

            logger.info(
                f"  [in-memory content cache took {self.url_manager.size*1e-6:.1f} MBs"
            )
            logger.info(
                f"run finished on {host} at {udatetime.to_logformat(self.change_list.start_date)}"
            )

    def format_html(self, rerun=False):
        " format raw html "
        is_first = False
        for key in self.cache_raw.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_raw.exists(key):
                if is_first:
                    logger.info(f"format existing files...")
                    is_first = False
                logger.info(f"  format {key}")
                local_raw_content = self.cache_raw.read(key)
                formater = HtmlFormater()
                local_clean_content = formater.format(None, local_raw_content)
                self.cache_raw.write(key, local_clean_content)

    def clean_html(self, rerun=False):
        " generate clean files from existing raw html "
        is_first = False
        for key in self.cache_raw.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_clean.exists(key):
                if is_first:
                    logger.info(f"clean existing files...")
                    is_first = False
                logger.info(f"  clean {key}")
                local_raw_content = self.cache_raw.read(key)
                cleaner = HtmlCleaner()
                local_clean_content = cleaner.clean(local_raw_content)
                self.cache_clean.write(key, local_clean_content)

    def extract_html(self, rerun=False):
        " generate extract files from existing clean html "

        self.change_list = ChangeList(self.cache_raw)
        self.change_list.load()

        is_first = False
        for key in self.cache_clean.list_html_files():
            if key == "index.html": continue
            if key == "google_sheet.html": continue
            if rerun or not self.cache_extract.exists(key):
                if is_first:
                    logger.info(f"extract existing files...")
                    is_first = False
                logger.info(f"  extract {key}")
                local_clean_content = self.cache_clean.read(key)

                item = self.change_list.get_item(key)
                if item == None:
                    logger.warning("   skip because it is a new item")
                    continue

                extracter = HtmlExtracter()
                local_extract_content = extracter.extract(
                    local_clean_content, item)
                self.cache_extract.write(key, local_extract_content)

    def _main_loop(self, source: UrlSource,
                   change_list: ChangeList) -> Dict[str, str]:
        def remove_duplicate_if_exists(location: str, source: str,
                                       other_state: str):
            key = location + ".html"

            self.cache_raw.remove(key)
            self.cache_clean.remove(key)
            change_list.record_duplicate(key, source,
                                         f"duplicate of {other_state}")

            if self.config.capture_image:
                c = self.get_capture()
                c.remove(location)

        def fetch_if_changed(location: str,
                             source: str,
                             xurl: str,
                             skip: bool = False) -> bool:

            key = location + ".html"

            if xurl == "" or xurl == None or xurl == "None":
                change_list.record_skip(key, source, xurl, "missing url")
                return

            mins = change_list.get_minutes_since_last_check(key)
            if self.config.trace:
                logger.info(f"  checked {key} {mins:.1f} minutes ago")
            if mins < 15.0:
                if self.config.rerun_now:
                    logger.info(f"{key}: checked {mins:.1f} mins ago")
                else:
                    logger.info(
                        f"{key}: checked {mins:.1f} mins ago -> skip b/c < 15 mins"
                    )
                    change_list.temporary_skip(key, source, xurl,
                                               "age < 15 mins")
                    return False

            if skip:
                change_list.record_skip(key, source, xurl, "skip flag set")
                return False

            if self.config.trace: logger.info(f"fetch {xurl}")
            remote_raw_content, status = self.url_manager.fetch(xurl)

            is_bad, msg = is_bad_content(remote_raw_content)
            if is_bad:
                change_list.record_failed(key, source, xurl, msg)
                return False

            if status > 300:
                change_list.record_failed(location, source, xurl,
                                          f"HTTP status {status}")
                return False

            remote_raw_content = remote_raw_content.replace(b"\r", b"")

            formater = HtmlFormater()
            remote_raw_content = formater.format(xurl, remote_raw_content)

            local_clean_content = self.cache_clean.read(key)
            cleaner = HtmlCleaner()
            remote_clean_content = cleaner.clean(remote_raw_content)

            if local_clean_content != remote_clean_content:

                self.cache_raw.write(key, remote_raw_content)
                self.cache_clean.write(key, remote_clean_content)
                change_list.record_changed(key, source, xurl)

                item = change_list.get_item(key)

                formatter = HtmlFormater()
                remote_raw_content = formatter.format(xurl, remote_raw_content)

                extracter = HtmlExtracter()
                remote_extract_content = extracter.extract(
                    remote_clean_content, item)
                self.cache_extract.write(key, remote_extract_content)

                if self.config.capture_image:
                    c = self.get_capture()
                    c.screenshot(key, f"Screenshot for {location}", xurl)
            else:
                change_list.record_unchanged(key, source, xurl)
                return False

        # -- get urls to hit
        if source.status != "valid":
            raise Exception(f"URL source {source.name} status is not valid")

        df_config = source.df
        if df_config is None:
            raise Exception(
                f"URL source {source.name} does not have any data loaded")

        # -- fetch pages
        skip = False
        err_cnt = 0

        for idx, r in df_config.iterrows():
            location = r["location"]
            source = r["source_name"]
            general_url = r["main_page"]
            data_url = r["data_page"]

            if general_url == None and data_url == None:
                logger.warning(f"  no urls for {location} -> skip")
                change_list.record_skip(location)
                continue

            if idx % 10 == 1: change_list.save_progress()

            if general_url != None:
                try:
                    fetch_if_changed(location, source, general_url, skip=skip)
                except Exception as ex:
                    err_cnt += 1
                    if err_cnt > 10: break
                    change_list.record_failed(location, source, general_url,
                                              "Exception in code")
                    logger.exception(ex)
                    logger.error("    error -> continue to next page")

            if data_url != None:
                if general_url == data_url:
                    remove_duplicate_if_exists(location + "_data", source,
                                               location)
                else:
                    try:
                        fetch_if_changed(location + "_data",
                                         source,
                                         data_url,
                                         skip=skip)
                    except Exception as ex:
                        err_cnt += 1
                        if err_cnt > 10: break
                        change_list.record_failed(location, source,
                                                  general_url,
                                                  "Exception in code")
                        logger.exception(ex)
                        logger.error("    error -> continue to next page")

        if err_cnt > 10:
            logger.error(f"  abort run due to {err_cnt} errors")

        change_list.write_html_to_cache(self.cache_raw, "RAW")
        change_list.write_html_to_cache(self.cache_clean, "CLEAN")
        change_list.write_html_to_cache(self.cache_extract, "EXTRACT")
 def write(self, cache: DirectoryCache, name: str):
     if not self.df_status is None:
         content = dataframe_to_text(self.df_status)
         cache.write(name, content)
Esempio n. 11
0
 def __init__(self, work_dir: str):
     self.cache = DirectoryCache(work_dir)
Esempio n. 12
0
class PageParser():
    def __init__(self, work_dir: str):
        self.cache = DirectoryCache(work_dir)

    def strip_attributes(self, x: html.Element):
        x.attrib.clear()
        for y in x:
            self.strip_attributes(y)

    def extract_tables(self, tables: List) -> List[ContentTable]:
        result = []

        for t in tables:
            self.strip_attributes(t)
            x = ContentTable(t)
            result.append(x)
        return result

    def write_as_text(self, foutput, name: str, tables: List[ContentTable]):
        foutput.write(f"{name}\n")
        for i in range(len(tables)):
            foutput.write(f"{name}\tTable {i+1}\n")
            t = tables[i]
            for r in t.rows:
                foutput.write(f"{name}\tTable {i+1}")
                for c in r:
                    foutput.write("\t")
                    if c != None:
                        c2 = unidecode(c)
                        foutput.write(c2)
                foutput.write(f"\n")
        foutput.write(f"\n")

    def write_as_html(self, foutput, name: str, url: str,
                      tables: List[ContentTable], html_doc: html.Element):

        s = html.Element("div")
        h = html.Element("h1")
        h.text = name
        s.append(h)

        m = html.Element("div")
        m.text = self.cache.read_date_time_str(name + ".html")
        s.append(m)

        for t in tables:
            s.append(t.new_element)

        x = html.Element("br")
        s.append(x)
        a = html.Element("a")
        a.attrib["href"] = url
        a.text = url
        s.append(a)

        h = html.Element("html")
        h.append(html.Element("body"))
        h[0].append(deepcopy(s))
        foutput.write(html.tostring(h, pretty_print=True))

        html_doc.append(s)
        html_doc.append(html.Element("hr"))

    def write_miss_to_html(self, name: str, url: str, msg: str,
                           html_doc: html.Element):

        s = html.Element("div")
        h = html.Element("h1")
        h.text = name
        s.append(h)

        m = html.Element("div")
        m.text = self.cache.read_date_time_str(name + ".html")
        s.append(m)

        m = html.Element("span")
        m.text = msg
        s.append(m)

        x = html.Element("br")
        s.append(x)
        a = html.Element("a")
        a.attrib["href"] = url
        a.text = url
        s.append(a)

        html_doc.append(s)
        html_doc.append(html.Element("hr"))

    def process_all(self):

        cl = ChangeList(self.cache)
        url_dict = cl.read_urls_as_dict()

        ignore_list = ["main_sheet", "WV"]
        missing_list = [
            "WI", "VA", "UT", "TN", "SC", "RI", "PA", "NJ", "NE", "ND",
            "ND_data", "NC", "MS", "MO", "MN", "KY", "KS", "IA", "HI", "GA",
            "DC_data", "AZ", "AL"
        ]
        table_list = [
            "WI_data", "WA", "VT", "TX", "SD", "SC_data", "OR", "OK", "OH",
            "NY", "NV", "NM", "NJ_data", "NH", "NC_data", "MT", "MN_data",
            "MI", "ME", "MD", "MA_data", "MA", "CO", "LA", "IN", "IL", "ID",
            "FL", "DE", "DC", "CT", "CO_data", "CA", "AR", "AK_data", "AK"
        ]

        out_file = os.path.join(self.cache.work_dir, "results.txt")

        old_date = self.cache.read_old_date()

        foutput = open(out_file, "w")
        foutput.write(f"Data Scanned at\t{old_date}\n")
        foutput.write(f"STATE RESULTS\n\n")

        html_out_dir = os.path.join(self.cache.work_dir, "tables")
        if not os.path.exists(html_out_dir): os.makedirs(html_out_dir)

        html_out_file = os.path.join(self.cache.work_dir, "tables",
                                     f"combined.html")
        html_doc = html.Element("html")
        html_doc.append(html.Element("body"))
        html_doc[0].append(html.Element("span"))
        html_doc[0][0].text = f"data scraped at {old_date}"
        html_doc[0].append(html.Element("hr"))

        for fn in self.cache.list_html_files():
            x = fn.replace(".html", "")
            if x in ignore_list: continue

            logger.info(f"=================| {fn}")
            content = self.cache.read(fn)

            tree = html.fromstring(content)
            tables = tree.xpath('//table')
            if len(tables) > 0:
                if x in missing_list:
                    foutput.write(f"{x}\t*** Found unexpected tables\n\n")
                    logger.warning(f"  found {len(tables)} unexpected tables")
                xlist = self.extract_tables(tables)

                xlist2 = [x for x in xlist if x.contains_data()]
                if len(xlist2) == 0:
                    foutput.write(f"{x}\tNo data tables\n\n")
                    self.write_miss_to_html(x, url_dict[x], "No data tables",
                                            html_doc)
                    continue

                #xlist = self.remove_nondata_tables(xlist)

                self.write_as_text(foutput, x, xlist2)

                html_out_file = os.path.join(self.cache.work_dir, "tables",
                                             f"{x}.html")
                with open(html_out_file, "wb") as foutput2:
                    self.write_as_html(foutput2, x, url_dict[x], xlist2,
                                       html_doc)

            else:
                if x in table_list:
                    foutput.write(f"{x}\t*** Missing expected tables\n\n")
                    logger.warning(f"  missing tables")
                else:
                    foutput.write(f"{x}\tNo tables in data\n\n")
                self.write_miss_to_html(x, url_dict[x], "No tables", html_doc)

            html_out_file = os.path.join(self.cache.work_dir, "tables",
                                         f"combined.html")
            with open(html_out_file, "wb") as foutput2:
                foutput2.write(html.tostring(html_doc, pretty_print=True))
class SpecializedCapture():
    def __init__(self,
                 temp_dir: str,
                 publish_dir: str,
                 driver: CaptiveBrowser = None):
        self.temp_dir = temp_dir
        self.publish_dir = publish_dir

        self.cache_images = DirectoryCache(os.path.join(publish_dir, "images"))
        self.cache = DirectoryCache(os.path.join(publish_dir))

        self.changed = False
        self._is_internal_browser = driver is None
        self._browser: CaptiveBrowser = driver

    def get_browser(self) -> CaptiveBrowser:
        if self._browser != None: return self._browser

        logger.info("  [start captive browser]")
        self._browser = CaptiveBrowser()
        atexit.register(self._browser.close)
        return self._browser

    def close(self):
        if self._browser and self._is_internal_browser:
            logger.info("  [stop captive browser]")
            self._browser.close()
            atexit.unregister(self._browser.close)
            self._browser = None

    def publish(self):
        if not self.changed:
            logger.info("  [nothing changed]")
        else:
            host = get_host()
            dt = datetime.now(timezone.utc)
            msg = f"{udatetime.to_displayformat(dt)} on {host} - Specialized Capture"
            util_git.push(self.publish_dir, msg)

    def remove(self, key: str):
        self.cache.remove(key)

        prefix = f"{key}_"
        for unique_key in self.cache_images.list_files():
            if unique_key == key or unique_key.starts(prefix):
                self.cache_images.remove(unique_key)

    def screenshot(self, key: str, label: str, url: str):

        logger.info(f"  screenshot {key}")

        ximages_dir = os.path.join(self.temp_dir, "images")
        if not os.path.exists(ximages_dir): os.makedirs(ximages_dir)

        xpath = os.path.join(self.temp_dir, f"{key}.png")
        xpath_temp = os.path.join(self.temp_dir, f"{key}_temp.png")
        xpath_prev = os.path.join(self.temp_dir, f"{key}_prev.png")
        xpath_diff = os.path.join(self.temp_dir, f"{key}_diff.png")

        browser = self.get_browser()
        if browser == None: raise Exception("Could not get browser")

        logger.info(f"    1. get content from {url}")
        if not browser.navigate(url):
            logger.info("  page timed out -> skip")

        logger.info(f"    2. wait for 5 seconds")
        time.sleep(5)

        logger.info(f"    3. save screenshot to {xpath}")
        buffer_new = browser.screenshot(xpath_temp)
        if buffer_new is None:
            logger.error("      *** could not capture image")
            return

        if os.path.exists(xpath):
            buffer_old = imageio.imread(xpath, as_gray=True)
            is_same, buffer_diff = are_images_same(buffer_new, buffer_old)
            if is_same:
                logger.info("      images are the same -> return")
                if os.path.exists(xpath_diff): os.remove(xpath_diff)
                return
            else:
                logger.warning("      images are different")
                if os.path.exists(xpath_prev): os.remove(xpath_prev)
                if os.path.exists(xpath): os.rename(xpath, xpath_prev)
                os.rename(xpath_temp, xpath)
                imageio.imwrite(xpath_diff, buffer_diff, format="png")
        else:
            logger.warning("      image is new")
            os.rename(xpath_temp, xpath)

        dt = datetime.now(timezone.utc)
        timestamp = udatetime.to_filenameformat(dt)
        key_image = key + "_" + timestamp + ".png"

        logger.info(f"    4. publish unique image {key_image}")
        xkey_image = self.cache_images.import_file(key_image, xpath)

        # also make a copy in the temp dir so we can preview HTML
        xpath_unique = os.path.join(self.temp_dir, "images", xkey_image)
        shutil.copyfile(xpath, xpath_unique)

        logger.info("    5. publish HTML snippet")
        xpath_html = os.path.join(self.temp_dir, f"{key}.html")
        with open(xpath_html, "w") as f:
            f.write(f"""
    <html>
    <body>
            <h3>{label}</h3>
            <div>captured: {udatetime.to_displayformat(dt)}</div>
            <div>src: <a href='{url}'>{url}</a></div>
            <br />
            <img src='images/{xkey_image}'>
    </body>
    </html>
    """)
        self.cache.import_file(f"{key}.html", xpath_html)
        self.changed = True