def write_parsed(self, name: str, cache: DirectoryCache): key = f"{name}_parsed.txt" if self.df is None: cache.remove(key) else: content = dataframe_to_text(self.df) cache.write(key, content)
def read(self, name: str, cache: DirectoryCache): key = f"{name}_data.txt" content = cache.read(key) if content == None: self.df = dataframe_from_text(content) key = f"{name}_source.{self.content_type}" self.content = cache.read(key)
def __init__(self, temp_dir: str, publish_dir: str, driver: CaptiveBrowser = None): self.temp_dir = temp_dir self.publish_dir = publish_dir self.cache_images = DirectoryCache(os.path.join(publish_dir, "images")) self.cache = DirectoryCache(os.path.join(publish_dir)) self.changed = False self._is_internal_browser = driver is None self._browser: CaptiveBrowser = driver
def write(self, name: str, cache: DirectoryCache, change_list: ChangeList): key = f"{name}_data.txt" new_content = dataframe_to_text(self.df) old_content = cache.read(key) if old_content != new_content: cache.write(key, new_content) key = f"{name}_source.{self.content_type}" cache.write(key, self.content) change_list.record_changed(name, "source", self.endpoint) else: change_list.record_unchanged(name, "source", self.endpoint)
class PageCompare(): def __init__(self, work_dir_a: str, work_dir_b: str, out_dir: str): self.cache_a = DirectoryCache(work_dir_a) self.cache_b = DirectoryCache(work_dir_b) self.out_dir = out_dir def process_all(self): ignore_list = ["main_sheet", "WV"] if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) for fn in self.cache_a.list_html_files(): x = fn.replace(".html", "") if x in ignore_list: continue content_a = self.cache_a.read(fn) content_b = self.cache_b.read(fn) if content_a == content_b: logger.info(f"=================| {fn}") logger.info(" data is SAME") continue fn_a = os.path.join(self.out_dir, f"{x}_A.html") fn_b = os.path.join(self.out_dir, f"{x}_B.html") if os.path.exists(fn_a): os.remove(fn_a) if os.path.exists(fn_b): os.remove(fn_b) if content_a == content_b: logger.info(f"=================| {fn}") logger.info(" data is FIXED") continue doc_a = html.fromstring(content_a) doc_b = html.fromstring(content_b) remove_identical_nodes(doc_a, doc_b) str_a = html.tostring(doc_a, pretty_print=True) str_b = html.tostring(doc_b, pretty_print=True) logger.info(f"=================| {fn}") logger.info(" data is different") with open(fn_a, "wb") as f: f.write(str_a) with open(fn_b, "wb") as f: f.write(str_b)
def read(self, cache: DirectoryCache, name: str): if len(self.names) == 0: raise Exception("No sources") content = cache.read(name) if content == None: return df = dataframe_from_text(content) df.index = df.names missing = ~df.index.isin(self.names) if len(missing) > 0: df.loc[missing, "status"] = "removed" df.reset_index(inplace=True, drop=True) self.df_status = df
def __init__(self, config: DataPipelineConfig): self.config = config self.change_list: ChangeList = None base_dir = config.base_dir self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources")) self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw")) self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean")) self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract")) self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff")) self.url_manager = UrlManager(config.headless, config.browser) self.sources: UrlSources = None self._capture: SpecializedCapture = None
def __init__(self, work_dir_a: str, work_dir_b: str, out_dir: str): self.cache_a = DirectoryCache(work_dir_a) self.cache_b = DirectoryCache(work_dir_b) self.out_dir = out_dir
class DataPipeline(): def __init__(self, config: DataPipelineConfig): self.config = config self.change_list: ChangeList = None base_dir = config.base_dir self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources")) self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw")) self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean")) self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract")) self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff")) self.url_manager = UrlManager(config.headless, config.browser) self.sources: UrlSources = None self._capture: SpecializedCapture = None def get_capture(self) -> SpecializedCapture: if self._capture == None: publish_dir = os.path.join(self.config.base_dir, 'captive-browser') driver = self.url_manager._captive.driver if self.url_manager._captive else None self._capture = SpecializedCapture(self.config.temp_dir, publish_dir, driver) return self._capture def shutdown_capture(self): if self._capture != None: self._capture.close() if self.config.auto_push: self._capture.publish() self._capture = None def update_sources(self): " update the remote url sources " manager = UrlSourceManager(self.cache_sources) self.sources = manager.update_sources("scan") def process(self) -> Dict[str, str]: " run the pipeline " self.url_manager.reset() self.change_list = ChangeList(self.cache_raw) host = get_host() print( f"=== run started on {host} at {udatetime.to_logformat(self.change_list.start_date)}" ) self.change_list.start_run() try: if self.sources == None: raise Exception("Sources not provided") src = self.sources.items[0] if src.name != "google-states-csv": raise Exception( f"Expected first source to be google-states-csv, not {src.name}" ) return self._main_loop(src, self.change_list) except Exception as ex: logger.exception(ex) self.change_list.abort_run(ex) finally: self.change_list.finish_run() self.shutdown_capture() logger.info( f" [in-memory content cache took {self.url_manager.size*1e-6:.1f} MBs" ) logger.info( f"run finished on {host} at {udatetime.to_logformat(self.change_list.start_date)}" ) def format_html(self, rerun=False): " format raw html " is_first = False for key in self.cache_raw.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_raw.exists(key): if is_first: logger.info(f"format existing files...") is_first = False logger.info(f" format {key}") local_raw_content = self.cache_raw.read(key) formater = HtmlFormater() local_clean_content = formater.format(None, local_raw_content) self.cache_raw.write(key, local_clean_content) def clean_html(self, rerun=False): " generate clean files from existing raw html " is_first = False for key in self.cache_raw.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_clean.exists(key): if is_first: logger.info(f"clean existing files...") is_first = False logger.info(f" clean {key}") local_raw_content = self.cache_raw.read(key) cleaner = HtmlCleaner() local_clean_content = cleaner.clean(local_raw_content) self.cache_clean.write(key, local_clean_content) def extract_html(self, rerun=False): " generate extract files from existing clean html " self.change_list = ChangeList(self.cache_raw) self.change_list.load() is_first = False for key in self.cache_clean.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_extract.exists(key): if is_first: logger.info(f"extract existing files...") is_first = False logger.info(f" extract {key}") local_clean_content = self.cache_clean.read(key) item = self.change_list.get_item(key) if item == None: logger.warning(" skip because it is a new item") continue extracter = HtmlExtracter() local_extract_content = extracter.extract( local_clean_content, item) self.cache_extract.write(key, local_extract_content) def _main_loop(self, source: UrlSource, change_list: ChangeList) -> Dict[str, str]: def remove_duplicate_if_exists(location: str, source: str, other_state: str): key = location + ".html" self.cache_raw.remove(key) self.cache_clean.remove(key) change_list.record_duplicate(key, source, f"duplicate of {other_state}") if self.config.capture_image: c = self.get_capture() c.remove(location) def fetch_if_changed(location: str, source: str, xurl: str, skip: bool = False) -> bool: key = location + ".html" if xurl == "" or xurl == None or xurl == "None": change_list.record_skip(key, source, xurl, "missing url") return mins = change_list.get_minutes_since_last_check(key) if self.config.trace: logger.info(f" checked {key} {mins:.1f} minutes ago") if mins < 15.0: if self.config.rerun_now: logger.info(f"{key}: checked {mins:.1f} mins ago") else: logger.info( f"{key}: checked {mins:.1f} mins ago -> skip b/c < 15 mins" ) change_list.temporary_skip(key, source, xurl, "age < 15 mins") return False if skip: change_list.record_skip(key, source, xurl, "skip flag set") return False if self.config.trace: logger.info(f"fetch {xurl}") remote_raw_content, status = self.url_manager.fetch(xurl) is_bad, msg = is_bad_content(remote_raw_content) if is_bad: change_list.record_failed(key, source, xurl, msg) return False if status > 300: change_list.record_failed(location, source, xurl, f"HTTP status {status}") return False remote_raw_content = remote_raw_content.replace(b"\r", b"") formater = HtmlFormater() remote_raw_content = formater.format(xurl, remote_raw_content) local_clean_content = self.cache_clean.read(key) cleaner = HtmlCleaner() remote_clean_content = cleaner.clean(remote_raw_content) if local_clean_content != remote_clean_content: self.cache_raw.write(key, remote_raw_content) self.cache_clean.write(key, remote_clean_content) change_list.record_changed(key, source, xurl) item = change_list.get_item(key) formatter = HtmlFormater() remote_raw_content = formatter.format(xurl, remote_raw_content) extracter = HtmlExtracter() remote_extract_content = extracter.extract( remote_clean_content, item) self.cache_extract.write(key, remote_extract_content) if self.config.capture_image: c = self.get_capture() c.screenshot(key, f"Screenshot for {location}", xurl) else: change_list.record_unchanged(key, source, xurl) return False # -- get urls to hit if source.status != "valid": raise Exception(f"URL source {source.name} status is not valid") df_config = source.df if df_config is None: raise Exception( f"URL source {source.name} does not have any data loaded") # -- fetch pages skip = False err_cnt = 0 for idx, r in df_config.iterrows(): location = r["location"] source = r["source_name"] general_url = r["main_page"] data_url = r["data_page"] if general_url == None and data_url == None: logger.warning(f" no urls for {location} -> skip") change_list.record_skip(location) continue if idx % 10 == 1: change_list.save_progress() if general_url != None: try: fetch_if_changed(location, source, general_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if data_url != None: if general_url == data_url: remove_duplicate_if_exists(location + "_data", source, location) else: try: fetch_if_changed(location + "_data", source, data_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if err_cnt > 10: logger.error(f" abort run due to {err_cnt} errors") change_list.write_html_to_cache(self.cache_raw, "RAW") change_list.write_html_to_cache(self.cache_clean, "CLEAN") change_list.write_html_to_cache(self.cache_extract, "EXTRACT")
def write(self, cache: DirectoryCache, name: str): if not self.df_status is None: content = dataframe_to_text(self.df_status) cache.write(name, content)
def __init__(self, work_dir: str): self.cache = DirectoryCache(work_dir)
class PageParser(): def __init__(self, work_dir: str): self.cache = DirectoryCache(work_dir) def strip_attributes(self, x: html.Element): x.attrib.clear() for y in x: self.strip_attributes(y) def extract_tables(self, tables: List) -> List[ContentTable]: result = [] for t in tables: self.strip_attributes(t) x = ContentTable(t) result.append(x) return result def write_as_text(self, foutput, name: str, tables: List[ContentTable]): foutput.write(f"{name}\n") for i in range(len(tables)): foutput.write(f"{name}\tTable {i+1}\n") t = tables[i] for r in t.rows: foutput.write(f"{name}\tTable {i+1}") for c in r: foutput.write("\t") if c != None: c2 = unidecode(c) foutput.write(c2) foutput.write(f"\n") foutput.write(f"\n") def write_as_html(self, foutput, name: str, url: str, tables: List[ContentTable], html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) for t in tables: s.append(t.new_element) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) h = html.Element("html") h.append(html.Element("body")) h[0].append(deepcopy(s)) foutput.write(html.tostring(h, pretty_print=True)) html_doc.append(s) html_doc.append(html.Element("hr")) def write_miss_to_html(self, name: str, url: str, msg: str, html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) m = html.Element("span") m.text = msg s.append(m) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) html_doc.append(s) html_doc.append(html.Element("hr")) def process_all(self): cl = ChangeList(self.cache) url_dict = cl.read_urls_as_dict() ignore_list = ["main_sheet", "WV"] missing_list = [ "WI", "VA", "UT", "TN", "SC", "RI", "PA", "NJ", "NE", "ND", "ND_data", "NC", "MS", "MO", "MN", "KY", "KS", "IA", "HI", "GA", "DC_data", "AZ", "AL" ] table_list = [ "WI_data", "WA", "VT", "TX", "SD", "SC_data", "OR", "OK", "OH", "NY", "NV", "NM", "NJ_data", "NH", "NC_data", "MT", "MN_data", "MI", "ME", "MD", "MA_data", "MA", "CO", "LA", "IN", "IL", "ID", "FL", "DE", "DC", "CT", "CO_data", "CA", "AR", "AK_data", "AK" ] out_file = os.path.join(self.cache.work_dir, "results.txt") old_date = self.cache.read_old_date() foutput = open(out_file, "w") foutput.write(f"Data Scanned at\t{old_date}\n") foutput.write(f"STATE RESULTS\n\n") html_out_dir = os.path.join(self.cache.work_dir, "tables") if not os.path.exists(html_out_dir): os.makedirs(html_out_dir) html_out_file = os.path.join(self.cache.work_dir, "tables", f"combined.html") html_doc = html.Element("html") html_doc.append(html.Element("body")) html_doc[0].append(html.Element("span")) html_doc[0][0].text = f"data scraped at {old_date}" html_doc[0].append(html.Element("hr")) for fn in self.cache.list_html_files(): x = fn.replace(".html", "") if x in ignore_list: continue logger.info(f"=================| {fn}") content = self.cache.read(fn) tree = html.fromstring(content) tables = tree.xpath('//table') if len(tables) > 0: if x in missing_list: foutput.write(f"{x}\t*** Found unexpected tables\n\n") logger.warning(f" found {len(tables)} unexpected tables") xlist = self.extract_tables(tables) xlist2 = [x for x in xlist if x.contains_data()] if len(xlist2) == 0: foutput.write(f"{x}\tNo data tables\n\n") self.write_miss_to_html(x, url_dict[x], "No data tables", html_doc) continue #xlist = self.remove_nondata_tables(xlist) self.write_as_text(foutput, x, xlist2) html_out_file = os.path.join(self.cache.work_dir, "tables", f"{x}.html") with open(html_out_file, "wb") as foutput2: self.write_as_html(foutput2, x, url_dict[x], xlist2, html_doc) else: if x in table_list: foutput.write(f"{x}\t*** Missing expected tables\n\n") logger.warning(f" missing tables") else: foutput.write(f"{x}\tNo tables in data\n\n") self.write_miss_to_html(x, url_dict[x], "No tables", html_doc) html_out_file = os.path.join(self.cache.work_dir, "tables", f"combined.html") with open(html_out_file, "wb") as foutput2: foutput2.write(html.tostring(html_doc, pretty_print=True))
class SpecializedCapture(): def __init__(self, temp_dir: str, publish_dir: str, driver: CaptiveBrowser = None): self.temp_dir = temp_dir self.publish_dir = publish_dir self.cache_images = DirectoryCache(os.path.join(publish_dir, "images")) self.cache = DirectoryCache(os.path.join(publish_dir)) self.changed = False self._is_internal_browser = driver is None self._browser: CaptiveBrowser = driver def get_browser(self) -> CaptiveBrowser: if self._browser != None: return self._browser logger.info(" [start captive browser]") self._browser = CaptiveBrowser() atexit.register(self._browser.close) return self._browser def close(self): if self._browser and self._is_internal_browser: logger.info(" [stop captive browser]") self._browser.close() atexit.unregister(self._browser.close) self._browser = None def publish(self): if not self.changed: logger.info(" [nothing changed]") else: host = get_host() dt = datetime.now(timezone.utc) msg = f"{udatetime.to_displayformat(dt)} on {host} - Specialized Capture" util_git.push(self.publish_dir, msg) def remove(self, key: str): self.cache.remove(key) prefix = f"{key}_" for unique_key in self.cache_images.list_files(): if unique_key == key or unique_key.starts(prefix): self.cache_images.remove(unique_key) def screenshot(self, key: str, label: str, url: str): logger.info(f" screenshot {key}") ximages_dir = os.path.join(self.temp_dir, "images") if not os.path.exists(ximages_dir): os.makedirs(ximages_dir) xpath = os.path.join(self.temp_dir, f"{key}.png") xpath_temp = os.path.join(self.temp_dir, f"{key}_temp.png") xpath_prev = os.path.join(self.temp_dir, f"{key}_prev.png") xpath_diff = os.path.join(self.temp_dir, f"{key}_diff.png") browser = self.get_browser() if browser == None: raise Exception("Could not get browser") logger.info(f" 1. get content from {url}") if not browser.navigate(url): logger.info(" page timed out -> skip") logger.info(f" 2. wait for 5 seconds") time.sleep(5) logger.info(f" 3. save screenshot to {xpath}") buffer_new = browser.screenshot(xpath_temp) if buffer_new is None: logger.error(" *** could not capture image") return if os.path.exists(xpath): buffer_old = imageio.imread(xpath, as_gray=True) is_same, buffer_diff = are_images_same(buffer_new, buffer_old) if is_same: logger.info(" images are the same -> return") if os.path.exists(xpath_diff): os.remove(xpath_diff) return else: logger.warning(" images are different") if os.path.exists(xpath_prev): os.remove(xpath_prev) if os.path.exists(xpath): os.rename(xpath, xpath_prev) os.rename(xpath_temp, xpath) imageio.imwrite(xpath_diff, buffer_diff, format="png") else: logger.warning(" image is new") os.rename(xpath_temp, xpath) dt = datetime.now(timezone.utc) timestamp = udatetime.to_filenameformat(dt) key_image = key + "_" + timestamp + ".png" logger.info(f" 4. publish unique image {key_image}") xkey_image = self.cache_images.import_file(key_image, xpath) # also make a copy in the temp dir so we can preview HTML xpath_unique = os.path.join(self.temp_dir, "images", xkey_image) shutil.copyfile(xpath, xpath_unique) logger.info(" 5. publish HTML snippet") xpath_html = os.path.join(self.temp_dir, f"{key}.html") with open(xpath_html, "w") as f: f.write(f""" <html> <body> <h3>{label}</h3> <div>captured: {udatetime.to_displayformat(dt)}</div> <div>src: <a href='{url}'>{url}</a></div> <br /> <img src='images/{xkey_image}'> </body> </html> """) self.cache.import_file(f"{key}.html", xpath_html) self.changed = True