def convert_to_json(self, rerun=False): " get json data out of extracted html " self.change_list = ChangeList(self.cache_raw) self.change_list.load() is_first = False for key in self.cache_extract.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue xkey = key.replace(".html", ".json") if rerun or not self.cache_convert.exists(xkey): if is_first: logger.info(f"convert existing files...") is_first = False logger.info(f" convert {key}") local_extract_content = self.cache_extract.read(key) item = self.change_list.get_item(key) if item == None: logger.warning(" skip because it is a new item") continue converter = HtmlConverter() local_convert_content = converter.convert( key, local_extract_content, item) self.cache_convert.write(xkey, local_convert_content)
def extract_html(self, rerun=False): " generate extract files from existing clean html " self.change_list = ChangeList(self.cache_raw) self.change_list.load() is_first = False for key in self.cache_clean.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_extract.exists(key): if is_first: logger.info(f"extract existing files...") is_first = False logger.info(f" extract {key}") local_clean_content = self.cache_clean.read(key) item = self.change_list.get_item(key) if item == None: logger.warning(" skip because it is a new item") continue extracter = HtmlExtracter() local_extract_content = extracter.extract( local_clean_content, item) self.cache_extract.write(key, local_extract_content)
def process(self) -> Dict[str, str]: " run the pipeline " self.url_manager.reset() self.change_list = ChangeList(self.cache_raw) host = get_host() print( f"=== run started on {host} at {udatetime.to_logformat(self.change_list.start_date)}" ) self.change_list.start_run() try: if self.sources == None: raise Exception("Sources not provided") src = self.sources.items[0] if src.name != "google-states-csv": raise Exception( f"Expected first source to be google-states-csv, not {src.name}" ) return self._main_loop(src, self.change_list) except Exception as ex: logger.exception(ex) self.change_list.abort_run(ex) finally: self.change_list.finish_run() self.shutdown_capture() logger.info( f" [in-memory content cache took {self.url_manager.size*1e-6:.1f} MBs" ) logger.info( f"run finished on {host} at {udatetime.to_logformat(self.change_list.start_date)}" )
def update_sources(self, mode: str) -> UrlSources: self.change_list = ChangeList(self.cache) self.change_list.load() self.change_list.start_run() sources = UrlSources() sources.scan(sources_config) sources.read(self.cache, "sources.txt") logger.info(f" found {len(sources.items)} sources") validator = UrlSourceValidator() for src in sources.items: if not src.check_mode(mode): continue src.update_from_remote() src.write_parsed(src.name, self.cache) if validator.validate(src): src.status = "valid" logger.info(f" {src.name}: save") src.write(src.name, self.cache, self.change_list) logger.info(f" {src.name}: updated from remote") else: src.status = "invalid" validator.display_status() if src.read(src.name, self.cache): logger.warning(f" {src.name}: use local cache") else: self.change_list.record_failed(src.name, "source", src.endpoint, "no local cache") sources.update_status() sources.write(self.cache, "sources.txt") self.change_list.finish_run() return sources
class DataPipeline(): def __init__(self, config: DataPipelineConfig): self.config = config self.change_list: ChangeList = None base_dir = config.base_dir self.cache_sources = DirectoryCache(os.path.join(base_dir, "sources")) self.cache_raw = DirectoryCache(os.path.join(base_dir, "raw")) self.cache_clean = DirectoryCache(os.path.join(base_dir, "clean")) self.cache_extract = DirectoryCache(os.path.join(base_dir, "extract")) self.cache_convert = DirectoryCache(os.path.join(base_dir, "convert")) self.cache_diff = DirectoryCache(os.path.join(base_dir, "diff")) self.url_manager = UrlManager(config.headless, config.browser) self.sources: UrlSources = None self._capture: SpecializedCapture = None def get_capture(self) -> SpecializedCapture: if self._capture == None: publish_dir = os.path.join(self.config.base_dir, 'captive-browser') driver = self.url_manager._captive.driver if self.url_manager._captive else None self._capture = SpecializedCapture(self.config.temp_dir, publish_dir, driver) return self._capture def shutdown_capture(self): if self._capture != None: self._capture.close() if self.config.auto_push: self._capture.publish() self._capture = None def update_sources(self): " update the remote url sources " manager = UrlSourceManager(self.cache_sources) self.sources = manager.update_sources("scan") def process(self) -> Dict[str, str]: " run the pipeline " self.url_manager.reset() self.change_list = ChangeList(self.cache_raw) host = get_host() print( f"=== run started on {host} at {udatetime.to_logformat(self.change_list.start_date)}" ) self.change_list.start_run() try: if self.sources == None: raise Exception("Sources not provided") src = self.sources.items[0] if src.name != "google-states-csv": raise Exception( f"Expected first source to be google-states-csv, not {src.name}" ) return self._main_loop(src, self.change_list) except Exception as ex: logger.exception(ex) self.change_list.abort_run(ex) finally: self.change_list.finish_run() self.shutdown_capture() logger.info( f" [in-memory content cache took {self.url_manager.size*1e-6:.1f} MBs" ) logger.info( f"run finished on {host} at {udatetime.to_logformat(self.change_list.start_date)}" ) def format_html(self, rerun=False): " format raw html " is_first = False for key in self.cache_raw.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_raw.exists(key): if is_first: logger.info(f"format existing files...") is_first = False logger.info(f" format {key}") local_raw_content = self.cache_raw.read(key) formater = HtmlFormater() local_clean_content = formater.format(None, local_raw_content) self.cache_raw.write(key, local_clean_content) def clean_html(self, rerun=False): " generate clean files from existing raw html " is_first = False for key in self.cache_raw.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_clean.exists(key): if is_first: logger.info(f"clean existing files...") is_first = False logger.info(f" clean {key}") local_raw_content = self.cache_raw.read(key) cleaner = HtmlCleaner() local_clean_content = cleaner.clean(local_raw_content) self.cache_clean.write(key, local_clean_content) def extract_html(self, rerun=False): " generate extract files from existing clean html " self.change_list = ChangeList(self.cache_raw) self.change_list.load() is_first = False for key in self.cache_clean.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue if rerun or not self.cache_extract.exists(key): if is_first: logger.info(f"extract existing files...") is_first = False logger.info(f" extract {key}") local_clean_content = self.cache_clean.read(key) item = self.change_list.get_item(key) if item == None: logger.warning(" skip because it is a new item") continue extracter = HtmlExtracter() local_extract_content = extracter.extract( local_clean_content, item) self.cache_extract.write(key, local_extract_content) def convert_to_json(self, rerun=False): " get json data out of extracted html " self.change_list = ChangeList(self.cache_raw) self.change_list.load() is_first = False for key in self.cache_extract.list_html_files(): if key == "index.html": continue if key == "google_sheet.html": continue xkey = key.replace(".html", ".json") if rerun or not self.cache_convert.exists(xkey): if is_first: logger.info(f"convert existing files...") is_first = False logger.info(f" convert {key}") local_extract_content = self.cache_extract.read(key) item = self.change_list.get_item(key) if item == None: logger.warning(" skip because it is a new item") continue converter = HtmlConverter() local_convert_content = converter.convert( key, local_extract_content, item) self.cache_convert.write(xkey, local_convert_content) def _main_loop(self, source: UrlSource, change_list: ChangeList) -> Dict[str, str]: def remove_duplicate_if_exists(location: str, source: str, other_state: str): key = location + ".html" self.cache_raw.remove(key) self.cache_clean.remove(key) change_list.record_duplicate(key, source, f"duplicate of {other_state}") if self.config.capture_image: c = self.get_capture() c.remove(location) def fetch_if_changed(location: str, source: str, xurl: str, skip: bool = False) -> bool: key = location + ".html" if xurl == "" or xurl == None or xurl == "None": change_list.record_skip(key, source, xurl, "missing url") return mins = change_list.get_minutes_since_last_check(key) if self.config.trace: logger.info(f" checked {key} {mins:.1f} minutes ago") if mins < 15.0: if self.config.rerun_now: logger.info(f"{key}: checked {mins:.1f} mins ago") else: logger.info( f"{key}: checked {mins:.1f} mins ago -> skip b/c < 15 mins" ) change_list.temporary_skip(key, source, xurl, "age < 15 mins") return False if skip: change_list.record_skip(key, source, xurl, "skip flag set") return False if self.config.trace: logger.info(f"fetch {xurl}") remote_raw_content, status = self.url_manager.fetch(xurl) is_bad, msg = is_bad_content(remote_raw_content) if is_bad: change_list.record_failed(key, source, xurl, msg) return False if status > 300: change_list.record_failed(location, source, xurl, f"HTTP status {status}") return False remote_raw_content = remote_raw_content.replace(b"\r", b"") formater = HtmlFormater() remote_raw_content = formater.format(xurl, remote_raw_content) local_clean_content = self.cache_clean.read(key) cleaner = HtmlCleaner() remote_clean_content = cleaner.clean(remote_raw_content) if local_clean_content != remote_clean_content: self.cache_raw.write(key, remote_raw_content) self.cache_clean.write(key, remote_clean_content) change_list.record_changed(key, source, xurl) item = change_list.get_item(key) formatter = HtmlFormater() remote_raw_content = formatter.format(xurl, remote_raw_content) extracter = HtmlExtracter() remote_extract_content = extracter.extract( remote_clean_content, item) self.cache_extract.write(key, remote_extract_content) converter = HtmlConverter() remote_convert_content = converter.convert( key, remote_extract_content, item) self.cache_convert.write(key, remote_convert_content) if self.config.capture_image: c = self.get_capture() c.screenshot(key, f"Screenshot for {location}", xurl) else: change_list.record_unchanged(key, source, xurl) return False # -- get urls to hit if source.status != "valid": raise Exception(f"URL source {source.name} status is not valid") df_config = source.df if df_config is None: raise Exception( f"URL source {source.name} does not have any data loaded") # -- fetch pages skip = False err_cnt = 0 for idx, r in df_config.iterrows(): location = r["location"] source = r["source_name"] general_url = r["main_page"] data_url = r["data_page"] if general_url == None and data_url == None: logger.warning(f" no urls for {location} -> skip") change_list.record_skip(location) continue if idx % 10 == 1: change_list.save_progress() if general_url != None: try: fetch_if_changed(location, source, general_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if data_url != None: if general_url == data_url: remove_duplicate_if_exists(location + "_data", source, location) else: try: fetch_if_changed(location + "_data", source, data_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if err_cnt > 10: logger.error(f" abort run due to {err_cnt} errors") change_list.write_html_to_cache(self.cache_raw, "RAW") change_list.write_html_to_cache(self.cache_clean, "CLEAN") change_list.write_html_to_cache(self.cache_extract, "EXTRACT")
def _main_loop(self, source: UrlSource, change_list: ChangeList) -> Dict[str, str]: def remove_duplicate_if_exists(location: str, source: str, other_state: str): key = location + ".html" self.cache_raw.remove(key) self.cache_clean.remove(key) change_list.record_duplicate(key, source, f"duplicate of {other_state}") if self.config.capture_image: c = self.get_capture() c.remove(location) def fetch_if_changed(location: str, source: str, xurl: str, skip: bool = False) -> bool: key = location + ".html" if xurl == "" or xurl == None or xurl == "None": change_list.record_skip(key, source, xurl, "missing url") return mins = change_list.get_minutes_since_last_check(key) if self.config.trace: logger.info(f" checked {key} {mins:.1f} minutes ago") if mins < 15.0: if self.config.rerun_now: logger.info(f"{key}: checked {mins:.1f} mins ago") else: logger.info( f"{key}: checked {mins:.1f} mins ago -> skip b/c < 15 mins" ) change_list.temporary_skip(key, source, xurl, "age < 15 mins") return False if skip: change_list.record_skip(key, source, xurl, "skip flag set") return False if self.config.trace: logger.info(f"fetch {xurl}") remote_raw_content, status = self.url_manager.fetch(xurl) is_bad, msg = is_bad_content(remote_raw_content) if is_bad: change_list.record_failed(key, source, xurl, msg) return False if status > 300: change_list.record_failed(location, source, xurl, f"HTTP status {status}") return False remote_raw_content = remote_raw_content.replace(b"\r", b"") formater = HtmlFormater() remote_raw_content = formater.format(xurl, remote_raw_content) local_clean_content = self.cache_clean.read(key) cleaner = HtmlCleaner() remote_clean_content = cleaner.clean(remote_raw_content) if local_clean_content != remote_clean_content: self.cache_raw.write(key, remote_raw_content) self.cache_clean.write(key, remote_clean_content) change_list.record_changed(key, source, xurl) item = change_list.get_item(key) formatter = HtmlFormater() remote_raw_content = formatter.format(xurl, remote_raw_content) extracter = HtmlExtracter() remote_extract_content = extracter.extract( remote_clean_content, item) self.cache_extract.write(key, remote_extract_content) converter = HtmlConverter() remote_convert_content = converter.convert( key, remote_extract_content, item) self.cache_convert.write(key, remote_convert_content) if self.config.capture_image: c = self.get_capture() c.screenshot(key, f"Screenshot for {location}", xurl) else: change_list.record_unchanged(key, source, xurl) return False # -- get urls to hit if source.status != "valid": raise Exception(f"URL source {source.name} status is not valid") df_config = source.df if df_config is None: raise Exception( f"URL source {source.name} does not have any data loaded") # -- fetch pages skip = False err_cnt = 0 for idx, r in df_config.iterrows(): location = r["location"] source = r["source_name"] general_url = r["main_page"] data_url = r["data_page"] if general_url == None and data_url == None: logger.warning(f" no urls for {location} -> skip") change_list.record_skip(location) continue if idx % 10 == 1: change_list.save_progress() if general_url != None: try: fetch_if_changed(location, source, general_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if data_url != None: if general_url == data_url: remove_duplicate_if_exists(location + "_data", source, location) else: try: fetch_if_changed(location + "_data", source, data_url, skip=skip) except Exception as ex: err_cnt += 1 if err_cnt > 10: break change_list.record_failed(location, source, general_url, "Exception in code") logger.exception(ex) logger.error(" error -> continue to next page") if err_cnt > 10: logger.error(f" abort run due to {err_cnt} errors") change_list.write_html_to_cache(self.cache_raw, "RAW") change_list.write_html_to_cache(self.cache_clean, "CLEAN") change_list.write_html_to_cache(self.cache_extract, "EXTRACT")
def process_all(self): cl = ChangeList(self.cache) url_dict = cl.read_urls_as_dict() ignore_list = ["main_sheet", "WV"] missing_list = [ "WI", "VA", "UT", "TN", "SC", "RI", "PA", "NJ", "NE", "ND", "ND_data", "NC", "MS", "MO", "MN", "KY", "KS", "IA", "HI", "GA", "DC_data", "AZ", "AL" ] table_list = [ "WI_data", "WA", "VT", "TX", "SD", "SC_data", "OR", "OK", "OH", "NY", "NV", "NM", "NJ_data", "NH", "NC_data", "MT", "MN_data", "MI", "ME", "MD", "MA_data", "MA", "CO", "LA", "IN", "IL", "ID", "FL", "DE", "DC", "CT", "CO_data", "CA", "AR", "AK_data", "AK" ] out_file = os.path.join(self.cache.work_dir, "results.txt") old_date = self.cache.read_old_date() foutput = open(out_file, "w") foutput.write(f"Data Scanned at\t{old_date}\n") foutput.write(f"STATE RESULTS\n\n") html_out_dir = os.path.join(self.cache.work_dir, "tables") if not os.path.exists(html_out_dir): os.makedirs(html_out_dir) html_out_file = os.path.join(self.cache.work_dir, "tables", f"combined.html") html_doc = html.Element("html") html_doc.append(html.Element("body")) html_doc[0].append(html.Element("span")) html_doc[0][0].text = f"data scraped at {old_date}" html_doc[0].append(html.Element("hr")) for fn in self.cache.list_html_files(): x = fn.replace(".html", "") if x in ignore_list: continue logger.info(f"=================| {fn}") content = self.cache.read(fn) tree = html.fromstring(content) tables = tree.xpath('//table') if len(tables) > 0: if x in missing_list: foutput.write(f"{x}\t*** Found unexpected tables\n\n") logger.warning(f" found {len(tables)} unexpected tables") xlist = self.extract_tables(tables) xlist2 = [x for x in xlist if x.contains_data()] if len(xlist2) == 0: foutput.write(f"{x}\tNo data tables\n\n") self.write_miss_to_html(x, url_dict[x], "No data tables", html_doc) continue #xlist = self.remove_nondata_tables(xlist) self.write_as_text(foutput, x, xlist2) html_out_file = os.path.join(self.cache.work_dir, "tables", f"{x}.html") with open(html_out_file, "wb") as foutput2: self.write_as_html(foutput2, x, url_dict[x], xlist2, html_doc) else: if x in table_list: foutput.write(f"{x}\t*** Missing expected tables\n\n") logger.warning(f" missing tables") else: foutput.write(f"{x}\tNo tables in data\n\n") self.write_miss_to_html(x, url_dict[x], "No tables", html_doc) html_out_file = os.path.join(self.cache.work_dir, "tables", f"combined.html") with open(html_out_file, "wb") as foutput2: foutput2.write(html.tostring(html_doc, pretty_print=True))