def extract_meta(rawhtml): def _ex_title(soup): s = soup.xpath("//h1/text()") return s[0] if len(s) > 0 else "" meta = {} soup = lxsoup(rawhtml) meta['title'] = _ex_title(soup) return meta
def fetch_catalog_urls(): mylog(CATALOG_URL, label="Fetching catalog") resp = requests.post(CATALOG_URL, data={"agency": "osha"}) soup = lxsoup(resp.text) urls = soup.xpath('//a[contains(@href, "csv.zip")]/@href') """ each url will look like this: https://enfxfr.dol.gov/../data_catalog/OSHA/osha_accident_injury_20200727.csv.zip so we tidy it to: https://enfxfr.dol.gov/data_catalog/OSHA/osha_accident_injury_20200727.csv.zip """ return [u.replace("../data_catalog", "data_catalog") for u in urls]
def parse_description_page(txt): _rx_division = r'Division ([A-Z]): (.+)' def _extract_division(soup): dtext = next(t for t in soup.xpath('//a[contains(@title, "Division")]/@title') if re.match(_rx_division, t)) return re.match(_rx_division, dtext).groups() _rx_group = r'Major Group (\d+): (.+)' def _extract_group(soup): dtext = next(t for t in soup.xpath('//a[contains(@title, "Major Group")]/@title') if re.match(_rx_group, t)) return re.match(_rx_group, dtext).groups() _rx_sic= r'Description for (\d{4}): (.+)' def _extract_sic(soup): # try: # except StopIteration: # import pdb; pdb.set_trace() dtext = next(t for t in soup.xpath('//h2[contains(text(), "Description")]/text()') if re.match(_rx_sic, t)) return re.match(_rx_sic, dtext).groups() def _extract_sic_desc(soup): els = soup.xpath('//span[contains(@class, "blueTen")]/text()') if len(els) == 0: return "" elif len(els) > 1: import pdb; pdb.set_trace() else: return els[0] def _extract_sic_examples(soup): examples = soup.xpath('//span[contains(@class, "blueTen")]/following-sibling::ul[1]/li/text()') if len(examples) == 0: return "" else: return '\n'.join(e.strip() for e in examples) soup = lxsoup(txt) d = {} d['division_code'], d['division_name'] = _extract_division(soup) d['group_code'], d['group_name'] = _extract_group(soup) d['sic_code'], d['sic_name'] = _extract_sic(soup) d['sic_description'] = _extract_sic_desc(soup) d['sic_examples'] = _extract_sic_examples(soup) return d
def _read_homepage(): _srcpath = fetch_and_save(HOMEPAGE_URL) txt = _srcpath.read_bytes() return lxsoup(txt)
def _read_manual(): manual_srcpath = fetch_and_save(MANUEL_INDEX_URL) txt = manual_srcpath.read_bytes() return lxsoup(txt)
def _get_desc_urls(srcpath): soup = lxsoup(srcpath.read_bytes()) relpaths = soup.xpath('//a[contains(@href, "&tab=description")]/@href') urls = [urljoin(MANUEL_INDEX_URL, p) for p in relpaths] return urls