Esempio n. 1
0
def extract_meta(rawhtml):

    def _ex_title(soup):
        s = soup.xpath("//h1/text()")
        return s[0] if len(s) > 0 else ""

    meta = {}
    soup = lxsoup(rawhtml)
    meta['title'] = _ex_title(soup)

    return meta
def fetch_catalog_urls():
    mylog(CATALOG_URL, label="Fetching catalog")
    resp = requests.post(CATALOG_URL, data={"agency": "osha"})
    soup = lxsoup(resp.text)
    urls = soup.xpath('//a[contains(@href, "csv.zip")]/@href')
    """
    each url will look like this:
        https://enfxfr.dol.gov/../data_catalog/OSHA/osha_accident_injury_20200727.csv.zip
    so we tidy it to:
        https://enfxfr.dol.gov/data_catalog/OSHA/osha_accident_injury_20200727.csv.zip
    """
    return [u.replace("../data_catalog", "data_catalog") for u in urls]
def parse_description_page(txt):
    _rx_division = r'Division ([A-Z]): (.+)'
    def _extract_division(soup):
        dtext = next(t for t in soup.xpath('//a[contains(@title, "Division")]/@title') if re.match(_rx_division, t))
        return re.match(_rx_division, dtext).groups()

    _rx_group = r'Major Group (\d+): (.+)'
    def _extract_group(soup):
        dtext = next(t for t in soup.xpath('//a[contains(@title, "Major Group")]/@title') if re.match(_rx_group, t))
        return re.match(_rx_group, dtext).groups()

    _rx_sic= r'Description for (\d{4}): (.+)'
    def _extract_sic(soup):
        # try:
        # except StopIteration:
        #     import pdb; pdb.set_trace()
        dtext = next(t for t in soup.xpath('//h2[contains(text(), "Description")]/text()') if re.match(_rx_sic, t))
        return re.match(_rx_sic, dtext).groups()

    def _extract_sic_desc(soup):
        els = soup.xpath('//span[contains(@class, "blueTen")]/text()')
        if len(els) == 0:
            return ""
        elif len(els) > 1:
            import pdb; pdb.set_trace()
        else:
            return els[0]

    def _extract_sic_examples(soup):
        examples = soup.xpath('//span[contains(@class, "blueTen")]/following-sibling::ul[1]/li/text()')
        if len(examples) == 0:
            return ""
        else:
            return '\n'.join(e.strip() for e in examples)


    soup = lxsoup(txt)
    d = {}
    d['division_code'], d['division_name'] = _extract_division(soup)
    d['group_code'], d['group_name'] = _extract_group(soup)
    d['sic_code'], d['sic_name'] = _extract_sic(soup)
    d['sic_description'] = _extract_sic_desc(soup)
    d['sic_examples'] = _extract_sic_examples(soup)
    return d
 def _read_homepage():
     _srcpath = fetch_and_save(HOMEPAGE_URL)
     txt = _srcpath.read_bytes()
     return lxsoup(txt)
 def _read_manual():
     manual_srcpath = fetch_and_save(MANUEL_INDEX_URL)
     txt = manual_srcpath.read_bytes()
     return lxsoup(txt)
 def _get_desc_urls(srcpath):
     soup = lxsoup(srcpath.read_bytes())
     relpaths = soup.xpath('//a[contains(@href, "&tab=description")]/@href')
     urls = [urljoin(MANUEL_INDEX_URL, p) for p in relpaths]
     return urls