def __init__(self, soup_obj, process_html, *args, **kwargs): temp = clean_find(soup_obj, ['div', {'class': 'card-body'}]) text = "".join(get_strings(clean_find(temp, ["h3", {'class': 'h4'}]))) self.title = text.split(".")[0] raw_rows = get_rows_in_table(soup_obj) if raw_rows is not None: header_row_text = self.get_table_header(soup_obj) rows = [extract_columns(row) for row in raw_rows] assert are_rowlens_equal(rows) if (len(header_row_text) == len(rows[0]) - 1): rows = drop_empty_first_row(rows) if (len(header_row_text) != len(rows[0])): warn("Header: `" + "`, `".join(header_row_text) + "`") warn("Row: `" + "`, `".join([get_string(e).strip() for e in rows[0]]) + "`") raise PageScrapeException( "Header has length {0} and rows are length {1}".format( len(header_row_text), len(rows[0]))) if process_html == False: rows = [[str(cell) for cell in row] for row in rows] else: raise BaseException("Not implemented") pandas.DataFrame.__init__(self, data=rows, columns=header_row_text, *args, **kwargs) else: pandas.DataFrame.__init__(self, *args, **kwargs)
def get_report_metadata(soup_obj): if is_pdf_page(soup_obj): return (None) def temp_get_string(tag): try: text = remove_extra_whitespace(get_full_string(tag)) except AttributeError: text = None return (text) d = {} d["report_name"] = temp_get_string( clean_find(soup_obj, ["h1", { "class": "mb-2" }])) d["filer_name"] = temp_get_string( clean_find(soup_obj, ["h2", { "class": "filedReport" }])) filing_dateime_string = temp_get_string( clean_find(soup_obj, ["p", { "class": "muted font-weight-bold" }])) candidacy_string = temp_get_string( clean_find(soup_obj, ["p", { "class": "muted" }])) d["date_filed"] = filing_dateime_string.split()[1] d["time_filed"] = " ".join(filing_dateime_string.split()[-2:]) d["state"] = candidacy_string.split()[3] # pretty hacky, i admit d["candidacy_date"] = candidacy_string.split()[-1] return (d)
def ommitted_assets(soup_obj): checkbox = clean_find(soup_obj, ["input", { "name": "filing_omitted_assets" }]) try: return (checkbox["checked"] == "checked") except KeyError: return (False)
def get_series_id(soup_obj): """ Requires the "category" category as the soup_obj """ def is_java_link(tag): try: return("javascript:showCat" in tag["href"]) except KeyError: return(False) javascript_link = clean_find(soup_obj, is_java_link) if javascript_link: series_id = javascript_link.get("href").split("javascript:showCat(")[-1].split(",")[0] return(series_id) else: warn("Manga does not have an id") return(None)
def is_pdf_page(soup_obj): return (clean_find(soup_obj, ["div", {"id": "myCarousel"}]) is not None)