def get_table_header(self, soup_obj): header_row = clean_find_all(soup_obj.thead.tr, ["th"]) header_row_text = [get_full_string(e) for e in header_row] header_row_text = [ e.strip() for e in header_row_text if e.strip() != "" and e.strip() != "#" ] if header_row_text[0] != "Document Title": header_row_text = ["Row Number"] + header_row_text return (header_row_text)
def get_report_tables(soup_obj, *args, **kwargs): tables = clean_find_all(soup_obj, ['section', {'class': 'card mb-2'}]) if len(tables) != 12: raise Exception( str(len(tables)) + " table headers found instead of 12!") dict_table = {} for table in tables: df = DataTable(table, *args, **kwargs) dict_table[df.title] = df return (dict_table)
def get_rows_in_table(table_obj): rows = clean_find_all(table_obj, ['tr', {'class': 'nowrap'}]) if len(rows) == 0: return (None) else: return (rows)
def extract_columns(row_obj, colval="td"): cols = clean_find_all(row_obj, [colval]) assert len(cols) > 0 extracted_cols = [col.extract() for col in cols] return (extracted_cols)
def get_middleware_token(text): soup_obj = BeautifulSoup(text) l = clean_find_all(soup_obj, ['input', {'name': 'csrfmiddlewaretoken'}]) assert (len(l) == 1) return (l[0]["value"])