def parse_filing(filepath): """ Parses html file :param filepath: html file :return: dictionary of file_contents including lxml_dict """ # complete_submission_filing_filepath = filing[0] lxml_dict = {} try: # or codecs.open on Python 2 raw_text = open(filepath, "rb").read() result = chardet.detect(raw_text) charenc = result['encoding'] raw_text = raw_text.decode(charenc) except: charenc = "" with io.open(filepath, "rb") as f: raw_text = f.read() lxml_html = lxml.html.fromstring(raw_text) root = lxml_html.getroottree() soup = BeautifulSoup(lxml.html.tostring(root), 'lxml') file_metadata = {} file_metadata['FILEPATH'] = filepath for ii, element in enumerate(root.xpath("//*/body/*")): lxml_dict[ii] = element div_check = {} for ii, element in enumerate( lxml.html.fromstring(soup.prettify()).xpath("//*/div/*")): div_check[ii] = element file_metadata['NUMBER_OF_ELEMENTS'] = len(lxml_dict) file_metadata['FILE_SIZE'] = file_size(file_metadata['FILEPATH']) file_metadata['FILE_SIZE_BYTES'] = os.stat( file_metadata['FILEPATH']).st_size file_metadata['lxml_dict'] = lxml_dict file_metadata['div_check'] = div_check file_metadata['ENCODING'] = charenc return file_metadata
def _load(self, filing_filepath=None, lxml_root=True, file_stats=True): if self.is_loaded: return if not filing_filepath: filing_filepath = self.filing_filepath try: # or codecs.open on Python 2 filing_text = open(filing_filepath, "rb").read() result = chardet.detect(filing_text) if result: self.charenc = result['encoding'] with io.open(filing_filepath, "r", encoding=self.charenc) as f: self.filing_text = f.read() self.is_loaded = True logger.info(f"Filing Loaded") except: with io.open(filing_filepath, "rb") as f: self.filing_text = f.read() if lxml_root: lxml_html = lxml.html.fromstring(self.filing_text) self.lxml_root = lxml_html.getroottree() self.is_lxml_root = True logger.info(f"Filing Lxml") if file_stats: self.FILE_SIZE = file_size(self.filing_filepath) self.FILE_SIZE_BYTES = os.stat(self.filing_filepath).st_size self.ENCODING = self.charenc
def _process_filing(self, raw_text, save_output=False): """ Given a filepath :param filepath: :param output_directory: :return: """ elements_list = [('FILENAME', './/filename'), ('TYPE', './/type'), ('SEQUENCE', './/sequence'), ('DESCRIPTION', './/description')] xbrl_doc = re.compile(r'<DOCUMENT>(.*?)</DOCUMENT>', re.DOTALL) xbrl_text = re.compile(r'<(TEXT|text)>(.*?)</(TEXT|text)>', re.MULTILINE | re.DOTALL) documents = xbrl_doc.findall(raw_text) filing_documents = {} for i, document in enumerate(documents, start=1): uue_filepath = None filing_document = {} lxml_html = lxml.html.fromstring(document) root = lxml_html.getroottree() for (element, element_path) in elements_list: try: filing_document[f"{element}"] = root.xpath( f"{element_path}")[0].text.strip() except: filing_document[f"{element}"] = "" raw_text = xbrl_text.findall(document) raw_text = raw_text[0][1].replace("<XBRL>", "").replace("</XBRL>", "").strip() raw_text = raw_text.replace("<XML>", "").replace("</XML>", "").strip() if raw_text.lower().startswith( "begin") or document.lower().startswith("begin"): uue_filepath = os.path.join( self.filing_folder, filing_document['FILENAME'] + ".uue") output_filepath = os.path.join( self.filing_folder, uue_filepath.replace(".uue", "")) output_filename = os.path.basename(output_filepath) if save_output: with open(uue_filepath, 'w', encoding=self.charenc) as f: f.write(raw_text) uudecode(uue_filepath, out_file=output_filepath) else: doc_num = f"{int(filing_document['SEQUENCE'])}".zfill(4) try: output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['DESCRIPTION']} {filing_document['FILENAME']}" except: output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['FILENAME']}".replace( " ", "_").replace(":", "").replace("__", "_") output_filename = output_filename.replace(" ", "_").replace( ":", "").replace("__", "_") output_filename = format_filename(output_filename) output_filepath = os.path.join(self.filing_folder, output_filename) if save_output: with open(output_filepath, 'w', encoding=self.charenc) as f: f.write(raw_text) filing_document['RELATIVE_FILEPATH'] = os.path.join( os.path.basename(self.filing_folder), output_filepath) filing_document['DESCRIPTIVE_FILEPATH'] = output_filename if save_output: filing_document['FILE_SIZE'] = file_size(output_filepath) filing_document['FILE_SIZE_BYTES'] = os.stat( output_filepath).st_size filing_documents[i] = filing_document if uue_filepath and os.path.exists(uue_filepath): os.remove(uue_filepath) df_sec_filing_contents = pd.DataFrame.from_dict(filing_documents, orient='index') if save_output: df_sec_filing_contents.to_csv( os.path.join( self.filing_folder, f"{os.path.basename(self.filing_folder)}_FILING_CONTENTS.csv" )) logger.info(df_sec_filing_contents) self.is_processed = True self.df_sec_filing_contents = df_sec_filing_contents
def complete_submission_filing(filepath, output_directory=None): """ Given a filepath :param filepath: :param output_directory: :return: """ elements_list = [('FILENAME', './/filename'), ('TYPE', './/type'), ('SEQUENCE', './/sequence'), ('DESCRIPTION', './/description')] if not os.path.exists(output_directory): os.makedirs(output_directory) else: logger.info(f"Folder Already Exists {output_directory}") return logger.info("extracting documents to {}".format(output_directory)) xbrl_doc = re.compile(r'<DOCUMENT>(.*?)</DOCUMENT>', re.DOTALL) xbrl_text = re.compile(r'<(TEXT|text)>(.*?)</(TEXT|text)>', re.MULTILINE | re.DOTALL) try: # or codecs.open on Python 2 raw_text = open(filepath, "rb").read() result = chardet.detect(raw_text) charenc = result['encoding'] with io.open(filepath, "r", encoding=charenc) as f: raw_text = f.read() except: with io.open(filepath, "rb") as f: raw_text = f.read() sec_filing_header = parse_filing_header(raw_text) header_filepath = os.path.join(output_directory, f"{os.path.basename(output_directory)}_FILING_HEADER.csv") sec_filing_header.to_csv(header_filepath) documents = xbrl_doc.findall(raw_text) filing_documents = {} for i, document in enumerate(documents, start=1): uue_filepath = None filing_document = {} lxml_html = lxml.html.fromstring(document) root = lxml_html.getroottree() for (element, element_path) in elements_list: try: filing_document[f"{element}"] = root.xpath(f"{element_path}")[0].text.strip() except: filing_document[f"{element}"] = "" raw_text = xbrl_text.findall(document) raw_text = raw_text[0][1].replace("<XBRL>", "").replace("</XBRL>", "").strip() raw_text = raw_text.replace("<XML>", "").replace("</XML>", "").strip() if raw_text.lower().startswith("begin") or document.lower().startswith("begin"): uue_filepath = os.path.join(output_directory, filing_document['FILENAME'] + ".uue") output_filepath = os.path.join(output_directory, uue_filepath.replace(".uue", "")) output_filename = os.path.basename(output_filepath) with open(uue_filepath, 'w', encoding=charenc) as f: f.write(raw_text) uudecode(uue_filepath, out_file=output_filepath) else: doc_num = f"{int(filing_document['SEQUENCE'])}".zfill(4) try: output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['DESCRIPTION']} {filing_document['FILENAME']}" except: output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['FILENAME']}".replace(" ", "_").replace(":", "").replace("__", "_") output_filename = output_filename.replace(" ", "_").replace(":", "").replace("__", "_") output_filename = format_filename(output_filename) output_filepath = os.path.join(output_directory, output_filename) with open(output_filepath, 'w', encoding=charenc) as f: f.write(raw_text) filing_document['RELATIVE_FILEPATH'] = os.path.join(os.path.basename(output_directory), output_filepath) filing_document['DESCRIPTIVE_FILEPATH'] = output_filename filing_document['FILE_SIZE'] = file_size(output_filepath) filing_document['FILE_SIZE_BYTES'] = os.stat(output_filepath).st_size filing_documents[i] = filing_document if uue_filepath: os.remove(uue_filepath) df_sec_filing_contents = pd.DataFrame.from_dict(filing_documents, orient='index') df_sec_filing_contents.to_csv(os.path.join(output_directory, f"{os.path.basename(output_directory)}_FILING_CONTENTS.csv")) logger.info(df_sec_filing_contents) return df_sec_filing_contents