Beispiel #1
0
def parse_filing(filepath):
    """
    Parses html file
    :param filepath: html file
    :return: dictionary of file_contents including lxml_dict
    """
    # complete_submission_filing_filepath = filing[0]

    lxml_dict = {}

    try:
        # or codecs.open on Python 2
        raw_text = open(filepath, "rb").read()
        result = chardet.detect(raw_text)
        charenc = result['encoding']
        raw_text = raw_text.decode(charenc)
    except:
        charenc = ""
        with io.open(filepath, "rb") as f:
            raw_text = f.read()

    lxml_html = lxml.html.fromstring(raw_text)
    root = lxml_html.getroottree()
    soup = BeautifulSoup(lxml.html.tostring(root), 'lxml')

    file_metadata = {}

    file_metadata['FILEPATH'] = filepath

    for ii, element in enumerate(root.xpath("//*/body/*")):
        lxml_dict[ii] = element

    div_check = {}

    for ii, element in enumerate(
            lxml.html.fromstring(soup.prettify()).xpath("//*/div/*")):
        div_check[ii] = element

    file_metadata['NUMBER_OF_ELEMENTS'] = len(lxml_dict)
    file_metadata['FILE_SIZE'] = file_size(file_metadata['FILEPATH'])
    file_metadata['FILE_SIZE_BYTES'] = os.stat(
        file_metadata['FILEPATH']).st_size
    file_metadata['lxml_dict'] = lxml_dict
    file_metadata['div_check'] = div_check
    file_metadata['ENCODING'] = charenc

    return file_metadata
    def _load(self, filing_filepath=None, lxml_root=True, file_stats=True):

        if self.is_loaded:
            return

        if not filing_filepath:
            filing_filepath = self.filing_filepath

        try:
            # or codecs.open on Python 2
            filing_text = open(filing_filepath, "rb").read()
            result = chardet.detect(filing_text)

            if result:
                self.charenc = result['encoding']

            with io.open(filing_filepath, "r", encoding=self.charenc) as f:
                self.filing_text = f.read()

            self.is_loaded = True
            logger.info(f"Filing Loaded")

        except:
            with io.open(filing_filepath, "rb") as f:
                self.filing_text = f.read()

        if lxml_root:
            lxml_html = lxml.html.fromstring(self.filing_text)
            self.lxml_root = lxml_html.getroottree()
            self.is_lxml_root = True
            logger.info(f"Filing Lxml")

        if file_stats:
            self.FILE_SIZE = file_size(self.filing_filepath)
            self.FILE_SIZE_BYTES = os.stat(self.filing_filepath).st_size
            self.ENCODING = self.charenc
    def _process_filing(self, raw_text, save_output=False):
        """
        Given a filepath
        :param filepath:
        :param output_directory:
        :return:
        """

        elements_list = [('FILENAME', './/filename'), ('TYPE', './/type'),
                         ('SEQUENCE', './/sequence'),
                         ('DESCRIPTION', './/description')]

        xbrl_doc = re.compile(r'<DOCUMENT>(.*?)</DOCUMENT>', re.DOTALL)
        xbrl_text = re.compile(r'<(TEXT|text)>(.*?)</(TEXT|text)>',
                               re.MULTILINE | re.DOTALL)

        documents = xbrl_doc.findall(raw_text)

        filing_documents = {}

        for i, document in enumerate(documents, start=1):
            uue_filepath = None
            filing_document = {}

            lxml_html = lxml.html.fromstring(document)
            root = lxml_html.getroottree()

            for (element, element_path) in elements_list:
                try:
                    filing_document[f"{element}"] = root.xpath(
                        f"{element_path}")[0].text.strip()
                except:
                    filing_document[f"{element}"] = ""

            raw_text = xbrl_text.findall(document)
            raw_text = raw_text[0][1].replace("<XBRL>",
                                              "").replace("</XBRL>",
                                                          "").strip()
            raw_text = raw_text.replace("<XML>", "").replace("</XML>",
                                                             "").strip()

            if raw_text.lower().startswith(
                    "begin") or document.lower().startswith("begin"):

                uue_filepath = os.path.join(
                    self.filing_folder, filing_document['FILENAME'] + ".uue")
                output_filepath = os.path.join(
                    self.filing_folder, uue_filepath.replace(".uue", ""))
                output_filename = os.path.basename(output_filepath)

                if save_output:
                    with open(uue_filepath, 'w', encoding=self.charenc) as f:
                        f.write(raw_text)

                    uudecode(uue_filepath, out_file=output_filepath)

            else:
                doc_num = f"{int(filing_document['SEQUENCE'])}".zfill(4)

                try:
                    output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['DESCRIPTION']} {filing_document['FILENAME']}"
                except:
                    output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['FILENAME']}".replace(
                        " ", "_").replace(":", "").replace("__", "_")

                output_filename = output_filename.replace(" ", "_").replace(
                    ":", "").replace("__", "_")

                output_filename = format_filename(output_filename)
                output_filepath = os.path.join(self.filing_folder,
                                               output_filename)

                if save_output:
                    with open(output_filepath, 'w',
                              encoding=self.charenc) as f:
                        f.write(raw_text)

            filing_document['RELATIVE_FILEPATH'] = os.path.join(
                os.path.basename(self.filing_folder), output_filepath)
            filing_document['DESCRIPTIVE_FILEPATH'] = output_filename

            if save_output:
                filing_document['FILE_SIZE'] = file_size(output_filepath)
                filing_document['FILE_SIZE_BYTES'] = os.stat(
                    output_filepath).st_size

            filing_documents[i] = filing_document

            if uue_filepath and os.path.exists(uue_filepath):
                os.remove(uue_filepath)

        df_sec_filing_contents = pd.DataFrame.from_dict(filing_documents,
                                                        orient='index')

        if save_output:
            df_sec_filing_contents.to_csv(
                os.path.join(
                    self.filing_folder,
                    f"{os.path.basename(self.filing_folder)}_FILING_CONTENTS.csv"
                ))

        logger.info(df_sec_filing_contents)
        self.is_processed = True
        self.df_sec_filing_contents = df_sec_filing_contents
Beispiel #4
0
def complete_submission_filing(filepath, output_directory=None):
    """
    Given a filepath
    :param filepath:
    :param output_directory:
    :return:
    """

    elements_list = [('FILENAME', './/filename'), ('TYPE', './/type'),
                     ('SEQUENCE', './/sequence'), ('DESCRIPTION', './/description')]

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    else:
        logger.info(f"Folder Already Exists {output_directory}")

        return

    logger.info("extracting documents to {}".format(output_directory))

    xbrl_doc = re.compile(r'<DOCUMENT>(.*?)</DOCUMENT>', re.DOTALL)
    xbrl_text = re.compile(r'<(TEXT|text)>(.*?)</(TEXT|text)>', re.MULTILINE | re.DOTALL)

    try:
        # or codecs.open on Python 2
        raw_text = open(filepath, "rb").read()
        result = chardet.detect(raw_text)
        charenc = result['encoding']

        with io.open(filepath, "r", encoding=charenc) as f:
            raw_text = f.read()

    except:
        with io.open(filepath, "rb") as f:
            raw_text = f.read()

    sec_filing_header = parse_filing_header(raw_text)

    header_filepath = os.path.join(output_directory, f"{os.path.basename(output_directory)}_FILING_HEADER.csv")

    sec_filing_header.to_csv(header_filepath)

    documents = xbrl_doc.findall(raw_text)

    filing_documents = {}

    for i, document in enumerate(documents, start=1):
        uue_filepath = None
        filing_document = {}

        lxml_html = lxml.html.fromstring(document)
        root = lxml_html.getroottree()

        for (element, element_path) in elements_list:
            try:
                filing_document[f"{element}"] = root.xpath(f"{element_path}")[0].text.strip()
            except:
                filing_document[f"{element}"] = ""

        raw_text = xbrl_text.findall(document)
        raw_text = raw_text[0][1].replace("<XBRL>", "").replace("</XBRL>", "").strip()
        raw_text = raw_text.replace("<XML>", "").replace("</XML>", "").strip()

        if raw_text.lower().startswith("begin") or document.lower().startswith("begin"):

            uue_filepath = os.path.join(output_directory, filing_document['FILENAME'] + ".uue")
            output_filepath = os.path.join(output_directory, uue_filepath.replace(".uue", ""))
            output_filename = os.path.basename(output_filepath)

            with open(uue_filepath, 'w', encoding=charenc) as f:
                f.write(raw_text)

            uudecode(uue_filepath, out_file=output_filepath)

        else:
            doc_num = f"{int(filing_document['SEQUENCE'])}".zfill(4)

            try:
                output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['DESCRIPTION']} {filing_document['FILENAME']}"
            except:
                output_filename = f"{doc_num}-({filing_document['TYPE']}) {filing_document['FILENAME']}".replace(" ", "_").replace(":", "").replace("__", "_")

            output_filename = output_filename.replace(" ", "_").replace(":", "").replace("__", "_")

            output_filename = format_filename(output_filename)
            output_filepath = os.path.join(output_directory, output_filename)

            with open(output_filepath, 'w', encoding=charenc) as f:
                f.write(raw_text)

        filing_document['RELATIVE_FILEPATH'] = os.path.join(os.path.basename(output_directory), output_filepath)
        filing_document['DESCRIPTIVE_FILEPATH'] = output_filename

        filing_document['FILE_SIZE'] = file_size(output_filepath)
        filing_document['FILE_SIZE_BYTES'] = os.stat(output_filepath).st_size

        filing_documents[i] = filing_document

        if uue_filepath:
            os.remove(uue_filepath)

    df_sec_filing_contents = pd.DataFrame.from_dict(filing_documents, orient='index')
    df_sec_filing_contents.to_csv(os.path.join(output_directory, f"{os.path.basename(output_directory)}_FILING_CONTENTS.csv"))
    logger.info(df_sec_filing_contents)

    return df_sec_filing_contents