Ejemplo n.º 1
0
    def test_retrieve_list_of_tags_value(self):
        '''

        :return:
        '''
        extractor = XbrlExtraction()

        #define test variables
        df = self.input_data()
        column = 'name'
        folder = '/shares/xbrl_parsed_data'
        month = 'January'
        year = '2010'

        # check if column not in dataframe raises error
        with self.assertRaises(ValueError):
            extractor.retrieve_list_of_tags(df, 'potatoe', folder, month, year)

        # check value error for invalid directory
        with self.assertRaises(ValueError):
            extractor.retrieve_list_of_tags(df, column, 'not_real_directory',
                                            month, year)

        # check value error for invalid month
        with self.assertRaises(ValueError):
            extractor.retrieve_list_of_tags(df, column, folder, 'Jan', year)

        # check value error if year not also int
        with self.assertRaises(ValueError):
            extractor.retrieve_list_of_tags(df, column, folder, month, '2k10')
Ejemplo n.º 2
0
    def test_get_tag_counts_pos(self, mock_to_csv, mock_path):
        """
        Positive test case for the get_tag_counts function.
        """
        extractor = XbrlExtraction()
        mock_path.exists.return_value = True
        df = self.input_data()

        extractor.get_tag_counts(df, "name", "output_folder", "January",
                                 "2010")

        self.assertTrue(mock_to_csv.called, "CSV file not saved")
    def test_get_filepaths_types(self):
        """
        Types test case for the get_filepaths function.
        """

        extractor = XbrlExtraction()

        # file = "Prod224_0013_07971828_20140331.html"
        # directory = "Accounts_Monthly_Data-December2014"
        # mock_listdir.return_value = [file]
        #int
        with self.assertRaises(TypeError):
            extractor.get_filepaths(1)
Ejemplo n.º 4
0
    def test_output_xbrl_month_pos(self, mock_to_csv, mock_path):
        """
        Positive test case for the output_xbrl_month function.
        """
        extractor = XbrlExtraction()
        # mock_path.exists.return_value = True
        df = self.data_input()

        extractor.output_xbrl_month(df, "output_folder", "January", "2010", "csv")

        self.assertTrue(mock_to_csv.called, "CSV file not saved")

        self.assertTrue(mock_to_csv, "output_folder" + "/" + "folder_year"
                        + "-" + "folder_month" + "_xbrl_data.csv")
    def test_get_filepaths_pos(self, mock_listdir):
        """
        Positive test case for the get_filepaths function.
        """
        extractor = XbrlExtraction()

        # Accounts_Monthly_Data-December2014/Prod224_0013_07971828_20140331.html
        file = "Prod224_0013_07971828_20140331.html"
        directory = "Accounts_Monthly_Data-December2014"
        mock_listdir.return_value = [file]

        files, month, year = extractor.get_filepaths(directory)

        self.assertTrue(files == [directory + "/" + file],
                        "Directory not present")
        self.assertTrue(month == "December", "Incorrect month")
        self.assertTrue(year == "2014", "Incorrect year")
    def build_month_table(list_of_files):
        """
        Function which parses, sequentially, a list of xbrl/ html files,
        converting each parsed file into a dictionary and appending to a list.

        Arguments:
            list of files: list of filepaths, each coresponding to a xbrl/html file (list)

        Returns:
            results:       list of dictionaries, each containing the parsed content of
                           a xbrl/html file (list)
        Raises:
            None
        """

        process_start = time.time()

        # Empty table awaiting results
        results = []

        COUNT = 0

        # For every file
        for file in list_of_files:
            COUNT += 1

            # Read the file and parse
            doc = XbrlParser.process_account(file)

            # flatten the elements dict into single dict
            #doc['elements'] = XbrlParser.flatten_dict(doc['elements'])

            # append results to table
            results.append(doc)

            XbrlExtraction.progressBar("XBRL Accounts Parsed",
                                       COUNT,
                                       len(list_of_files),
                                       bar_length=50,
                                       width=20)

        print(
            "Average time to process an XBRL file: \x1b[31m{:0f}\x1b[0m".
            format((time.time() - process_start) / 60, 2), "minutes")

        return results
Ejemplo n.º 7
0
    def test_get_tag_counts_values(self):
        """
        Values test case for the get_tag_counts function.
        """
        extractor = XbrlExtraction()
        df = self.input_data()

        with self.assertRaises(ValueError):
            extractor.get_tag_counts(df, "three", "output_folder", "month",
                                     "year")

        with self.assertRaises(ValueError):
            extractor.get_tag_counts(df, "name", "output_folder", "notamonth",
                                     "year")

        with self.assertRaises(ValueError):
            extractor.get_tag_counts(df, "name", "notafolder", "January",
                                     "year")

        with self.assertRaises(ValueError):
            extractor.get_tag_counts(df, "name", "output_folder", "January",
                                     "notayear")
Ejemplo n.º 8
0
    def test_output_xbrl_month_values(self):
        """
        Values test case for the output_xbrl_month function.
        """
        extractor = XbrlExtraction()
        df = self.data_input()

        with self.assertRaises(ValueError):
            extractor.output_xbrl_month(df, "notafolder", "January", "year", "file_type")

        with self.assertRaises(ValueError):
            extractor.output_xbrl_month(df, "output_folder", "notamonth", "year", "file_type")

        with self.assertRaises(ValueError):
            extractor.output_xbrl_month(df, "output_folder", "January", "notayear", "file_type")
Ejemplo n.º 9
0
    def test_retrieve_list_of_tags_pos(self, mock_open):
        '''
        Test to check if the open function is called and that the file name and directory
        is as expected. Additionally checks the file is in write mode.

        :param mock_open: mocked instance of the builtin python open method

        :return: None

        '''

        #define test variables
        df = self.input_data()
        column = 'name'
        folder = '/shares/xbrl_parsed_data'
        month = 'January'
        year = '2010'

        mock_file_name = folder + '/' + year + '-' + month + '_list_of_tags.txt'

        with mock.patch('builtins.open', mock_open()) as mocked_file:
            XbrlExtraction().retrieve_list_of_tags(df, column, folder, month,
                                                   year)
            mocked_file.assert_called_once_with(mock_file_name, 'w')
Ejemplo n.º 10
0
    def test_get_tag_counts_types(self):
        """
        Types test case for the get_tag_counts function.
        """

        extractor = XbrlExtraction()
        df = self.input_data()

        with self.assertRaises(TypeError):
            extractor.get_tag_counts(1.0, "name", "output_folder", "month",
                                     "year")

        with self.assertRaises(TypeError):
            extractor.get_tag_counts(df, 1, "output_folder", "month", "year")

        with self.assertRaises(TypeError):
            extractor.get_tag_counts(df, "name", 1, "month", "year")

        with self.assertRaises(TypeError):
            extractor.get_tag_counts(df, "name", "output_folder", 1, "year")

        with self.assertRaises(TypeError):
            extractor.get_tag_counts(df, "name", "output_folder", "month", 1)
Ejemplo n.º 11
0
def main():
    print("-" * 50)

    # Execute module xbrl_web_scraper
    if xbrl_web_scraper == str(True):
        print("XBRL web scraper running...")
        print("Scraping XBRL data to:", scraped_dir)
        print("Running crawler from:", xbrl_scraper)
        chdir(xbrl_scraper)
        print(getcwd())
        cmdlinestr = "scrapy crawl xbrl_scraper"
        popen(cmdlinestr).read()

    # Validate xbrl data
    if xbrl_web_scraper_validator == str(True):
        validator = XbrlValidatorMethods()
        print("Validating xbrl web scraped data...")
        validator.validate_compressed_files(validator_scraped_dir)

    # Execute module xbrl_unpacker
    if xbrl_unpacker == str(True):
        print("XBRL unpacker running...")
        print("Unpacking zip files...")
        print("Reading from directory: ", unpacker_source_dir)
        print("Writing to directory: ", unpacker_destination_dir)
        unpacker = DataProcessing()
        unpacker.extract_compressed_files(unpacker_source_dir,
                                          unpacker_destination_dir)

    # Execute module xbrl_parser
    if xbrl_parser == str(True):
        print("XBRL parser running...")

        extractor = XbrlExtraction()

        # Create a list of months based on what quarter in the year has been specified
        if xbrl_parser_process_quarter == "1":
            month_list = ['January', 'February', 'March']
        elif xbrl_parser_process_quarter == "2":
            month_list = ['April', 'May', 'June']
        elif xbrl_parser_process_quarter == "3":
            month_list = ['July', 'August', 'September']
        elif xbrl_parser_process_quarter == "4":
            month_list = ['October', 'November', 'December']
        else:
            month_list = [
                'January', 'February', 'March', 'April', 'May', 'June', 'July',
                'August', 'September', 'October', 'November', 'December'
            ]
            if xbrl_parser_process_quarter != "None":
                print(
                    "Invalid quarter specified...processing one year of data!")

        # Create a list of directories from each month present in the month list
        directory_list = []
        if xbrl_parser_custom_input == "None":
            for month in month_list:
                directory_list.append(xbrl_unpacked_data +
                                      "/Accounts_Monthly_Data-" + month +
                                      xbrl_parser_process_year)

        # If a custom list has been specified as a comma separated string, use this instead
        else:
            folder_list = xbrl_parser_custom_input.split(",")
            for folder in folder_list:
                directory_list.append(xbrl_unpacked_data + "/" + folder)

        for directory in directory_list:

            print("Parsing " + directory + "...")

            # Get all the filenames from the example folder
            files, folder_month, folder_year = extractor.get_filepaths(
                directory)

            print(len(files))

            # Here you can splice/truncate the number of files you want to process for testing
            # TO BE COMMENTED OUT AFTER TESTING
            #files = files[0:40]
            files = files[0:5]

            print(folder_month, folder_year)

            # Finally, build a table of all variables from all example (digital) documents
            # This can take a while
            results = extractor.build_month_table(files)

            print(results.shape)

            results.head(10)

            extractor.output_xbrl_month(results, xbrl_processed_csv,
                                        folder_month, folder_year)

            # Find list of all unique tags in dataset
            list_of_tags = results["name"].tolist()
            list_of_tags_unique = list(set(list_of_tags))

            print("Longest tag: ", len(max(list_of_tags_unique, key=len)))

            # Output all unique tags to a txt file
            extractor.retrieve_list_of_tags(results, "name", xbrl_tag_list,
                                            folder_month, folder_year)

            # Output all unique tags and their relative frequencies to a txt file
            extractor.get_tag_counts(results, "name", xbrl_tag_frequencies,
                                     folder_month, folder_year)

            # print(results.shape)

        #tempcsv = pd.read_csv("/shares/xbrl_parsed_data/2020-April_xbrl_data.csv", lineterminator='\n')
        #print(tempcsv.head(5000000))
        #print(tempcsv.shape)

    # Append XBRL data on an annual or quarterly basis
    if xbrl_file_appender == str(True):
        appender = XbrlCsvAppender()
        print("XBRL appender running...")
        appender.merge_files_by_year(xbrl_file_appender_indir,
                                     xbrl_file_appender_outdir,
                                     xbrl_file_appender_year,
                                     xbrl_file_appender_quarter)

    # Execute PDF web scraper
    if pdf_web_scraper == str(True):
        print("PDF web scraper running...")
        print("Scraping filed accounts as PDF data to:",
              filed_accounts_scraped_dir)
        print("Running crawler from:", filed_accounts_scraper)
        chdir(filed_accounts_scraper)
        print(getcwd())
        paper_filing_cmdlinestr = "scrapy crawl latest_paper_filing"
        popen(paper_filing_cmdlinestr).read()

    # Convert PDF files to images
    if pdfs_to_images == str(True):
        print("Converting all PDFs to images...")

    # Train the Classifier model
    if train_classifier_model == str(True):
        print("Training classifier model...")

    # Execute binary Classifier
    if binary_classifier == str(True):
        print("Executing binary classifier...")

    # Execute OCR
    if ocr_functions == str(True):
        print("Running all OCR functions...")
        # instance to class

    # Execute NLP
    if nlp_functions == str(True):
        print("Running all NLP functions...")

    # Merge xbrl and PDF file data
    if merge_xbrl_to_pdf_data == str(True):
        print("Merging XBRL and PDF data...")
    """
Ejemplo n.º 12
0
    def test_retrieve_list_of_tags_type(self):
        '''

        :return:
        '''
        extractor = XbrlExtraction()
        df = self.input_data()

        #check if dataframe = string
        with self.assertRaises(TypeError):
            extractor.retrieve_list_of_tags('df', 'name', 'output_folder',
                                            'January', '2010')

        #check type error for column = list type
        with self.assertRaises(TypeError):
            extractor.retrieve_list_of_tags(df, ['name'], 'output_folder',
                                            'January', '2010')

        #check type error for folder = list
        with self.assertRaises(TypeError):
            extractor.retrieve_list_of_tags(df, 'name', ['output_folder'],
                                            'January', '2010')

        #check type error for month = int
        with self.assertRaises(TypeError):
            extractor.retrieve_list_of_tags(df, 'name', 'output_folder', 1,
                                            '2010')

        # check type error for year = int
        with self.assertRaises(TypeError):
            extractor.retrieve_list_of_tags(df, 'name', 'output_folder',
                                            'January', 2010)
Ejemplo n.º 13
0
    def test_output_xbrl_month_types(self):
        """
        Types test case for the output_xbrl_month function.
        """

        extractor = XbrlExtraction()
        df = self.data_input()

        with self.assertRaises(TypeError):
            extractor.output_xbrl_month(1.0, "output_folder", "month", "year", "file_type")

        with self.assertRaises(TypeError):
            extractor.output_xbrl_month(df, 1, "month", "year", "file_type")

        with self.assertRaises(TypeError):
            extractor.output_xbrl_month(df, "output_folder", 1, "year", "file_type")

        with self.assertRaises(TypeError):
            extractor.output_xbrl_month(df, "output_folder", "month", 1, "file_type")

        with self.assertRaises(TypeError):
            extractor.output_xbrl_month(df, "name", "output_folder", "month", 1)
    def parse_directory(directory, processed_path, num_processes=1):
        """
        Takes a directory, parses all files contained there and saves them as
        csv files in a specified directory.

        Arguments:
            directory: A directory (path) to be processed (str)
            processed_path: String of the path where processed files should be
                            saved (str)
            num_processes:  The number of cores to use in multiprocessing (int)
        Returns:
            None
        Raises:
            None
        """
        extractor = XbrlExtraction()
        parser = XbrlParser()

        # Get all the filenames from the example folder
        files, folder_month, folder_year = extractor.get_filepaths(directory)

        print(len(files))

        # Here you can splice/truncate the number of files you want to process
        # for testing
        files = files[0:10]

        # TO BE COMMENTED OUT AFTER TESTING
        print(folder_month, folder_year)

        # Code needed to split files by the number of cores before passing in
        # as an argument
        chunk_len = math.ceil(len(files) / num_processes)
        files = [
            files[i:i + chunk_len] for i in range(0, len(files), chunk_len)
        ]

        # define number of processors
        pool = mp.Pool(processes=num_processes)
        # Finally, build a table of all variables from all example (digital)
        # documents splitting the load between cpu cores = num_processes
        # This can take a while (hopefully not anymore!!!)
        r = pool.map(parser.build_month_table, files)

        pool.close()
        pool.join()
        # combine resultant list of lists
        print("Combining lists...")
        r = [item for sublist in r for item in sublist]
        print("Flattening data....")
        # combine data and convert into dataframe
        results = parser.flatten_data(r)
        print(results.shape)

        # save to csv
        extractor.output_xbrl_month(results, processed_path, folder_month,
                                    folder_year)

        # Find list of all unique tags in dataset
        list_of_tags = results["name"].tolist()
        list_of_tags_unique = list(set(list_of_tags))

        print("Longest tag: ", len(max(list_of_tags_unique, key=len)))