def test_retrieve_list_of_tags_value(self): ''' :return: ''' extractor = XbrlExtraction() #define test variables df = self.input_data() column = 'name' folder = '/shares/xbrl_parsed_data' month = 'January' year = '2010' # check if column not in dataframe raises error with self.assertRaises(ValueError): extractor.retrieve_list_of_tags(df, 'potatoe', folder, month, year) # check value error for invalid directory with self.assertRaises(ValueError): extractor.retrieve_list_of_tags(df, column, 'not_real_directory', month, year) # check value error for invalid month with self.assertRaises(ValueError): extractor.retrieve_list_of_tags(df, column, folder, 'Jan', year) # check value error if year not also int with self.assertRaises(ValueError): extractor.retrieve_list_of_tags(df, column, folder, month, '2k10')
def test_get_tag_counts_pos(self, mock_to_csv, mock_path): """ Positive test case for the get_tag_counts function. """ extractor = XbrlExtraction() mock_path.exists.return_value = True df = self.input_data() extractor.get_tag_counts(df, "name", "output_folder", "January", "2010") self.assertTrue(mock_to_csv.called, "CSV file not saved")
def test_get_filepaths_types(self): """ Types test case for the get_filepaths function. """ extractor = XbrlExtraction() # file = "Prod224_0013_07971828_20140331.html" # directory = "Accounts_Monthly_Data-December2014" # mock_listdir.return_value = [file] #int with self.assertRaises(TypeError): extractor.get_filepaths(1)
def test_output_xbrl_month_pos(self, mock_to_csv, mock_path): """ Positive test case for the output_xbrl_month function. """ extractor = XbrlExtraction() # mock_path.exists.return_value = True df = self.data_input() extractor.output_xbrl_month(df, "output_folder", "January", "2010", "csv") self.assertTrue(mock_to_csv.called, "CSV file not saved") self.assertTrue(mock_to_csv, "output_folder" + "/" + "folder_year" + "-" + "folder_month" + "_xbrl_data.csv")
def test_get_filepaths_pos(self, mock_listdir): """ Positive test case for the get_filepaths function. """ extractor = XbrlExtraction() # Accounts_Monthly_Data-December2014/Prod224_0013_07971828_20140331.html file = "Prod224_0013_07971828_20140331.html" directory = "Accounts_Monthly_Data-December2014" mock_listdir.return_value = [file] files, month, year = extractor.get_filepaths(directory) self.assertTrue(files == [directory + "/" + file], "Directory not present") self.assertTrue(month == "December", "Incorrect month") self.assertTrue(year == "2014", "Incorrect year")
def build_month_table(list_of_files): """ Function which parses, sequentially, a list of xbrl/ html files, converting each parsed file into a dictionary and appending to a list. Arguments: list of files: list of filepaths, each coresponding to a xbrl/html file (list) Returns: results: list of dictionaries, each containing the parsed content of a xbrl/html file (list) Raises: None """ process_start = time.time() # Empty table awaiting results results = [] COUNT = 0 # For every file for file in list_of_files: COUNT += 1 # Read the file and parse doc = XbrlParser.process_account(file) # flatten the elements dict into single dict #doc['elements'] = XbrlParser.flatten_dict(doc['elements']) # append results to table results.append(doc) XbrlExtraction.progressBar("XBRL Accounts Parsed", COUNT, len(list_of_files), bar_length=50, width=20) print( "Average time to process an XBRL file: \x1b[31m{:0f}\x1b[0m". format((time.time() - process_start) / 60, 2), "minutes") return results
def test_get_tag_counts_values(self): """ Values test case for the get_tag_counts function. """ extractor = XbrlExtraction() df = self.input_data() with self.assertRaises(ValueError): extractor.get_tag_counts(df, "three", "output_folder", "month", "year") with self.assertRaises(ValueError): extractor.get_tag_counts(df, "name", "output_folder", "notamonth", "year") with self.assertRaises(ValueError): extractor.get_tag_counts(df, "name", "notafolder", "January", "year") with self.assertRaises(ValueError): extractor.get_tag_counts(df, "name", "output_folder", "January", "notayear")
def test_output_xbrl_month_values(self): """ Values test case for the output_xbrl_month function. """ extractor = XbrlExtraction() df = self.data_input() with self.assertRaises(ValueError): extractor.output_xbrl_month(df, "notafolder", "January", "year", "file_type") with self.assertRaises(ValueError): extractor.output_xbrl_month(df, "output_folder", "notamonth", "year", "file_type") with self.assertRaises(ValueError): extractor.output_xbrl_month(df, "output_folder", "January", "notayear", "file_type")
def test_retrieve_list_of_tags_pos(self, mock_open): ''' Test to check if the open function is called and that the file name and directory is as expected. Additionally checks the file is in write mode. :param mock_open: mocked instance of the builtin python open method :return: None ''' #define test variables df = self.input_data() column = 'name' folder = '/shares/xbrl_parsed_data' month = 'January' year = '2010' mock_file_name = folder + '/' + year + '-' + month + '_list_of_tags.txt' with mock.patch('builtins.open', mock_open()) as mocked_file: XbrlExtraction().retrieve_list_of_tags(df, column, folder, month, year) mocked_file.assert_called_once_with(mock_file_name, 'w')
def test_get_tag_counts_types(self): """ Types test case for the get_tag_counts function. """ extractor = XbrlExtraction() df = self.input_data() with self.assertRaises(TypeError): extractor.get_tag_counts(1.0, "name", "output_folder", "month", "year") with self.assertRaises(TypeError): extractor.get_tag_counts(df, 1, "output_folder", "month", "year") with self.assertRaises(TypeError): extractor.get_tag_counts(df, "name", 1, "month", "year") with self.assertRaises(TypeError): extractor.get_tag_counts(df, "name", "output_folder", 1, "year") with self.assertRaises(TypeError): extractor.get_tag_counts(df, "name", "output_folder", "month", 1)
def main(): print("-" * 50) # Execute module xbrl_web_scraper if xbrl_web_scraper == str(True): print("XBRL web scraper running...") print("Scraping XBRL data to:", scraped_dir) print("Running crawler from:", xbrl_scraper) chdir(xbrl_scraper) print(getcwd()) cmdlinestr = "scrapy crawl xbrl_scraper" popen(cmdlinestr).read() # Validate xbrl data if xbrl_web_scraper_validator == str(True): validator = XbrlValidatorMethods() print("Validating xbrl web scraped data...") validator.validate_compressed_files(validator_scraped_dir) # Execute module xbrl_unpacker if xbrl_unpacker == str(True): print("XBRL unpacker running...") print("Unpacking zip files...") print("Reading from directory: ", unpacker_source_dir) print("Writing to directory: ", unpacker_destination_dir) unpacker = DataProcessing() unpacker.extract_compressed_files(unpacker_source_dir, unpacker_destination_dir) # Execute module xbrl_parser if xbrl_parser == str(True): print("XBRL parser running...") extractor = XbrlExtraction() # Create a list of months based on what quarter in the year has been specified if xbrl_parser_process_quarter == "1": month_list = ['January', 'February', 'March'] elif xbrl_parser_process_quarter == "2": month_list = ['April', 'May', 'June'] elif xbrl_parser_process_quarter == "3": month_list = ['July', 'August', 'September'] elif xbrl_parser_process_quarter == "4": month_list = ['October', 'November', 'December'] else: month_list = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] if xbrl_parser_process_quarter != "None": print( "Invalid quarter specified...processing one year of data!") # Create a list of directories from each month present in the month list directory_list = [] if xbrl_parser_custom_input == "None": for month in month_list: directory_list.append(xbrl_unpacked_data + "/Accounts_Monthly_Data-" + month + xbrl_parser_process_year) # If a custom list has been specified as a comma separated string, use this instead else: folder_list = xbrl_parser_custom_input.split(",") for folder in folder_list: directory_list.append(xbrl_unpacked_data + "/" + folder) for directory in directory_list: print("Parsing " + directory + "...") # Get all the filenames from the example folder files, folder_month, folder_year = extractor.get_filepaths( directory) print(len(files)) # Here you can splice/truncate the number of files you want to process for testing # TO BE COMMENTED OUT AFTER TESTING #files = files[0:40] files = files[0:5] print(folder_month, folder_year) # Finally, build a table of all variables from all example (digital) documents # This can take a while results = extractor.build_month_table(files) print(results.shape) results.head(10) extractor.output_xbrl_month(results, xbrl_processed_csv, folder_month, folder_year) # Find list of all unique tags in dataset list_of_tags = results["name"].tolist() list_of_tags_unique = list(set(list_of_tags)) print("Longest tag: ", len(max(list_of_tags_unique, key=len))) # Output all unique tags to a txt file extractor.retrieve_list_of_tags(results, "name", xbrl_tag_list, folder_month, folder_year) # Output all unique tags and their relative frequencies to a txt file extractor.get_tag_counts(results, "name", xbrl_tag_frequencies, folder_month, folder_year) # print(results.shape) #tempcsv = pd.read_csv("/shares/xbrl_parsed_data/2020-April_xbrl_data.csv", lineterminator='\n') #print(tempcsv.head(5000000)) #print(tempcsv.shape) # Append XBRL data on an annual or quarterly basis if xbrl_file_appender == str(True): appender = XbrlCsvAppender() print("XBRL appender running...") appender.merge_files_by_year(xbrl_file_appender_indir, xbrl_file_appender_outdir, xbrl_file_appender_year, xbrl_file_appender_quarter) # Execute PDF web scraper if pdf_web_scraper == str(True): print("PDF web scraper running...") print("Scraping filed accounts as PDF data to:", filed_accounts_scraped_dir) print("Running crawler from:", filed_accounts_scraper) chdir(filed_accounts_scraper) print(getcwd()) paper_filing_cmdlinestr = "scrapy crawl latest_paper_filing" popen(paper_filing_cmdlinestr).read() # Convert PDF files to images if pdfs_to_images == str(True): print("Converting all PDFs to images...") # Train the Classifier model if train_classifier_model == str(True): print("Training classifier model...") # Execute binary Classifier if binary_classifier == str(True): print("Executing binary classifier...") # Execute OCR if ocr_functions == str(True): print("Running all OCR functions...") # instance to class # Execute NLP if nlp_functions == str(True): print("Running all NLP functions...") # Merge xbrl and PDF file data if merge_xbrl_to_pdf_data == str(True): print("Merging XBRL and PDF data...") """
def test_retrieve_list_of_tags_type(self): ''' :return: ''' extractor = XbrlExtraction() df = self.input_data() #check if dataframe = string with self.assertRaises(TypeError): extractor.retrieve_list_of_tags('df', 'name', 'output_folder', 'January', '2010') #check type error for column = list type with self.assertRaises(TypeError): extractor.retrieve_list_of_tags(df, ['name'], 'output_folder', 'January', '2010') #check type error for folder = list with self.assertRaises(TypeError): extractor.retrieve_list_of_tags(df, 'name', ['output_folder'], 'January', '2010') #check type error for month = int with self.assertRaises(TypeError): extractor.retrieve_list_of_tags(df, 'name', 'output_folder', 1, '2010') # check type error for year = int with self.assertRaises(TypeError): extractor.retrieve_list_of_tags(df, 'name', 'output_folder', 'January', 2010)
def test_output_xbrl_month_types(self): """ Types test case for the output_xbrl_month function. """ extractor = XbrlExtraction() df = self.data_input() with self.assertRaises(TypeError): extractor.output_xbrl_month(1.0, "output_folder", "month", "year", "file_type") with self.assertRaises(TypeError): extractor.output_xbrl_month(df, 1, "month", "year", "file_type") with self.assertRaises(TypeError): extractor.output_xbrl_month(df, "output_folder", 1, "year", "file_type") with self.assertRaises(TypeError): extractor.output_xbrl_month(df, "output_folder", "month", 1, "file_type") with self.assertRaises(TypeError): extractor.output_xbrl_month(df, "name", "output_folder", "month", 1)
def parse_directory(directory, processed_path, num_processes=1): """ Takes a directory, parses all files contained there and saves them as csv files in a specified directory. Arguments: directory: A directory (path) to be processed (str) processed_path: String of the path where processed files should be saved (str) num_processes: The number of cores to use in multiprocessing (int) Returns: None Raises: None """ extractor = XbrlExtraction() parser = XbrlParser() # Get all the filenames from the example folder files, folder_month, folder_year = extractor.get_filepaths(directory) print(len(files)) # Here you can splice/truncate the number of files you want to process # for testing files = files[0:10] # TO BE COMMENTED OUT AFTER TESTING print(folder_month, folder_year) # Code needed to split files by the number of cores before passing in # as an argument chunk_len = math.ceil(len(files) / num_processes) files = [ files[i:i + chunk_len] for i in range(0, len(files), chunk_len) ] # define number of processors pool = mp.Pool(processes=num_processes) # Finally, build a table of all variables from all example (digital) # documents splitting the load between cpu cores = num_processes # This can take a while (hopefully not anymore!!!) r = pool.map(parser.build_month_table, files) pool.close() pool.join() # combine resultant list of lists print("Combining lists...") r = [item for sublist in r for item in sublist] print("Flattening data....") # combine data and convert into dataframe results = parser.flatten_data(r) print(results.shape) # save to csv extractor.output_xbrl_month(results, processed_path, folder_month, folder_year) # Find list of all unique tags in dataset list_of_tags = results["name"].tolist() list_of_tags_unique = list(set(list_of_tags)) print("Longest tag: ", len(max(list_of_tags_unique, key=len)))