Exemple #1
0
def profile_rtf_parser(full_file_path: str):
    """Profile the rtf parser"""
    try:
        start = time.time()
        file_gen = FileGenerator(project_file_path=full_file_path,
                                 file_ext='rtf')
        file_iter = file_gen.__iter__()
        rtf_parser = RtfParser()
        while True:
            try:
                rtf_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing rtf files')
                break
        end = time.time()
        print(f"Rtf Runtime: {end - start}")

        print(f"Rtfl file counter: {rtf_parser.file_counter}")
        print(f"Rtf error file counter: {rtf_parser.error_file_counter}")
        pprint(f"Rtf Content: {rtf_parser.mapping_dict.keys()}")
        create_admin_spreadsheet(write_to_path=admin_log_path,
                                 file_type='rtf',
                                 count_extracted=rtf_parser.file_counter,
                                 count_failed=rtf_parser.error_file_counter,
                                 failed_file_name=rtf_parser.error_files,
                                 failed_file_path=full_file_path,
                                 count=get_file_count_by_extension(
                                     file_path=full_file_path, file_ext='rtf'))
    except Exception as e:
        print(e)
Exemple #2
0
def profile_doc_parser(full_file_path: str):
    """Profile the doc parser"""
    try:
        start = time.time()
        r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
        r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
        r_script = r'doc_to_csv_4_29_19.R'
        doc_to_csv_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\doc_to_csv'
        doc_parser = DocParser(r_executable=r_exacutable,
                               r_path=r_path,
                               r_script=r_script)
        # convert doc files to csv and write them to the doc_to_csv path
        doc_parser.run_doc_to_csv_r_script(file_path=full_file_path,
                                           timeout=str(20))
        file_gen = FileGenerator(project_file_path=doc_to_csv_path,
                                 file_ext='csv')
        file_iter = file_gen.__iter__()
        while True:
            try:
                current = doc_parser.extract_text(current_file=next(file_iter))
                print(current)
            except StopIteration:
                print('finished processing doc files')
                break
        doc_parser.remove_temp_doc_to_csv_files(
            doc_to_csv_write_path=doc_to_csv_path)
        end = time.time()
        print(f"Doc Runtime: {end - start}")
        print(f"Doc file counter: {doc_parser.file_counter}")
        print(f"Doc error file counter: {doc_parser.error_file_counter}")
        pprint(f"Doc Content: {doc_parser.mapping_dict.keys()}")
        create_admin_spreadsheet(write_to_path=admin_log_path,
                                 file_type='doc',
                                 count_extracted=doc_parser.file_counter,
                                 count_failed=doc_parser.error_file_counter,
                                 failed_file_name=doc_parser.error_files,
                                 failed_file_path=full_file_path,
                                 count=get_file_count_by_extension(
                                     file_path=full_file_path, file_ext='doc'))
        #print(f"Doc Content: {doc_parser.mapping_dict}")
    except Exception as e:
        print(e)
def run_eml_parser(project_file_path: str, metadata_file=None):
    """Run the eml data processing class application"""
    try:
        # NOTE: updated *6/27/2019*
        # step 1: get the unloaded documents from the current project directory



        file_gen = FileGenerator(project_file_path=project_file_path, file_ext='eml')
        file_iter = file_gen.__iter__()
        eml_parser = EmlParser()
        while True:
            try:
                eml_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing eml files')
                break
        print(f"Number of Eml files processed: {eml_parser.file_counter}")
        print(f"Number of Eml error files: {eml_parser.error_file_counter}")
        print(f"Mapping Dict: {eml_parser.mapping_dict}")
        serialize_data(
            data_set=eml_parser.mapping_dict,
            write_name='EmlParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'EmlParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_26_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data dictionary
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            print(f'Meta Data Dict := {meta_data_dict}')
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                # NOTE: *updated 06/27/2019*
                # pass in the path from metadata_config.load_metadata_file function
                data_set=meta_data_dict,
                is_local=True,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )

            # TEST: check the output of the file counter, error file counter and error files
            print(f"EML File count in filesystem: {get_file_count_by_extension(file_path=project_file_path, file_ext='eml')}")
            print(f"EML Error files list: {eml_parser.error_files}")
            print(f"EML Total files: {eml_parser.file_counter}")
            print(f"EML Total error files: {eml_parser.error_file_counter}")

            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='eml',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='eml'),
                count_extracted=eml_parser.file_counter, count_failed=eml_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=eml_parser.error_files
            )
            # step 8: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 8.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except (OSError, Exception) as e:
        logger.error(error=e)
def run_pdf_parser_v_1_1(project_file_path: str, metadata_file=None):
    pdf_parser = PdfParser()    # create an instance of the PdfParser class
    file_gen = FileGenerator(
        project_file_path=project_file_path, # create an instance of the FileGenerator class
        file_ext='pdf'
    )
    file_iter = file_gen.__iter__() # create a file iterator
    pdf_ocr_two = PdfOcrTwo()

    while True:
        # step 1: iterate through the pdfs
        try:
            current_pdf = next(file_iter)   #  get the current pdf
            pdf_parser.extract_text(current_file=current_pdf)   # extract any text
            pdf_ocr_two.extract_img_minecart(full_file_name=current_pdf)   # extract pdf images
        except StopIteration:
            print("finished processing pdf files")
            break

    # step 2: run the pdf image algorithm
    pdf_name_list, pdf_img_data_struct = process_pdf_img_algo()

    for n in pdf_img_data_struct.keys():
        # step 3: get the current pdf image file name
        for m in pdf_parser.mapping_container:
            # step 3.1: search the mapping container for a matching file name
            fn = m['file_name'].split('.')[0]   # format the filename
            if n == fn:
                # step 3.2: if names match, search through the pdf img container for a matching pg
                for pg in pdf_img_data_struct[n].keys():
                    pg = int(pg) # convert to the same type as mapping_container pages
                    if pg in list(m.keys())[1:]:
                        # step 3.3: join the text and insert to the correct pdf's page
                        full_text = m[pg] + pdf_img_data_struct[n][str(pg)]
                        m[pg] = full_text
    pprint(f"PDF Text: {pdf_parser.mapping_container}")
    # NOTE: Comment back in after tests
    # step 4: write the mapping file to a pickle file and save in the pickle file directory
    serialize_data(
        data_set=pdf_parser.mapping_container,
        write_name='PdfParserMappingContainer'
    )
    try:
        # step 5: sftp the serialized mapping data into the remote server
        data_serialization = DataSerialization()    # create an instance of the DataSerialization class
        remote_pkl_path = data_serialization.get_remote_pickle_path()   # get the remote pickle path
        local_pkl_path = data_serialization.get_local_pickle_path()     # get the local pickle path
        sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
        sftp_connection.connect(    # connect and load serialized data to the remote server
            filename=f'PdfParserMappingContainer_{d}.pickle',
            filepath=local_pkl_path,
            remote_path=remote_pkl_path
        )
    except Exception as e:
        logger.error(error=e)
    try:
        # step 6: load in the meta data pandas DataFrame and sftp to the remote server
        md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
        md_file = 'metadata_log_06_14_2019'
        metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
        metadata_test_file = r'20190619.xls'
        meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
        meta_data_dict= meta_data.load_metadata_file(    # get the meta_data data frame
            full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
        )
        # # step 6.1: extract only the necessary columns from the meta_data data frame
        # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
        #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
        # step 6.2: serialize the data frame and sftp it to the remote server
        local_mapping_path = data_serialization.get_local_mapping_file_path()
        remote_mapping_path = data_serialization.get_remote_mapping_file_path()
        serialize_data(     # serialize the pickled meta data, data frame
            data_set=meta_data_dict,
            write_name='MetaData_DataFrame',
            is_pickle=False
        )
    except Exception as e:
        logger.error(error=e)
    try:
        # step 7: create a new connection and load in the mapping data
        new_sftp_connection = SftpConnection()
        new_sftp_connection.connect(    # connect to the remote server and dump the pickled meta data, data frame
            filename=f'MetaData_DataFrame_{d}.pickle',
            filepath=local_mapping_path,
            remote_path=remote_mapping_path
        )
        # step 8: update the admin file
        create_admin_spreadsheet(
            write_to_path=admin_dir, file_type='pdf',
            count=get_file_count_by_extension(file_path=project_file_path, file_ext='doc'),
            count_extracted=pdf_parser.file_counter, count_failed=pdf_parser.error_file_counter,
            failed_file_path=project_file_path, failed_file_name=pdf_parser.error_files
        )
        # step 9: execute the server side portion of the data pipeline
        remote_connection = RunRemoteScript(make_connection=True) # connect to the remote server
        # step 9.1: execute the server side pipeline
        remote_connection.execute_server_side_pipeline()
    except Exception as e:
        logger.error(error=e)
def run_doc_parser(project_file_path: str, metadata_file=None):
    raw_data = r'V:\Dev\Historical\20190521\Document'
    r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
    r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
    r_script = r'doc_to_csv_4_29_19.R'
    doc_to_csv_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\doc_to_csv'
    try:
        # create an instance of the DocParser class
        doc_parser = DocParser(r_executable=r_exacutable, r_path=r_path, r_script=r_script)

        # convert doc files to csv and write them to the doc_to_csv path
        doc_parser.run_doc_to_csv_r_script(file_path=project_file_path, timeout=str(20))

        # NOTE: make sure this is pointed at the correct file location(doc_to_csv) path!
        file_gen = FileGenerator(project_file_path=doc_to_csv_path, file_ext='csv')
        file_iter = file_gen.__iter__()

        # extract the text from the converted doc files
        # NOTE: updated *06/27/19*
        # change current_file=doc_to_csv path since that's where the extracted txt files go
        while True:
            try:
                current = doc_parser.extract_text(current_file=next(file_iter))
                print(current)
            except StopIteration:
                print('finished processing rtf files')
                break
        # print(f"Number of Docx files processed: {doc_parser.file_counter}")
        # print(f"Number of Docx error files: {doc_parser.error_file_counter}")
        # print(f"Mapping Dict: {doc_parser.mapping_dict}")

        # attempt to remove all the temporary files
        #doc_parser.remove_temp_doc_to_csv_files(doc_to_csv_write_path=doc_to_csv_path)
        serialize_data(
            data_set=doc_parser.mapping_dict,
            write_name='DocParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'DocParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_27_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))

            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data data frame
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                data_set=meta_data_dict,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )

            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='doc',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='doc'),
                count_extracted=doc_parser.file_counter, count_failed=doc_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=doc_parser.error_files
            )
            # step 9: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 9.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except Exception as e:
        print(e)
def run_rtf_parser(project_file_path: str, metadata_file=None):
    raw_data = r'V:\Dev\Historical\20190521\Document'
    r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
    r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
    r_script = r'doc_to_csv_4_29_19.R'
    try:
        file_gen = FileGenerator(project_file_path=project_file_path, file_ext='rtf')
        file_iter = file_gen.__iter__()
        rtf_parser = RtfParser()
        while True:
            try:
                print(f"RTF: Extracting Text.")
                rtf_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing rtf files')
                break
        # print(f"Number of Rtf files processed: {rtf_parser.file_counter}")
        # print(f"Number of Rtf error files: {rtf_parser.error_file_counter}")
        # print(f"Mapping Dict: {rtf_parser.mapping_dict}")
        serialize_data(
            data_set=rtf_parser.mapping_dict,
            write_name='RtfParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'RtfParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_14_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data data frame
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                data_set=meta_data_dict,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )
            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='rtf',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='rtf'),
                count_extracted=rtf_parser.file_counter, count_failed=rtf_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=rtf_parser.error_files
            )
            # step 9: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 9.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except (OSError, Exception) as e:
        logger.error(error=e)