Ejemplo n.º 1
0
def profile_rtf_parser(full_file_path: str):
    """Profile the rtf parser"""
    try:
        start = time.time()
        file_gen = FileGenerator(project_file_path=full_file_path,
                                 file_ext='rtf')
        file_iter = file_gen.__iter__()
        rtf_parser = RtfParser()
        while True:
            try:
                rtf_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing rtf files')
                break
        end = time.time()
        print(f"Rtf Runtime: {end - start}")

        print(f"Rtfl file counter: {rtf_parser.file_counter}")
        print(f"Rtf error file counter: {rtf_parser.error_file_counter}")
        pprint(f"Rtf Content: {rtf_parser.mapping_dict.keys()}")
        create_admin_spreadsheet(write_to_path=admin_log_path,
                                 file_type='rtf',
                                 count_extracted=rtf_parser.file_counter,
                                 count_failed=rtf_parser.error_file_counter,
                                 failed_file_name=rtf_parser.error_files,
                                 failed_file_path=full_file_path,
                                 count=get_file_count_by_extension(
                                     file_path=full_file_path, file_ext='rtf'))
    except Exception as e:
        print(e)
Ejemplo n.º 2
0
def profile_doc_parser(full_file_path: str):
    """Profile the doc parser"""
    try:
        start = time.time()
        r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
        r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
        r_script = r'doc_to_csv_4_29_19.R'
        doc_to_csv_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\doc_to_csv'
        doc_parser = DocParser(r_executable=r_exacutable,
                               r_path=r_path,
                               r_script=r_script)
        # convert doc files to csv and write them to the doc_to_csv path
        doc_parser.run_doc_to_csv_r_script(file_path=full_file_path,
                                           timeout=str(20))
        file_gen = FileGenerator(project_file_path=doc_to_csv_path,
                                 file_ext='csv')
        file_iter = file_gen.__iter__()
        while True:
            try:
                current = doc_parser.extract_text(current_file=next(file_iter))
                print(current)
            except StopIteration:
                print('finished processing doc files')
                break
        doc_parser.remove_temp_doc_to_csv_files(
            doc_to_csv_write_path=doc_to_csv_path)
        end = time.time()
        print(f"Doc Runtime: {end - start}")
        print(f"Doc file counter: {doc_parser.file_counter}")
        print(f"Doc error file counter: {doc_parser.error_file_counter}")
        pprint(f"Doc Content: {doc_parser.mapping_dict.keys()}")
        create_admin_spreadsheet(write_to_path=admin_log_path,
                                 file_type='doc',
                                 count_extracted=doc_parser.file_counter,
                                 count_failed=doc_parser.error_file_counter,
                                 failed_file_name=doc_parser.error_files,
                                 failed_file_path=full_file_path,
                                 count=get_file_count_by_extension(
                                     file_path=full_file_path, file_ext='doc'))
        #print(f"Doc Content: {doc_parser.mapping_dict}")
    except Exception as e:
        print(e)
Ejemplo n.º 3
0
def run_eml_parser(project_file_path: str, metadata_file=None):
    """Run the eml data processing class application"""
    try:
        # NOTE: updated *6/27/2019*
        # step 1: get the unloaded documents from the current project directory



        file_gen = FileGenerator(project_file_path=project_file_path, file_ext='eml')
        file_iter = file_gen.__iter__()
        eml_parser = EmlParser()
        while True:
            try:
                eml_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing eml files')
                break
        print(f"Number of Eml files processed: {eml_parser.file_counter}")
        print(f"Number of Eml error files: {eml_parser.error_file_counter}")
        print(f"Mapping Dict: {eml_parser.mapping_dict}")
        serialize_data(
            data_set=eml_parser.mapping_dict,
            write_name='EmlParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'EmlParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_26_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data dictionary
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            print(f'Meta Data Dict := {meta_data_dict}')
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                # NOTE: *updated 06/27/2019*
                # pass in the path from metadata_config.load_metadata_file function
                data_set=meta_data_dict,
                is_local=True,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )

            # TEST: check the output of the file counter, error file counter and error files
            print(f"EML File count in filesystem: {get_file_count_by_extension(file_path=project_file_path, file_ext='eml')}")
            print(f"EML Error files list: {eml_parser.error_files}")
            print(f"EML Total files: {eml_parser.file_counter}")
            print(f"EML Total error files: {eml_parser.error_file_counter}")

            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='eml',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='eml'),
                count_extracted=eml_parser.file_counter, count_failed=eml_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=eml_parser.error_files
            )
            # step 8: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 8.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except (OSError, Exception) as e:
        logger.error(error=e)
Ejemplo n.º 4
0
def run_pdf_parser_v_1_1(project_file_path: str, metadata_file=None):
    pdf_parser = PdfParser()    # create an instance of the PdfParser class
    file_gen = FileGenerator(
        project_file_path=project_file_path, # create an instance of the FileGenerator class
        file_ext='pdf'
    )
    file_iter = file_gen.__iter__() # create a file iterator
    pdf_ocr_two = PdfOcrTwo()

    while True:
        # step 1: iterate through the pdfs
        try:
            current_pdf = next(file_iter)   #  get the current pdf
            pdf_parser.extract_text(current_file=current_pdf)   # extract any text
            pdf_ocr_two.extract_img_minecart(full_file_name=current_pdf)   # extract pdf images
        except StopIteration:
            print("finished processing pdf files")
            break

    # step 2: run the pdf image algorithm
    pdf_name_list, pdf_img_data_struct = process_pdf_img_algo()

    for n in pdf_img_data_struct.keys():
        # step 3: get the current pdf image file name
        for m in pdf_parser.mapping_container:
            # step 3.1: search the mapping container for a matching file name
            fn = m['file_name'].split('.')[0]   # format the filename
            if n == fn:
                # step 3.2: if names match, search through the pdf img container for a matching pg
                for pg in pdf_img_data_struct[n].keys():
                    pg = int(pg) # convert to the same type as mapping_container pages
                    if pg in list(m.keys())[1:]:
                        # step 3.3: join the text and insert to the correct pdf's page
                        full_text = m[pg] + pdf_img_data_struct[n][str(pg)]
                        m[pg] = full_text
    pprint(f"PDF Text: {pdf_parser.mapping_container}")
    # NOTE: Comment back in after tests
    # step 4: write the mapping file to a pickle file and save in the pickle file directory
    serialize_data(
        data_set=pdf_parser.mapping_container,
        write_name='PdfParserMappingContainer'
    )
    try:
        # step 5: sftp the serialized mapping data into the remote server
        data_serialization = DataSerialization()    # create an instance of the DataSerialization class
        remote_pkl_path = data_serialization.get_remote_pickle_path()   # get the remote pickle path
        local_pkl_path = data_serialization.get_local_pickle_path()     # get the local pickle path
        sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
        sftp_connection.connect(    # connect and load serialized data to the remote server
            filename=f'PdfParserMappingContainer_{d}.pickle',
            filepath=local_pkl_path,
            remote_path=remote_pkl_path
        )
    except Exception as e:
        logger.error(error=e)
    try:
        # step 6: load in the meta data pandas DataFrame and sftp to the remote server
        md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
        md_file = 'metadata_log_06_14_2019'
        metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
        metadata_test_file = r'20190619.xls'
        meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
        meta_data_dict= meta_data.load_metadata_file(    # get the meta_data data frame
            full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
        )
        # # step 6.1: extract only the necessary columns from the meta_data data frame
        # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
        #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
        # step 6.2: serialize the data frame and sftp it to the remote server
        local_mapping_path = data_serialization.get_local_mapping_file_path()
        remote_mapping_path = data_serialization.get_remote_mapping_file_path()
        serialize_data(     # serialize the pickled meta data, data frame
            data_set=meta_data_dict,
            write_name='MetaData_DataFrame',
            is_pickle=False
        )
    except Exception as e:
        logger.error(error=e)
    try:
        # step 7: create a new connection and load in the mapping data
        new_sftp_connection = SftpConnection()
        new_sftp_connection.connect(    # connect to the remote server and dump the pickled meta data, data frame
            filename=f'MetaData_DataFrame_{d}.pickle',
            filepath=local_mapping_path,
            remote_path=remote_mapping_path
        )
        # step 8: update the admin file
        create_admin_spreadsheet(
            write_to_path=admin_dir, file_type='pdf',
            count=get_file_count_by_extension(file_path=project_file_path, file_ext='doc'),
            count_extracted=pdf_parser.file_counter, count_failed=pdf_parser.error_file_counter,
            failed_file_path=project_file_path, failed_file_name=pdf_parser.error_files
        )
        # step 9: execute the server side portion of the data pipeline
        remote_connection = RunRemoteScript(make_connection=True) # connect to the remote server
        # step 9.1: execute the server side pipeline
        remote_connection.execute_server_side_pipeline()
    except Exception as e:
        logger.error(error=e)
Ejemplo n.º 5
0
def run_doc_parser(project_file_path: str, metadata_file=None):
    raw_data = r'V:\Dev\Historical\20190521\Document'
    r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
    r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
    r_script = r'doc_to_csv_4_29_19.R'
    doc_to_csv_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\doc_to_csv'
    try:
        # create an instance of the DocParser class
        doc_parser = DocParser(r_executable=r_exacutable, r_path=r_path, r_script=r_script)

        # convert doc files to csv and write them to the doc_to_csv path
        doc_parser.run_doc_to_csv_r_script(file_path=project_file_path, timeout=str(20))

        # NOTE: make sure this is pointed at the correct file location(doc_to_csv) path!
        file_gen = FileGenerator(project_file_path=doc_to_csv_path, file_ext='csv')
        file_iter = file_gen.__iter__()

        # extract the text from the converted doc files
        # NOTE: updated *06/27/19*
        # change current_file=doc_to_csv path since that's where the extracted txt files go
        while True:
            try:
                current = doc_parser.extract_text(current_file=next(file_iter))
                print(current)
            except StopIteration:
                print('finished processing rtf files')
                break
        # print(f"Number of Docx files processed: {doc_parser.file_counter}")
        # print(f"Number of Docx error files: {doc_parser.error_file_counter}")
        # print(f"Mapping Dict: {doc_parser.mapping_dict}")

        # attempt to remove all the temporary files
        #doc_parser.remove_temp_doc_to_csv_files(doc_to_csv_write_path=doc_to_csv_path)
        serialize_data(
            data_set=doc_parser.mapping_dict,
            write_name='DocParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'DocParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_27_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))

            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data data frame
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                data_set=meta_data_dict,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )

            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='doc',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='doc'),
                count_extracted=doc_parser.file_counter, count_failed=doc_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=doc_parser.error_files
            )
            # step 9: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 9.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except Exception as e:
        print(e)
Ejemplo n.º 6
0
def run_rtf_parser(project_file_path: str, metadata_file=None):
    raw_data = r'V:\Dev\Historical\20190521\Document'
    r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
    r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
    r_script = r'doc_to_csv_4_29_19.R'
    try:
        file_gen = FileGenerator(project_file_path=project_file_path, file_ext='rtf')
        file_iter = file_gen.__iter__()
        rtf_parser = RtfParser()
        while True:
            try:
                print(f"RTF: Extracting Text.")
                rtf_parser.extract_text(current_file=next(file_iter))
            except StopIteration:
                print('finished processing rtf files')
                break
        # print(f"Number of Rtf files processed: {rtf_parser.file_counter}")
        # print(f"Number of Rtf error files: {rtf_parser.error_file_counter}")
        # print(f"Mapping Dict: {rtf_parser.mapping_dict}")
        serialize_data(
            data_set=rtf_parser.mapping_dict,
            write_name='RtfParserMappingDict'
        )
        try:
            # step 5: sftp the serialized mapping data into the remote server
            data_serialization = DataSerialization()  # create an instance of the DataSerialization class
            remote_pkl_path = data_serialization.get_remote_pickle_path()  # get the remote pickle path
            local_pkl_path = data_serialization.get_local_pickle_path()  # get the local pickle path
            sftp_connection = SftpConnection()  # create an instance of the SftpConnection class
            sftp_connection.connect(  # connect and load serialized data to the remote server
                filename=f'RtfParserMappingDict_{d}.pickle',
                filepath=local_pkl_path,
                remote_path=remote_pkl_path
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 6: load in the meta data pandas DataFrame and sftp to the remote server
            md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
            md_file = 'metadata_log_06_14_2019'
            metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
            metadata_test_file = r'20190619.xls'
            meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
            meta_data_dict = meta_data.load_metadata_file(  # get the meta_data data frame
                full_file_path=os.path.join(metadata_test_file_path, metadata_test_file)
            )
            # # step 6.1: extract only the necessary columns from the meta_data data frame
            # md_df = meta_data_df.loc[:,['claim_id', 'Object_id', 'File_Name', 'Format',
            #                          'Version', 'Object_name', 'Prior_Version_Object_Name']]
            # step 6.2: serialize the data frame and sftp it to the remote server
            local_mapping_path = data_serialization.get_local_mapping_file_path()
            remote_mapping_path = data_serialization.get_remote_mapping_file_path()
            serialize_data(  # serialize the pickled meta data, data frame
                data_set=meta_data_dict,
                write_name='MetaData_DataFrame',
                is_pickle=False
            )
        except Exception as e:
            logger.error(error=e)
        try:
            # step 7: create a new connection and load in the mapping data
            new_sftp_connection = SftpConnection()
            new_sftp_connection.connect(  # connect to the remote server and dump the pickled meta data, data frame
                filename=f'MetaData_DataFrame_{d}.pickle',
                filepath=local_mapping_path,
                remote_path=remote_mapping_path
            )
            # step 8: update the admin file
            create_admin_spreadsheet(
                write_to_path=admin_dir, file_type='rtf',
                count=get_file_count_by_extension(file_path=project_file_path, file_ext='rtf'),
                count_extracted=rtf_parser.file_counter, count_failed=rtf_parser.error_file_counter,
                failed_file_path=project_file_path, failed_file_name=rtf_parser.error_files
            )
            # step 9: execute the server side portion of the data pipeline
            remote_connection = RunRemoteScript(make_connection=True)  # connect to the remote server
            # step 9.1: execute the server side pipeline
            remote_connection.execute_server_side_pipeline()
        except Exception as e:
            logger.error(error=e)
    except (OSError, Exception) as e:
        logger.error(error=e)
Ejemplo n.º 7
0
def create_email(business_segment: str, hadoop_env: str, python_path: str, code_version: float,
                 hadoop_path: str, hive: str, daily_win_total: int, dms_total: int,
                 total_python: int, total_hdfs: int, total_hive: int):
    """creates the email that will be sent out to end users"""
    # step1: setup jinja2 template setup
    html_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\versioned_code\unstructured_data_pipeline_07_11_19\templates'
    html_file = 'count_loc_type.html'
    file_loader = FileSystemLoader(html_path)
    environment = Environment(loader=file_loader)
    template = environment.get_template(html_file)

    # system file counts
    pdf_fs_count = get_file_count_by_extension(file_path=test_path,  file_ext='pdf')
    rtf_fs_count = get_file_count_by_extension(file_path=test_path,  file_ext='rtf')
    docx_fs_count = get_file_count_by_extension(file_path=test_path, file_ext='docx')
    doc_fs_count = get_file_count_by_extension(file_path=test_path,  file_ext='doc')
    eml_fs_count = get_file_count_by_extension(file_path=test_path,  file_ext='eml')

    # total supported system files
    total_supported_sys_files = pdf_fs_count + rtf_fs_count + docx_fs_count + doc_fs_count + eml_fs_count

    # hive counts
    remote_script = RunRemoteScript(make_connection=True)
    pdf_hive_count = remote_script.execute_hive_count(hive_table='pdfdata_2019_07_01') # get the count of files from hive
    rtf_hive_count = remote_script.execute_hive_count(hive_table='rtfdata_2019_07_01')
    docx_hive_count = remote_script.execute_hive_count(hive_table='docxdata_2019_07_01')
    doc_hive_count = remote_script.execute_hive_count(hive_table='docdata_2019_07_01')
    eml_hive_count = remote_script.execute_hive_count(hive_table='emldata_2019_07_01')

    # total hive count
    total_hive_count = int(pdf_hive_count) + int(rtf_hive_count) + int(docx_hive_count) + int(doc_hive_count) + int(eml_hive_count)

    # dms counts
    md_log_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\metadata_log'
    md_file = 'metadata_log_06_14_2019'
    metadata_test_file_path = r'V:\Dev\Delta\20190619\Metadata'
    metadata_test_file = r'20190619.xls'
    meta_data = MetaData(metadata_log_file=os.path.join(md_log_path, md_file))
    meta_data_df = meta_data.load_metadata_as_df(os.path.join(metadata_test_file_path, metadata_test_file))

    pdf_dms_count = meta_data_df[meta_data_df['Format'] == 'pdf'].shape[0]
    rtf_dms_count = meta_data_df[meta_data_df['Format'] == 'rtf'].shape[0]
    docx_dms_count = meta_data_df[meta_data_df['Format'] == 'docx'].shape[0]
    doc_dms_count = meta_data_df[meta_data_df['Format'] == 'doc'].shape[0]
    eml_dms_count = meta_data_df[meta_data_df['Format'] == 'eml'].shape[0]

    # total dms count
    total_dms_count = pdf_dms_count + rtf_dms_count + docx_dms_count + doc_dms_count + eml_dms_count


    # python data mining count
    pdf_gen = FileGenerator(project_file_path=test_path, file_ext='pdf')
    pdf_iter = pdf_gen.__iter__()
    pdf_parser = PdfParser()
    while True:
        try:
            current = pdf_parser.extract_text(current_file=next(pdf_iter))
        except StopIteration:
            print('finished processing pdf files')
            break
    pdf_ocr_count = len(pdf_parser.mapping_container)

    rtf_gen = FileGenerator(project_file_path=test_path, file_ext='rtf')
    rtf_iter = rtf_gen.__iter__()
    rtf_parser = RtfParser()
    while True:
        try:
            current = rtf_parser.extract_text(current_file=next(rtf_iter))
        except StopIteration:
            print('finished processing rtf files')
            break
    rtf_ocr_count = len(rtf_parser.mapping_dict.keys())

    docx_gen = FileGenerator(project_file_path=test_path, file_ext='docx')
    docx_iter = docx_gen.__iter__()
    docx_parser = DocxParser()
    while True:
        try:
            current = docx_parser.extract_text(current_file=next(docx_iter))
        except StopIteration:
            print('finished processing docx files')
            break
    docx_ocr_count = len(docx_parser.mapping_dict.keys())

    eml_gen = FileGenerator(project_file_path=test_path, file_ext='eml')
    eml_iter = eml_gen.__iter__()
    eml_parser = EmlParser()
    while True:
        try:
            current = eml_parser.extract_text(current_file=next(eml_iter))
        except StopIteration:
            print('finished processing eml files')
            break
    eml_ocr_count = len(eml_parser.mapping_dict.keys())

    r_path = r'C:\Users\wmurphy\Desktop\R_Scripts'
    r_exacutable = r'C:\Program Files\R\R-3.5.3\bin\Rscript'
    r_script = r'doc_to_csv_4_29_19.R'
    doc_to_csv_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\doc_to_csv'

    doc_gen = FileGenerator(project_file_path=test_path, file_ext='csv')
    doc_iter = doc_gen.__iter__()
    doc_parser = DocParser(r_executable=r_exacutable, r_path=r_path, r_script=r_script)
    doc_parser.run_doc_to_csv_r_script(file_path=test_path, timeout=str(20))
    while True:
        try:
            current = doc_parser.extract_text(current_file=next(doc_iter))
        except StopIteration:
            print('finished processing pdf files')
            break
    doc_ocr_count = len(doc_parser.mapping_dict.keys())

    # total ocr counts
    total_ocr_count = pdf_ocr_count + rtf_ocr_count + docx_ocr_count + doc_ocr_count + eml_ocr_count

    hive_tables = list(map(lambda x: x + '_' + str(d).replace('-', '_'),
                           ['docdata', 'docxdata', 'emldata', 'rtfdata', 'pdfdata']))


    output = template.render(
                             business_segment=business_segment,
                             hadoop_env=t_path, python_path=python_path,
                             code_version=code_version, linux_path='/genre/bda/apps/data_pipeline',
                             hive=hive, daily_win_total=total_supported_sys_files, dms_total=total_dms_count,
                             total_python=total_ocr_count, total_hdfs=total_hdfs,
                             total_hive=total_hive,
                             logfile_path=r'\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims\logging',
                             hive_tables=hive_tables,
                             pdf_hive_count=pdf_hive_count, pdf_fs_count=pdf_fs_count, rtf_hive_count=rtf_hive_count,
                             rtf_fs_count=rtf_fs_count, eml_hive_count=eml_hive_count, eml_fs_count=eml_fs_count,
                             doc_hive_count=doc_hive_count, doc_fs_count=doc_fs_count, docx_hive_count=docx_hive_count,
                             docx_fs_count=docx_fs_count, pdf_dms_count=pdf_dms_count,
                             rtf_dms_count=rtf_dms_count, docx_dms_count=docx_dms_count,
                             doc_dms_count=doc_dms_count, eml_dms_count=eml_dms_count, pdf_ocr_count=pdf_ocr_count,
                             eml_ocr_count=eml_ocr_count, rtf_ocr_count=rtf_ocr_count, docx_ocr_count=docx_ocr_count,
                             doc_ocr_count=doc_ocr_count)
    pipeline_email = PipelineEmailer()
    pipeline_email.host___ = "mail.genre.com"
    #pipeline_email.to___ = ['*****@*****.**']
    pipeline_email.to___ = ['*****@*****.**']
    pipeline_email.body___ = output
    pipeline_email.from___ = '*****@*****.**'
    pipeline_email.subject___ = 'Python Emailer Test' + str(datetime.now())
    pipeline_email.send_email()


# if __name__ == '__main__':
#     python_path = r'Y:\Shared\USD\Business Data and Analytics\Unstructured_Data_Pipeline\DMS_Claims'
#     py_path = os.path.splitdrive(python_path)[1]
#     create_email(business_segment='NA Prop/Fac', python_path=py_path, hadoop_env=t_path,
#                  code_version=1.0, hadoop_path='P:ath/to/hadoop', hive='MSEAULXDA03',
#                  daily_win_total=100, dms_total=100, total_python=100, total_hdfs=99, total_hive=100)