Exemple #1
0
def process_input(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        output_files = []
        langs = []
        for index, file in enumerate(files):
            file_output = {"status": {}}
            file = get_json(base_dir, file['file']['name'])[0]

            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)

            page_res = text_extraction(file_properties, page_paths, file)
            output_files.append(page_res)
            langs.append(file_properties.get_language())

        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None, None

    return app_context.application_context, langs
Exemple #2
0
def process_input(app_context):
    try:
        files = get_files(app_context.application_context)
        output_files = []

        for index, file in enumerate(files):
            file_output = {"status": {}}
            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)
            page_res = text_extraction(page_paths)
            file_output["page_info"] = page_paths
            file_output["file"] = file
            file_output["pages"] = page_res
            file_output['status'][
                'message'] = "google-vision ocr run successfully"
            output_files.append(file_output)
        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Exemple #3
0
def preprocess_file(file_properties, lang, ocr_level):
    file = file_properties.get_file()
    page_paths = file_properties.get_pages()
    width, height = file_properties.get_pageinfo(0)

    for page_index, page_path in enumerate(page_paths):
        page_regions = file_properties.get_regions(page_index)
        #page_path =  '/'.join(page_path.split('/')[-4:])
        mode_height = frequent_height(file_properties.get_lines(page_index))

        if config.OCR_LEVEL[ocr_level] == 'words':
            for idx, region in enumerate(page_regions):
                if region['class'] in ["TEXT", "TABLE"]:
                    region_lines = file_properties.get_region_lines(
                        page_index, idx)
                    for line_index, line in enumerate(region_lines):
                        region_words = file_properties.get_region_words(
                            page_index, idx, line_index)
                        if config.IS_DYNAMIC:
                            region_words_org = coord_adjustment(
                                page_path, region_words)
                            region_ocr = text_extraction(
                                lang, page_path, region_words_org,
                                region_words, width, height, mode_height)
                        else:
                            region_ocr = text_extraction(
                                lang, page_path, region_words, region_words,
                                width, height, mode_height)

                        file['pages'][page_index]['regions'][idx]['children'][
                            line_index]['children'] = region_ocr
                else:

                    file['pages'][page_index]['regions'][idx] = copy.deepcopy(
                        region)
                file['pages'][page_index]['regions'][idx][
                    'children'] = merge_text(
                        file['pages'][page_index]['regions'][idx]['children'],
                        merge_tess_confidence=True)
            file['pages'][page_index]['regions'] = merge_text(
                file['pages'][page_index]['regions'])

        if config.OCR_LEVEL[ocr_level] == 'lines':
            for idx, region in enumerate(page_regions):
                if region['class'] in ["TEXT", "TABLE"]:
                    region_lines = file_properties.get_region_lines(
                        page_index, idx)
                    if config.IS_DYNAMIC:
                        region_lines_org = coord_adjustment(
                            page_path, region_lines)
                        region_ocr = text_extraction(lang, page_path,
                                                     region_lines_org,
                                                     region_lines, width,
                                                     height, mode_height)
                    else:
                        region_ocr = text_extraction(lang, page_path,
                                                     region_lines,
                                                     region_lines, width,
                                                     height, mode_height)
                    file['pages'][page_index]['regions'][idx][
                        'children'] = region_ocr
                else:
                    file['pages'][page_index]['regions'][idx] = copy.deepcopy(
                        region)
            file['pages'][page_index]['regions'] = merge_text(
                file['pages'][page_index]['regions'])
    return file
Exemple #4
0
def preprocess_file(file_properties, lang, ocr_level):
    file = file_properties.get_file()
    page_paths = file_properties.get_pages()
    width, height = file_properties.get_pageinfo(0)
    mask_page_path = []
    for page_index, page_path in enumerate(page_paths):
        print('processing for page : '.format(([page_index])))
        page_regions = file_properties.get_regions(page_index)
        #page_path =  '/'.join(page_path.split('/')[-4:])

        save_path = mask_image(page_path, page_regions, page_index,
                               file_properties, width, height)
        file = set_bg_image(file, save_path, page_index)

        mode_height = frequent_height(file_properties.get_lines(page_index))

        if config.OCR_LEVEL[ocr_level] == 'words':
            for idx, region in enumerate(page_regions):
                if region['class'] in config.ocr_class:
                    region_lines = file_properties.get_region_lines(
                        page_index, idx)
                    for line_index, line in enumerate(region_lines):
                        region_words = file_properties.get_region_words(
                            page_index, idx, line_index)
                        if config.IS_DYNAMIC and region[
                                'class'] in config.DYNAMIC_CLASS:
                            region_words_org = coord_adjustment(
                                page_path, copy.deepcopy(region_words))
                            region_ocr = text_extraction(
                                lang, page_path, region_words_org,
                                region_words, width, height, mode_height)
                        else:
                            region_ocr = text_extraction(
                                lang, page_path, region_words, region_words,
                                width, height, mode_height)

                        #file['pages'][page_index]['regions'][idx]['children'][line_index]['children'] = region_ocr
                        file['pages'][page_index]['regions'][idx]['regions'][
                            line_index]['regions'] = region_ocr
                else:

                    file['pages'][page_index]['regions'][idx] = copy.deepcopy(
                        region)
                #file['pages'][page_index]['regions'][idx]['children'] = merge_text(file['pages'][page_index]['regions'][idx]['children'],merge_tess_confidence=True)
                file['pages'][page_index]['regions'][idx][
                    'regions'] = merge_text(
                        file['pages'][page_index]['regions'][idx]['regions'],
                        merge_tess_confidence=True)
            #file['pages'][page_index]['regions'] = merge_text(file['pages'][page_index]['regions'])

        if config.OCR_LEVEL[ocr_level] == 'lines':
            for idx, region in enumerate(page_regions):
                if region['class'] in config.ocr_class:
                    region_lines = file_properties.get_region_lines(
                        page_index, idx)
                    if config.IS_DYNAMIC and region[
                            'class'] in config.DYNAMIC_CLASS:
                        region_lines_org = coord_adjustment(
                            page_path, copy.deepcopy(region_lines))
                        region_ocr = text_extraction(lang, page_path,
                                                     region_lines_org,
                                                     region_lines, width,
                                                     height, mode_height)
                    else:
                        region_ocr = text_extraction(lang, page_path,
                                                     region_lines,
                                                     region_lines, width,
                                                     height, mode_height)
                    file['pages'][page_index]['regions'][idx][
                        'children'] = region_ocr
                else:
                    file['pages'][page_index]['regions'][idx] = copy.deepcopy(
                        region)

            file['pages'][page_index]['regions'] = merge_text(
                file['pages'][page_index]['regions'])
        '''
            masking out images based on word coordinates
        '''

        log_info("successfully completed ocr for  page {}".format(page_index),
                 app_context.application_context)
        #mask_page_path.append(save_path)
    #file['bg_image_paths']  = mask_page_path

    return file