def extract_images(app_context, base_dir):

    files = get_files(app_context.application_context)
    file_images = []
    try:
        for file in files:
            file_properties = File(file)
            file_format = file_properties.get_format()

            if file_format in ['PDF', 'pdf']:
                filename = file_properties.get_name()
                image_paths = extract_pdf_images(filename, base_dir)
                file_images.append(image_paths)
            else:
                if file_format in [
                        'PNG', 'JPEG', 'BMP', 'jpg', 'png', 'bmp', 'jpeg'
                ]:
                    filename = file_properties.get_name()
                    image_paths = [os.path.join(base_dir, filename)]
                    file_images.append(image_paths)
                else:
                    log_info(
                        "currently we do not support {} files .".format(
                            file_format), app_context.application_context)
                    return None
    except Exception as e:
        log_error('error extracting images of' + str(e),
                  app_context.application_context, e)
        return None

    return file_images
Beispiel #2
0
def process_info(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        file_images = []
        output = []
        for index, file_new in enumerate(files):
            start_time = time.time()
            file = get_json(file_new['file']['name'], base_dir)[0]
            file_properties = File(file)
            ocr_level, lang = get_ocr_config(file_new,
                                             file_properties.get_pages())
            file = preprocess_file(file_properties, lang, ocr_level)
            file['file'] = file_new['file']
            file['config'] = file_new['config']
            output.append(file)
            output[index]['status'] = {
                'code': 200,
                'message': "tesseract ocr successful"
            }
            end_time = time.time()
            extraction_time = (end_time - start_time) / len(
                file_properties.get_pages())
            log_info(
                'tesseract ocr per page completed in {}'.format(
                    extraction_time), app_context.application_context)
        app_context.application_context["outputs"] = output
        log_info("successfully completed tesseract ocr", None)
    except Exception as e:
        log_exception("Error occured during tesseract ocr ",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Beispiel #3
0
def process_input(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        output_files = []
        langs = []
        for index, file in enumerate(files):
            file_output = {"status": {}}
            file = get_json(base_dir, file['file']['name'])[0]

            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)

            page_res = text_extraction(file_properties, page_paths, file)
            output_files.append(page_res)
            langs.append(file_properties.get_language())

        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None, None

    return app_context.application_context, langs
Beispiel #4
0
def get_segmented_regions(app_context,base_dir) :
    try:
        files       = get_files(app_context.application_context)
        output      = []
        for index,file in enumerate(files):
            file   = get_json(base_dir, file['file']['name'])
            file_properties = File(file)
            pages = file_properties.get_pages()
            page_counts = len(pages)
            start_time = time.time()
            for page_index in range(page_counts):
                print('processing for page   :  ', page_index)
                # page_lines   =  file_properties.get_lines(page_index)
                # page_regions =  file_properties.get_regions(page_index)
                # page_words   =  file_properties.get_words(page_index)
                #font_meta    = font_properties(file_properties.get_page(page_index))
                font_meta  = []
                #page_regions =  region_unifier.region_unifier(page_lines,page_regions)
                #file_properties.set_regions(page_index, segment_regions(page_words,page_lines,page_regions))
                file_properties.set_font_properties(page_index,font_meta)

            output.append(file_properties.get_file())
            output[index]['status']= {'message':"block-segmenter successful"}
            end_time            = time.time()
            extraction_time     = (end_time - start_time)/page_counts
            log_info('block segmentation per page completed in {}'.format(extraction_time), app_context.application_context)
        app_context.application_context["outputs"] =output
        log_info("successfully completed block segmentation", None)
    except Exception as e:
        log_exception("Error occured during block segmentation ",  app_context.application_context, e)
        return None

    return app_context.application_context
Beispiel #5
0
def process_input(app_context):
    try:
        files = get_files(app_context.application_context)
        output_files = []

        for index, file in enumerate(files):
            file_output = {"status": {}}
            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)
            page_res = text_extraction(page_paths)
            file_output["page_info"] = page_paths
            file_output["file"] = file
            file_output["pages"] = page_res
            file_output['status'][
                'message'] = "google-vision ocr run successfully"
            output_files.append(file_output)
        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Beispiel #6
0
def get_layout(app_context):
    try:
        files = get_files(app_context.application_context)
        #files   = get_json(app_context.application_context)
        #files       = get_files(json_data)
        file_images = []
        output = []
        for index, file_new in enumerate(files):
            file = get_json(file_new['file']['name'])[0]
            file_properties = File(file)
            page_paths = file_properties.get_pages()
            start_time = time.time()
            for idx, page_path in enumerate(page_paths):
                page_lines = file_properties.get_lines(idx)
                page_words = file_properties.get_words(idx)
                line_coords = get_coord(page_lines)
                #page_path   = '/'.join(page_path.split('/')[-4:])
                page_path = 'upload/' + page_path.split('upload/')[-1]

                #masked_image, table_and_lines = extract_table_line_regions(page_path)
                #cell_regions = cell_layout(table_and_lines,page_path)
                if torch.cuda.is_available():
                    torch.cuda.device(0)
                    print("*******cuda available")
                    torch.cuda.empty_cache()
                time.sleep(1)
                regions = primalaynet.predict_primanet(page_path, line_coords)
                #regions += cell_regions
                file['pages'][idx]["regions"] = regions
            file['file'] = file_new['file']
            file['config'] = file_new['config']
            output.append(file)
            output[index]['status'] = {}
            output[index]['status']['message'] = "layout-detector successful"
            end_time = time.time()
            extraction_time = (end_time - start_time) / len(page_paths)
            log_info(
                'Layout detection per page completed in {}'.format(
                    extraction_time), app_context.application_context)
        app_context.application_context["outputs"] = output
        log_info("successfully completed layout detection", None)
    except Exception as e:
        log_exception("Error occured during prima layout detection ",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Beispiel #7
0
def get_layout(app_context) :
    try:
        files       = get_files(app_context.application_context)
        file_images = []
        output      = []
        for index,file in enumerate(files):
            file_properties = File(file)
            page_paths      = file_properties.get_pages()
            for idx,page_path in enumerate(page_paths):
                page_lines  = file_properties.get_lines(idx)
                page_words  = file_properties.get_words(idx)
                line_coords = get_coord(page_lines)
                regions     = primalaynet.predict_primanet(page_path, line_coords)
                file['pages'][idx]["regions"]=regions
            output.append(file)
            output[index]['status']['message']="layout-detector successful"
        app_context.application_context["outputs"] =output
        log_info("successfully completed layout detection", None)
    except Exception as e:
        log_exception("Error occured during prima layout detection ",  app_context.application_context, e)
        return None

    return app_context.application_context