Example #1
0
 def __call__(self, data, context: dict = None):
     return {
         'filename':
         change_ext(data['filename'], None, self.output_ext),
         'content':
         doc_to_type(**self.get_doc_to_type_kwargs(data, context=context)),
         'type':
         self.output_mime_type
     }
Example #2
0
 def __call__(self, data):
     response = requests_post(self._api_url,
                              headers={'Content-Type': data['type']},
                              data=data['content'])
     response.raise_for_status()
     return {
         'filename': change_ext(data['filename'], None, '.xml'),
         'content': response.text,
         'type': response.headers['Content-Type']
     }
Example #3
0
 def process_request(self, data: dict, session: requests.Session, context: dict = None):
     response = self.post_data(
         data=data,
         session=session,
         timeout=self.get_default_request_timeout(context=context)
     )
     return {
         'filename': change_ext(data['filename'], None, '.xml'),
         'content': response.text,
         'type': response.headers['Content-Type']
     }
    def _do_convert(self,
                    temp_source_filename,
                    output_type: str = 'pdf',
                    remove_line_no: bool = True,
                    remove_header_footer: bool = True,
                    remove_redline: bool = True):
        self.start_listener_if_not_running()

        temp_target_filename = change_ext(temp_source_filename, None,
                                          '-output.%s' % output_type)

        args = []
        args.extend(['convert', '--format', output_type])
        if remove_line_no:
            args.append('--remove-line-no')
        if remove_header_footer:
            args.append('--remove-header-footer')
        if remove_redline:
            args.append('--remove-redline')
        args.extend([
            '--port',
            str(self.port), '--output-file',
            str(temp_target_filename), temp_source_filename
        ])
        if self.no_launch:
            args.append('--no-launch')
        if self.keep_listener_running:
            args.append('--keep-listener-running')
        try:
            _exec_doc_converter(args,
                                enable_debug=self.enable_debug,
                                process_timeout=self.process_timeout)
        except UnoConnectionError:
            self.stop_listener_if_running()
            raise
        except Exception:
            if self.stop_listener_on_error:
                self.stop_listener_if_running()
            raise

        if not os.path.exists(temp_target_filename):
            raise RuntimeError('temp target file missing: %s' %
                               temp_target_filename)
        return temp_target_filename
 def __call__(self, data):
     return {
         'filename': change_ext(data['filename'], None, '.docx'),
         'content': doc_to_docx(data['content'], data['type']),
         'type': MimeTypes.DOCX
     }
 def test_should_remove_gz_ext_before_replacing_ext(self):
     assert change_ext('file.pdf.gz', None, '.svg.zip') == 'file.svg.zip'
 def test_should_replace_simple_ext_with_combined_ext(self):
     assert change_ext('file.pdf', None, '.svg.zip') == 'file.svg.zip'
 def test_should_replace_simple_ext_with_simple_ext(self):
     assert change_ext('file.pdf', None, '.xml') == 'file.xml'
def configure_pipeline(p, opt):
    image_size = ((opt.image_width, opt.image_height)
                  if opt.image_width and opt.image_height else None)
    page_range = opt.pages
    first_page = page_range[0] if page_range else 1
    xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
    if opt.lxml_path:
        lxml_xml_file_pairs = (
            p | beam.Create(
                [[
                    join_if_relative_path(opt.base_data_path, s)
                    for s in [opt.lxml_path, opt.xml_path]
                ]]) | "FindFilePairs" >> TransformAndLog(
                    beam.FlatMap(lambda patterns: islice(
                        find_file_pairs_grouped_by_parent_directory_or_name(
                            patterns), opt.limit)),
                    log_prefix='file pairs: ',
                    log_level='debug') | PreventFusion()
            | "ReadFileContent" >> beam.Map(
                lambda filenames: {
                    'source_filename': filenames[0],
                    'xml_filename': filenames[1],
                    'lxml_content': read_all_from_path(filenames[0]),
                    'xml_content': read_all_from_path(filenames[1])
                }))
    elif opt.pdf_path or opt.pdf_xml_file_list:
        if opt.pdf_xml_file_list:
            pdf_xml_url_pairs = (
                p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list,
                                                      limit=opt.limit)
                | "TranslateFilePairUrls" >>
                beam.Map(lambda row: (row['source_url'], row['xml_url'])))
        else:
            pdf_xml_url_pairs = (p | beam.Create([[
                join_if_relative_path(opt.base_data_path, s)
                for s in [opt.pdf_path, opt.xml_path]
            ]]) | "FindFilePairs" >> TransformAndLog(
                beam.FlatMap(lambda patterns: islice(
                    find_file_pairs_grouped_by_parent_directory_or_name(
                        patterns), opt.limit)),
                log_prefix='file pairs: ',
                log_level='debug'))
        pdf_xml_file_pairs = (
            pdf_xml_url_pairs | PreventFusion()
            | "ReadFileContent" >> TransformAndCount(
                beam.Map(
                    lambda filenames: {
                        'source_filename': filenames[0],
                        'xml_filename': filenames[1],
                        'pdf_content': read_all_from_path(filenames[0]),
                        'xml_content': read_all_from_path(filenames[1])
                    }), MetricCounters.FILE_PAIR))

        lxml_xml_file_pairs = (
            pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'lxml_content':
                            convert_pdf_bytes_to_lxml(v['pdf_content'],
                                                      path=v['source_filename'
                                                             ],
                                                      page_range=page_range)
                        }),
                    # we don't need the pdf_content unless we are writing tf_records
                    None if opt.save_tfrecords else {'pdf_content'}),
                log_fn=lambda e, v: (get_logger().warning(
                    'caught exception (ignoring item): %s, pdf: %s, xml: %s',
                    e,
                    v['source_filename'],
                    v['xml_filename'],
                    exc_info=e)),
                error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))
    else:
        raise RuntimeError('either lxml-path or pdf-path required')

    if opt.save_png or opt.save_tfrecords:
        with_pdf_png_pages = (
            (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs)
            | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'pdf_png_pages':
                            list(
                                pdf_bytes_to_png_pages(v['pdf_content'],
                                                       dpi=opt.png_dpi,
                                                       image_size=image_size,
                                                       page_range=page_range))
                        }),
                    {'pdf_content'}  # we no longer need the pdf_content
                ),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR))

        if opt.save_png:
            _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.png.zip')),
                    '.png', v['pdf_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.save_lxml:
        _ = (lxml_xml_file_pairs
             | "SaveLxml" >> TransformAndLog(
                 beam.Map(lambda v: save_file_content(
                     FileSystems.join(
                         opt.output_path,
                         change_ext(
                             relative_path(opt.base_data_path, v[
                                 'source_filename']), None, '.lxml.gz')), v[
                                     'lxml_content'])),
                 log_fn=lambda x: get_logger().info('saved lxml: %s', x)))

    annotation_results = ((
        with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs
    ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount(
        MapOrLog(
            lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'svg_pages':
                        list(
                            convert_and_annotate_lxml_content(
                                v['lxml_content'],
                                v['xml_content'],
                                xml_mapping,
                                name=v['source_filename']))
                    }),
                # Won't need the XML anymore
                {'lxml_content', 'xml_content'}),
            log_fn=lambda e, v: (get_logger().warning(
                'caught exception (ignoring item): %s, source: %s, xml: %s',
                e,
                v['source_filename'],
                v['xml_filename'],
                exc_info=e)),
            error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR),
        MetricCounters.PAGE,
        lambda v: len(v['svg_pages'])))

    if opt.save_svg:
        _ = (annotation_results | "SaveSvgPages" >> TransformAndLog(
            beam.Map(lambda v: save_svg_roots(
                FileSystems.join(
                    opt.output_path,
                    change_ext(
                        relative_path(opt.base_data_path, v['source_filename']
                                      ), None, '.svg.zip')), v['svg_pages'])),
            log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.annotation_evaluation_csv or opt.min_annotation_percentage:
        annotation_evaluation_results = (
            annotation_results | "EvaluateAnnotations" >> TransformAndLog(
                beam.Map(lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'annotation_evaluation':
                            evaluate_document_by_page(
                                SvgStructuredDocument(v['svg_pages']))
                        }), None
                    if opt.min_annotation_percentage else {'svg_pages'})),
                log_fn=lambda x: get_logger().info(
                    'annotation evaluation result: %s: %s', x[
                        'source_filename'], x['annotation_evaluation'])))

    if opt.save_block_png or opt.save_tfrecords:
        color_map = parse_color_map_from_file(opt.color_map)
        with_block_png_pages = (
            (annotation_evaluation_results
             if opt.min_annotation_percentage else annotation_results)
            | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'block_png_pages': [
                            svg_page_to_blockified_png_bytes(
                                svg_page, color_map, image_size=image_size)
                            for svg_page in v['svg_pages']
                        ]
                    }), {'svg_pages'})))

        if opt.save_block_png:
            _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.block-png.zip')),
                    '.png', v['block_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

        if opt.save_tfrecords:
            if opt.min_annotation_percentage:
                filtered_pages = (
                    with_block_png_pages | "FilterPages" >> TransformAndCount(
                        beam.Map(lambda v: filter_list_props_by_indices(
                            v,
                            get_page_indices_with_min_annotation_percentage(
                                v['annotation_evaluation'], opt.
                                min_annotation_percentage),
                            {'pdf_png_pages', 'block_png_pages'})),
                        MetricCounters.FILTERED_PAGE,
                        lambda v: len(v['block_png_pages'])))
            else:
                filtered_pages = with_block_png_pages
            _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord(
                FileSystems.join(opt.output_path, 'data'), lambda v: ({
                    'input_uri':
                    v['source_filename'] + '#page%d' % (first_page + i),
                    'input_image':
                    pdf_png_page,
                    'annotation_uri':
                    (v['source_filename'] + '.annot' + '#page%d' %
                     (first_page + i)),
                    'annotation_image':
                    block_png_page,
                    'page_no':
                    first_page + i
                } for i, pdf_png_page, block_png_page in zip(
                    range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[
                        'block_png_pages']))))

    if opt.annotation_evaluation_csv:
        annotation_evaluation_csv_name, annotation_evaluation_ext = (
            os.path.splitext(opt.annotation_evaluation_csv))
        _ = (  # flake8: noqa
            annotation_evaluation_results | "FlattenAnotationEvaluationResults"
            >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows(
                v['annotation_evaluation'],
                document=basename(v['source_filename'])))
            | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv(
                join_if_relative_path(opt.output_path,
                                      annotation_evaluation_csv_name),
                file_name_suffix=annotation_evaluation_ext,
                columns=DEFAULT_EVALUATION_COLUMNS))
Example #10
0
 def __call__(self, data):
     return {
         'filename': change_ext(data['filename'], None, '.pdf'),
         'content': doc_to_pdf(data['content'], data['type']),
         'type': MimeTypes.PDF
     }