def save_grobid_training_tei_structured_document( filename, grobid_training_tei_structured_document): try: xml = etree.tostring(grobid_training_tei_structured_document.root) except Exception as e: raise RuntimeError('failed to convert to xml for %s due to %s' % (filename, e)) from e save_file_content(filename, xml)
def main(argv=None): args = parse_args(argv) if args.debug: logging.getLogger().setLevel('DEBUG') structured_document = load_structured_document(args.lxml_path) xml_root = extract_structured_document_to_xml(structured_document, tag_scope=args.tag_scope) get_logger().info('writing result to: %s', args.output_path) save_file_content(args.output_path, etree.tostring(xml_root, pretty_print=True))
def fix_jats_xml_file(input_file: str, output_file: str, log_file_enabled: bool = True): if log_file_enabled: LOGGER.info('processing: %r -> %r', input_file, output_file) else: LOGGER.debug('processing: %r -> %r', input_file, output_file) fixed_malformatted_xml = False with auto_download_input_file(input_file) as local_input_file: try: tree = parse_xml(local_input_file, filename=input_file, fix_xml=False) except ValueError: tree = parse_xml(local_input_file, filename=input_file, fix_xml=True) fixed_malformatted_xml = True root = tree.getroot() original_root = clone_node(root) fix_jats_xml_node(root) add_fix_xml_meta_data(root, original_root, fixed_malformatted_xml=fixed_malformatted_xml) output_bytes = etree.tostring( tree, xml_declaration=True, encoding=tree.docinfo.encoding ) save_file_content(output_file, output_bytes)
def process_file(file_url: str, simple_runner: SimplePipelineRunner, get_output_file_for_source_url: Callable[[str], str], session: requests.Session, request_args: MultiDict = None): output_file_url = get_output_file_for_source_url(file_url) file_content = read_all_from_path(file_url) LOGGER.info('read source content: %s (%s)', file_url, format_size(len(file_content))) data_type = guess_type(file_url)[0] LOGGER.debug('data_type: %s', data_type) LOGGER.debug('session: %s', session) context = {RequestsPipelineStep.REQUESTS_SESSION_KEY: session} if request_args: context['request_args'] = request_args result = simple_runner.convert(file_content, file_url, data_type, context=context) LOGGER.debug('result.keys: %s', result.keys()) output_content = encode_if_text_type(result[DataProps.CONTENT]) save_file_content(output_file_url, output_content) LOGGER.info('saved output to: %s (%s)', output_file_url, format_size(len(output_content)))
def configure_pipeline(p, opt, pipeline, config): def get_pipeline_output_file(source_url, ext): return get_output_file(source_url, opt.base_data_path, opt.output_path, ext) def get_default_output_file_for_source_file(source_url): return get_pipeline_output_file(source_url, opt.output_suffix) def output_file_not_exists(source_url): return not _file_exists( get_default_output_file_for_source_file(source_url)) steps = pipeline.get_steps(config, opt) LOGGER.info('steps: %s', steps) input_urls = (p | FileUrlSource(opt) | PreventFusion()) if opt.resume: input_urls |= beam.Filter(output_file_not_exists) input_data = (input_urls | ReadFileContent() | "Determine Type" >> beam.Map(lambda d: extend_dict( d, { DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0] }))) result = input_data for step in steps: LOGGER.debug('step: %s', step) result |= get_step_transform(step) _ = (result | beam.Map( lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE]))) _ = ( # noqa: F841 result | "WriteOutput" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_default_output_file_for_source_file(v[DataProps. SOURCE_FILENAME]), encode_if_text_type(v[DataProps.CONTENT]))), log_fn=lambda x: get_logger().info('saved output to: %s', x)))
def configure_pipeline(p, opt): def get_pipeline_output_file(source_url, ext): return get_output_file(source_url, opt.base_data_path, opt.output_path, ext) if (opt.use_grobid and not opt.crf_model and not opt.cv_model_export_dir and not opt.lxml_file_list): extracted_xml = add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt) else: extracted_xml = add_read_source_to_extracted_xml_pipeline_steps( p, opt, get_pipeline_output_file) _ = ( # flake8: noqa extracted_xml | "WriteXml" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_pipeline_output_file(v[DataProps.SOURCE_FILENAME], opt. output_suffix), v[DataProps. EXTRACTED_XML])), log_fn=lambda x: get_logger().info('saved xml to: %s', x)))
def configure_pipeline(p, opt, pipeline, config): get_default_output_file_for_source_file = get_output_file_for_source_file_fn( opt) file_list = get_remaining_file_list_for_args(opt) LOGGER.debug('file_list: %s', file_list) if not file_list: LOGGER.info('no files to process') return steps = pipeline.get_steps(config, opt) LOGGER.info('steps: %s', steps) input_urls = (p | beam.Create(file_list) | PreventFusion()) input_data = (input_urls | ReadFileContent() | "Determine Type" >> beam.Map(lambda d: extend_dict( d, { DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0] }))) result = input_data for step in steps: LOGGER.debug('step: %s', step) result |= get_step_transform(step) _ = ( # noqa: F841 result | "WriteOutput" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_default_output_file_for_source_file(v[DataProps. SOURCE_FILENAME]), encode_if_text_type(v[DataProps.CONTENT]))), log_fn=lambda x: get_logger().info('saved output to: %s', x)))
def save_lxml_structured_document(filename, lxml_structured_document): save_file_content(filename, etree.tostring(lxml_structured_document.root))
def save_model(output_filename, model_bytes): LOGGER.info('saving model to %s', output_filename) save_file_content(output_filename, model_bytes)
def save_model(output_filename, model_bytes): get_logger().info('saving model to %s', output_filename) save_file_content(output_filename, model_bytes)
def fix_source_file_to(source_url: str, target_url: str, encoding: str = 'utf-8'): xml_bytes = read_all_from_path(source_url) fixed_xml_bytes = get_fixed_xml_bytes(xml_bytes, encoding=encoding) save_file_content(target_url, fixed_xml_bytes)
def fix_source_file_to(source_url: str, target_url: str): source_data = read_all_from_path(source_url) data = source_data if b'</content>' in data and b'<content>' not in data: data = data.replace(b'</content>', b'') save_file_content(target_url, data)
def configure_pipeline(p, opt): image_size = ((opt.image_width, opt.image_height) if opt.image_width and opt.image_height else None) page_range = opt.pages first_page = page_range[0] if page_range else 1 xml_mapping = parse_xml_mapping(opt.xml_mapping_path) if opt.lxml_path: lxml_xml_file_pairs = ( p | beam.Create( [[ join_if_relative_path(opt.base_data_path, s) for s in [opt.lxml_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug') | PreventFusion() | "ReadFileContent" >> beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'lxml_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) })) elif opt.pdf_path or opt.pdf_xml_file_list: if opt.pdf_xml_file_list: pdf_xml_url_pairs = ( p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) | "TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url']))) else: pdf_xml_url_pairs = (p | beam.Create([[ join_if_relative_path(opt.base_data_path, s) for s in [opt.pdf_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug')) pdf_xml_file_pairs = ( pdf_xml_url_pairs | PreventFusion() | "ReadFileContent" >> TransformAndCount( beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'pdf_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) }), MetricCounters.FILE_PAIR)) lxml_xml_file_pairs = ( pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'lxml_content': convert_pdf_bytes_to_lxml(v['pdf_content'], path=v['source_filename' ], page_range=page_range) }), # we don't need the pdf_content unless we are writing tf_records None if opt.save_tfrecords else {'pdf_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, pdf: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR)) else: raise RuntimeError('either lxml-path or pdf-path required') if opt.save_png or opt.save_tfrecords: with_pdf_png_pages = ( (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs) | "ConvertPdfToPng" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'pdf_png_pages': list( pdf_bytes_to_png_pages(v['pdf_content'], dpi=opt.png_dpi, image_size=image_size, page_range=page_range)) }), {'pdf_content'} # we no longer need the pdf_content ), error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)) if opt.save_png: _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.png.zip')), '.png', v['pdf_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_lxml: _ = (lxml_xml_file_pairs | "SaveLxml" >> TransformAndLog( beam.Map(lambda v: save_file_content( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.lxml.gz')), v[ 'lxml_content'])), log_fn=lambda x: get_logger().info('saved lxml: %s', x))) annotation_results = (( with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount( MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'svg_pages': list( convert_and_annotate_lxml_content( v['lxml_content'], v['xml_content'], xml_mapping, name=v['source_filename'])) }), # Won't need the XML anymore {'lxml_content', 'xml_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, source: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR), MetricCounters.PAGE, lambda v: len(v['svg_pages']))) if opt.save_svg: _ = (annotation_results | "SaveSvgPages" >> TransformAndLog( beam.Map(lambda v: save_svg_roots( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v['source_filename'] ), None, '.svg.zip')), v['svg_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.annotation_evaluation_csv or opt.min_annotation_percentage: annotation_evaluation_results = ( annotation_results | "EvaluateAnnotations" >> TransformAndLog( beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'annotation_evaluation': evaluate_document_by_page( SvgStructuredDocument(v['svg_pages'])) }), None if opt.min_annotation_percentage else {'svg_pages'})), log_fn=lambda x: get_logger().info( 'annotation evaluation result: %s: %s', x[ 'source_filename'], x['annotation_evaluation']))) if opt.save_block_png or opt.save_tfrecords: color_map = parse_color_map_from_file(opt.color_map) with_block_png_pages = ( (annotation_evaluation_results if opt.min_annotation_percentage else annotation_results) | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'block_png_pages': [ svg_page_to_blockified_png_bytes( svg_page, color_map, image_size=image_size) for svg_page in v['svg_pages'] ] }), {'svg_pages'}))) if opt.save_block_png: _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.block-png.zip')), '.png', v['block_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_tfrecords: if opt.min_annotation_percentage: filtered_pages = ( with_block_png_pages | "FilterPages" >> TransformAndCount( beam.Map(lambda v: filter_list_props_by_indices( v, get_page_indices_with_min_annotation_percentage( v['annotation_evaluation'], opt. min_annotation_percentage), {'pdf_png_pages', 'block_png_pages'})), MetricCounters.FILTERED_PAGE, lambda v: len(v['block_png_pages']))) else: filtered_pages = with_block_png_pages _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord( FileSystems.join(opt.output_path, 'data'), lambda v: ({ 'input_uri': v['source_filename'] + '#page%d' % (first_page + i), 'input_image': pdf_png_page, 'annotation_uri': (v['source_filename'] + '.annot' + '#page%d' % (first_page + i)), 'annotation_image': block_png_page, 'page_no': first_page + i } for i, pdf_png_page, block_png_page in zip( range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[ 'block_png_pages'])))) if opt.annotation_evaluation_csv: annotation_evaluation_csv_name, annotation_evaluation_ext = ( os.path.splitext(opt.annotation_evaluation_csv)) _ = ( # flake8: noqa annotation_evaluation_results | "FlattenAnotationEvaluationResults" >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows( v['annotation_evaluation'], document=basename(v['source_filename']))) | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv( join_if_relative_path(opt.output_path, annotation_evaluation_csv_name), file_name_suffix=annotation_evaluation_ext, columns=DEFAULT_EVALUATION_COLUMNS))
def save_structured_document(filename, structured_document): # only support saving lxml for now assert isinstance(structured_document, LxmlStructuredDocument) save_file_content( filename, etree.tostring(structured_document.root, pretty_print=True)) return filename