def test_should_not_call_get_cloud_project_if_cloud_is_false( self, get_cloud_project_mock): args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, { 'cloud': False })) process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH) get_cloud_project_mock.assert_not_called()
def read_examples(filenames, shuffle, num_epochs=None, page_range=None, channel_colors=None): # Convert num_epochs == 0 -> num_epochs is None, if necessary num_epochs = num_epochs or None feature_map = DEFAULT_FEATURE_MAP if page_range is not None: feature_map = extend_dict(feature_map, PAGE_NO_FEATURE) map_keys_tracker = MapKeysTracker() dataset = TFRecordDataset(filenames, compression_type='GZIP') dataset = dataset.map( map_keys_tracker.wrap(partial(parse_example, feature_map=feature_map))) if page_range is not None: dataset = dataset.filter(lambda *x: page_no_is_within( map_keys_tracker.unwrap(x)['page_no'], page_range)) if channel_colors is not None: dataset = dataset.filter(lambda *x: image_contains_any_of_the_colors( map_keys_tracker.unwrap(x)['annotation_image'], channel_colors)) if shuffle: dataset = dataset.shuffle(buffer_size=10000) dataset = dataset.repeat(num_epochs) return map_keys_tracker.unwrap(dataset.make_one_shot_iterator().get_next())
def test_should_convert_multiple_article_authors_of_single_reference( self, scienceparse_jats_xslt): authors = [AUTHOR_1, AUTHOR_2] jats = etree.fromstring(scienceparse_jats_xslt({ 'references': [ extend_dict(REFERENCE_1, { 'authors': [ '%s %s' % (author['first-name'], author['last-name']) for author in authors ] }) ] })) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') person_group = _get_item(element_citation, 'person-group') persons = person_group.xpath('name') assert len(persons) == 2 for person, author in zip(persons, authors): assert _get_text(person, 'surname') == author['last-name'] assert _get_text(person, 'given-names') == author['first-name']
def test_should_filter_by_channel_colors(self): with patch.object(examples_module, 'TFRecordDataset') as TFRecordDataset: with tf.Graph().as_default(): TFRecordDataset.return_value = list_dataset([ dict_to_example( extend_dict( EXAMPLE_PROPS_1, page_no=page_no, annotation_image=image_with_color( some_color(page_no)))).SerializeToString() for page_no in [1, 2, 3, 4] ], tf.string) examples = read_examples( DATA_PATH, shuffle=False, num_epochs=1, page_range=(0, 100), channel_colors=[some_color(i) for i in [2, 3]]) TFRecordDataset.assert_called_with(DATA_PATH, compression_type='GZIP') with tf.Session() as session: assert [ x['page_no'] for x in fetch_examples(session, examples) ] == [2, 3]
def test_should_not_call_get_cloud_project_if_project_was_specified( self, get_cloud_project_mock): args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, { 'cloud': True, 'project': PROJECT_1 })) process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH) get_cloud_project_mock.assert_not_called()
def test_should_use_get_cloud_project_if_project_is_empty( self, get_cloud_project_mock): get_cloud_project_mock.return_value = PROJECT_1 args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, { 'cloud': True, 'project': None })) process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH) assert args.project == PROJECT_1 # pylint: disable=no-member
def add_read_source_to_extracted_xml_pipeline_steps(p, opt, get_pipeline_output_file): if opt.lxml_file_list: lxml_urls = p | ReadFileList( opt.lxml_file_list, column=opt.lxml_file_column, limit=opt.limit) annotated_lxml = (lxml_urls | PreventFusion( ) | "ReadLxmlContent" >> TransformAndCount( MapOrLog( lambda url: { DataProps.SOURCE_FILENAME: url, DataProps.STRUCTURED_DOCUMENT: load_structured_document(url ) }, error_count=MetricCounters.READ_LXML_ERROR), MetricCounters.FILES)) extract_tag_scope = None else: annotated_lxml, extract_tag_scope = add_read_pdfs_to_annotated_lxml_pipeline_steps( p, opt, get_pipeline_output_file) extracted_xml = (annotated_lxml | "ExtractToXml" >> MapOrLog( lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.EXTRACTED_XML: extract_annotated_structured_document_to_xml( v[DataProps.STRUCTURED_DOCUMENT], tag_scope=extract_tag_scope) }), keys_to_remove= {DataProps.STRUCTURED_DOCUMENT}), error_count=MetricCounters.EXTRACT_TO_XML_ERROR)) if opt.use_grobid: enhancer = GrobidXmlEnhancer(opt.grobid_url, start_service=opt.start_grobid_service) extracted_xml = (extracted_xml | "GrobidEnhanceXml" >> MapOrLog( lambda v: extend_dict(v, { DataProps.EXTRACTED_XML: enhancer(v[DataProps.EXTRACTED_XML]) }), error_count=MetricCounters.GROBID_ERROR)) return extracted_xml
def test_should_use_process_header_if_includes_only_contains_header( self, config, args, grobid_service_instance): args.grobid_action = None _run_pipeline( config, args, extend_dict(PDF_INPUT, {StepDataProps.INCLUDES: {FieldNames.TITLE}})) grobid_service_instance.assert_called_with( (PDF_INPUT['filename'], PDF_INPUT['content']), path=GrobidApiPaths.PROCESS_HEADER_DOCUMENT)
def test_should_use_process_full_text_if_includes_only_contains_references( self, config, args, grobid_service_instance): args.grobid_action = None _run_pipeline( config, args, extend_dict(PDF_INPUT, {StepDataProps.INCLUDES: {FieldNames.REFERENCES}})) grobid_service_instance.assert_called_with( (PDF_INPUT['filename'], PDF_INPUT['content']), path=GrobidApiPaths.PROCESS_FULL_TEXT_DOCUMENT)
def test_should_convert_venue_as_source(self, scienceparse_jats_xslt): jats = etree.fromstring(scienceparse_jats_xslt({ 'references': [ extend_dict(REFERENCE_1, {'venue': VALUE_1}) ] })) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'source') == VALUE_1
def test_should_convert_single_page_no(self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, page='page1')) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'fpage') == 'page1' assert _get_text(element_citation, 'lpage') == 'page1'
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt): grobid_transformer = grobid_service(opt.grobid_url, opt.grobid_action, start_service=opt.start_grobid_service) return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent( ) | "Grobid" >> MapOrLog(lambda v: extend_dict( v, { DataProps.EXTRACTED_XML: grobid_transformer( (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1] }), error_count=MetricCounters.GROBID_ERROR))
def test_should_convert_year_and_month(self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict( REFERENCE_1, year='2001', month='02')) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'year') == '2001' assert _get_text(element_citation, 'month') == '02'
def test_should_use_unknown_class_weight_from_configuration( self, parse_color_map_from_file_mock, parse_json_file_mock): parse_color_map_from_file_mock.return_value = SOME_COLOR_MAP parse_json_file_mock.return_value = extend_dict( SOME_CLASS_WEIGHTS, {'unknown': 0.99}) args = create_args(DEFAULT_ARGS, base_loss=BaseLoss.WEIGHTED_CROSS_ENTROPY, color_map=COLOR_MAP_FILENAME, class_weights=CLASS_WEIGHTS_FILENAME, use_separate_channels=True, use_unknown_class=True) model = Model(args) assert model.pos_weight[-1] == 0.99
def wrapper(x): data_type = x['type'] if data_type in supported_types: get_logger().debug('excuting step %s: %s (%s)', step, x.keys(), data_type) result = extend_dict(x, step(x)) get_logger().debug('result of step %s: %s (%s)', step, result.keys(), result.get('type')) processed_counter.inc() return result get_logger().debug('skipping step %s, %s not in supported types (%s)', step, data_type, supported_types) ignored_counter.inc() return x
def test_should_only_return_article_title_at_different_levels( self, grobid_jats_xslt, title_level): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, article_title=ARTICLE_TITLE_1, title_level=title_level)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == ARTICLE_TITLE_1
def test_should_fallback_to_collection_title_if_article_title_does_not_exist( self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference( **extend_dict(REFERENCE_1, article_title=None, collection_title=COLLECTION_TITLE_1)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == COLLECTION_TITLE_1
def test_should_only_return_article_title_even_if_collection_title_exists( self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference( **extend_dict(REFERENCE_1, article_title=ARTICLE_TITLE_1, collection_title=COLLECTION_TITLE_1)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == ARTICLE_TITLE_1
def configure_pipeline(p, opt, pipeline, config): def get_pipeline_output_file(source_url, ext): return get_output_file(source_url, opt.base_data_path, opt.output_path, ext) def get_default_output_file_for_source_file(source_url): return get_pipeline_output_file(source_url, opt.output_suffix) def output_file_not_exists(source_url): return not _file_exists( get_default_output_file_for_source_file(source_url)) steps = pipeline.get_steps(config, opt) LOGGER.info('steps: %s', steps) input_urls = (p | FileUrlSource(opt) | PreventFusion()) if opt.resume: input_urls |= beam.Filter(output_file_not_exists) input_data = (input_urls | ReadFileContent() | "Determine Type" >> beam.Map(lambda d: extend_dict( d, { DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0] }))) result = input_data for step in steps: LOGGER.debug('step: %s', step) result |= get_step_transform(step) _ = (result | beam.Map( lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE]))) _ = ( # noqa: F841 result | "WriteOutput" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_default_output_file_for_source_file(v[DataProps. SOURCE_FILENAME]), encode_if_text_type(v[DataProps.CONTENT]))), log_fn=lambda x: get_logger().info('saved output to: %s', x)))
def test_should_convert_multiple_collection_authors_of_single_reference( self, grobid_jats_xslt): authors = [AUTHOR_1, AUTHOR_2] jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, collection_authors=authors)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') person_group = _get_item(element_citation, 'person-group') persons = person_group.xpath('name') assert len(persons) == 2 for person, author in zip(persons, authors): assert _get_text(person, 'surname') == author['last-name'] assert _get_text(person, 'given-names') == author['first-name']
def get_sentence_char_features( char_index, char_by_index_map, word_index_by_char_index_map, word_by_index_map): word_index = word_index_by_char_index_map.get(char_index, -10) d = extend_dict( {}, get_char_features('char', char_by_index_map.get(char_index, '')), get_word_features('word', word_by_index_map.get(word_index, '')), { 'char_index': char_index, 'word_index': word_index, 'bias': 1.0 } ) for i in range(1, 1 + 3): d.update(get_char_features('char[-%d]' % i, char_by_index_map.get(char_index - i, ''))) d.update(get_char_features('char[+%d]' % i, char_by_index_map.get(char_index + i, ''))) d.update(get_word_features('word[-%d]' % i, word_by_index_map.get(word_index - i, ''))) d.update(get_word_features('word[+%d]' % i, word_by_index_map.get(word_index + i, ''))) return d
def process_request(self, data: dict, session: requests.Session, context: dict = None): root = etree.fromstring(data['content']) matching_nodes = root.xpath(self._xpath) if not matching_nodes: LOGGER.info('xpath not matching any element: %s', self._xpath) return data for node in matching_nodes: value = get_text_content(node) LOGGER.debug('node for xpath %s: %s (text: %s)', self._xpath, node, value) response = session.post( self._api_url, data=value.encode('utf-8'), timeout=self.get_default_request_timeout(context=context)) response.raise_for_status() revised_value = response.text LOGGER.debug('revised_value: %s (was: %s)', revised_value, value) if revised_value != value: apply_revised_value(node, revised_value) return extend_dict(data, {'content': etree.tostring(root)})
def configure_pipeline(p, opt, pipeline, config): get_default_output_file_for_source_file = get_output_file_for_source_file_fn( opt) file_list = get_remaining_file_list_for_args(opt) LOGGER.debug('file_list: %s', file_list) if not file_list: LOGGER.info('no files to process') return steps = pipeline.get_steps(config, opt) LOGGER.info('steps: %s', steps) input_urls = (p | beam.Create(file_list) | PreventFusion()) input_data = (input_urls | ReadFileContent() | "Determine Type" >> beam.Map(lambda d: extend_dict( d, { DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0] }))) result = input_data for step in steps: LOGGER.debug('step: %s', step) result |= get_step_transform(step) _ = ( # noqa: F841 result | "WriteOutput" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_default_output_file_for_source_file(v[DataProps. SOURCE_FILENAME]), encode_if_text_type(v[DataProps.CONTENT]))), log_fn=lambda x: get_logger().info('saved output to: %s', x)))
def patch_preprocessing_pipeline(**kwargs): always_mock = { 'find_file_pairs_grouped_by_parent_directory_or_name', 'read_all_from_path', 'pdf_bytes_to_png_pages', 'convert_pdf_bytes_to_lxml', 'convert_and_annotate_lxml_content', 'svg_page_to_blockified_png_bytes', 'save_svg_roots', 'save_pages', 'evaluate_document_by_page', 'ReadDictCsv' } tfrecords_mock = Mock(name='tfrecords_mock') def DummyWritePropsToTFRecord(file_path, extract_props): return TransformAndLog( beam.Map( lambda v: tfrecords_mock(file_path, list(extract_props(v)))), log_fn=lambda x: get_logger().info('tfrecords: %s', x)) with patch.multiple(PREPROCESSING_PIPELINE, WritePropsToTFRecord=DummyWritePropsToTFRecord, **{k: kwargs.get(k, DEFAULT) for k in always_mock}) as mocks: get_current_test_context().mocks = mocks mocks['read_all_from_path'].side_effect = fake_content mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf yield extend_dict(mocks, {'tfrecords': tfrecords_mock})
def configure_pipeline(p, opt): image_size = ((opt.image_width, opt.image_height) if opt.image_width and opt.image_height else None) page_range = opt.pages first_page = page_range[0] if page_range else 1 xml_mapping = parse_xml_mapping(opt.xml_mapping_path) if opt.lxml_path: lxml_xml_file_pairs = ( p | beam.Create( [[ join_if_relative_path(opt.base_data_path, s) for s in [opt.lxml_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug') | PreventFusion() | "ReadFileContent" >> beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'lxml_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) })) elif opt.pdf_path or opt.pdf_xml_file_list: if opt.pdf_xml_file_list: pdf_xml_url_pairs = ( p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) | "TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url']))) else: pdf_xml_url_pairs = (p | beam.Create([[ join_if_relative_path(opt.base_data_path, s) for s in [opt.pdf_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug')) pdf_xml_file_pairs = ( pdf_xml_url_pairs | PreventFusion() | "ReadFileContent" >> TransformAndCount( beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'pdf_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) }), MetricCounters.FILE_PAIR)) lxml_xml_file_pairs = ( pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'lxml_content': convert_pdf_bytes_to_lxml(v['pdf_content'], path=v['source_filename' ], page_range=page_range) }), # we don't need the pdf_content unless we are writing tf_records None if opt.save_tfrecords else {'pdf_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, pdf: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR)) else: raise RuntimeError('either lxml-path or pdf-path required') if opt.save_png or opt.save_tfrecords: with_pdf_png_pages = ( (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs) | "ConvertPdfToPng" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'pdf_png_pages': list( pdf_bytes_to_png_pages(v['pdf_content'], dpi=opt.png_dpi, image_size=image_size, page_range=page_range)) }), {'pdf_content'} # we no longer need the pdf_content ), error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)) if opt.save_png: _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.png.zip')), '.png', v['pdf_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_lxml: _ = (lxml_xml_file_pairs | "SaveLxml" >> TransformAndLog( beam.Map(lambda v: save_file_content( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.lxml.gz')), v[ 'lxml_content'])), log_fn=lambda x: get_logger().info('saved lxml: %s', x))) annotation_results = (( with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount( MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'svg_pages': list( convert_and_annotate_lxml_content( v['lxml_content'], v['xml_content'], xml_mapping, name=v['source_filename'])) }), # Won't need the XML anymore {'lxml_content', 'xml_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, source: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR), MetricCounters.PAGE, lambda v: len(v['svg_pages']))) if opt.save_svg: _ = (annotation_results | "SaveSvgPages" >> TransformAndLog( beam.Map(lambda v: save_svg_roots( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v['source_filename'] ), None, '.svg.zip')), v['svg_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.annotation_evaluation_csv or opt.min_annotation_percentage: annotation_evaluation_results = ( annotation_results | "EvaluateAnnotations" >> TransformAndLog( beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'annotation_evaluation': evaluate_document_by_page( SvgStructuredDocument(v['svg_pages'])) }), None if opt.min_annotation_percentage else {'svg_pages'})), log_fn=lambda x: get_logger().info( 'annotation evaluation result: %s: %s', x[ 'source_filename'], x['annotation_evaluation']))) if opt.save_block_png or opt.save_tfrecords: color_map = parse_color_map_from_file(opt.color_map) with_block_png_pages = ( (annotation_evaluation_results if opt.min_annotation_percentage else annotation_results) | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'block_png_pages': [ svg_page_to_blockified_png_bytes( svg_page, color_map, image_size=image_size) for svg_page in v['svg_pages'] ] }), {'svg_pages'}))) if opt.save_block_png: _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.block-png.zip')), '.png', v['block_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_tfrecords: if opt.min_annotation_percentage: filtered_pages = ( with_block_png_pages | "FilterPages" >> TransformAndCount( beam.Map(lambda v: filter_list_props_by_indices( v, get_page_indices_with_min_annotation_percentage( v['annotation_evaluation'], opt. min_annotation_percentage), {'pdf_png_pages', 'block_png_pages'})), MetricCounters.FILTERED_PAGE, lambda v: len(v['block_png_pages']))) else: filtered_pages = with_block_png_pages _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord( FileSystems.join(opt.output_path, 'data'), lambda v: ({ 'input_uri': v['source_filename'] + '#page%d' % (first_page + i), 'input_image': pdf_png_page, 'annotation_uri': (v['source_filename'] + '.annot' + '#page%d' % (first_page + i)), 'annotation_image': block_png_page, 'page_no': first_page + i } for i, pdf_png_page, block_png_page in zip( range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[ 'block_png_pages'])))) if opt.annotation_evaluation_csv: annotation_evaluation_csv_name, annotation_evaluation_ext = ( os.path.splitext(opt.annotation_evaluation_csv)) _ = ( # flake8: noqa annotation_evaluation_results | "FlattenAnotationEvaluationResults" >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows( v['annotation_evaluation'], document=basename(v['source_filename']))) | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv( join_if_relative_path(opt.output_path, annotation_evaluation_csv_name), file_name_suffix=annotation_evaluation_ext, columns=DEFAULT_EVALUATION_COLUMNS))
assert class_weights_to_pos_weight({ 'a': 0.1, 'b': 0.2, 'c': 0.3 }, ['a', 'b'], True, DEFAULT_UNKNOWN_CLASS_WEIGHT) == ( [0.1, 0.2, DEFAULT_UNKNOWN_CLASS_WEIGHT] ) DEFAULT_ARGS = extend_dict( CORE_DEFAULT_ARGS, dict( pages=None, color_map=None, class_weights=None, channels=None, filter_annotated=False, use_separate_channels=False, use_unknown_class=False, debug=False ) ) def create_args(*args, **kwargs): d = extend_dict(*list(args) + [kwargs]) return namedtuple('args', d.keys())(**d) @pytest.mark.usefixtures( 'parse_json_file_mock'
def test_should_preserve_other_input_props(self, config, args): result = _run_pipeline(config, args, extend_dict( _generate_content_with_title(TITLE_1), {'other': 'other1'} )) assert result['other'] == 'other1'
def test_should_parse_namespaces(self): opt = parse_args( _get_argv( extend_dict(DEFAULT_ARGS, {'namespaces': '{"xyz": "http://xyz"}'}))) assert opt.namespaces == {'xyz': 'http://xyz'}
def add_read_pdfs_to_annotated_lxml_pipeline_steps(p, opt, get_pipeline_output_file): page_range = opt.pages cv_enabled = opt.cv_model_export_dir extract_tag_scope = None pdf_urls = p | PdfUrlSource(opt) lxml_content = (pdf_urls | PreventFusion() | ReadPdfContent() | "ConvertPdfToLxml" >> MapOrLog( lambda v: extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: convert_pdf_bytes_to_structured_document( v[DataProps.PDF_CONTENT], path=v[DataProps.SOURCE_FILENAME], page_range=page_range) }), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, pdf: %s', e, v[DataProps.SOURCE_FILENAME], exc_info=e)), error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR)) if cv_enabled: image_size = ((opt.image_width, opt.image_height) if opt.image_width and opt.image_height else None) inference_model_wrapper = InferenceModelWrapper( opt.cv_model_export_dir) cv_predictions = ( lxml_content | "ConvertPdfToPng" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { DataProps.PDF_PNG_PAGES: list( pdf_bytes_to_png_pages( v[DataProps.PDF_CONTENT], dpi=90, # not used if the image is scaled image_size=image_size, page_range=page_range)) }), keys_to_remove={DataProps.PDF_CONTENT}), error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR) | "ComputerVisionPrediction" >> MapOrLog( lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.CV_PREDICTION_PNG_PAGES: inference_model_wrapper(v[DataProps.PDF_PNG_PAGES]), DataProps.COLOR_MAP: inference_model_wrapper.get_color_map() }), keys_to_remove= {DataProps.PDF_PNG_PAGES}), error_count=MetricCounters.CV_PREDICTION_ERROR)) if opt.save_cv_output: _ = (cv_predictions | "SaveComputerVisionOutput" >> TransformAndLog(beam.Map(lambda v: save_pages( get_pipeline_output_file(v[DataProps.SOURCE_FILENAME], OutputExt.CV_PNG), '.png', [ image_data_to_png(image_data) for image_data in v[DataProps.CV_PREDICTION_PNG_PAGES] ])), log_fn=lambda x: get_logger().info( 'saved cv output: %s', x))) cv_annotated_lxml = ( cv_predictions | "AnnotateLxmlUsingCvPrediction" >> MapOrLog(lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: (annotate_structured_document_using_predicted_image_data( v[DataProps.STRUCTURED_DOCUMENT], v[DataProps.CV_PREDICTION_PNG_PAGES], v[DataProps.COLOR_MAP], tag_scope=CV_TAG_SCOPE)) }), keys_to_remove= {DataProps.PDF_PNG_PAGES }), error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR )) lxml_content = cv_annotated_lxml extract_tag_scope = CV_TAG_SCOPE if opt.crf_model: model = load_crf_model(opt.crf_model) crf_annotated_lxml = ( lxml_content | "AnnotateLxmlUsingCrfPrediction" >> MapOrLog( lambda v: extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: predict_and_annotate_structured_document( v[DataProps.STRUCTURED_DOCUMENT], model) }), error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR)) lxml_content = crf_annotated_lxml extract_tag_scope = CRF_TAG_SCOPE if opt.save_annot_lxml: _ = ( # flake8: noqa lxml_content | "SaveAnnotLxml" >> TransformAndLog(beam.Map(lambda v: save_structured_document( get_pipeline_output_file( v[DataProps.SOURCE_FILENAME], get_annot_lxml_ext(crf_enabled=opt.crf_model, cv_enabled=cv_enabled)), v[ DataProps.STRUCTURED_DOCUMENT])), log_fn=lambda x: get_logger().info( 'saved annoted lxml to: %s', x))) return lxml_content, extract_tag_scope
def create_args(*args, **kwargs): d = extend_dict(*list(args) + [kwargs]) return namedtuple('args', d.keys())(**d)