def test_should_merge_from_cv_tag_scope(self):
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=None, tag=TAG_1)
     ])])
     cv_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=CV_TAG_SCOPE, tag=TAG_2)
     ])])
     structured_document = merge_with_cv_structured_document(
         structured_document, cv_structured_document,
         cv_source_tag_scope=CV_TAG_SCOPE
     )
     assert get_all_token_tags(structured_document) == [TAG_1]
     assert get_all_token_tags(structured_document, scope=CV_TAG_SCOPE) == [TAG_2]
def load_and_convert_to_token_props(filename,
                                    cv_filename,
                                    cv_source_tag_scope,
                                    page_range=None):
    try:
        structured_document = load_structured_document(filename,
                                                       page_range=page_range)
        if cv_filename:
            cv_structured_document = load_structured_document(
                cv_filename, page_range=page_range)
            structured_document = merge_with_cv_structured_document(
                structured_document,
                cv_structured_document,
                cv_source_tag_scope=cv_source_tag_scope)
        return list(structured_document_to_token_props(structured_document))
    except StandardError as e:
        raise_from(
            RuntimeError('failed to process %s (due to %s: %s)' %
                         (filename, type(e), e)), e)
def main(argv=None):
    args = parse_args(argv)

    if args.debug:
        logging.getLogger().setLevel('DEBUG')

    structured_document = load_lxml_structured_document(args.lxml_path)

    if args.cv_lxml_path:
        cv_structured_document = load_lxml_structured_document(
            args.cv_lxml_path)
        structured_document = merge_with_cv_structured_document(
            structured_document, cv_structured_document)

    model = load_crf_model(args.crf_model)

    predict_and_annotate_structured_document(structured_document,
                                             model,
                                             tag_scope=args.tag_scope)

    get_logger().info('writing result to: %s', args.output_path)
    save_structured_document(args.output_path, structured_document)