Beispiel #1
0
def get_parser_udf(
    structural=True,  # structural information
    blacklist=["style", "script"],  # ignore tag types, default: style, script
    flatten=["span", "br"],  # flatten tag types, default: span, br
    language="en",
    lingual=True,  # lingual information
    strip=True,
    replacements=[("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")],
    tabular=True,  # tabular information
    visual=False,  # visual information
    pdf_path=None,
):
    """Return an instance of ParserUDF."""

    # Patch new_sessionmaker() under the namespace of fonduer.utils.udf
    # See more details in
    # https://docs.python.org/3/library/unittest.mock.html#where-to-patch
    with patch("fonduer.utils.udf.new_sessionmaker", autospec=True):
        parser_udf = ParserUDF(
            structural=structural,
            blacklist=blacklist,
            flatten=flatten,
            lingual=lingual,
            strip=strip,
            replacements=replacements,
            tabular=tabular,
            visual=visual,
            pdf_path=pdf_path,
            language=language,
        )
    return parser_udf
Beispiel #2
0
def get_parser_udf(
    structural=True,  # structural information
    blacklist=["style", "script"],  # ignore tag types, default: style, script
    flatten=["span", "br"],  # flatten tag types, default: span, br
    language="en",
    lingual=True,  # lingual information
    lingual_parser=None,
    strip=True,
    replacements=[("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")],
    tabular=True,  # tabular information
    visual=False,  # visual information
    visual_parser=None,
):
    """Return an instance of ParserUDF."""
    parser_udf = ParserUDF(
        structural=structural,
        blacklist=blacklist,
        flatten=flatten,
        lingual=lingual,
        lingual_parser=lingual_parser,
        strip=strip,
        replacements=replacements,
        tabular=tabular,
        visual=visual,
        visual_parser=visual_parser,
        language=language,
    )
    return parser_udf
Beispiel #3
0
def _load_pyfunc(model_path: str) -> Any:
    """Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``."""

    # Load mention_classes
    _load_mention_classes(model_path)
    # Load candiate_classes
    _load_candidate_classes(model_path)
    # Load a pickled model
    model = pickle.load(open(os.path.join(model_path, "model.pkl"), "rb"))
    fonduer_model = model["fonduer_model"]
    fonduer_model.preprocessor = model["preprosessor"]
    fonduer_model.parser = ParserUDF(**model["parser"])
    fonduer_model.mention_extractor = MentionExtractorUDF(
        **model["mention_extractor"])
    fonduer_model.candidate_extractor = CandidateExtractorUDF(
        **model["candidate_extractor"])

    # Configure logging for Fonduer
    init_logging(log_dir="logs")

    pyfunc_conf = _get_flavor_configuration(model_path=model_path,
                                            flavor_name=pyfunc.FLAVOR_NAME)
    candidate_classes = fonduer_model.candidate_extractor.candidate_classes

    fonduer_model.model_type = pyfunc_conf.get(MODEL_TYPE, "emmental")
    if fonduer_model.model_type == "emmental":
        emmental.init()
        fonduer_model.featurizer = FeaturizerUDF(candidate_classes,
                                                 FeatureExtractor())
        fonduer_model.key_names = model["feature_keys"]
        fonduer_model.word2id = model["word2id"]

        # Load the emmental_model
        buffer = BytesIO()
        buffer.write(model["emmental_model"])
        buffer.seek(0)
        fonduer_model.emmental_model = torch.load(buffer)
    else:
        fonduer_model.labeler = LabelerUDF(candidate_classes)
        fonduer_model.key_names = model["labeler_keys"]

        fonduer_model.lfs = model["lfs"]

        fonduer_model.label_models = []
        for state_dict in model["label_models_state_dict"]:
            label_model = LabelModel()
            label_model.__dict__.update(state_dict)
            fonduer_model.label_models.append(label_model)
    return fonduer_model