def get_parser_udf( structural=True, # structural information blacklist=["style", "script"], # ignore tag types, default: style, script flatten=["span", "br"], # flatten tag types, default: span, br language="en", lingual=True, # lingual information strip=True, replacements=[("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")], tabular=True, # tabular information visual=False, # visual information pdf_path=None, ): """Return an instance of ParserUDF.""" # Patch new_sessionmaker() under the namespace of fonduer.utils.udf # See more details in # https://docs.python.org/3/library/unittest.mock.html#where-to-patch with patch("fonduer.utils.udf.new_sessionmaker", autospec=True): parser_udf = ParserUDF( structural=structural, blacklist=blacklist, flatten=flatten, lingual=lingual, strip=strip, replacements=replacements, tabular=tabular, visual=visual, pdf_path=pdf_path, language=language, ) return parser_udf
def get_parser_udf( structural=True, # structural information blacklist=["style", "script"], # ignore tag types, default: style, script flatten=["span", "br"], # flatten tag types, default: span, br language="en", lingual=True, # lingual information lingual_parser=None, strip=True, replacements=[("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")], tabular=True, # tabular information visual=False, # visual information visual_parser=None, ): """Return an instance of ParserUDF.""" parser_udf = ParserUDF( structural=structural, blacklist=blacklist, flatten=flatten, lingual=lingual, lingual_parser=lingual_parser, strip=strip, replacements=replacements, tabular=tabular, visual=visual, visual_parser=visual_parser, language=language, ) return parser_udf
def _load_pyfunc(model_path: str) -> Any: """Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.""" # Load mention_classes _load_mention_classes(model_path) # Load candiate_classes _load_candidate_classes(model_path) # Load a pickled model model = pickle.load(open(os.path.join(model_path, "model.pkl"), "rb")) fonduer_model = model["fonduer_model"] fonduer_model.preprocessor = model["preprosessor"] fonduer_model.parser = ParserUDF(**model["parser"]) fonduer_model.mention_extractor = MentionExtractorUDF( **model["mention_extractor"]) fonduer_model.candidate_extractor = CandidateExtractorUDF( **model["candidate_extractor"]) # Configure logging for Fonduer init_logging(log_dir="logs") pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) candidate_classes = fonduer_model.candidate_extractor.candidate_classes fonduer_model.model_type = pyfunc_conf.get(MODEL_TYPE, "emmental") if fonduer_model.model_type == "emmental": emmental.init() fonduer_model.featurizer = FeaturizerUDF(candidate_classes, FeatureExtractor()) fonduer_model.key_names = model["feature_keys"] fonduer_model.word2id = model["word2id"] # Load the emmental_model buffer = BytesIO() buffer.write(model["emmental_model"]) buffer.seek(0) fonduer_model.emmental_model = torch.load(buffer) else: fonduer_model.labeler = LabelerUDF(candidate_classes) fonduer_model.key_names = model["labeler_keys"] fonduer_model.lfs = model["lfs"] fonduer_model.label_models = [] for state_dict in model["label_models_state_dict"]: label_model = LabelModel() label_model.__dict__.update(state_dict) fonduer_model.label_models.append(label_model) return fonduer_model