def __init__( self, model=None, model_path: str = None, entities_to_keep: List[str] = None, verbose: bool = False, ): """ Evaluator for Flair models :param model: model of type SequenceTagger :param model_path: :param entities_to_keep: :param verbose: and model expected entity types """ super().__init__( entities_to_keep=entities_to_keep, verbose=verbose, ) if model is None: if model_path is None: raise ValueError( "Either model_path or model object must be supplied") self.model = SequenceTagger.load(model_path) else: self.model = model self.spacy_tokenizer = SpacyTokenizer( model=spacy.load("en_core_web_lg"))
def build_spacy_tokenizer(model) -> Callable[[str], List[Token]]: from flair.tokenization import SpacyTokenizer spacy_tokenizer = SpacyTokenizer(model) def tokenizer(text: str) -> List[Token]: return spacy_tokenizer.tokenize(text) return tokenizer
def test_create_sentence_with_spacy_tokenizer(): sentence:Sentence = Sentence("I love Berlin.", use_tokenizer=SpacyTokenizer("en_core_sci_sm")) assert 4 == len(sentence.tokens) assert "I" == sentence.tokens[0].text assert "love" == sentence.tokens[1].text assert "Berlin" == sentence.tokens[2].text assert "." == sentence.tokens[3].text
def predict(): model = SequenceTagger.load('../SI/SI_model.pt') files = sorted(glob.glob(os.path.join(TEST_FOLDER, '*.txt'))) for file in files: filename = os.path.basename(file) with open(file, 'r') as inputs, open( os.path.join('../SI-predictions', filename), 'w') as output: article = inputs.read() splitter = SegtokSentenceSplitter( tokenizer=SpacyTokenizer('en_core_web_sm')) sentences = splitter.split(article) model.predict(sentences, mini_batch_size=16, verbose=True) for sentence in sentences: output.write(sentence.to_tagged_string())
def __init__( self, model=None, model_path: str = None, entities_to_keep: List[str] = None, verbose: bool = False, entity_mapping: Dict[str, str] = PRESIDIO_SPACY_ENTITIES, ): super().__init__( entities_to_keep=entities_to_keep, verbose=verbose, entity_mapping=entity_mapping, ) if model is None: if model_path is None: raise ValueError( "Either model_path or model object must be supplied") self.model = SequenceTagger.load(model_path) else: self.model = model self.spacy_tokenizer = SpacyTokenizer( model=spacy.load("en_core_web_sm"))
s = re.sub("[\x00-\x1f\x7f\u0080-\u009f]+", '', s) # replace `NO-BREAK SPACE` return re.sub("\u00a0+", ' ', s) components = { "au": [ "house_name", "unit", "house_number", "street", "suburb", "city", "state", "postcode", "country" ], "jp": ["state", "city", "street", "house_number", "unit", "postcode", "country"] } tokenizers = { "au": SpacyTokenizer(spacy.load("en_core_web_sm")), "jp": SpacyTokenizer(spacy.load("ja_core_news_sm")) } embeddings = StackedEmbeddings( [FlairEmbeddings('multi-forward'), FlairEmbeddings('multi-backward')]) # stacked embedding is recommended but traditional embeddings are trained on single languages # embeddings = { # "au": StackedEmbeddings([ # WordEmbeddings('en'), # FlairEmbeddings('mix-forward'), # FlairEmbeddings('mix-backward'), # ]), # "jp": StackedEmbeddings([
def get_french_tokenizer(): nlp = spacy.blank(name="fr") nlp.tokenizer = get_tokenizer(model=nlp) return SpacyTokenizer(nlp)