Ejemplo n.º 1
0
    def __init__(
        self,
        model=None,
        model_path: str = None,
        entities_to_keep: List[str] = None,
        verbose: bool = False,
    ):
        """
        Evaluator for Flair models
        :param model: model of type SequenceTagger
        :param model_path:
        :param entities_to_keep:
        :param verbose:
        and model expected entity types
        """
        super().__init__(
            entities_to_keep=entities_to_keep,
            verbose=verbose,
        )
        if model is None:
            if model_path is None:
                raise ValueError(
                    "Either model_path or model object must be supplied")
            self.model = SequenceTagger.load(model_path)
        else:
            self.model = model

        self.spacy_tokenizer = SpacyTokenizer(
            model=spacy.load("en_core_web_lg"))
Ejemplo n.º 2
0
def build_spacy_tokenizer(model) -> Callable[[str], List[Token]]:
    from flair.tokenization import SpacyTokenizer
    spacy_tokenizer = SpacyTokenizer(model)

    def tokenizer(text: str) -> List[Token]:
        return spacy_tokenizer.tokenize(text)

    return tokenizer
Ejemplo n.º 3
0
def test_create_sentence_with_spacy_tokenizer():
    sentence:Sentence = Sentence("I love Berlin.", use_tokenizer=SpacyTokenizer("en_core_sci_sm"))

    assert 4 == len(sentence.tokens)
    assert "I" == sentence.tokens[0].text
    assert "love" == sentence.tokens[1].text
    assert "Berlin" == sentence.tokens[2].text
    assert "." == sentence.tokens[3].text
Ejemplo n.º 4
0
def predict():
    model = SequenceTagger.load('../SI/SI_model.pt')
    files = sorted(glob.glob(os.path.join(TEST_FOLDER, '*.txt')))
    for file in files:
        filename = os.path.basename(file)
        with open(file, 'r') as inputs, open(
                os.path.join('../SI-predictions', filename), 'w') as output:
            article = inputs.read()
            splitter = SegtokSentenceSplitter(
                tokenizer=SpacyTokenizer('en_core_web_sm'))
            sentences = splitter.split(article)
            model.predict(sentences, mini_batch_size=16, verbose=True)
            for sentence in sentences:
                output.write(sentence.to_tagged_string())
Ejemplo n.º 5
0
    def __init__(
        self,
        model=None,
        model_path: str = None,
        entities_to_keep: List[str] = None,
        verbose: bool = False,
        entity_mapping: Dict[str, str] = PRESIDIO_SPACY_ENTITIES,
    ):

        super().__init__(
            entities_to_keep=entities_to_keep,
            verbose=verbose,
            entity_mapping=entity_mapping,
        )
        if model is None:
            if model_path is None:
                raise ValueError(
                    "Either model_path or model object must be supplied")
            self.model = SequenceTagger.load(model_path)
        else:
            self.model = model

        self.spacy_tokenizer = SpacyTokenizer(
            model=spacy.load("en_core_web_sm"))
Ejemplo n.º 6
0
    s = re.sub("[\x00-\x1f\x7f\u0080-\u009f]+", '', s)
    # replace `NO-BREAK SPACE`
    return re.sub("\u00a0+", ' ', s)


components = {
    "au": [
        "house_name", "unit", "house_number", "street", "suburb", "city",
        "state", "postcode", "country"
    ],
    "jp":
    ["state", "city", "street", "house_number", "unit", "postcode", "country"]
}

tokenizers = {
    "au": SpacyTokenizer(spacy.load("en_core_web_sm")),
    "jp": SpacyTokenizer(spacy.load("ja_core_news_sm"))
}

embeddings = StackedEmbeddings(
    [FlairEmbeddings('multi-forward'),
     FlairEmbeddings('multi-backward')])

# stacked embedding is recommended but traditional embeddings are trained on single languages
# embeddings = {
#     "au": StackedEmbeddings([
#         WordEmbeddings('en'),
#         FlairEmbeddings('mix-forward'),
#         FlairEmbeddings('mix-backward'),
#     ]),
#     "jp": StackedEmbeddings([
Ejemplo n.º 7
0
def get_french_tokenizer():
    nlp = spacy.blank(name="fr")
    nlp.tokenizer = get_tokenizer(model=nlp)
    return SpacyTokenizer(nlp)