Exemple #1
0
async def task():
    tokenizer = AutoTokenizer.from_pretrained(
        'Alaeddin/convbert-base-turkish-ner-cased')
    model = AutoModelForTokenClassification.from_pretrained(
        'Alaeddin/convbert-base-turkish-ner-cased')
    ner = TokenClassificationPipeline(model=model,
                                      tokenizer=tokenizer,
                                      grouped_entities=True)

    queue = await aioredis.create_redis_pool(
        "redis://redis:6379/0?encoding=utf-8")
    logging.warning("Connected to Redis")

    logging.warning("NER task is running asynchronously...")
    while True:
        pipe = queue.pipeline()
        pipe.lrange("ner", 0, 7)
        pipe.ltrim("ner", 8, -1)
        requests, _ = await pipe.execute()

        for r in requests:
            r = ujson.loads(r)
            results = ner(r["text"])
            for i in range(len(results)):
                results[i]['start'] = int(results[i]['start'])
                results[i]['end'] = int(results[i]['end'])

            await queue.set(r["id"], ujson.dumps(results))

        asyncio.sleep(0.1)
 def pipeline(self, text: str):
     # TODO maybe this needs senctenizing
     device = -1 if self.device.type == "cpu" else 0  # set to other device id if more (see how when you have a gpu)
     nlp = TokenClassificationPipeline(model=self.model,
                                       tokenizer=self.tokenizer,
                                       task="ner",
                                       device=device)
     res = nlp(inputs=text)
     return res
 def __init__(self):
     self.tokenizer = AutoTokenizer.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.model = AutoModelForTokenClassification.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.config = PretrainedConfig.from_pretrained(
         "Alaeddin/convbert-base-turkish-ner-cased")
     self.pipeline = pipeline('ner',
                              model=self.model,
                              tokenizer=self.tokenizer,
                              config=self.config)
     self.nlp = spacy.load("en_core_web_sm")
     self.nlp_grouped = TokenClassificationPipeline(
         model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
    def run_pipeline_test(self, model, tokenizer, feature_extractor):
        token_classifier = TokenClassificationPipeline(model=model,
                                                       tokenizer=tokenizer)

        outputs = token_classifier("A simple string")
        self.assertIsInstance(outputs, list)
        n = len(outputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "entity": ANY(str),
                "score": ANY(float),
                "start": ANY(int),
                "end": ANY(int),
                "index": ANY(int),
                "word": ANY(str),
            } for i in range(n)],
        )
        outputs = token_classifier(
            ["list of strings", "A simple string that is quite a bit longer"])
        self.assertIsInstance(outputs, list)
        self.assertEqual(len(outputs), 2)
        n = len(outputs[0])
        m = len(outputs[1])

        self.assertEqual(
            nested_simplify(outputs),
            [
                [{
                    "entity": ANY(str),
                    "score": ANY(float),
                    "start": ANY(int),
                    "end": ANY(int),
                    "index": ANY(int),
                    "word": ANY(str),
                } for i in range(n)],
                [{
                    "entity": ANY(str),
                    "score": ANY(float),
                    "start": ANY(int),
                    "end": ANY(int),
                    "index": ANY(int),
                    "word": ANY(str),
                } for i in range(m)],
            ],
        )

        self.run_aggregation_strategy(model, tokenizer)
Exemple #5
0
def create_pipeline(model_name):
    config = AutoConfig.from_pretrained(model_name)

    tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            use_fast=True,
            return_offsets_mapping=True
        )

    model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            config=config
        )
    NER_pipeline = TokenClassificationPipeline(model= model,tokenizer=tokenizer, framework='pt', task='ner', grouped_entities=True)

    return NER_pipeline
    def __init__(self,
                 model_type: str = "BERT",
                 model_name: str = "dslim/bert-base-NER"):

        self.adaptor = get_adaptor(model_type)

        model = AutoModelForTokenClassification.from_pretrained(model_name)

        super().__init__(model_type, model_name, model)

        device_number = detect_cuda_device_number()

        self._pipeline = TokenClassificationPipeline(model=self.model,
                                                     tokenizer=self.tokenizer,
                                                     device=device_number)

        self._trainer = TOCTrainer(self.model, model_type, self.tokenizer,
                                   self._device, self.logger)
Exemple #7
0
from transformers import ElectraForTokenClassification, TokenClassificationPipeline
from tokenization_kocharelectra import KoCharElectraTokenizer
from pprint import pprint

tokenizer = KoCharElectraTokenizer.from_pretrained(
    "monologg/kocharelectra-base-kmounlp-ner")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/kocharelectra-base-kmounlp-ner")

ner = TokenClassificationPipeline(model=model,
                                  tokenizer=tokenizer,
                                  ignore_labels=["O"],
                                  grouped_entities=True,
                                  device=-1)

pprint(
    ner("문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)"
        ))
    def run_aggregation_strategy(self, model, tokenizer):
        token_classifier = TokenClassificationPipeline(
            model=model, tokenizer=tokenizer, aggregation_strategy="simple")
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.SIMPLE)
        outputs = token_classifier("A simple string")
        self.assertIsInstance(outputs, list)
        n = len(outputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "entity_group": ANY(str),
                "score": ANY(float),
                "start": ANY(int),
                "end": ANY(int),
                "word": ANY(str),
            } for i in range(n)],
        )

        token_classifier = TokenClassificationPipeline(
            model=model, tokenizer=tokenizer, aggregation_strategy="first")
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.FIRST)
        outputs = token_classifier("A simple string")
        self.assertIsInstance(outputs, list)
        n = len(outputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "entity_group": ANY(str),
                "score": ANY(float),
                "start": ANY(int),
                "end": ANY(int),
                "word": ANY(str),
            } for i in range(n)],
        )

        token_classifier = TokenClassificationPipeline(
            model=model, tokenizer=tokenizer, aggregation_strategy="max")
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.MAX)
        outputs = token_classifier("A simple string")
        self.assertIsInstance(outputs, list)
        n = len(outputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "entity_group": ANY(str),
                "score": ANY(float),
                "start": ANY(int),
                "end": ANY(int),
                "word": ANY(str),
            } for i in range(n)],
        )

        token_classifier = TokenClassificationPipeline(
            model=model, tokenizer=tokenizer, aggregation_strategy="average")
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.AVERAGE)
        outputs = token_classifier("A simple string")
        self.assertIsInstance(outputs, list)
        n = len(outputs)
        self.assertEqual(
            nested_simplify(outputs),
            [{
                "entity_group": ANY(str),
                "score": ANY(float),
                "start": ANY(int),
                "end": ANY(int),
                "word": ANY(str),
            } for i in range(n)],
        )

        with self.assertWarns(UserWarning):
            token_classifier = pipeline(task="ner",
                                        model=model,
                                        tokenizer=tokenizer,
                                        grouped_entities=True)
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.SIMPLE)
        with self.assertWarns(UserWarning):
            token_classifier = pipeline(task="ner",
                                        model=model,
                                        tokenizer=tokenizer,
                                        grouped_entities=True,
                                        ignore_subwords=True)
        self.assertEqual(token_classifier.aggregation_strategy,
                         AggregationStrategy.FIRST)
 def get_test_pipeline(self, model, tokenizer, feature_extractor):
     token_classifier = TokenClassificationPipeline(model=model,
                                                    tokenizer=tokenizer)
     return token_classifier, [
         "A simple string", "A simple string that is quite a bit longer"
     ]
# news['entities'] = news.Article.apply(get_entities)



# huggingface NER pipeline
from transformers import TokenClassificationPipeline, TFAutoModelForTokenClassification, AutoTokenizer
# from transformers import pipeline
# nlp = pipeline('ner')
# print(pd.DataFrame(nlp(sentence))) # gives out tokens and labes

sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \
           'infrastrucutre.'

## BERT tokenizer and token classification
nlp = TokenClassificationPipeline(model=TFAutoModelForTokenClassification.from_pretrained(
    'distilbert-base-cased'), tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased'),
    framework='tf')
print(pd.DataFrame(nlp(sentence)))


from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
import tensorflow as tf
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
print(model(input_ids))

import numpy as np
from transformers import AutoTokenizer, pipeline, TFDistilBertModel
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')