def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg)
def add_maskedlm_pipe(nlp: Language): """Add maskedlm pipe to nlp""" wp = nlp.get_pipe(TRANSFORMERS_TOKENIZER) tokenizer = wp.model preprocessor = BertForMaskedLMPreprocessor(nlp.vocab, tokenizer) nlp.add_pipe(preprocessor, before=TRANSFORMERS_MODEL) bert = nlp.get_pipe(TRANSFORMERS_MODEL) config = bert.model.config model = BertOnlyMLMHead(config) pipe = BertForMaskedLM(nlp.vocab, model) nlp.add_pipe(pipe)
def test_pipe_class_component_defaults(): name = "test_class_component_defaults" @Language.factory(name) class Component: def __init__( self, nlp: Language, name: str, value1: StrictInt = 10, value2: StrictStr = "hello", ): self.nlp = nlp self.value1 = value1 self.value2 = value2 def __call__(self, doc: Doc) -> Doc: return doc nlp = Language() nlp.add_pipe(name) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert pipe.value2 == "hello"
def test_pipe_class_component_model(): name = "test_class_component_model" default_config = { "model": { "@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": { "@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False, }, }, "value1": 10, } @Language.factory(name, default_config=default_config) class Component: def __init__(self, nlp: Language, model: Model, name: str, value1: StrictInt): self.nlp = nlp self.model = model self.value1 = value1 self.name = name def __call__(self, doc: Doc) -> Doc: return doc nlp = Language() nlp.add_pipe(name) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert isinstance(pipe.model, Model)
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def test_simple_train(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): for text, answer in [ ("aaaa", 1.0), ("bbbb", 0), ("aa", 1.0), ("bbbbbbbbb", 0.0), ("aaaaaa", 1), ]: nlp.update([text], [{"cats": {"answer": answer}}]) doc = nlp("aaa") assert "answer" in doc.cats assert doc.cats["answer"] >= 0.5
def tagger(): nlp = Language() nlp.add_pipe(nlp.create_pipe("tagger")) tagger = nlp.get_pipe("tagger") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.begin_training(pipeline=nlp.pipeline) return tagger
def test_tagger_initialize_tag_map(): """Test that Tagger.initialize() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A") nlp.initialize() assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
def test_pipe_class_component_config(): name = "test_class_component_config" @Language.factory(name) class Component: def __init__(self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = True self.name = name def __call__(self, doc: Doc) -> Doc: return doc @English.factory(name) class ComponentEN: def __init__(self, nlp: Language, name: str, value1: StrictInt, value2: StrictStr): self.nlp = nlp self.value1 = value1 self.value2 = value2 self.is_base = False def __call__(self, doc: Doc) -> Doc: return doc nlp = Language() with pytest.raises(ConfigValidationError): # no config provided nlp.add_pipe(name) with pytest.raises(ConfigValidationError): # invalid config nlp.add_pipe(name, config={"value1": "10", "value2": "hello"}) with pytest.warns(UserWarning): nlp.add_pipe(name, config={ "value1": 10, "value2": "hello", "name": "wrong_name" }) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is True assert pipe.name == name nlp_en = English() with pytest.raises(ConfigValidationError): # invalid config nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"}) nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"}) pipe = nlp_en.get_pipe(name) assert isinstance(pipe.nlp, English) assert pipe.value1 == 10 assert pipe.value2 == "hello" assert pipe.is_base is False
def entity_linker(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) entity_linker = nlp.get_pipe("entity_linker") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker
def test_tagger_begin_training_tag_map(): """Test that Tagger.begin_training() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.create_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A", {"POS": "NOUN"}) nlp.add_pipe(tagger) nlp.begin_training() assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
def test_issue_3526_4(en_vocab): nlp = Language(vocab=en_vocab) patterns = [{"label": "ORG", "pattern": "Apple"}] config = {"overwrite_ents": True} ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, overwrite_ents=True) ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) nlp.add_pipe(ruler) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def test_pipe_function_component(): name = "test_component" @Language.component(name) def component(doc: Doc) -> Doc: return doc assert name in registry.factories nlp = Language() with pytest.raises(ValueError): nlp.add_pipe(component) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert pipe == component pipe = nlp.create_pipe(name) assert pipe == component
def test_pipe_class_component_init(): name1 = "test_class_component1" name2 = "test_class_component2" @Language.factory(name1) class Component1: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc class Component2: def __init__(self, nlp: Language, name: str): self.nlp = nlp def __call__(self, doc: Doc) -> Doc: return doc @Language.factory(name2) def factory(nlp: Language, name=name2): return Component2(nlp, name) nlp = Language() for name, Component in [(name1, Component1), (name2, Component2)]: assert name in registry.factories with pytest.raises(ValueError): nlp.add_pipe(Component(nlp, name)) nlp.add_pipe(name) assert name in nlp.pipe_names assert nlp.pipe_factories[name] == name assert Language.get_factory_meta(name) assert nlp.get_pipe_meta(name) pipe = nlp.get_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language) pipe = nlp.create_pipe(name) assert isinstance(pipe, Component) assert isinstance(pipe.nlp, Language)
def get_predictions(project: Project, model: Language = None, annotations=None): """Evaluate a label based on the project's model and gold annotations""" if model is None: model = get_model(project) tc = model.get_pipe("textcat") labels = {l.id: l.label for l in project.label_set.all()} gold = {} # doc.id : {label: T/F, ..} if annotations is None: annotations = Annotation.objects.filter(document__gold=True, session__project=project) for a in annotations: gold.setdefault(a.document_id, {})[labels[a.label_id]] = a.accept docs = list(gold.keys()) tokens = [get_tokens(model, doc) for doc in docs] for doc, result in zip(docs, tc.pipe(tokens)): for label, accept in gold[doc].items(): predict = result.cats[label] > .5 yield doc, label, accept, predict, result.cats[label]
def test_pipe_class_component_model_custom(): name = "test_class_component_model_custom" arch = f"{name}.arch" default_config = {"value1": 1, "model": {"@architectures": arch, "nO": 0, "nI": 0}} @Language.factory(name, default_config=default_config) class Component: def __init__( self, nlp: Language, model: Model, name: str, value1: StrictInt = 10 ): self.nlp = nlp self.model = model self.value1 = value1 self.name = name def __call__(self, doc: Doc) -> Doc: return doc @registry.architectures(arch) def make_custom_arch(nO: StrictInt, nI: StrictInt): return Linear(nO, nI) nlp = Language() config = {"value1": 20, "model": {"@architectures": arch, "nO": 1, "nI": 2}} nlp.add_pipe(name, config=config) pipe = nlp.get_pipe(name) assert isinstance(pipe.nlp, Language) assert pipe.value1 == 20 assert isinstance(pipe.model, Model) assert pipe.model.name == "linear" nlp = Language() with pytest.raises(ConfigValidationError): config = {"value1": "20", "model": {"@architectures": arch, "nO": 1, "nI": 2}} nlp.add_pipe(name, config=config) with pytest.raises(ConfigValidationError): config = {"value1": 20, "model": {"@architectures": arch, "nO": 1.0, "nI": 2.0}} nlp.add_pipe(name, config=config)
def get_todo(session: Session, model: Language, n=10) -> OrderedDict: """Populate the queue of documents to code""" done = { a.document_id for a in Annotation.objects.filter(document__gold=False, label=session.label) } todo = Document.objects.filter(gold=False).exclude(pk__in=done) if session.query: todo = todo.filter(text__icontains=session.query) todo = list(todo.values_list("id", flat=True)) logging.debug( "{ntodo} documents in todo (query: {q}, done={ndone})".format( ntodo=len(todo), ndone=len(done), q=session.query)) if len(todo) > settings.N_SAMPLE: todo = sample(todo, settings.N_SAMPLE) tc = model.get_pipe("textcat") tokens = [get_tokens(model, doc_id) for doc_id in todo] scores = [d.cats[session.label.label] for d in tc.pipe(tokens)] uncertainty = [abs(score - 0.5) for score in scores] index = list(argsort(uncertainty))[:n] return OrderedDict((todo[i], scores[i]) for i in index)
def set_split_mode(nlp: Language, mode: str): if nlp.has_pipe("compound_splitter"): splitter = nlp.get_pipe("compound_splitter") splitter.split_mode = mode
def set_split_mode(nlp: Language, mode: str): splitter = nlp.get_pipe("CompoundSplitter") splitter.split_mode = mode
def get_predictions(nlp: Language, docs: List[dict]): from collections import Counter ner = nlp.get_pipe('ner') parses = list(nlp.pipe([t['text'] for t in docs])) beams = [ ner.beam_parse([x], beam_width=16)[0] for x in tqdm(parses, desc="Predicting labels...") ] results = [] # print(type(docs), type(parses), type(beams)) # print(len(docs), len(parses), len(beams)) items = zip(docs, parses, beams) for document, parse, beam in items: text = document['text'] # if parse.ents: # print("Entities:", text, parse.ents) # else: # print("No entities found:", text, parse.ents) entities = ner.moves.get_beam_annot(beam) words = Counter() start_end = {} for (estart, eend, etype), v in sorted(entities.items(), key=lambda x: (x[1], x[0])): etype_str = parse.vocab.strings[etype] if (estart, eend) in start_end: print("Removing completely overlapping entry:", (estart, eend, etype_str)) continue words[estart, eend, etype_str] = v start_end[estart, eend] = True words_items = sorted(words.items(), key=lambda x: (-x[1], x[0])) labels = [] predicts = [] unsure = 0.001 # print(repr(text)) max_per_type = Counter() for (estart, eend, etype), escore in words_items: cstart = parse[estart].idx if eend == len(parse): cend = len(text) else: cend = parse[eend].idx # cend = parse[eend-1].idx + len(parse[eend].text) # print(cstart, cend, estart, eend, f"'{parse[estart:eend]}', '{text[cstart:cend]}'", escore) # assert parse[estart:eend].text.strip() == text[cstart:cend].strip() unsure += 0.5 - abs(escore - 0.5) if escore > 0.01: # 0.4 <= escore: max_per_type[etype] += 1 if max_per_type[etype] < 100: labels.append((cstart, cend, etype)) predicts.append( (cstart, cend, parse[estart:eend].text, etype, escore)) results.append({ 'document': document, 'labels': labels, 'unsure': unsure / len(text), 'predicts': predicts, }) return results
def test_label_types(): nlp = Language() nlp.add_pipe(nlp.create_pipe("textcat")) nlp.get_pipe("textcat").add_label("answer") with pytest.raises(ValueError): nlp.get_pipe("textcat").add_label(9)