コード例 #1
0
ファイル: test_issue1501-2000.py プロジェクト: spacy-io/spaCy
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
コード例 #2
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.get_pipe("ner").add_label("answer")
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)
コード例 #3
0
def add_maskedlm_pipe(nlp: Language):
    """Add maskedlm pipe to nlp"""
    wp = nlp.get_pipe(TRANSFORMERS_TOKENIZER)
    tokenizer = wp.model
    preprocessor = BertForMaskedLMPreprocessor(nlp.vocab, tokenizer)
    nlp.add_pipe(preprocessor, before=TRANSFORMERS_MODEL)
    bert = nlp.get_pipe(TRANSFORMERS_MODEL)
    config = bert.model.config
    model = BertOnlyMLMHead(config)
    pipe = BertForMaskedLM(nlp.vocab, model)
    nlp.add_pipe(pipe)
コード例 #4
0
def test_pipe_class_component_defaults():
    name = "test_class_component_defaults"

    @Language.factory(name)
    class Component:
        def __init__(
            self,
            nlp: Language,
            name: str,
            value1: StrictInt = 10,
            value2: StrictStr = "hello",
        ):
            self.nlp = nlp
            self.value1 = value1
            self.value2 = value2

        def __call__(self, doc: Doc) -> Doc:
            return doc

    nlp = Language()
    nlp.add_pipe(name)
    pipe = nlp.get_pipe(name)
    assert isinstance(pipe.nlp, Language)
    assert pipe.value1 == 10
    assert pipe.value2 == "hello"
コード例 #5
0
def test_pipe_class_component_model():
    name = "test_class_component_model"
    default_config = {
        "model": {
            "@architectures": "spacy.TextCatEnsemble.v2",
            "tok2vec": DEFAULT_TOK2VEC_MODEL,
            "linear_model": {
                "@architectures": "spacy.TextCatBOW.v1",
                "exclusive_classes": False,
                "ngram_size": 1,
                "no_output_layer": False,
            },
        },
        "value1": 10,
    }

    @Language.factory(name, default_config=default_config)
    class Component:
        def __init__(self, nlp: Language, model: Model, name: str,
                     value1: StrictInt):
            self.nlp = nlp
            self.model = model
            self.value1 = value1
            self.name = name

        def __call__(self, doc: Doc) -> Doc:
            return doc

    nlp = Language()
    nlp.add_pipe(name)
    pipe = nlp.get_pipe(name)
    assert isinstance(pipe.nlp, Language)
    assert pipe.value1 == 10
    assert isinstance(pipe.model, Model)
コード例 #6
0
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
コード例 #7
0
ファイル: test_textcat.py プロジェクト: spacy-io/spaCy
def test_simple_train():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    nlp.begin_training()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update([text], [{"cats": {"answer": answer}}])
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
コード例 #8
0
def tagger():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    tagger = nlp.get_pipe("tagger")
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.begin_training(pipeline=nlp.pipeline)
    return tagger
コード例 #9
0
def test_tagger_initialize_tag_map():
    """Test that Tagger.initialize() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A")
    nlp.initialize()
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
コード例 #10
0
ファイル: test_pipe_factories.py プロジェクト: svlandeg/spaCy
def test_pipe_class_component_config():
    name = "test_class_component_config"

    @Language.factory(name)
    class Component:
        def __init__(self, nlp: Language, name: str, value1: StrictInt,
                     value2: StrictStr):
            self.nlp = nlp
            self.value1 = value1
            self.value2 = value2
            self.is_base = True
            self.name = name

        def __call__(self, doc: Doc) -> Doc:
            return doc

    @English.factory(name)
    class ComponentEN:
        def __init__(self, nlp: Language, name: str, value1: StrictInt,
                     value2: StrictStr):
            self.nlp = nlp
            self.value1 = value1
            self.value2 = value2
            self.is_base = False

        def __call__(self, doc: Doc) -> Doc:
            return doc

    nlp = Language()
    with pytest.raises(ConfigValidationError):  # no config provided
        nlp.add_pipe(name)
    with pytest.raises(ConfigValidationError):  # invalid config
        nlp.add_pipe(name, config={"value1": "10", "value2": "hello"})
    with pytest.warns(UserWarning):
        nlp.add_pipe(name,
                     config={
                         "value1": 10,
                         "value2": "hello",
                         "name": "wrong_name"
                     })
    pipe = nlp.get_pipe(name)
    assert isinstance(pipe.nlp, Language)
    assert pipe.value1 == 10
    assert pipe.value2 == "hello"
    assert pipe.is_base is True
    assert pipe.name == name

    nlp_en = English()
    with pytest.raises(ConfigValidationError):  # invalid config
        nlp_en.add_pipe(name, config={"value1": "10", "value2": "hello"})
    nlp_en.add_pipe(name, config={"value1": 10, "value2": "hello"})
    pipe = nlp_en.get_pipe(name)
    assert isinstance(pipe.nlp, English)
    assert pipe.value1 == 10
    assert pipe.value2 == "hello"
    assert pipe.is_base is False
コード例 #11
0
def entity_linker():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("entity_linker"))
    entity_linker = nlp.get_pipe("entity_linker")
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    entity_linker.set_kb(kb)
    entity_linker.begin_training(pipeline=nlp.pipeline)
    return entity_linker
コード例 #12
0
def test_tagger_begin_training_tag_map():
    """Test that Tagger.begin_training() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.create_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A", {"POS": "NOUN"})
    nlp.add_pipe(tagger)
    nlp.begin_training()
    assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN}
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
コード例 #13
0
def test_issue_3526_4(en_vocab):
    nlp = Language(vocab=en_vocab)
    patterns = [{"label": "ORG", "pattern": "Apple"}]
    config = {"overwrite_ents": True}
    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
コード例 #14
0
ファイル: test_issue3526.py プロジェクト: monasaad/CAPEsFinal
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)

    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
コード例 #15
0
def test_pipe_function_component():
    name = "test_component"

    @Language.component(name)
    def component(doc: Doc) -> Doc:
        return doc

    assert name in registry.factories
    nlp = Language()
    with pytest.raises(ValueError):
        nlp.add_pipe(component)
    nlp.add_pipe(name)
    assert name in nlp.pipe_names
    assert nlp.pipe_factories[name] == name
    assert Language.get_factory_meta(name)
    assert nlp.get_pipe_meta(name)
    pipe = nlp.get_pipe(name)
    assert pipe == component
    pipe = nlp.create_pipe(name)
    assert pipe == component
コード例 #16
0
def test_pipe_class_component_init():
    name1 = "test_class_component1"
    name2 = "test_class_component2"

    @Language.factory(name1)
    class Component1:
        def __init__(self, nlp: Language, name: str):
            self.nlp = nlp

        def __call__(self, doc: Doc) -> Doc:
            return doc

    class Component2:
        def __init__(self, nlp: Language, name: str):
            self.nlp = nlp

        def __call__(self, doc: Doc) -> Doc:
            return doc

    @Language.factory(name2)
    def factory(nlp: Language, name=name2):
        return Component2(nlp, name)

    nlp = Language()
    for name, Component in [(name1, Component1), (name2, Component2)]:
        assert name in registry.factories
        with pytest.raises(ValueError):
            nlp.add_pipe(Component(nlp, name))
        nlp.add_pipe(name)
        assert name in nlp.pipe_names
        assert nlp.pipe_factories[name] == name
        assert Language.get_factory_meta(name)
        assert nlp.get_pipe_meta(name)
        pipe = nlp.get_pipe(name)
        assert isinstance(pipe, Component)
        assert isinstance(pipe.nlp, Language)
        pipe = nlp.create_pipe(name)
        assert isinstance(pipe, Component)
        assert isinstance(pipe.nlp, Language)
コード例 #17
0
ファイル: ml.py プロジェクト: vanatteveldt/claxon
def get_predictions(project: Project,
                    model: Language = None,
                    annotations=None):
    """Evaluate a label based on the project's model and gold annotations"""
    if model is None:
        model = get_model(project)
    tc = model.get_pipe("textcat")
    labels = {l.id: l.label for l in project.label_set.all()}

    gold = {}  # doc.id : {label: T/F, ..}
    if annotations is None:
        annotations = Annotation.objects.filter(document__gold=True,
                                                session__project=project)
    for a in annotations:
        gold.setdefault(a.document_id, {})[labels[a.label_id]] = a.accept

    docs = list(gold.keys())
    tokens = [get_tokens(model, doc) for doc in docs]
    for doc, result in zip(docs, tc.pipe(tokens)):
        for label, accept in gold[doc].items():
            predict = result.cats[label] > .5
            yield doc, label, accept, predict, result.cats[label]
コード例 #18
0
ファイル: test_pipe_factories.py プロジェクト: mosynaq/spaCy
def test_pipe_class_component_model_custom():
    name = "test_class_component_model_custom"
    arch = f"{name}.arch"
    default_config = {"value1": 1, "model": {"@architectures": arch, "nO": 0, "nI": 0}}

    @Language.factory(name, default_config=default_config)
    class Component:
        def __init__(
            self, nlp: Language, model: Model, name: str, value1: StrictInt = 10
        ):
            self.nlp = nlp
            self.model = model
            self.value1 = value1
            self.name = name

        def __call__(self, doc: Doc) -> Doc:
            return doc

    @registry.architectures(arch)
    def make_custom_arch(nO: StrictInt, nI: StrictInt):
        return Linear(nO, nI)

    nlp = Language()
    config = {"value1": 20, "model": {"@architectures": arch, "nO": 1, "nI": 2}}
    nlp.add_pipe(name, config=config)
    pipe = nlp.get_pipe(name)
    assert isinstance(pipe.nlp, Language)
    assert pipe.value1 == 20
    assert isinstance(pipe.model, Model)
    assert pipe.model.name == "linear"

    nlp = Language()
    with pytest.raises(ConfigValidationError):
        config = {"value1": "20", "model": {"@architectures": arch, "nO": 1, "nI": 2}}
        nlp.add_pipe(name, config=config)
    with pytest.raises(ConfigValidationError):
        config = {"value1": 20, "model": {"@architectures": arch, "nO": 1.0, "nI": 2.0}}
        nlp.add_pipe(name, config=config)
コード例 #19
0
ファイル: ml.py プロジェクト: vanatteveldt/claxon
def get_todo(session: Session, model: Language, n=10) -> OrderedDict:
    """Populate the queue of documents to code"""
    done = {
        a.document_id
        for a in Annotation.objects.filter(document__gold=False,
                                           label=session.label)
    }
    todo = Document.objects.filter(gold=False).exclude(pk__in=done)
    if session.query:
        todo = todo.filter(text__icontains=session.query)
    todo = list(todo.values_list("id", flat=True))
    logging.debug(
        "{ntodo} documents in todo (query: {q}, done={ndone})".format(
            ntodo=len(todo), ndone=len(done), q=session.query))
    if len(todo) > settings.N_SAMPLE:
        todo = sample(todo, settings.N_SAMPLE)

    tc = model.get_pipe("textcat")
    tokens = [get_tokens(model, doc_id) for doc_id in todo]
    scores = [d.cats[session.label.label] for d in tc.pipe(tokens)]
    uncertainty = [abs(score - 0.5) for score in scores]
    index = list(argsort(uncertainty))[:n]

    return OrderedDict((todo[i], scores[i]) for i in index)
コード例 #20
0
ファイル: __init__.py プロジェクト: megagonlabs/ginza
def set_split_mode(nlp: Language, mode: str):
    if nlp.has_pipe("compound_splitter"):
        splitter = nlp.get_pipe("compound_splitter")
        splitter.split_mode = mode
コード例 #21
0
def set_split_mode(nlp: Language, mode: str):
    splitter = nlp.get_pipe("CompoundSplitter")
    splitter.split_mode = mode
コード例 #22
0
def get_predictions(nlp: Language, docs: List[dict]):
    from collections import Counter
    ner = nlp.get_pipe('ner')
    parses = list(nlp.pipe([t['text'] for t in docs]))
    beams = [
        ner.beam_parse([x], beam_width=16)[0]
        for x in tqdm(parses, desc="Predicting labels...")
    ]

    results = []
    # print(type(docs), type(parses), type(beams))
    # print(len(docs), len(parses), len(beams))
    items = zip(docs, parses, beams)
    for document, parse, beam in items:
        text = document['text']
        # if parse.ents:
        #     print("Entities:", text, parse.ents)
        # else:
        #     print("No entities found:", text, parse.ents)
        entities = ner.moves.get_beam_annot(beam)
        words = Counter()
        start_end = {}
        for (estart, eend, etype), v in sorted(entities.items(),
                                               key=lambda x: (x[1], x[0])):
            etype_str = parse.vocab.strings[etype]
            if (estart, eend) in start_end:
                print("Removing completely overlapping entry:",
                      (estart, eend, etype_str))
                continue
            words[estart, eend, etype_str] = v
            start_end[estart, eend] = True

        words_items = sorted(words.items(), key=lambda x: (-x[1], x[0]))
        labels = []
        predicts = []
        unsure = 0.001
        # print(repr(text))
        max_per_type = Counter()
        for (estart, eend, etype), escore in words_items:
            cstart = parse[estart].idx
            if eend == len(parse):
                cend = len(text)
            else:
                cend = parse[eend].idx
                # cend = parse[eend-1].idx + len(parse[eend].text)
            # print(cstart, cend, estart, eend, f"'{parse[estart:eend]}', '{text[cstart:cend]}'", escore)
            # assert parse[estart:eend].text.strip() == text[cstart:cend].strip()
            unsure += 0.5 - abs(escore - 0.5)
            if escore > 0.01:  # 0.4 <= escore:
                max_per_type[etype] += 1
                if max_per_type[etype] < 100:
                    labels.append((cstart, cend, etype))
                predicts.append(
                    (cstart, cend, parse[estart:eend].text, etype, escore))

        results.append({
            'document': document,
            'labels': labels,
            'unsure': unsure / len(text),
            'predicts': predicts,
        })

    return results
コード例 #23
0
def test_label_types():
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    nlp.get_pipe("textcat").add_label("answer")
    with pytest.raises(ValueError):
        nlp.get_pipe("textcat").add_label(9)