Beispiel #1
0
def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in clusters:
        if word not in probs:
            probs[word] = -17.0
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
Beispiel #2
0
def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / "vectors.tgz"
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / "vec.bin"))
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / "clusters.txt")
    probs = _read_probs(src_dir / "words.sgt.prob")
    if not probs:
        min_prob = 0.0
    else:
        min_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = min_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry["prob"] = float(prob)
            cluster = clusters.get(word, "0")
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry["cluster"] = int(cluster[::-1], 2)
            orth_senses = set()
            lemmas = []
            vocab[word] = entry
    vocab.dump(str(dst_dir / "lexemes.bin"))
    vocab.strings.dump(str(dst_dir / "strings.txt"))
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    vocab2 = vocab2.from_bytes(vocab1.to_bytes())
    assert vocab2[strings[0]].norm_ == lex_attr
Beispiel #4
0
def test_vocab_add_vector():
    vocab = Vocab()
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    cat = vocab["cat"]
    assert list(cat.vector) == [1.0, 1.0, 1.0]
    dog = vocab["dog"]
    assert list(dog.vector) == [2.0, 2.0, 2.0]
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
Beispiel #6
0
def test_pickle_vocab(text1, text2):
    vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
    vocab.set_vector("dog", numpy.ones((5,), dtype="f"))
    lex1 = vocab[text1]
    lex2 = vocab[text2]
    assert lex1.norm_ == text1[:-1]
    assert lex2.norm_ == text2[:-1]
    data = srsly.pickle_dumps(vocab)
    unpickled = srsly.pickle_loads(data)
    assert unpickled[text1].orth == lex1.orth
    assert unpickled[text2].orth == lex2.orth
    assert unpickled[text1].norm == lex1.norm
    assert unpickled[text2].norm == lex2.norm
    assert unpickled[text1].norm != unpickled[text2].norm
    assert unpickled.vectors is not None
    assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
Beispiel #7
0
def load_vocab(path):
    path = Path(path)
    if not path.exists():
        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
    if not path.is_dir():
        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
    return Vocab.load(path)
Beispiel #8
0
def setup_vocab(lex_attr_getters, tag_map, src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.bz2'
    if vectors_src.exists():
        write_binary_vectors(vectors_src.as_posix, (dst_dir / 'vec.bin').as_posix())
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
    if not probs:
        oov_prob = -20
    else:
        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = oov_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        # First encode the strings into the StringStore. This way, we can map
        # the orth IDs to frequency ranks
        orth = vocab.strings[word]
    # Now actually load the vocab
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
    vocab.dump((dst_dir / 'lexemes.bin').as_posix())
    with (dst_dir / 'strings.json').open('w') as file_:
        vocab.strings.dump(file_)
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)
Beispiel #9
0
    def test_load_careful(self):
        config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}

        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))

        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])

        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))

        parser = Parser(vocab.strings, moves, model)
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
    if strings1 == strings2:
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
    vocab1 = vocab1.from_bytes(vocab1_b)
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
    assert len(new_vocab1) == len(strings1)
    assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
Beispiel #11
0
def setup_vocab(src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    senses = _read_senses(src_dir / 'supersenses.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry['prob'] = float(prob)
            cluster = clusters.get(word, '0')
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            entry['cluster'] = int(cluster[::-1], 2)
            orth_senses = set()
            lemmas = []
            for pos in [NOUN, VERB, ADJ]:
                for lemma in lemmatizer(word.lower(), pos):
                    lemmas.append(lemma)
                    orth_senses.update(senses[lemma][pos])
            if word.lower() == 'dogging':
                print word
                print lemmas
                print [spacy.senses.STRINGS[si] for si in orth_senses]
            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
Beispiel #12
0
def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
    if not dst_dir.exists():
        dst_dir.mkdir()

    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
        print("Warning: Word vectors file not found")
    vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
        oov_prob = -20
    else:
        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
            probs[word] = oov_prob

    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)
Beispiel #13
0
def test_doc_token_api_vectors():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
    doc = Doc(vocab, words=["apples", "oranges", "oov"])
    assert doc.has_vector
    assert doc[0].has_vector
    assert doc[1].has_vector
    assert not doc[2].has_vector
    apples_norm = (0 * 0 + 2 * 2) ** 0.5
    oranges_norm = (0 * 0 + 1 * 1) ** 0.5
    cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
    assert doc[0].similarity(doc[1]) == cosine
Beispiel #14
0
def test_Example_from_dict_with_empty_entities():
    annots = {
        "words": ["I", "like", "New", "York", "and", "Berlin", "."],
        "entities": [],
    }
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    # entities as empty list sets everything to O
    assert example.reference.has_annotation("ENT_IOB")
    assert len(list(example.reference.ents)) == 0
    assert all(token.ent_iob_ == "O" for token in example.reference)
    # various unset/missing entities leaves entities unset
    annots["entities"] = None
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
    annots.pop("entities", None)
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
Beispiel #15
0
def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray(
        [
            [1, 397],
            [4, 436],
            [2, 426],
            [1, 402],
            [0, 8206900633647566924],
            [18446744073709551615, 440],
            [18446744073709551614, 442],
        ],
        dtype="uint64",
    )
    doc = Doc(Vocab(), words="Just what I was looking for .".split())
    doc.vocab.strings.add("ROOT")
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1
Beispiel #16
0
def case_spacy(spacy_load_mock):


    import re
    data = re.sub(
        " +", "\t",
        """
# sent_id = testtext.1
1   Dies     Dies    PRON    PDS   _   2   sb     _   _
2   ist   sein  AUX    VAFIN   _                  0   ROOT    _   _
3   ein   einen  DET    ART   _                  4   nk    _   _
4   Test     Test    NOUN   NN   _                 2   pd   _   _
5   .     .    PUNCT   $.   _                 2   punct   _   _

""")

    result = conllu.parse(data)

    words = ["Dies", "ist", "ein", "Test", "."]
    vocab = Vocab(strings=words)

    heads = [1, 1, 3, 1, 1]
    tags = ["PDS", "VAFIN", "ART", "NN", "$."]
    pos = ["PRON", "AUX", "DET", "NOUN", "PUNCT"]
    lemmas = ["Dies", "sein", "einen", "Test", "."]
    deps = ["sb", "ROOT", "nk", "pd", "punct"]

    doc = Doc(vocab, words, pos=pos, tags=tags, lemmas=lemmas, deps=deps, heads=heads)

    spacy_annotator = spacy_load_mock.return_value
    spacy_annotator.return_value = doc

    annotator = wikiannotator.Annotator.createAnnotator('spacy', {'model_name': 'model_name'})
    spacy_load_mock.assert_called_once_with('model_name')

    return (
        annotator, wikiannotator.SpacyAnnotator,
        {
            'text': 'Dies ist ein Test.',
            'textname': 'testtext',
            'parse': result
        }
    )
Beispiel #17
0
def test_issue7056():
    """Test that the Unshift transition works properly, and doesn't cause
    sentence segmentation errors."""
    vocab = Vocab()
    ae = ArcEager(
        vocab.strings,
        ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]))
    doc = Doc(vocab, words="Severe pain , after trauma".split())
    state = ae.init_batch([doc])[0]
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "L-amod")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "R-pobj")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    assert not state.eol()
Beispiel #18
0
def test_issue850_basic():
    """Test Matcher matches with '*' operator and Boolean flag"""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    pattern = [{
        "LOWER": "bob"
    }, {
        "OP": "*",
        "LOWER": "and"
    }, {
        "LOWER": "frank"
    }]
    matcher.add("FarAway", None, pattern)
    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4
Beispiel #19
0
def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
            SHAPE: orth_funcs.word_shape,
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
            CLUSTER: lambda string: 0,
            IS_ALPHA: orth_funcs.is_alpha,
            IS_ASCII: orth_funcs.is_ascii,
            IS_DIGIT: lambda string: string.isdigit(),
            IS_LOWER: orth_funcs.is_lower,
            IS_PUNCT: orth_funcs.is_punct,
            IS_SPACE: lambda string: string.isspace(),
            IS_TITLE: orth_funcs.is_title,
            IS_UPPER: orth_funcs.is_upper,
            IS_STOP: lambda string: False,
            IS_OOV: lambda string: True
        })
Beispiel #20
0
    def initialize(self, get_examples: Callable[[], Iterable[Example]], *,
                   nlp: Optional[Language]):
        """Initialize the pipe for training, using data examples if available.

        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
        nlp (Language): The current nlp object.

        DOCS: https://nightly.spacy.io/api/transformer#initialize
        """
        validate_get_examples(get_examples, "Transformer.initialize")
        docs = [Doc(Vocab(), words=["hello"])]
        self.model.initialize(X=docs)
        if nlp is not None:
            for i, (name1, proc1) in enumerate(nlp.pipeline):
                if proc1 is self:
                    for name2, proc2 in nlp.pipeline[i:]:
                        self.find_listeners(proc2)
                    break
Beispiel #21
0
def test_vocab_prune_vectors():
    vocab = Vocab(vectors_name="test_vocab_prune_vectors")
    _ = vocab["cat"]  # noqa: F841
    _ = vocab["dog"]  # noqa: F841
    _ = vocab["kitten"]  # noqa: F841
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = [1.0, 1.2, 1.1]
    data[1] = [0.3, 1.3, 1.0]
    data[2] = [0.9, 1.22, 1.05]
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    vocab.set_vector("kitten", data[2])

    remap = vocab.prune_vectors(2, batch_size=2)
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
Beispiel #22
0
def test_vocab_prune_vectors():
    vocab = Vocab()
    _ = vocab["cat"]  # noqa: F841
    _ = vocab["dog"]  # noqa: F841
    _ = vocab["kitten"]  # noqa: F841
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    data[2] = 1.1
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    vocab.set_vector("kitten", data[2])

    remap = vocab.prune_vectors(2)
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
Beispiel #23
0
def test_Example_missing_heads():
    vocab = Vocab()
    words = ["I", "like", "London", "and", "Berlin", "."]
    deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"]
    heads = [1, 1, 1, None, 2, 1]
    annots = {"words": words, "heads": heads, "deps": deps}
    predicted = Doc(vocab, words=words)

    example = Example.from_dict(predicted, annots)
    parsed_heads = [t.head.i for t in example.reference]
    assert parsed_heads[0] == heads[0]
    assert parsed_heads[1] == heads[1]
    assert parsed_heads[2] == heads[2]
    assert parsed_heads[4] == heads[4]
    assert parsed_heads[5] == heads[5]
    expected = [True, True, True, False, True, True]
    assert [t.has_head() for t in example.reference] == expected
    # Ensure that the missing head doesn't create an artificial new sentence start
    expected = [True, False, False, False, False, False]
    assert example.get_aligned_sent_starts() == expected
Beispiel #24
0
def main(output_dir):
    ensure_dir(output_dir)
    ensure_dir(output_dir, "pos")
    ensure_dir(output_dir, "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger.blank(vocab, Tagger.default_templates())

    for i in range(5):
        for words, tags in DATA:
            tokens = tokenizer.tokens_from_list(words)
            tagger.train(tokens, tags)
        random.shuffle(DATA)
    tagger.model.end_training()
    tagger.model.dump(path.join(output_dir, 'pos', 'model'))
    with io.open(output_dir, 'vocab', 'strings.json') as file_:
        tagger.vocab.strings.dump(file_)
Beispiel #25
0
    def __call__(self, current_file: ProtocolFile) -> Doc:

        with open(self.path) as file:
            current_transcription = file.read().split('\n')

        tokens, attributes = [], []
        for line in current_transcription:
            if line == '':
                continue
            _, speaker, start, end, text, confidence = line.split()
            start, end, confidence = map(float, (start, end, confidence))
            tokens.append(text)
            attributes.append((speaker, start, end, confidence))

        current_transcription = Doc(Vocab(), tokens)
        for token, (speaker, time_start, time_end,
                    alignment_confidence) in zip(current_transcription,
                                                 attributes):
            token._.speaker, token._.time_start, token._.time_end, token._.alignment_confidence = speaker, time_start, time_end, alignment_confidence

        return current_transcription
Beispiel #26
0
def test_empty_doc():
    width = 128
    embed_size = 2000
    vocab = Vocab()
    doc = Doc(vocab, words=[])
    tok2vec = build_Tok2Vec_model(
        MultiHashEmbed(
            width=width,
            rows=[embed_size, embed_size, embed_size, embed_size],
            include_static_vectors=False,
            attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
        ),
        MaxoutWindowEncoder(width=width,
                            depth=4,
                            window_size=1,
                            maxout_pieces=3),
    )
    tok2vec.initialize()
    vectors, backprop = tok2vec.begin_update([doc])
    assert len(vectors) == 1
    assert vectors[0].shape == (0, width)
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "vocab1"
        file_path2 = d / "vocab2"
        vocab1.to_disk(file_path1)
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        assert list(vocab1_d) == list(vocab1)
        assert list(vocab2_d) == list(vocab2)
        if strings1 == strings2:
            assert list(vocab1_d) == list(vocab2_d)
        else:
            assert list(vocab1_d) != list(vocab2_d)
Beispiel #28
0
def test_issue4054(en_vocab):
    """Test that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point."""
    nlp1 = English()
    vocab1 = nlp1.vocab

    with make_tempdir() as d:
        vocab_dir = ensure_path(d / "vocab")
        if not vocab_dir.exists():
            vocab_dir.mkdir()
        vocab1.to_disk(vocab_dir)

        vocab2 = Vocab().from_disk(vocab_dir)
        print("lang", vocab2.lang)
        nlp2 = spacy.blank("en", vocab=vocab2)

        nlp_dir = ensure_path(d / "nlp")
        if not nlp_dir.exists():
            nlp_dir.mkdir()
        nlp2.to_disk(nlp_dir)
        nlp3 = spacy.load(nlp_dir)
        assert nlp3.lang == "en"
Beispiel #29
0
def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam",
                               entities=["Q2"],
                               probabilities=[0.9])

    candidates = mykb.get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
    assert candidates[0].alias == adam_hash
    assert candidates[0].alias_ == "adam"

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

        candidates = kb_new_vocab.get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"

        assert kb_new_vocab.get_vector("Q2") == [2]
        assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
Beispiel #30
0
    def __init__(self):
        self.nlp = spacy.load("en_core_sci_lg", disable=["tagger"])
        self.nlp.max_length = 2000000

        # We also need to detect language, or else we'll be parsing non-english text
        # as if it were English.
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)

        # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)

        # Our linker will look up named entities/concepts in the UMLS graph and normalize
        # the data for us.
        self.linker = UmlsEntityLinker(resolve_abbreviations=True)
        self.nlp.add_pipe(self.linker)

        new_vector = self.nlp(
            """Positive-sense single‐stranded ribonucleic acid virus, subgenus 
                   sarbecovirus of the genus Betacoronavirus. 
                   Also known as severe acute respiratory syndrome coronavirus 2, 
                   also known by 2019 novel coronavirus. It is 
                   contagious in humans and is the cause of the ongoing pandemic of 
                   coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious 
                   disease.""").vector

        vector_data = {
            "COVID-19": new_vector,
            "2019-nCoV": new_vector,
            "SARS-CoV-2": new_vector
        }

        vocab = Vocab()
        for word, vector in vector_data.items():
            self.nlp.vocab.set_vector(word, vector)
        return
Beispiel #31
0
def test_graph_walk():
    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
    graph = Graph(
        doc,
        name="hello",
        nodes=[(0, ), (1, ), (2, ), (3, )],
        edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
        labels=None,
        weights=None,
    )
    node0, node1, node2, node3 = list(graph.nodes)
    assert [tuple(h) for h in node0.heads()] == [(3, )]
    assert [tuple(h) for h in node1.heads()] == [(0, )]
    assert [tuple(h) for h in node0.walk_heads()] == [(3, ), (0, )]
    assert [tuple(h) for h in node1.walk_heads()] == [(0, ), (3, ), (0, )]
    assert [tuple(h) for h in node2.walk_heads()] == [(0, ), (3, ), (0, )]
    assert [tuple(h) for h in node3.walk_heads()] == [(0, ), (3, )]
    assert [tuple(t) for t in node0.walk_tails()] == [(1, ), (2, ), (3, ),
                                                      (0, )]
    assert [tuple(t) for t in node1.walk_tails()] == []
    assert [tuple(t) for t in node2.walk_tails()] == []
    assert [tuple(t) for t in node3.walk_tails()] == [(0, ), (1, ), (2, ),
                                                      (3, )]
Beispiel #32
0
def test_extract_tokens(task_head, allennlp_tokens):
    tokenizer = Tokenizer(Vocab())
    input_tokens = list(tokenizer("test this sentence."))
    if allennlp_tokens:
        input_tokens = [spacy_to_allennlp_token(tok) for tok in input_tokens]

    tf = TextField(input_tokens, None)
    instance = Instance({"test": tf})

    tokens = task_head._extract_tokens(instance)

    assert all([isinstance(tok, Token) for tok in tokens])
    assert all(itok.text == otok.text
               for itok, otok in zip(input_tokens, tokens))
    assert all(itok.idx == otok.start
               for itok, otok in zip(input_tokens, tokens))
    if allennlp_tokens:
        assert all(itok.idx_end == otok.end
                   for itok, otok in zip(input_tokens, tokens))
    else:
        assert all(itok.idx + len(itok.text) == otok.end
                   for itok, otok in zip(input_tokens, tokens))
    assert all([tok.field == "test" for tok in tokens])
def read_spacy_docs(filepath, vocab_filepath):
    """ Reads serialized spacy docs from a file into memory.
    
    Parameters
    ----------
    filepath: str
        File path to serialized spacy docs
    
    Returns
    -------
    list of spacy.tokens.doc.Doc
        List of spacy Docs loaded from file
    """
    from spacy.vocab import Vocab
    with open(vocab_filepath, 'rb') as f:
        vocab = Vocab().from_bytes(f.read())

    with open(filepath, 'rb') as f:
        data = f.read()

    doc_bin = DocBin().from_bytes(data)
    docs = list(doc_bin.get_docs(vocab))
    return docs
Beispiel #34
0
    def __call__(self, current_file: ProtocolFile) -> Doc:

        with open(self.path) as file:
            current_transcription = file.read().split('\n')

        tokens, speakers = [], []
        for line in current_transcription:
            # line should not be empty
            if line == '':
                continue
            line = line.split()
            # there should be at least one speaker and one token per line
            if len(line) < 2:
                continue
            speaker = line[0]
            for token in line[1:]:
                speakers.append(speaker)
                tokens.append(token)

        current_transcription = Doc(Vocab(), tokens)
        for token, speaker in zip(current_transcription, speakers):
            token._.speaker = speaker

        return current_transcription
    def _make_task_prediction(
        self,
        single_forward_output: Dict,
        instance: Instance,
    ) -> TokenClassificationPrediction:
        # The dims are: top_k, tags
        tags: List[List[str]] = self._make_tags(
            single_forward_output["viterbi_paths"])
        # construct a spacy Doc
        pre_tokenized = not isinstance(single_forward_output["raw_text"], str)
        if pre_tokenized:
            # compose doc from tokens
            doc = Doc(Vocab(), words=single_forward_output["raw_text"])
        else:
            doc = self.backbone.tokenizer.nlp(
                single_forward_output["raw_text"])

        return TokenClassificationPrediction(
            tags=tags,
            scores=[
                score for tags, score in single_forward_output["viterbi_paths"]
            ],
            entities=self._make_entities(doc, tags, pre_tokenized),
        )
Beispiel #36
0
def test_oracle_bad_tokenization(vocab, arc_eager):
    words_deps_heads = """
        [catalase] dep is
        : punct is
        that nsubj is
        is root is
        bad comp is
    """

    gold_words = []
    gold_deps = []
    gold_heads = []
    for line in words_deps_heads.strip().split("\n"):
        line = line.strip()
        if not line:
            continue
        word, dep, head = line.split()
        gold_words.append(word)
        gold_deps.append(dep)
        gold_heads.append(head)
    gold_heads = [gold_words.index(head) for head in gold_heads]
    for dep in gold_deps:
        arc_eager.add_action(2, dep)  # Left
        arc_eager.add_action(3, dep)  # Right
    reference = Doc(Vocab(),
                    words=gold_words,
                    deps=gold_deps,
                    heads=gold_heads)
    predicted = Doc(reference.vocab,
                    words=["[", "catalase", "]", ":", "that", "is", "bad"])
    example = Example(predicted=predicted, reference=reference)
    ae_oracle_actions = arc_eager.get_oracle_sequence(example, _debug=False)
    ae_oracle_actions = [
        arc_eager.get_class_name(i) for i in ae_oracle_actions
    ]
    assert ae_oracle_actions
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('input_path')
    ap.add_argument('output_path')
    ap.add_argument('--append', action='store_true')
    args = ap.parse_args()

    if args.append:
        logging.info('Loading existing model...')
        model = Vocab().from_disk(args.output_path)
    else:
        model = Vocab()

    logging.info('Loading vectors into spacy...')
    load_vectors_into_model(args.input_path, model)

    logging.info('Writing model to disk...')
    model.to_disk(args.output_path)

    logging.info('Done!')
Beispiel #38
0
import jsonlines
import spacy
import spacy.language
from spacy.tokens import Doc
from spacy.scorer import Scorer
from spacy.vocab import Vocab

# global variables
vocab = Vocab()
scorer = Scorer()

# path to jsonl overlap files
path_coco = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl"
path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl"
path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl"
path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl"


# functions
def jsonl_to_list(path):
    """takes path to jsonl file and returns list of dicts"""

    with jsonlines.open(path) as reader:
        list_of_dicts = list(reader)
        print("----")
        print(list_of_dicts)

    return list_of_dicts


def create_ent_set(span_list):
def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
    assert len(vocab) == length
Beispiel #40
0
def test_doc_api_has_vector():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    doc = Doc(vocab, words=["kitten"])
    assert doc.has_vector
Beispiel #41
0
def get_random_doc(n_words):
    vocab = Vocab()
    # Make the words numbers, so that they're easy to track.
    numbers = [str(i) for i in range(0, n_words)]
    return Doc(vocab, words=numbers)
Beispiel #42
0
def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    nlp = English()
    nlp_plain = English()
    # load both vec and hashvec tables
    with make_tempdir() as tmpdir:
        p = tmpdir / "test.hashvec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_hashvec_str)
        convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
        p = tmpdir / "test.vec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_vec_str)
        convert_vectors(nlp_plain, p, truncate=0, prune=-1)

    word = "der"
    # ngrams: full padded word + padded 2-grams + padded 3-grams
    ngrams = nlp.vocab.vectors._get_ngrams(word)
    assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"]
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
            h % nlp.vocab.vectors.shape[0] for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
        dtype="uint32",
    )
    assert_equal(
        OPS.to_numpy(rows),
        numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
    )
    assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
    # all vectors are equivalent for plain static table vs. hash ngrams
    for word in nlp_plain.vocab.vectors:
        word = nlp_plain.vocab.strings.as_string(word)
        assert_almost_equal(nlp.vocab[word].vector,
                            nlp_plain.vocab[word].vector,
                            decimal=3)

        # every word has a vector
        assert nlp.vocab[word * 5].has_vector

    # n_keys is -1 for floret
    assert nlp_plain.vocab.vectors.n_keys > 0
    assert nlp.vocab.vectors.n_keys == -1

    # check that single and batched vector lookups are identical
    words = [s for s in nlp_plain.vocab.vectors]
    single_vecs = OPS.to_numpy(
        OPS.asarray([nlp.vocab[word].vector for word in words]))
    batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
    assert_equal(single_vecs, batch_vecs)

    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )

    # the loaded ngram vector table cannot be modified
    # except for clear: warning, then return without modifications
    vector = list(range(nlp.vocab.vectors.shape[1]))
    orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.set_vector("the", vector)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab[word].vector = vector
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.add("the", row=6)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.resize(shape=(100, 10))
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.raises(ValueError):
        nlp.vocab.vectors.clear()

    # data and settings are serialized correctly
    with make_tempdir() as d:
        nlp.vocab.to_disk(d)
        vocab_r = Vocab()
        vocab_r.from_disk(d)
        assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
        assert_equal(OPS.to_numpy(nlp.vocab.vectors.data),
                     OPS.to_numpy(vocab_r.vectors.data))
        assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
        assert_almost_equal(
            OPS.to_numpy(nlp.vocab[word].vector),
            OPS.to_numpy(vocab_r[word].vector),
            decimal=6,
        )
Beispiel #43
0
def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
Beispiel #44
0
def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab()
    assert "hello" not in vocab
    vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
    assert "hello" in vocab
Beispiel #45
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
Beispiel #46
0
def main(json_loc: Path,
         train_file: Path,
         dev_file: Path,
         test_file: Path,
         test_split=0.189,
         train_split=0.709):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={})
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": []}
    ids = {"train": set(), "dev": set(), "test": set()}
    count_all = {"train": 0, "dev": 0, "test": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0}

    long_rel_count = 0  #how many relations are longer
    error_count_rel = 0  #how often is something different than ARGO, ARG1, ARG

    with json_loc.open("r", encoding="utf8") as jsonfile:
        length_training_data = len([
            True for line in jsonfile if json.loads(line)["answer"] == "accept"
        ])
        msg.info(f"Number of accepted recipes: {length_training_data}")

    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)  #one recipe
            span_starts = set()

            if example["answer"] == "accept":
                neg = 0
                pos = 0
                try:
                    # Parse the tokens -> example["tokens"] = list of dicts
                    words = [t["text"] for t in example["tokens"]
                             ]  #list containing all words
                    spaces = [
                        t["ws"] for t in example["tokens"]
                    ]  #list containing ws is behind word (ws = True/False)
                    doc = Doc(vocab, words=words, spaces=spaces)

                    # Parse the entities
                    spans = example[
                        "spans"]  #list of dicts containing entities
                    entities = []
                    span_end_to_start = {}
                    ents_dict = {}
                    for span in spans:  #every detected span
                        entity = doc.char_span(
                            span["start"], span["end"], label=span["label"]
                        )  #"start" = wievielter character ist start character des spans im doc
                        span_end_to_start[span["token_end"]] = span[
                            "token_start"]  #end_token of span as key for start_token (start token = wievielter token in doc)
                        entities.append(entity)  #appended to list
                        span_starts.add(span["token_start"])  #added to set
                        ents_dict[span["token_start"]] = (span["label"],
                                                          span["token_start"])
                    doc.ents = entities  #entity list assigned as doc entites

                    # Parse the relations
                    rels = {}

                    # create token combinations
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 1a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 1a
                                        if DIFF_FRONT_BACK == True:

                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                rels[(x1, x2)] = {}

                                            else:
                                                pass
                                        #DIFF_FRONT_BACK 1b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                rels[(x1, x2)] = {
                                                }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...
                        #VERBS_TO_OTHER 1b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 2a
                                if DIFF_FRONT_BACK == True:

                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        rels[(x1, x2)] = {}

                                    else:
                                        pass
                                #DIFF_FRONT_BACK 2b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        rels[(x1, x2)] = {
                                        }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...

                    relations = example[
                        "relations"]  #relations is list of dict
                    for relation in relations:
                        # the 'head' and 'child' annotations refer to the end token in the span
                        # but we want the first token
                        start = span_end_to_start[relation[
                            "head"]]  #wievielter token ist start token des head
                        end = span_end_to_start[relation[
                            "child"]]  #wievielter token ist start token des child
                        label = relation["label"]

                        #DETAILED_ARGS 1a
                        if DETAILED_ARGS == True:
                            if label == "ARG0":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG0[ents_dict[end][
                                        0]]  #assign new label based on span type
                            elif label == "ARG1":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG1[ents_dict[end][0]]
                            elif label == "ARG":
                                if ents_dict[end][0] in ["Z", "TOOL"]:
                                    if ents_dict[end][0] == "Z":
                                        label = "Arg0Z"
                                    elif ents_dict[end][0] == "TOOL":
                                        label = "Arg1Tool"
                                else:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                            else:
                                error_count_rel += 1

                        #DETAILED_ARGS 1b
                        else:
                            label = MAP_LABELS_STANDARD[
                                label]  #MAP_LABELS = dict containing label as key

                        # Positive relations are being added
                        try:
                            if label not in rels[(
                                    start, end
                            )]:  #check if label already exists for token combination
                                rels[(
                                    start, end
                                )][label] = 1.0  #initialize label as new key with value 1.0
                                pos += 1  #positive case
                        except:
                            long_rel_count += 1  #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb)
                            pass

                    # The annotation is complete, so fill in zero's where the data is missing
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 2a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 2a
                                        if DIFF_FRONT_BACK == True:
                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                #DETAILED_ARGS 2a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                            #DETAILED_ARGS 2b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0

                                        #DIFF_FRONT_BACK 2b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                #DETAILED_ARGS 3a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                                #DETAILED_ARGS 3b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0  #span combination with label as key gets 0 as value
                        #VERBS_TO_OTHER 2b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 3a
                                if DIFF_FRONT_BACK == True:
                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        #DETAILED_ARGS 4a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                    #DETAILED_ARGS 4b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                                #DIFF_FRONT_BACK 3b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        #DETAILED_ARGS 5a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                        #DETAILED_ARGS 5b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                    #print(rels)
                    doc._.rel = rels  # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}}

                    # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list)
                    if pos > 0:

                        recipe_id = example["_input_hash"]

                        if len(docs["train"]) < round(
                                train_split * length_training_data):
                            ids["train"].add(recipe_id)
                            docs["train"].append(doc)
                            count_pos["train"] += pos
                            count_all["train"] += pos + neg
                        elif len(docs["test"]) < round(
                                test_split * length_training_data):
                            ids["test"].add(recipe_id)
                            docs["test"].append(doc)
                            count_pos["test"] += pos
                            count_all["test"] += pos + neg
                        else:
                            ids["dev"].add(recipe_id)
                            docs["dev"].append(doc)
                            count_pos["dev"] += pos
                            count_all["dev"] += pos + neg

                except KeyError as e:
                    msg.fail(
                        f"Skipping doc because of key error: {e} in {example['_input_hash']}"
                    )

    msg.info(
        f"{long_rel_count} relations have been cut because tokens are too far apart."
    )

    docbin = DocBin(docs=docs["train"], store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, "
        f"{count_pos['train']}/{count_all['train']} pos instances.")

    docbin = DocBin(docs=docs["dev"], store_user_data=True)
    docbin.to_disk(dev_file)
    msg.info(
        f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, "
        f"{count_pos['dev']}/{count_all['dev']} pos instances.")

    docbin = DocBin(docs=docs["test"], store_user_data=True)
    docbin.to_disk(test_file)
    msg.info(
        f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, "
        f"{count_pos['test']}/{count_all['test']} pos instances.")
Beispiel #47
0
 def test_load(self):
     vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
Beispiel #48
0
def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab(vectors_name="test_issue1807")
    assert "hello" not in vocab
    vocab.set_vector("hello", numpy.ones((50, ), dtype="f"))
    assert "hello" in vocab
Beispiel #49
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
Beispiel #50
0
def vocab():
    return Vocab()
Beispiel #51
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)