Esempio n. 1
0
def train_word_vectors(vectors_loc, lang='zh', model_name='zh_model'):
    """
    加载词向量数据 从零开始训练 spacy 模型
    :param vectors_loc:
    :param lang:
    :param model_name:
    :return:
    """
    if lang is None:
        nlp = Language()
    else:
        # create an empty language class
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        print(nr_row, nr_dim)

        nlp.vocab.reset_vectors(width=int(nr_dim))

        count = 0
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            # add the vectors to the vocab

            count += 1
            print(f'{word} added {count / int(nr_row) * 100} % ')

            nlp.vocab.set_vector(word, vector)
    nlp.to_disk("data/" + model_name)
    print('finishing!!!')
Esempio n. 2
0
    def load_tospacy(self, lang='en'):
        """
		loads glove vectors from file specified in intialization, set vectors and save to disk
		:param lang:
		:return:
		"""

        if lang is None:
            # create blank multilanguage class with 'xx' if lang is None
            nlp = Language('xx')

        else:
            nlp = spacy.blank(lang)

        custom_log('PARSING GLOVE MODEL')

        with open(self.glovedir, 'r', encoding="utf8") as glove_file:
            model = {}
            for line in glove_file:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                nlp.vocab.set_vector(word, embedding)

        custom_log('VECTORS SET, SAVING TO DISK')

        nlp.to_disk(r'glove6B/glove-6B')
Esempio n. 3
0
def main(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        print(nr_row, nr_dim)

        nlp.vocab.reset_vectors(width=int(nr_dim))

        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

            print(word)
    # test the vectors and similarity
    # text = '您好'
    # doc = nlp(text)
    # print(text, doc[0].similarity(doc[1]))
    nlp.to_disk("./zh_model")
Esempio n. 4
0
def test_package(nlp: Language, chdir):
    d = Path().cwd()
    modeld = d / "model"
    pkgd = d / "package"
    pkgd.mkdir()
    nlp.to_disk(modeld)
    package(modeld, pkgd)
Esempio n. 5
0
def test_serialize_config_language_specific():
    """Test that config serialization works as expected with language-specific
    factories."""
    name = "test_serialize_config_language_specific"

    @English.factory(name, default_config={"foo": 20})
    def custom_factory(nlp: Language, name: str, foo: int):
        return lambda doc: doc

    nlp = Language()
    assert not nlp.has_factory(name)
    nlp = English()
    assert nlp.has_factory(name)
    nlp.add_pipe(name, config={"foo": 100}, name="bar")
    pipe_config = nlp.config["components"]["bar"]
    assert pipe_config["foo"] == 100
    assert pipe_config["factory"] == name

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
    assert nlp2.has_factory(name)
    assert nlp2.pipe_names == ["bar"]
    assert nlp2.get_pipe_meta("bar").factory == name
    pipe_config = nlp2.config["components"]["bar"]
    assert pipe_config["foo"] == 100
    assert pipe_config["factory"] == name

    config = Config().from_str(nlp2.config.to_str())
    config["nlp"]["lang"] = "de"
    with pytest.raises(ValueError):
        # German doesn't have a factory, only English does
        load_model_from_config(config)
Esempio n. 6
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        [
            "i'm looking for a place in the north of town",
            [[31, 36, "LOCATION"]]
        ],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Esempio n. 7
0
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
    nlp = Language(vocab=en_vocab)
    ruler = EntityRuler(nlp, overwrite_ents=True)

    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.add_pipe(ruler)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
Esempio n. 8
0
def test_issue_3526_4(en_vocab):
    nlp = Language(vocab=en_vocab)
    patterns = [{"label": "ORG", "pattern": "Apple"}]
    config = {"overwrite_ents": True}
    ruler = nlp.add_pipe("entity_ruler", config=config)
    ruler.add_patterns(patterns)
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        ruler = nlp.get_pipe("entity_ruler")
        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert ruler.overwrite is True
        nlp2 = load(tmpdir)
        new_ruler = nlp2.get_pipe("entity_ruler")
        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
        assert new_ruler.overwrite is True
Esempio n. 9
0
def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to readd labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
    TRAIN_DATA = [
        ["hey", []],
        ["howdy", []],
        ["hey there", []],
        ["hello", []],
        ["hi", []],
        ["i'm looking for a place to eat", []],
        ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
        ["show me chinese restaurants", [[8, 15, "CUISINE"]]],
        ["show me chines restaurants", [[8, 14, "CUISINE"]]],
    ]

    nlp = Language()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
    nlp.begin_training()
    ner.model.learn_rate = 0.001
    for itn in range(100):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            nlp.update([raw_text], [{"entities": entity_offsets}])

    with make_tempdir() as model_dir:
        nlp.to_disk(model_dir)
        nlp2 = Language().from_disk(model_dir)

    for raw_text, entity_offsets in TRAIN_DATA:
        doc = nlp2(raw_text)
        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
        for start, end, label in entity_offsets:
            if (start, end) in ents:
                assert ents[(start, end)] == label
                break
        else:
            if entity_offsets:
                raise Exception(ents)
Esempio n. 10
0
def main(lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    file_loc = './广电全量地址.txt'
    #file_loc = '/home/siy/Downloads/guizhou/new/txt/0.txt'
    nr_dim = 768
    nlp.vocab.reset_vectors(width=int(nr_dim))
    cnt = 0
    with open(file_loc, 'r') as f:
        # df = pd.read_csv(f)
        lines = f.readlines()
        np.random.shuffle(lines)
        lines = lines[:10000]
        for line in lines:
            # line = line.decode()
            # print(line)
            line = strQ2B(line)
            line.strip()
            line = clr(line)
            line.strip()
            vecs = []
            try:
                print(line)
                vecs = bc.encode(list(line))
            except:
                traceback.print_exc()
                print(list(line))
                continue
            for char, vec in zip(line, vecs):
                try:
                    nlp.vocab.set_vector(ord(char), bc.encode([char]))
                except BaseException:
                    traceback.print_exc()
                    print(char)
                    continue
            cnt += 1
            print('bingo,  i write in %s' % cnt)
    nlp.to_disk('./zh_models')
Esempio n. 11
0
def main(vectors_loc=None, lang=None):

    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(VECTORS_PATH, "rb") as file_:
        print("loading vectors...")
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode("utf8")
            pieces = line.rsplit(" ", int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
    print("trainning tags...")
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)
    optimizer = nlp.begin_training()
    for i in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        print(losses)

    # test the trained model
    test_text = "Eu desejo ouvir uma música muito boa"
    doc = nlp(test_text)
    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

    print("Saved mode to nl_model_tagger")

    nlp.to_disk("/app/model")
Esempio n. 12
0
def test_serialize_with_custom_tokenizer():
    """Test that serialization with custom tokenizer works without token_match.
    See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
    """
    prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
    suffix_re = re.compile(r"""""")
    infix_re = re.compile(r"""[~]""")

    def custom_tokenizer(nlp):
        return Tokenizer(
            nlp.vocab,
            {},
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
        )

    nlp = Language()
    nlp.tokenizer = custom_tokenizer(nlp)
    with make_tempdir() as d:
        nlp.to_disk(d)
Esempio n. 13
0
def main(vectors_loc, lang=None, model_name='zh_model'):
    if lang is None:
        nlp = Language()
    else:
        # create an empty language class
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        print(nr_row, nr_dim)

        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            # add the vectors to the vocab
            nlp.vocab.set_vector(word, vector)
    nlp.to_disk("data/" + model_name)
    print('finishing!!!')
Esempio n. 14
0
def test_serialize_with_custom_tokenizer():
    """Test that serialization with custom tokenizer works without token_match.
    See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
    """
    prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
    suffix_re = re.compile(r"""""")
    infix_re = re.compile(r"""[~]""")

    def custom_tokenizer(nlp):
        return Tokenizer(
            nlp.vocab,
            {},
            prefix_search=prefix_re.search,
            suffix_search=suffix_re.search,
            infix_finditer=infix_re.finditer,
        )

    nlp = Language()
    nlp.tokenizer = custom_tokenizer(nlp)
    with make_tempdir() as d:
        nlp.to_disk(d)
Esempio n. 15
0
def recreateWordVectors(vectors_loc="wordfeats/glove.6B/glove.6B.50d.txt",
                        save_loc="wordfeats"):

    lang = "en"
    if lang is None:
        nlp = Language()
    else:
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        # nr_row, nr_dim = header.split()
        nr_dim = 50
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    nlp.to_disk(save_loc)
    return
Esempio n. 16
0
def main(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    print('=' * 20)
    new_model = KeyedVectors.load_word2vec_format(vectors_loc, binary=True)
    for word in new_model.wv.index2word:
        vector = numpy.asarray([float(v) for v in new_model[word]], dtype='f')
        nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
        # print(word, vector)
    print('=' * 20)

    # test the vectors and similarity
    # text = '不同'
    # doc = nlp(text)
    # print(text, doc[0].similarity(doc[1]))
    # print('='*20)
    nlp.to_disk("./zh_model")
Esempio n. 17
0
def save_model(nlp: Language, path: Path) -> None:
    nlp.to_disk(path)
    logger.info(f"Saved the model in {str(path.absolute())}")
Esempio n. 18
0
def save_model(nlp: Language, output_path: str):
    nlp.to_disk(output_path)
Esempio n. 19
0
def test_serialize_language_meta_disk(meta_data):
    language = Language(meta=meta_data)
    with make_tempdir() as d:
        language.to_disk(d)
        new_language = Language().from_disk(d)
    assert new_language.meta == language.meta
        "neice",
        "king",
        "queen",
        "dude",
        "guy",
        "gal",
        "fire",
        "dog",
        "cat",
        "mouse",
        "red",
        "bluee",
        "green",
        "yellow",
        "water",
        "person",
        "family",
        "brother",
        "sister",
    ]
    nlp = spacy.load("en_core_web_md")
    vec_data = {w: nlp(w).vector for w in words}
    vocab = Vocab(strings=words)
    for word, vector in vec_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab, meta={"lang": "en"})
    vocab.to_disk("tests/custom_test_vocab")
    print("local vocab saved for spacy")
    nlp.to_disk("tests/custom_test_lang")
    print("local nlp saved for spacy")
Esempio n. 21
0
def save_model(model: Language, project=None, session=None):
    output_dir = get_model_dir(project, session)
    logging.debug("Saving model to {}...".format(output_dir))
    model.to_disk(output_dir)
    return output_dir
Esempio n. 22
0
def test_serialize_language_meta_disk(meta_data):
    language = Language(meta=meta_data)
    with make_tempdir() as d:
        language.to_disk(d)
        new_language = Language().from_disk(d)
    assert new_language.meta == language.meta