def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = Tok2Vec(width, embed_size) vectors, backprop = tok2vec.begin_update(batch) assert len(vectors) == len(batch) for doc_vec, doc in zip(vectors, batch): assert doc_vec.shape == (len(doc), width)
def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) tok2vec = Tok2Vec(**tok2vec_config) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) backprop(vectors)
def test_empty_doc(): width = 128 embed_size = 2000 vocab = Vocab() doc = Doc(vocab, words=[]) tok2vec = Tok2Vec(width, embed_size) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width)
def create_pipeline(width, embed_size, vectors_model): print("Load vectors") nlp = spacy.load(vectors_model) print("Start training") textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], model=build_textcat_model(Tok2Vec(width=width, embed_size=embed_size), 2, width), ) nlp.add_pipe(textcat) return nlp
def create_pipeline(lang, width, embed_size, vectors): if vectors: nlp = spacy.blank(lang) else: print("Load vectors", vectors) nlp = spacy.load(vectors) print("Start training") tok2vec = Tok2Vec(width=width, embed_size=embed_size, pretrained_vectors=vectors) textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], model=build_textcat_model(tok2vec, 2, width), ) nlp.add_pipe(textcat) return nlp
def create_pipeline(lang, width, embed_size, vectors): if vectors is None: nlp = spacy.blank(lang) else: print("Load vectors", vectors) nlp = spacy.load(vectors) print("Start training") tok2vec = Tok2Vec( width=width, embed_size=embed_size, ) textcat = TextCategorizer( nlp.vocab, labels=['1', '2', '3', '4'], model=build_textcat_model(tok2vec, 4, width), ) nlp.add_pipe(textcat) return nlp
def tok2vec(): return Tok2Vec(8, 100)