def test_vectors_serialize(): data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) b = v.to_bytes() v_r = Vectors() v_r.from_bytes(b) assert_equal(v.data, v_r.data) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f")) assert row == row_r assert_equal(v.data, v_r.data) assert v.is_full == v_r.is_full with make_tempdir() as d: v.to_disk(d) v_r.from_disk(d) assert_equal(v.data, v_r.data) assert v.key2row == v_r.key2row v.resize((5, 4)) v_r.resize((5, 4)) row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f")) assert row == row_r assert_equal(v.data, v_r.data)
class VocabBuilder(object): def __init__(self, rootDir='.cache', vectorPath='vectors', tokenizerPath='tokenizer'): self.vectorPath = Path.cwd() / rootDir / vectorPath self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath self.tokenizer = Tokenizer(Vocab()) self.vectors = Vectors(shape=(41299, 300)) def _countWords(self, sequences, tokenizer): self.tokenCounts = Counter() for seq in sequences: tokens = tokenizer(seq) for t in tokens: self.tokenCounts[t.text] += 1 def fromDisk(self): self.tokenizer.from_disk(self.tokenizerPath) self.vectors.from_disk(self.vectorPath) def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'): nlp = English() self._countWords(sequences, tokenizer=tokenizer) nlp.vocab = Vocab() nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1])) for word in self.tokenCounts: idx = tokenizer(word)[0].lex_id nlp.vocab.set_vector(word, vectors.data[idx]) self.tokenizer = Tokenizer(nlp.vocab, rules={padToken: [{ ORTH: padToken }]}, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, token_match=nlp.tokenizer.token_match, infix_finditer=nlp.tokenizer.infix_finditer) self.vectors = nlp.vocab.vectors def toDisk(self, tokenizerPath=None, vectorPath=None): self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath) self.vectors.to_disk(vectorPath or self.vectorPath)
def get_nlp(model="en", embeddings_path=None): import spacy if embeddings_path not in nlp_objects: if embeddings_path is None: nlp_ = spacy.load(model) else: if embeddings_path.endswith(".bin"): nlp_ = spacy.load(model, vectors=False) nlp_.vocab.load_vectors_from_bin_loc(embeddings_path) elif os.path.isdir(embeddings_path): from spacy.vectors import Vectors vectors = Vectors() vectors = vectors.from_disk(embeddings_path) nlp_ = spacy.load(model, vectors=False) nlp_.vocab.vectors = vectors else: nlp_ = spacy.load(model, vectors=embeddings_path) nlp_objects[embeddings_path] = nlp_ return nlp_objects[embeddings_path]
import config import utils from train import spacy_tokenizer # for bundled # set_data_path('./') logger = utils.get_logger() print('**************** Loading model... ****************') # too slow # _nlp = spacy.load('en_core_web_md') _vectors = Vectors() # _vectors.from_disk('/home/han/.local/lib/python3.6/site-packages/en_core_web_md/en_core_web_md-2.1.0/vocab/') _vectors.from_disk('%s/%s-%s/vocab/' % (en_core_web_md.__path__, en_core_web_md.__name__, en_core_web_md.__version__)) _vector_size = _vectors.shape[1] def get_sent_vector(sent): # use regexp tokenizer to speed up vs = np.array([get_word_vector(w) for w in spacy_tokenizer(sent)]) if len(vs) > 0: return vs.sum(axis=0) / vs.shape[0] return np.zeros(_vector_size) def get_word_vector(w): h = hash_string(w.lower()) i = _vectors.key2row.get(h, 0)