def model_fn(model_dir): model = BiLSTMModel(torch.zeros((41299, 300)), nClasses=4, hiddenSizeEncoder=2048, hiddenSizeCls=512, layers=1, dropProb=0.0) weights = torch.load(Path(model_dir) / '{}.pt'.format(MODEL_NAME), map_location=DEVICE) model.load_state_dict(weights) model.to(DEVICE) model.eval() tokenizer = Tokenizer(Vocab()) tokenizer.from_disk(Path(model_dir) / '{}'.format(TOKENIZER)) return {'model': model, 'tokenizer': tokenizer}
class VocabBuilder(object): def __init__(self, rootDir='.cache', vectorPath='vectors', tokenizerPath='tokenizer'): self.vectorPath = Path.cwd() / rootDir / vectorPath self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath self.tokenizer = Tokenizer(Vocab()) self.vectors = Vectors(shape=(41299, 300)) def _countWords(self, sequences, tokenizer): self.tokenCounts = Counter() for seq in sequences: tokens = tokenizer(seq) for t in tokens: self.tokenCounts[t.text] += 1 def fromDisk(self): self.tokenizer.from_disk(self.tokenizerPath) self.vectors.from_disk(self.vectorPath) def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'): nlp = English() self._countWords(sequences, tokenizer=tokenizer) nlp.vocab = Vocab() nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1])) for word in self.tokenCounts: idx = tokenizer(word)[0].lex_id nlp.vocab.set_vector(word, vectors.data[idx]) self.tokenizer = Tokenizer(nlp.vocab, rules={padToken: [{ ORTH: padToken }]}, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, token_match=nlp.tokenizer.token_match, infix_finditer=nlp.tokenizer.infix_finditer) self.vectors = nlp.vocab.vectors def toDisk(self, tokenizerPath=None, vectorPath=None): self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath) self.vectors.to_disk(vectorPath or self.vectorPath)