コード例 #1
0
ファイル: serve.py プロジェクト: carlward/short-answer-ml
def model_fn(model_dir):
    model = BiLSTMModel(torch.zeros((41299, 300)),
                        nClasses=4,
                        hiddenSizeEncoder=2048,
                        hiddenSizeCls=512,
                        layers=1,
                        dropProb=0.0)
    weights = torch.load(Path(model_dir) / '{}.pt'.format(MODEL_NAME),
                         map_location=DEVICE)
    model.load_state_dict(weights)
    model.to(DEVICE)
    model.eval()

    tokenizer = Tokenizer(Vocab())
    tokenizer.from_disk(Path(model_dir) / '{}'.format(TOKENIZER))
    return {'model': model, 'tokenizer': tokenizer}
コード例 #2
0
class VocabBuilder(object):
    def __init__(self,
                 rootDir='.cache',
                 vectorPath='vectors',
                 tokenizerPath='tokenizer'):
        self.vectorPath = Path.cwd() / rootDir / vectorPath
        self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath
        self.tokenizer = Tokenizer(Vocab())
        self.vectors = Vectors(shape=(41299, 300))

    def _countWords(self, sequences, tokenizer):
        self.tokenCounts = Counter()
        for seq in sequences:
            tokens = tokenizer(seq)
            for t in tokens:
                self.tokenCounts[t.text] += 1

    def fromDisk(self):
        self.tokenizer.from_disk(self.tokenizerPath)
        self.vectors.from_disk(self.vectorPath)

    def learnVocab(self, sequences, tokenizer, vectors, padToken='<pad>'):
        nlp = English()
        self._countWords(sequences, tokenizer=tokenizer)
        nlp.vocab = Vocab()
        nlp.vocab.set_vector(padToken, np.zeros(vectors.data.shape[1]))
        for word in self.tokenCounts:
            idx = tokenizer(word)[0].lex_id
            nlp.vocab.set_vector(word, vectors.data[idx])

        self.tokenizer = Tokenizer(nlp.vocab,
                                   rules={padToken: [{
                                       ORTH: padToken
                                   }]},
                                   prefix_search=nlp.tokenizer.prefix_search,
                                   suffix_search=nlp.tokenizer.suffix_search,
                                   token_match=nlp.tokenizer.token_match,
                                   infix_finditer=nlp.tokenizer.infix_finditer)
        self.vectors = nlp.vocab.vectors

    def toDisk(self, tokenizerPath=None, vectorPath=None):
        self.tokenizer.to_disk(tokenizerPath or self.tokenizerPath)
        self.vectors.to_disk(vectorPath or self.vectorPath)