def __init__(self, model, normalize=True, embedding_file=None, char_embedding_file=None, num_workers=None): """ Args: model: path to saved model file. normalize: squash output score to 0-1 probabilities with a softmax. embedding_file: if provided, will expand dictionary to use all available pretrained vectors in this file. num_workers: number of CPU processes to use to preprocess batches. """ logger.info('Initializing model...') self.model = DocReader.load(model, normalize=normalize) if embedding_file: logger.info('Expanding dictionary...') utils.index_embedding_words(embedding_file) added_words = self.model.expand_dictionary(words) self.model.load_embeddings(added_words, embedding_file) if char_embedding_file: logger.info('Expanding dictionary...') chars = utils.index_embedding_chars(char_embedding_file) added_chars = self.model.expand_char_dictionary(chars) self.model.load_char_embeddings(added_chars, char_embedding_file) logger.info('Initializing tokenizer...') annotators = get_annotators_for_model(self.model) if num_workers is None or num_workers > 0: self.workers = ProcessPool( num_workers, initializer=init, initargs=({ 'annotators': annotators }, ), ) else: self.workers = None self.tokenizer = SpacyTokenizer(annotators=annotators)
def init(options): global TOK TOK = SpacyTokenizer(**options) Finalize(TOK, TOK.shutdown, exitpriority=100)
def init(): global TOK TOK = SpacyTokenizer(annotators=ANNTOTORS) Finalize(TOK, TOK.shutdown, exitpriority=100)