def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None): self.cdb = cdb self.config = config self.w2v = None if vocab is not None: self.vocab = vocab else: self.vocab = Vocab() # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok # Used for saving if the real path is not set self.vocab_path = "./tmp_vocab.dat"
def setUp(self) -> None: self.undertest = Vocab() self.tmp_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tmp") os.makedirs(self.tmp_dir, exist_ok=True)