def fit(self, docs): tokenizer = self._build_tokenizer() for raw in docs: tokens = tokenizer(raw) for token in set(tokens): self._add_token(token) self.id2token = reverse_dict(self.token2id)
def __init__(self, token_pattern=" ", tokenizer=None, min_df=1, max_df=1.0, stop_words=None, filters=PUNC): self.id2token = { EOS_IDX: EOS, SOS_IDX: SOS, PAD_IDX: PAD, UNK_IDX: UNK } self.token2id = reverse_dict(self.id2token) self.dfs = {} self.n_docs = 0 self._tokenizer = tokenizer self._token_pattern = token_pattern self._min_df = min_df self._max_df = max_df self._keep = set([EOS_IDX, SOS_IDX, UNK_IDX, PAD_IDX]) self._stop_words = stop_words self._filters = filters self.max_df = max_df self.min_df = min_df
def __getitem__(self, tokenid): """Get token by provided `tokenid`""" if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = reverse_dict(self.token2id) return self.id2token[tokenid] # will throw for non-existent ids
def __init__(self, stop_words=None, filters='', ngram_range=(1, 1), min_df=1, max_df=1.0, lowercase=True, token_pattern=" ", tokenizer=None, pad=False, eos=False, sos=False, unk=False): self.id2token = {} self.token2id = reverse_dict(self.id2token) self.dfs = {} self.n_docs = 0 self.stop_words = stop_words self.ngram_range = ngram_range self.min_df = min_df self.max_df = max_df self.tokenizer = tokenizer self.token_pattern = token_pattern self.lowercase = lowercase self.filters = re.compile(filters) self._pad = PAD if pad else None self._eos = EOS if eos else None self._sos = SOS if sos else None self._unk = UNK if unk else None self._init_dictionary()
def __init__(self, token_pattern=" ", tokenizer=None): self.token_pattern = token_pattern self.tokenizer = tokenizer self.token2id = {} self.id2token = {} for token in [EOS, SOS, PAD, UNK]: self._add_token(token) self.id2token = reverse_dict(self.token2id)
def _compact(self): """Assign new word ids to all words, shrinking gaps.""" # build mapping from old id -> new id idmap = dict( zip(sorted(self.token2id.values()), range(len(self.token2id)))) # reassign mappings to new ids self.token2id = { token: idmap[tokenid] for token, tokenid in self.token2id.items() } self.id2token = {} self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()} self.id2token = reverse_dict(self.token2id)
def __init__(self, token_pattern=" ", tokenizer=None, min_df=1, max_df=1.0, ngram_range=(1, 1)): self.id2token = {} self.token2id = reverse_dict(self.id2token) self.dfs = {} self.n_docs = 0 self._tokenizer = tokenizer self._token_pattern = token_pattern self._min_df = min_df self._max_df = max_df self._ngram_range = ngram_range self._keep_ids = None self.max_df = max_df self.min_df = min_df
def test_revers_dict(self): o = {2: "a", "c": 3} t = {"a": 2, 3: "c"} assert reverse_dict(o) == t