Exemple #1
0
 def fit(self, docs):
     tokenizer = self._build_tokenizer()
     for raw in docs:
         tokens = tokenizer(raw)
         for token in set(tokens):
             self._add_token(token)
     self.id2token = reverse_dict(self.token2id)
Exemple #2
0
 def __init__(self,
              token_pattern=" ",
              tokenizer=None,
              min_df=1,
              max_df=1.0,
              stop_words=None,
              filters=PUNC):
     self.id2token = {
         EOS_IDX: EOS,
         SOS_IDX: SOS,
         PAD_IDX: PAD,
         UNK_IDX: UNK
     }
     self.token2id = reverse_dict(self.id2token)
     self.dfs = {}
     self.n_docs = 0
     self._tokenizer = tokenizer
     self._token_pattern = token_pattern
     self._min_df = min_df
     self._max_df = max_df
     self._keep = set([EOS_IDX, SOS_IDX, UNK_IDX, PAD_IDX])
     self._stop_words = stop_words
     self._filters = filters
     self.max_df = max_df
     self.min_df = min_df
Exemple #3
0
 def __getitem__(self, tokenid):
     """Get token by provided `tokenid`"""
     if len(self.id2token) != len(self.token2id):
         # the word->id mapping has changed (presumably via add_documents);
         # recompute id->word accordingly
         self.id2token = reverse_dict(self.token2id)
     return self.id2token[tokenid]  # will throw for non-existent ids
Exemple #4
0
 def __init__(self,
              stop_words=None,
              filters='',
              ngram_range=(1, 1),
              min_df=1,
              max_df=1.0,
              lowercase=True,
              token_pattern=" ",
              tokenizer=None,
              pad=False,
              eos=False,
              sos=False,
              unk=False):
     self.id2token = {}
     self.token2id = reverse_dict(self.id2token)
     self.dfs = {}
     self.n_docs = 0
     self.stop_words = stop_words
     self.ngram_range = ngram_range
     self.min_df = min_df
     self.max_df = max_df
     self.tokenizer = tokenizer
     self.token_pattern = token_pattern
     self.lowercase = lowercase
     self.filters = re.compile(filters)
     self._pad = PAD if pad else None
     self._eos = EOS if eos else None
     self._sos = SOS if sos else None
     self._unk = UNK if unk else None
     self._init_dictionary()
Exemple #5
0
 def __init__(self, token_pattern=" ", tokenizer=None):
     self.token_pattern = token_pattern
     self.tokenizer = tokenizer
     self.token2id = {}
     self.id2token = {}
     for token in [EOS, SOS, PAD, UNK]:
         self._add_token(token)
     self.id2token = reverse_dict(self.token2id)
Exemple #6
0
 def _compact(self):
     """Assign new word ids to all words, shrinking gaps."""
     # build mapping from old id -> new id
     idmap = dict(
         zip(sorted(self.token2id.values()), range(len(self.token2id))))
     # reassign mappings to new ids
     self.token2id = {
         token: idmap[tokenid]
         for token, tokenid in self.token2id.items()
     }
     self.id2token = {}
     self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
     self.id2token = reverse_dict(self.token2id)
Exemple #7
0
 def __init__(self,
              token_pattern=" ",
              tokenizer=None,
              min_df=1,
              max_df=1.0,
              ngram_range=(1, 1)):
     self.id2token = {}
     self.token2id = reverse_dict(self.id2token)
     self.dfs = {}
     self.n_docs = 0
     self._tokenizer = tokenizer
     self._token_pattern = token_pattern
     self._min_df = min_df
     self._max_df = max_df
     self._ngram_range = ngram_range
     self._keep_ids = None
     self.max_df = max_df
     self.min_df = min_df
Exemple #8
0
 def test_revers_dict(self):
     o = {2: "a", "c": 3}
     t = {"a": 2, 3: "c"}
     assert reverse_dict(o) == t