Esempio n. 1
0
    def fit(self, X, y=None):
        words_counter = Counter()
        def update_word_counts(element):
            words = self.token_to_words(element)
            words_counter.update(words)

        apply_map(X, update_word_counts)
        common_words = {word for word, freq in words_counter.most_common(self.n)}
        rare_words = {word for word, freq in words_counter.most_common() if freq <= self.min_freq}
        filter_words = common_words.union(rare_words)
        self._filter_words = filter_words
        return self
Esempio n. 2
0
    def _test_apply_map(self, inplace):
        value_dict = {
            "column1": [1, 2, 3, 4, 5],
            "column2": [4, 5, 6, 7, 8],
        }
        array_dict = {
            "column1": [[0, 1], [2, 3, 4], [4, 5]],
            "column2": [[3, 4], [5, 6], [2, 1, 9]],
        }

        df = pd.DataFrame(value_dict)
        array_df = pd.DataFrame(array_dict)

        series = pd.Series(value_dict["column1"])
        array_series = pd.Series(array_dict["column1"])

        value_list = [1, 2, 3, 4, 5]
        array_list = [[0, 1], [2, 3, 2], [4, 5, 1]]
        multi_array_list = [[[3, 1], [2, 3, 1]], [[4, 5], [4, 5]]]

        test_dict = {
            "value": [df, series, value_dict, value_list, None],
            "array":
            [array_df, array_series, array_dict, array_list, multi_array_list]
        }
        kinds = ["DataFrame", "Series", "dict", "list", "multi-array"]

        for k in test_dict:
            if k == "value":
                func = lambda x: x * 2
            else:
                func = lambda x: sum(x)

            for _k, d in zip(kinds, test_dict[k]):
                if d is None:
                    continue
                print("{}-{}".format(k, _k))
                _d = copy.deepcopy(d)
                result = apply_map(_d, func, inplace)
                if inplace:
                    result = _d
                if _k == "DataFrame":
                    print(result)
                    self.assertEqual(tuple(self.flatten(result)),
                                     tuple(map(func, self.flatten(d))))
                elif _k == "Series":
                    self.assertEqual(tuple(self.flatten(result)),
                                     tuple(map(func, self.flatten(d))))
                elif _k == "dict":
                    for kx in result:
                        self.assertEqual(tuple(self.flatten(result[kx])),
                                         tuple(map(func, self.flatten(d[kx]))))
                elif _k == "list":
                    print("{} => {}".format(d, result))
                    self.assertEqual(tuple(self.flatten(result)),
                                     tuple(map(func, self.flatten(d))))
Esempio n. 3
0
    def fit(self, X, y=None):
        vocab = Counter()
        length = len(X)
        if isinstance(X, dict):
            length = len(list(X.values)[0])

        def update_vocab(element):
            words = self.token_to_words(element)
            if self.ignore_blank:
                words = [w for w in words if w.strip()]
            vocab.update(words)

        apply_map(X, update_vocab)

        reserved = [self._padding, self._unknown,
                    self._begin_of_sequence, self._end_of_sequence]
        reserved = [r for r in reserved if r]  # filter no setting token

        selected = []
        if self.vocab_size > 0:
            for term, count in vocab.most_common():
                if len(selected) < self.vocab_size:
                    selected.append(term)
                else:
                    break
        else:
            min_limit = (self.min_df
                         if isinstance(self.min_df, numbers.Integral)
                         else self.min_df * length)
            max_limit = (self.max_df
                         if isinstance(self.max_df, numbers.Integral)
                         else self.max_df * length)

            for term, count in vocab.most_common():
                if count < min_limit or count > max_limit:
                    continue
                else:
                    selected.append(term)

        reserved = [r for r in reserved if r not in selected]
        self._vocab = reserved + selected
        return self
Esempio n. 4
0
 def transform(self, X):
     return apply_map(X, self.apply, inplace=(not self.copy))
Esempio n. 5
0
 def inverse_transform(self, X):
     if len(self._vocab) == 0:
         raise Exception("Vocabulary has not made yet. Plase execute fit.")
     return apply_map(X, self.inverse, inplace=self.copy)
Esempio n. 6
0
 def transform(self, X):
     _X = apply_map(X, self.tokenizer.tokenize, self.copy)
     return _X