Ejemplo n.º 1
0
    def test_feed(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        dp = DatasetPreprocessor()
        dp.process("review")\
            .by(ct.text.UnicodeNormalizer())\
            .by(ct.Tokenizer("en"))\
            .by(ct.token.StopwordFilter("en"))\
            .by(ct.Vocabulary(min_df=0, max_df=1.0))\
            .by(ct.formatter.Padding(length=5))\
            .fit(df.loc[:, ["review", "comment"]])
        dp.process("label")\
            .by(ct.formatter.CategoricalLabel(),
                reference=dp.process("review"))

        adjusted = dp(df).preprocess().format().processed
        self.assertEqual(len(adjusted["label"][0]),
                         dp.process("review").preprocessor.vocabulary.count)

        # Iterate
        for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1):
            self.assertEqual(len(batch), 3)
            self.assertEqual(len(batch["review"][0]), 5)

            inversed = dp.inverse(batch)
            self.assertEqual(inversed["label"][0], np.argmax(batch["label"]))
            self.assertLessEqual(len(inversed["review"][0]), 5)
Ejemplo n.º 2
0
    def test_dataframe(self):
        path = os.path.join(os.path.dirname(__file__), "./data")
        storage = Storage(path)
        df = storage.read("raw/corpus.csv",
                          delimiter="\t",
                          names=["summary", "text"])

        preprocessor = Preprocessor(
            tokenizer=ct.Tokenizer("ja"),
            text_transformers=[ct.text.UnicodeNormalizer()],
            vocabulary=ct.Vocabulary(vocab_size=50))

        preprocessor.fit(df[["summary", "text"]])
        joblib.dump(preprocessor, "test_preprocessor.pkl")

        preprocessor = joblib.load("test_preprocessor.pkl")
        transformed = preprocessor.transform(df)
        inversed = preprocessor.inverse_transform(transformed)

        for c in df.columns:
            for o, i in zip(df[c], inversed[c]):
                self.assertEqual(o, "".join(i))

        print(inversed)
        os.remove("test_preprocessor.pkl")
def preprocess(X, y, preprocessor=None):
    if preprocessor is None:
        preprocessor = Preprocessor()
        preprocessor\
            .stack(ct.text.UnicodeNormalizer())\
            .stack(ct.Tokenizer("ja"))\
            .fit(X['article'])

    processed = preprocessor.transform(X['article'])
    dataset = [
        text_to_instance([token.surface for token in document], int(label))
        for document, label in zip(processed, y)
    ]
    return dataset, preprocessor
Ejemplo n.º 4
0
    def test_series(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        preprocessor = Preprocessor(
                            tokenizer=ct.Tokenizer("en"),
                            text_transformers=[ct.text.UnicodeNormalizer()],
                            token_transformers=[ct.token.StopwordFilter("en")],
                            vocabulary=ct.Vocabulary(min_df=0, max_df=1.0))

        preprocessor.fit(df["review"])
        transformed = preprocessor.transform(df["comment"])
        self.assertEqual(len(transformed), 3)
Ejemplo n.º 5
0
    def test_feed(self):
        df = self._make_corpus()
        dp = LanguageModelPreprocessor()

        dp.process("sentence")\
          .by(ct.text.UnicodeNormalizer())\
          .by(ct.Tokenizer("en"))\
          .by(ct.token.StopwordFilter("en"))\
          .by(ct.Vocabulary(vocab_size=30))\
          .by(ct.generator.ShiftedTarget())\
          .fit(df)

        # Iterate
        b_len = 2
        s_len = 6
        for d, t in dp(df).preprocess().iterate(batch_size=b_len,
                                                sequence_length=s_len,
                                                epoch=2):
            self.assertEqual(d.shape, (s_len, b_len))
            self.assertEqual(t.shape, (s_len, b_len))
Ejemplo n.º 6
0
    def __init__(self,
                 root="",
                 lang=None,
                 min_df=5,
                 max_df=sys.maxsize,
                 unknown="<unk>",
                 preprocessor_name="preprocessor",
                 log_dir=""):
        default_root = os.path.join(os.path.dirname(__file__), "../../")
        _root = root if root else default_root

        self.storage = Storage(_root)
        self.preprocessor_name = preprocessor_name
        self._base_log_dir = log_dir
        self._built = False
        self.preprocessor = Preprocessor(text_transformers=[
            ct.text.UnicodeNormalizer(),
            ct.text.LowerNormalizer()
        ],
                                         tokenizer=ct.Tokenizer(lang=lang),
                                         vocabulary=ct.Vocabulary(
                                             min_df=min_df,
                                             max_df=max_df,
                                             unknown=unknown))
Ejemplo n.º 7
0
        self.replacement = replacement

    def apply(self, text):
        # patternにマッチした部分文字列をreplacementに置き換える
        return re.sub(self.pattern, self.replacement, text)


df = pd.read_pickle('news.pkl')
df = df[:10]

pad_length = 300

dp = DatasetPreprocessor()
dp.process('news')\
    .by(ct.text.UnicodeNormalizer())\
    .by(ct.text.LowerNormalizer())\
    .by(ct.text.SymbolFilter())\
    .by(ct.Tokenizer('ja'))\
    .by(ct.token.StopwordFilter('ja'))\
    .by(ct.Vocabulary(min_df=2, max_df=0.8))\
    .by(Padding(length=pad_length))\
    .fit(df['news'])

dp.process('class')\
    .by(ct.formatter.CategoricalLabel(num_class=9))

preprocessed = dp.preprocess(df)
print(preprocessed)
print("end")