Example #1
0
class PredictorConvertible():
    def __init__(self, df_or_dfe, insights):
        if isinstance(df_or_dfe, DataFrameExtension):
            self.dfe = df_or_dfe
        else:
            self.dfe = DataFrameExtension(df_or_dfe)
        self.insights = insights
        self._tag_order = [
            InsightIndex.COLUMN_CHECK_TAG, InsightIndex.ROW_CHECK_TAG,
            InsightIndex.PREPROCESSING, InsightIndex.FEATURE_AUGMENTATION,
            InsightIndex.LABEL_FORMAT, InsightIndex.FEATURE_SELECTION,
            InsightIndex.MODEL_SELECTION
        ]

    def set_target(self, target):
        self.dfe.target = target

    def ignore(self, ignore_or_ignores):
        ignores = ignore_or_ignores if isinstance(
            ignore_or_ignores, (list, tuple)) else [ignore_or_ignores]
        for i in ignores:
            self.dfe.drop(i)

    def to_predictor(self):
        return Predictor.create(self.dfe, self.insights, self._tag_order)
Example #2
0
    def file_to_df(self, byte_str):
        fileio = BytesIO(byte_str)
        columns = []
        ftype_names = []
        index = 0
        for line in fileio:
            items = line.decode("utf-8").split("\t")
            items = [i.strip() for i in items]
            if index == 0:
                columns = items
            elif index == 1:
                ftype_names = items
            index += 1
            if index == 2:
                break

        df = pd.read_csv(fileio, encoding="utf-8", sep="\t")
        df.columns = columns
        ignored_columns = []
        _ftype_names = {}
        target = ""

        index = 0
        for c, fn in zip(columns, ftype_names):
            if not fn:
                ignored_columns.append(index)
            else:
                fn_attr = fn.split("/")
                _fn = fn_attr[0]
                attr = "" if len(fn_attr) == 1 else fn_attr[1]

                if attr == "PRED":
                    ignored_columns.append(index)
                elif attr == "TGT":
                    target = c
                    _ftype_names[c] = _fn
                else:
                    _ftype_names[c] = _fn

            index += 1

        ftypes = {}
        for ftype in FTypeNames:
            name = FTypeNames[ftype]
            ftypes[ftype] = [k for k, v in _ftype_names.items() if v == name]

        df.drop(df.columns[ignored_columns], axis=1, inplace=True)
        dfe = DataFrameExtension(df, ftypes[FType.categorical],
                                 ftypes[FType.numerical],
                                 ftypes[FType.datetime], ftypes[FType.text],
                                 ftypes[FType.unique])
        dfe.target = target
        return dfe
Example #3
0
 def __init__(self, df_or_dfe, insights):
     if isinstance(df_or_dfe, DataFrameExtension):
         self.dfe = df_or_dfe
     else:
         self.dfe = DataFrameExtension(df_or_dfe)
     self.insights = insights
     self._tag_order = [
         InsightIndex.COLUMN_CHECK_TAG, InsightIndex.ROW_CHECK_TAG,
         InsightIndex.PREPROCESSING, InsightIndex.FEATURE_AUGMENTATION,
         InsightIndex.LABEL_FORMAT, InsightIndex.FEATURE_SELECTION,
         InsightIndex.MODEL_SELECTION
     ]
Example #4
0
    def test_transform(self):
        d = {
            "datetime": [
                datetime(2010, 1, 1),
                datetime(2015, 6, 30),
                datetime(2020, 3, 9)
            ],
            "datetime2": [
                datetime(2010, 1, 3),
                datetime(2015, 6, 15),
                datetime(2020, 3, 20)
            ]
        }
        df = pd.DataFrame(d)
        dfe = DataFrameExtension(df)

        di = DatetimeToCategoricalInsight()
        di.adopt(dfe)

        tf = di.get_transformer(dfe)
        tf.model_features = ["datetime_month", "datetime2_day"]
        transformed = tf.transform(pd.DataFrame(d))
        self.assertEqual(len(transformed.columns), 2)
        self.assertTrue(transformed.columns.tolist,
                        ("datetime_month", "datetime2_day"))
        self.assertTrue(dfe.df["datetime_month"].tolist(), [1, 6, 3])
        self.assertTrue(dfe.df["datetime2_day"].tolist(), [3, 15, 20])
    def test_insight(self):
        d = {
            "category1": pd.Series(["a", "b", "c", "b", "c", "a", "a", "b"]),
            "category2": pd.Series(["z", "z", "x", "y", "z", "z", "z", "x"]),
            "numericals": pd.Series([1, 2, 3, 2, 1, 2, 2, 1]),
        }
        category_columns = ["category1", "category2"]
        dfe = DataFrameExtension(pd.DataFrame(d),
                                 categoricals=category_columns)

        insight = CategoricalToDummyInsight()
        targets = insight.get_insight_targets(dfe)
        self.assertEqual(len(category_columns), len(targets))

        insight.adopt(dfe)
        self.assertEqual("numericals", dfe.df.columns[0])
        for c in category_columns:
            categories = d[c].value_counts().index
            dummy_columns = ["{}_{}".format(c, v)
                             for v in categories]  # default prefix
            converted = dfe.df.columns[[
                v.startswith(c) for v in dfe.df.columns
            ]]
            for cv in converted:
                self.assertTrue(cv in dummy_columns)

        print(dfe.ftypes)
    def test_insight_classification(self):
        X, y = make_classification(n_samples=1000, n_features=7, n_informative=4, n_classes=3)

        df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, categoricals=["target"], target="target")

        insight = ModelSelectionInsight()
        insight.adopt(dfe)

        self.assertTrue(insight.score > 0)
        print(insight.score)
    def test_insight(self):
        df = pd.read_csv(self.FILE_NAME)
        dfe = DataFrameExtension(df)
        insight = RowCountInsight(max_count=50)

        self.assertTrue(insight.is_applicable(dfe))
        self.assertTrue(insight.describe())

        insight.adopt(dfe)
        self.assertEqual(dfe.df.shape[0], insight.max_count)
        print(dfe.df.head(5))
    def test_insight_regression(self):
        candidates = 4
        X, y = make_regression(
            n_samples=1000, n_features=15, n_informative=candidates,
            n_targets=1)
        
        df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, numericals=["target"], target="target")

        insight = ModelSelectionInsight()
        insight.adopt(dfe)

        self.assertTrue(insight.score > 0)
        print(insight.score)
    def test_adopt(self):
        dfe = DataFrameExtension.read_csv(self.FILE_NAME)
        insight = NumericalScalingInsight()

        targets = insight.get_insight_targets(dfe)

        scaled = (dfe.df[targets] -
                  dfe.df[targets].mean()) / dfe.df[targets].std()

        insight.adopt(dfe)
        scaled1 = dfe.df[targets]
        self.assertTrue(scaled1.mean().sum() < 1.0e-5)
        self.assertTrue(scaled1.std().mean() - 1 < 0.1)

        dfe2 = DataFrameExtension.read_csv(self.FILE_NAME)
        transformer = insight.get_transformer(dfe2)
        scaled2 = transformer.transform(dfe2.df)
        scaled2 = scaled2[targets]
        self.assertTrue(scaled2.mean().sum() < 1.0e-5)
        self.assertTrue(scaled2.std().mean() - 1 < 0.1)

        for c in scaled1.columns:
            self.assertEqual(0, (scaled1[c] != scaled2[c]).sum())
Example #10
0
    def test_insight(self):
        d = {
            "category": pd.Series(["a", "b", "c", "b", "c", "a", "a", "b"]),
            "with_50_na": pd.Series(["a", None, "c", None, None, None, "a", "b"]),
            "with_20_na": pd.Series(["a", "b", "c", "b", None, "a", None, "b"])
        }
        df = pd.DataFrame(d)
        dfe = DataFrameExtension(df)

        insight = NAFrequencyCheckInsight()
        self.assertTrue(insight.is_applicable(dfe))
        insight.init_description()

        insight.adopt(dfe)
        self.assertEqual(len(dfe.df.columns), 2)
Example #11
0
    def test_insight_regression(self):
        candidates = 4
        X, y = make_regression(n_samples=1000,
                               n_features=15,
                               n_informative=candidates,
                               n_targets=1)

        df = pd.DataFrame(
            np.hstack((X, y.reshape([-1, 1]))),
            columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, numericals=["target"], target="target")

        insight = FeatureSelectionInsight()
        insight.adopt(dfe)

        print("selected regressor features {}".format(dfe.ftypes.keys()))
        self.assertTrue(candidates <= len(dfe.ftypes) - 1 <
                        candidates * 2)  # -1 is target ftype
Example #12
0
    def test_adopt_categorical(self):
        d = {
            "numerical": [0, 1, 2, 3],
            "categorical": ["a", "b", "c", "a"]
        }
        df = pd.DataFrame(d)
        dfe = DataFrameExtension(df, categoricals=("categorical"), target="categorical")

        li = LabelFormatInsight()
        ts = li.get_insight_targets(dfe)
        self.assertEqual("categorical", ts[0])

        result = li.adopt(dfe)
        self.assertTrue(result)
        
        ts = li.get_transformer(dfe)
        inv = ts.inverse_transform(dfe.df["categorical"])
        self.assertEqual(inv.tolist(), d["categorical"])
Example #13
0
    def test_adopt_numerical(self):
        d = {
            "numerical": [0, 1, 2, 3],
            "categorical": ["a", "b", "c", "a"]
        }
        df = pd.DataFrame(d)
        dfe = DataFrameExtension(df, categoricals=("categorical"), target="numerical")

        li = LabelFormatInsight()
        ts = li.get_insight_targets(dfe)
        self.assertEqual("numerical", ts[0])

        result = li.adopt(dfe)
        self.assertTrue(result)
        
        ts = li.get_transformer(dfe)
        inv = ts.inverse_transform(np.array(dfe.df["numerical"]))
        diff = sum(inv.flatten() - np.array(d["numerical"]))
        self.assertTrue(diff < 1e-10)
    def test_get_transformer(self):
        d = {"category1": ["a", "b", "c", "d"], "category2": [1, 2, 3, 4]}
        df = pd.DataFrame.from_dict(d)
        dfe = DataFrameExtension(df, categoricals=("category1", "category2"))
        insight = CategoricalToDummyInsight()
        insight.adopt(dfe)

        self.assertEqual(8, len(dfe.ftypes))  # expand to dummy

        dfe.df.drop(["category1_a", "category2_3"], axis=1,
                    inplace=True)  # drop 2 column (like useless feature)

        transformer = insight.get_transformer(dfe)
        df_t = transformer.transform(pd.DataFrame.from_dict(d))
        print(df_t.head())
        self.assertEqual(6, len(df_t.columns))
        for c in ["category1_b", "category1_c", "category1_d"]:
            self.assertTrue(c in df_t.columns)
            self.assertEqual(1, len(df_t[df_t[c] == 1]))
        for c in ["category2_1", "category2_2", "category2_4"]:
            self.assertTrue(c in df_t.columns)
            self.assertEqual(1, len(df_t[df_t[c] == 1]))
Example #15
0
    def test_adopt(self):
        d = {
            "datetime": [
                datetime(2010, 1, 1),
                datetime(2015, 6, 30),
                datetime(2020, 3, 9)
            ]
        }
        df = pd.DataFrame(d)
        dfe = DataFrameExtension(df)

        di = DatetimeToCategoricalInsight()
        ts = di.get_insight_targets(dfe)
        self.assertEqual("datetime", ts[0])

        result = di.adopt(dfe)
        self.assertTrue(result)
        self.assertEqual(len(dfe.df.columns), 2)
        self.assertTrue(dfe.ftypes["datetime_month"], FType.categorical)
        self.assertTrue(dfe.ftypes["datetime_day"], FType.categorical)
        self.assertTrue(dfe.df["datetime_month"].tolist(), [1, 6, 3])
        self.assertTrue(dfe.df["datetime_day"].tolist(), [1, 30, 9])
Example #16
0
    def test_insight_classification(self):
        candidates = 3
        X, y = make_classification(n_samples=1000,
                                   n_features=25,
                                   n_informative=candidates,
                                   n_redundant=2,
                                   n_repeated=0,
                                   n_classes=5,
                                   n_clusters_per_class=1,
                                   random_state=0)

        df = pd.DataFrame(
            np.hstack((X, y.reshape([-1, 1]))),
            columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"])
        dfe = DataFrameExtension(df, categoricals=["target"], target="target")

        insight = FeatureSelectionInsight()
        insight.adopt(dfe)

        print("selected classifier features {}".format(dfe.ftypes.keys()))
        self.assertTrue(candidates <= len(dfe.ftypes) - 1 <
                        candidates * 2)  # -1 is target ftype
Example #17
0
    def load(self, app_id, query="", fields=(), target=""):
        app = pykintone.login(self.env.domain, self.env.login_id,
                              self.env.password).app(app_id)
        fields_d = self.get_fields(app_id)
        if len(fields) > 0:
            d = OrderedDict()
            for f in fields:
                if f in fields_d:
                    d[f] = fields_d[f]
            fields_d = d

        q = query + " " if query else ""

        records = []
        _fields = list(fields_d.keys())
        selected = app.select(query=q + "limit {}".format(self._kintone_limit),
                              fields=_fields)
        records = selected.records
        if selected.total_count > self._kintone_limit:
            repeat = np.floor(
                min(self.max_count, selected.total_count) /
                self._kintone_limit)
            for i in range(int(repeat)):
                selected = app.select(
                    query=q +
                    "limit {} offset {}".format(self._kintone_limit,
                                                (i + 1) * self._kintone_limit),
                    fields=_fields)
                if len(selected.records) > 0:
                    records += selected.records

        data = []
        columns = []
        for i, r in enumerate(records):
            row = []

            if i == 0:
                columns = [f for f in _fields if f in r]

            for f in columns:
                v = r[f]["value"]
                row.append(v)

            if len(row) > 0:
                data.append(row)

        fs = [fields_d[c] for c in columns]
        df = pd.DataFrame(np.array(data), columns=[f.label for f in fs])
        categoricals = [
            f.label for f in fs if f.get_feature_type() == FType.categorical
        ]
        numericals = [
            f.label for f in fs if f.get_feature_type() == FType.numerical
        ]
        datetimes = [
            f.label for f in fs if f.get_feature_type() == FType.datetime
        ]
        texts = [f.label for f in fs if f.get_feature_type() == FType.text]
        uniques = [f.label for f in fs if f.get_feature_type() == FType.unique]

        dfe = DataFrameExtension(df, categoricals, numericals, datetimes,
                                 texts, uniques)
        if target:
            dfe.target = fields_d[target].label
        return dfe
 def test_inference(self):
     df = self.make_data_frame()
     print(df.dtypes)
     dfe = DataFrameExtension(df)
     print(dfe.ftypes)
    def test_insight_targets(self):
        dfe = DataFrameExtension.read_csv(self.FILE_NAME)
        insight = NumericalScalingInsight()

        targets = insight.get_insight_targets(dfe)
        self.assertTrue(len(targets) > 0)