class PredictorConvertible(): def __init__(self, df_or_dfe, insights): if isinstance(df_or_dfe, DataFrameExtension): self.dfe = df_or_dfe else: self.dfe = DataFrameExtension(df_or_dfe) self.insights = insights self._tag_order = [ InsightIndex.COLUMN_CHECK_TAG, InsightIndex.ROW_CHECK_TAG, InsightIndex.PREPROCESSING, InsightIndex.FEATURE_AUGMENTATION, InsightIndex.LABEL_FORMAT, InsightIndex.FEATURE_SELECTION, InsightIndex.MODEL_SELECTION ] def set_target(self, target): self.dfe.target = target def ignore(self, ignore_or_ignores): ignores = ignore_or_ignores if isinstance( ignore_or_ignores, (list, tuple)) else [ignore_or_ignores] for i in ignores: self.dfe.drop(i) def to_predictor(self): return Predictor.create(self.dfe, self.insights, self._tag_order)
def file_to_df(self, byte_str): fileio = BytesIO(byte_str) columns = [] ftype_names = [] index = 0 for line in fileio: items = line.decode("utf-8").split("\t") items = [i.strip() for i in items] if index == 0: columns = items elif index == 1: ftype_names = items index += 1 if index == 2: break df = pd.read_csv(fileio, encoding="utf-8", sep="\t") df.columns = columns ignored_columns = [] _ftype_names = {} target = "" index = 0 for c, fn in zip(columns, ftype_names): if not fn: ignored_columns.append(index) else: fn_attr = fn.split("/") _fn = fn_attr[0] attr = "" if len(fn_attr) == 1 else fn_attr[1] if attr == "PRED": ignored_columns.append(index) elif attr == "TGT": target = c _ftype_names[c] = _fn else: _ftype_names[c] = _fn index += 1 ftypes = {} for ftype in FTypeNames: name = FTypeNames[ftype] ftypes[ftype] = [k for k, v in _ftype_names.items() if v == name] df.drop(df.columns[ignored_columns], axis=1, inplace=True) dfe = DataFrameExtension(df, ftypes[FType.categorical], ftypes[FType.numerical], ftypes[FType.datetime], ftypes[FType.text], ftypes[FType.unique]) dfe.target = target return dfe
def __init__(self, df_or_dfe, insights): if isinstance(df_or_dfe, DataFrameExtension): self.dfe = df_or_dfe else: self.dfe = DataFrameExtension(df_or_dfe) self.insights = insights self._tag_order = [ InsightIndex.COLUMN_CHECK_TAG, InsightIndex.ROW_CHECK_TAG, InsightIndex.PREPROCESSING, InsightIndex.FEATURE_AUGMENTATION, InsightIndex.LABEL_FORMAT, InsightIndex.FEATURE_SELECTION, InsightIndex.MODEL_SELECTION ]
def test_transform(self): d = { "datetime": [ datetime(2010, 1, 1), datetime(2015, 6, 30), datetime(2020, 3, 9) ], "datetime2": [ datetime(2010, 1, 3), datetime(2015, 6, 15), datetime(2020, 3, 20) ] } df = pd.DataFrame(d) dfe = DataFrameExtension(df) di = DatetimeToCategoricalInsight() di.adopt(dfe) tf = di.get_transformer(dfe) tf.model_features = ["datetime_month", "datetime2_day"] transformed = tf.transform(pd.DataFrame(d)) self.assertEqual(len(transformed.columns), 2) self.assertTrue(transformed.columns.tolist, ("datetime_month", "datetime2_day")) self.assertTrue(dfe.df["datetime_month"].tolist(), [1, 6, 3]) self.assertTrue(dfe.df["datetime2_day"].tolist(), [3, 15, 20])
def test_insight(self): d = { "category1": pd.Series(["a", "b", "c", "b", "c", "a", "a", "b"]), "category2": pd.Series(["z", "z", "x", "y", "z", "z", "z", "x"]), "numericals": pd.Series([1, 2, 3, 2, 1, 2, 2, 1]), } category_columns = ["category1", "category2"] dfe = DataFrameExtension(pd.DataFrame(d), categoricals=category_columns) insight = CategoricalToDummyInsight() targets = insight.get_insight_targets(dfe) self.assertEqual(len(category_columns), len(targets)) insight.adopt(dfe) self.assertEqual("numericals", dfe.df.columns[0]) for c in category_columns: categories = d[c].value_counts().index dummy_columns = ["{}_{}".format(c, v) for v in categories] # default prefix converted = dfe.df.columns[[ v.startswith(c) for v in dfe.df.columns ]] for cv in converted: self.assertTrue(cv in dummy_columns) print(dfe.ftypes)
def test_insight_classification(self): X, y = make_classification(n_samples=1000, n_features=7, n_informative=4, n_classes=3) df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, categoricals=["target"], target="target") insight = ModelSelectionInsight() insight.adopt(dfe) self.assertTrue(insight.score > 0) print(insight.score)
def test_insight(self): df = pd.read_csv(self.FILE_NAME) dfe = DataFrameExtension(df) insight = RowCountInsight(max_count=50) self.assertTrue(insight.is_applicable(dfe)) self.assertTrue(insight.describe()) insight.adopt(dfe) self.assertEqual(dfe.df.shape[0], insight.max_count) print(dfe.df.head(5))
def test_insight_regression(self): candidates = 4 X, y = make_regression( n_samples=1000, n_features=15, n_informative=candidates, n_targets=1) df = pd.DataFrame(np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, numericals=["target"], target="target") insight = ModelSelectionInsight() insight.adopt(dfe) self.assertTrue(insight.score > 0) print(insight.score)
def test_adopt(self): dfe = DataFrameExtension.read_csv(self.FILE_NAME) insight = NumericalScalingInsight() targets = insight.get_insight_targets(dfe) scaled = (dfe.df[targets] - dfe.df[targets].mean()) / dfe.df[targets].std() insight.adopt(dfe) scaled1 = dfe.df[targets] self.assertTrue(scaled1.mean().sum() < 1.0e-5) self.assertTrue(scaled1.std().mean() - 1 < 0.1) dfe2 = DataFrameExtension.read_csv(self.FILE_NAME) transformer = insight.get_transformer(dfe2) scaled2 = transformer.transform(dfe2.df) scaled2 = scaled2[targets] self.assertTrue(scaled2.mean().sum() < 1.0e-5) self.assertTrue(scaled2.std().mean() - 1 < 0.1) for c in scaled1.columns: self.assertEqual(0, (scaled1[c] != scaled2[c]).sum())
def test_insight(self): d = { "category": pd.Series(["a", "b", "c", "b", "c", "a", "a", "b"]), "with_50_na": pd.Series(["a", None, "c", None, None, None, "a", "b"]), "with_20_na": pd.Series(["a", "b", "c", "b", None, "a", None, "b"]) } df = pd.DataFrame(d) dfe = DataFrameExtension(df) insight = NAFrequencyCheckInsight() self.assertTrue(insight.is_applicable(dfe)) insight.init_description() insight.adopt(dfe) self.assertEqual(len(dfe.df.columns), 2)
def test_insight_regression(self): candidates = 4 X, y = make_regression(n_samples=1000, n_features=15, n_informative=candidates, n_targets=1) df = pd.DataFrame( np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, numericals=["target"], target="target") insight = FeatureSelectionInsight() insight.adopt(dfe) print("selected regressor features {}".format(dfe.ftypes.keys())) self.assertTrue(candidates <= len(dfe.ftypes) - 1 < candidates * 2) # -1 is target ftype
def test_adopt_categorical(self): d = { "numerical": [0, 1, 2, 3], "categorical": ["a", "b", "c", "a"] } df = pd.DataFrame(d) dfe = DataFrameExtension(df, categoricals=("categorical"), target="categorical") li = LabelFormatInsight() ts = li.get_insight_targets(dfe) self.assertEqual("categorical", ts[0]) result = li.adopt(dfe) self.assertTrue(result) ts = li.get_transformer(dfe) inv = ts.inverse_transform(dfe.df["categorical"]) self.assertEqual(inv.tolist(), d["categorical"])
def test_adopt_numerical(self): d = { "numerical": [0, 1, 2, 3], "categorical": ["a", "b", "c", "a"] } df = pd.DataFrame(d) dfe = DataFrameExtension(df, categoricals=("categorical"), target="numerical") li = LabelFormatInsight() ts = li.get_insight_targets(dfe) self.assertEqual("numerical", ts[0]) result = li.adopt(dfe) self.assertTrue(result) ts = li.get_transformer(dfe) inv = ts.inverse_transform(np.array(dfe.df["numerical"])) diff = sum(inv.flatten() - np.array(d["numerical"])) self.assertTrue(diff < 1e-10)
def test_get_transformer(self): d = {"category1": ["a", "b", "c", "d"], "category2": [1, 2, 3, 4]} df = pd.DataFrame.from_dict(d) dfe = DataFrameExtension(df, categoricals=("category1", "category2")) insight = CategoricalToDummyInsight() insight.adopt(dfe) self.assertEqual(8, len(dfe.ftypes)) # expand to dummy dfe.df.drop(["category1_a", "category2_3"], axis=1, inplace=True) # drop 2 column (like useless feature) transformer = insight.get_transformer(dfe) df_t = transformer.transform(pd.DataFrame.from_dict(d)) print(df_t.head()) self.assertEqual(6, len(df_t.columns)) for c in ["category1_b", "category1_c", "category1_d"]: self.assertTrue(c in df_t.columns) self.assertEqual(1, len(df_t[df_t[c] == 1])) for c in ["category2_1", "category2_2", "category2_4"]: self.assertTrue(c in df_t.columns) self.assertEqual(1, len(df_t[df_t[c] == 1]))
def test_adopt(self): d = { "datetime": [ datetime(2010, 1, 1), datetime(2015, 6, 30), datetime(2020, 3, 9) ] } df = pd.DataFrame(d) dfe = DataFrameExtension(df) di = DatetimeToCategoricalInsight() ts = di.get_insight_targets(dfe) self.assertEqual("datetime", ts[0]) result = di.adopt(dfe) self.assertTrue(result) self.assertEqual(len(dfe.df.columns), 2) self.assertTrue(dfe.ftypes["datetime_month"], FType.categorical) self.assertTrue(dfe.ftypes["datetime_day"], FType.categorical) self.assertTrue(dfe.df["datetime_month"].tolist(), [1, 6, 3]) self.assertTrue(dfe.df["datetime_day"].tolist(), [1, 30, 9])
def test_insight_classification(self): candidates = 3 X, y = make_classification(n_samples=1000, n_features=25, n_informative=candidates, n_redundant=2, n_repeated=0, n_classes=5, n_clusters_per_class=1, random_state=0) df = pd.DataFrame( np.hstack((X, y.reshape([-1, 1]))), columns=["c_{}".format(i) for i in range(X.shape[1])] + ["target"]) dfe = DataFrameExtension(df, categoricals=["target"], target="target") insight = FeatureSelectionInsight() insight.adopt(dfe) print("selected classifier features {}".format(dfe.ftypes.keys())) self.assertTrue(candidates <= len(dfe.ftypes) - 1 < candidates * 2) # -1 is target ftype
def load(self, app_id, query="", fields=(), target=""): app = pykintone.login(self.env.domain, self.env.login_id, self.env.password).app(app_id) fields_d = self.get_fields(app_id) if len(fields) > 0: d = OrderedDict() for f in fields: if f in fields_d: d[f] = fields_d[f] fields_d = d q = query + " " if query else "" records = [] _fields = list(fields_d.keys()) selected = app.select(query=q + "limit {}".format(self._kintone_limit), fields=_fields) records = selected.records if selected.total_count > self._kintone_limit: repeat = np.floor( min(self.max_count, selected.total_count) / self._kintone_limit) for i in range(int(repeat)): selected = app.select( query=q + "limit {} offset {}".format(self._kintone_limit, (i + 1) * self._kintone_limit), fields=_fields) if len(selected.records) > 0: records += selected.records data = [] columns = [] for i, r in enumerate(records): row = [] if i == 0: columns = [f for f in _fields if f in r] for f in columns: v = r[f]["value"] row.append(v) if len(row) > 0: data.append(row) fs = [fields_d[c] for c in columns] df = pd.DataFrame(np.array(data), columns=[f.label for f in fs]) categoricals = [ f.label for f in fs if f.get_feature_type() == FType.categorical ] numericals = [ f.label for f in fs if f.get_feature_type() == FType.numerical ] datetimes = [ f.label for f in fs if f.get_feature_type() == FType.datetime ] texts = [f.label for f in fs if f.get_feature_type() == FType.text] uniques = [f.label for f in fs if f.get_feature_type() == FType.unique] dfe = DataFrameExtension(df, categoricals, numericals, datetimes, texts, uniques) if target: dfe.target = fields_d[target].label return dfe
def test_inference(self): df = self.make_data_frame() print(df.dtypes) dfe = DataFrameExtension(df) print(dfe.ftypes)
def test_insight_targets(self): dfe = DataFrameExtension.read_csv(self.FILE_NAME) insight = NumericalScalingInsight() targets = insight.get_insight_targets(dfe) self.assertTrue(len(targets) > 0)