def main(self, ds): df = ds.get("titanic").content df[self.column] = df[self.column].fillna(df[self.column].median()) rds = DataSet() rds.put(f"fillna_{self.column}", DataFrameData(df[self.column])) return rds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) return rds
def main(self, ds): repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds = DataSet() ds.put("titanic", titanic_data) return ds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df.loc[df["Embarked"] == "S", "Embarked"] = 0 df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) return rds
def main(self, ds): df = ds.get("titanic").content df["Sex"][df["Sex"] == "male"] = 0 df["Sex"][df["Sex"] == "female"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) time.sleep(random.randint(3, 10)) return rds
def main(self, ds): df = ds.get("titanic").content df = df.drop(["Age", "Sex", "Embarked"], axis=1) df = df.join(ds.get("fillna_Age").content) df = df.join(ds.get("sex_to_code").content) df = df.join(ds.get("embarked_to_code").content) rds = DataSet() rds.put("titanic_result", DataFrameData(df)) return rds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df["Embarked"][df["Embarked"] == "S"] = 0 df["Embarked"][df["Embarked"] == "C"] = 1 df["Embarked"][df["Embarked"] == "Q"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) time.sleep(random.randint(3, 10)) return rds
df.loc[df["Embarked"] == "Q", "Embarked"] = 2 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み ds = DataSet() repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds.put("titanic", titanic_data) # print("## Original data") print(ds.get("titanic").content) # Graphで処理する # Age欠損埋め -> 性別のコード化 -> 乗船した港 のコード化 の順で処理 graph = Graph() fill_age = graph.append(FillNaMedian("Age")) sex_to_code = graph.append(SexToCode(), [fill_age]) graph.append(EmbarkedToCode(), [sex_to_code]) ds = graph.run(ds) print("## Processed data") print(ds.get("titanic").content)
repo_s = SqlAlchemyRepository(engine) md = SqlAlchemyModelData(repo_s, Titanic) md.update_dataframe(titanic_data.content) md.save() if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み・DBの準備 engine = create_engine("sqlite:///example.sqlite3", echo=True) prepare_db(engine) repo = SqlAlchemyRepository(engine) d = SqlAlchemyModelData(repo, Titanic) d.query() passenger_ids = [m.PassengerId for m in d.content] # データを一行ずつ SQLAlchemy のモデルに取り出し、処理して書き戻す例 for passenger_id in passenger_ids: d.query(lambda x: x.filter(Titanic.PassengerId == passenger_id)) ds = DataSet() ds.put("titanic", d) ds = SexToCode().main(ds) ds = EmbarkedToCode().main(ds) ds.save_all()