def test_dataframe(self): # ローカルのファイルを読んでS3に保存 repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("titanic.csv")) data = DataFrameData.load(repo) repo_s3 = S3FileRepository(self.access_key_id, self.secret_access_key, self.s3file_url) data.repository = repo_s3 data.save() # S3からファイルを読み込み data2 = DataFrameData.load(repo_s3) self.assertTrue(len(data2.content) > 0)
def main(self, ds): df = ds.get("titanic").content df[self.column] = df[self.column].fillna(df[self.column].median()) rds = DataSet() rds.put(f"fillna_{self.column}", DataFrameData(df[self.column])) return rds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) return rds
def main(self, ds): repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds = DataSet() ds.put("titanic", titanic_data) return ds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df.loc[df["Embarked"] == "S", "Embarked"] = 0 df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) return rds
def main(self, ds): df = ds.get("titanic").content df["Sex"][df["Sex"] == "male"] = 0 df["Sex"][df["Sex"] == "female"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) time.sleep(random.randint(3, 10)) return rds
def table_init(self): repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("titanic.csv")) data = DataFrameData.load(repo) repo_s = SqlAlchemyRepository(self.engine) md = SqlAlchemyModelData(repo_s, Titanic) md.update_dataframe(data.content) md.save()
def main(self, ds): df = ds.get("titanic").content df = df.drop(["Age", "Sex", "Embarked"], axis=1) df = df.join(ds.get("fillna_Age").content) df = df.join(ds.get("sex_to_code").content) df = df.join(ds.get("embarked_to_code").content) rds = DataSet() rds.put("titanic_result", DataFrameData(df)) return rds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df["Embarked"][df["Embarked"] == "S"] = 0 df["Embarked"][df["Embarked"] == "C"] = 1 df["Embarked"][df["Embarked"] == "Q"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) time.sleep(random.randint(3, 10)) return rds
def prepare_db(engine): Base.metadata.drop_all(engine) Base.metadata.create_all(engine) repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) repo_s = SqlAlchemyRepository(engine) md = SqlAlchemyModelData(repo_s, Titanic) md.update_dataframe(titanic_data.content) md.save()
def test_dataframe(self): repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("titanic.csv")) data = DataFrameData.load(repo) tmp_path = Path(tempfile.gettempdir()) / Path( next(tempfile._get_candidate_names())) repo = LocalFileRepository(tmp_path) data.repository = repo data.save() self.assertTrue(tmp_path.exists()) self.assertTrue(tmp_path.is_file())
def test_dataframe(self): repo = LocalFileRepository(Path(os.path.dirname(__file__)) / Path("titanic.csv")) data = DataFrameData.load(repo) db_repo = PandasDbRepository(self.engine, "titanic") dfd = DataFrameData(data.content, db_repo) dfd.save() dfd2 = DataFrameData.load(db_repo) self.assertIsNotNone(dfd2.content) self.assertTrue(data.content.equals(dfd2.content))
df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み ds = DataSet() repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds.put("titanic", titanic_data) # print("## Original data") print(ds.get("titanic").content) # Graphで処理する # Age欠損埋め -> 性別のコード化 -> 乗船した港 のコード化 の順で処理 graph = Graph() fill_age = graph.append(FillNaMedian("Age")) sex_to_code = graph.append(SexToCode(), [fill_age]) graph.append(EmbarkedToCode(), [sex_to_code]) ds = graph.run(ds) print("## Processed data")