Exemple #1
0
    def test_dataframe(self):
        # ローカルのファイルを読んでS3に保存
        repo = LocalFileRepository(
            Path(os.path.dirname(__file__)) / Path("titanic.csv"))

        data = DataFrameData.load(repo)

        repo_s3 = S3FileRepository(self.access_key_id, self.secret_access_key,
                                   self.s3file_url)

        data.repository = repo_s3
        data.save()

        # S3からファイルを読み込み
        data2 = DataFrameData.load(repo_s3)
        self.assertTrue(len(data2.content) > 0)
Exemple #2
0
    def main(self, ds):
        df = ds.get("titanic").content
        df[self.column] = df[self.column].fillna(df[self.column].median())

        rds = DataSet()
        rds.put(f"fillna_{self.column}", DataFrameData(df[self.column]))
        return rds
Exemple #3
0
    def main(self, ds):
        df = ds.get("titanic").content
        df.loc[df["Sex"] == "male", "Sex"] = 0
        df.loc[df["Sex"] == "female", "Sex"] = 1

        rds = DataSet()
        rds.put("sex_to_code", DataFrameData(df["Sex"]))
        return rds
Exemple #4
0
    def main(self, ds):
        repo = LocalFileRepository(
            Path(os.path.dirname(__file__)) / Path("../titanic.csv"))
        titanic_data = DataFrameData.load(repo)

        ds = DataSet()
        ds.put("titanic", titanic_data)
        return ds
Exemple #5
0
    def main(self, ds):
        df = ds.get("titanic").content
        df.loc[df["Sex"] == "male", "Sex"] = 0
        df.loc[df["Sex"] == "female", "Sex"] = 1

        ds = DataSet()
        ds.put("titanic", DataFrameData(df))
        return ds
Exemple #6
0
    def main(self, ds):
        df = ds.get("titanic").content
        df["Embarked"] = df["Embarked"].fillna("S")
        df.loc[df["Embarked"] == "S", "Embarked"] = 0
        df.loc[df["Embarked"] == "C", "Embarked"] = 1
        df.loc[df["Embarked"] == "Q", "Embarked"] = 2

        rds = DataSet()
        rds.put("embarked_to_code", DataFrameData(df["Embarked"]))
        return rds
Exemple #7
0
    def main(self, ds):
        df = ds.get("titanic").content
        df["Sex"][df["Sex"] == "male"] = 0
        df["Sex"][df["Sex"] == "female"] = 1

        rds = DataSet()
        rds.put("sex_to_code", DataFrameData(df["Sex"]))

        time.sleep(random.randint(3, 10))
        return rds
    def table_init(self):
        repo = LocalFileRepository(
            Path(os.path.dirname(__file__)) / Path("titanic.csv"))
        data = DataFrameData.load(repo)

        repo_s = SqlAlchemyRepository(self.engine)

        md = SqlAlchemyModelData(repo_s, Titanic)
        md.update_dataframe(data.content)
        md.save()
Exemple #9
0
    def main(self, ds):
        df = ds.get("titanic").content

        df = df.drop(["Age", "Sex", "Embarked"], axis=1)
        df = df.join(ds.get("fillna_Age").content)
        df = df.join(ds.get("sex_to_code").content)
        df = df.join(ds.get("embarked_to_code").content)

        rds = DataSet()
        rds.put("titanic_result", DataFrameData(df))
        return rds
Exemple #10
0
    def main(self, ds):
        df = ds.get("titanic").content
        df["Embarked"] = df["Embarked"].fillna("S")
        df["Embarked"][df["Embarked"] == "S"] = 0
        df["Embarked"][df["Embarked"] == "C"] = 1
        df["Embarked"][df["Embarked"] == "Q"] = 2

        rds = DataSet()
        rds.put("embarked_to_code", DataFrameData(df["Embarked"]))

        time.sleep(random.randint(3, 10))
        return rds
Exemple #11
0
def prepare_db(engine):
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)

    repo = LocalFileRepository(
        Path(os.path.dirname(__file__)) / Path("../titanic.csv"))
    titanic_data = DataFrameData.load(repo)

    repo_s = SqlAlchemyRepository(engine)

    md = SqlAlchemyModelData(repo_s, Titanic)
    md.update_dataframe(titanic_data.content)
    md.save()
Exemple #12
0
    def test_dataframe(self):
        repo = LocalFileRepository(
            Path(os.path.dirname(__file__)) / Path("titanic.csv"))

        data = DataFrameData.load(repo)

        tmp_path = Path(tempfile.gettempdir()) / Path(
            next(tempfile._get_candidate_names()))
        repo = LocalFileRepository(tmp_path)
        data.repository = repo
        data.save()

        self.assertTrue(tmp_path.exists())
        self.assertTrue(tmp_path.is_file())
Exemple #13
0
    def test_dataframe(self):
        repo = LocalFileRepository(Path(os.path.dirname(__file__)) / Path("titanic.csv"))
        data = DataFrameData.load(repo)

        db_repo = PandasDbRepository(self.engine, "titanic")

        dfd = DataFrameData(data.content, db_repo)
        dfd.save()

        dfd2 = DataFrameData.load(db_repo)
        self.assertIsNotNone(dfd2.content)
        self.assertTrue(data.content.equals(dfd2.content))
Exemple #14
0
        df.loc[df["Embarked"] == "C", "Embarked"] = 1
        df.loc[df["Embarked"] == "Q", "Embarked"] = 2

        ds = DataSet()
        ds.put("titanic", DataFrameData(df))
        return ds


if __name__ == "__main__":
    basicConfig(level=DEBUG)

    # データセットの読み込み
    ds = DataSet()
    repo = LocalFileRepository(
        Path(os.path.dirname(__file__)) / Path("../titanic.csv"))
    titanic_data = DataFrameData.load(repo)
    ds.put("titanic", titanic_data)

    #
    print("## Original data")
    print(ds.get("titanic").content)

    # Graphで処理する
    # Age欠損埋め -> 性別のコード化 -> 乗船した港 のコード化 の順で処理
    graph = Graph()
    fill_age = graph.append(FillNaMedian("Age"))
    sex_to_code = graph.append(SexToCode(), [fill_age])
    graph.append(EmbarkedToCode(), [sex_to_code])
    ds = graph.run(ds)

    print("## Processed data")