Esempio n. 1
0
class MakeDatasets(Task):
    TEST_AS_PERCENT_OF_DATASET = 0.20

    dir_path = luigi.Parameter(default="data")

    requires = Requires()

    output = TargetOutput(
        file_pattern="{task.dir_path}/{task.__class__.__name__}/",
        target_class=ParquetTarget,
        glob="*.parquet",
    )
Esempio n. 2
0
class TransformData(Task):
    dir_path = luigi.Parameter(default="data")

    requires = Requires()
    source_data = Requirement(ExtractFeatures)

    output = TargetOutput(
        file_pattern="{task.dir_path}/{task.__class__.__name__}/",
        target_class=ParquetTarget,
        glob="*.parquet",
    )

    def run(self):
        ddf = self.input()["source_data"].read_dask()

        ddf = transform_dataframe(ddf)

        self.output().write_dask(ddf, compression='gzip')
Esempio n. 3
0
class CleanData(Task):
    dir_path = luigi.Parameter(default="data")

    requires = Requires()
    source_data = Requirement(DownloadData)

    output = TargetOutput(
        file_pattern="{task.dir_path}/{task.__class__.__name__}/",
        target_class=ParquetTarget,
        glob="*.parquet",
    )

    def run(self):
        ddf = self.input()["source_data"].read_dask()

        ddf = clean_datasets(ddf)

        self.output().write_dask(ddf, compression='gzip')
Esempio n. 4
0
class TrainModel(Task):
    dir_path = luigi.Parameter(default="data")
    model_path = luigi.Parameter(default="data/Model/model.pckl")

    requires = Requires()
    source_data = Requirement(MakeTrainingSet)

    def output(self):
        return LocalTarget(self.model_path)

    def run(self):
        train_ddf = self.input()["source_data"].read_dask()

        model = train_model(train_ddf)

        self.output().makedirs()

        with self.output().temporary_path() as temp_output_path:
            print(temp_output_path)
            pickle.dump(model, open(temp_output_path, 'wb'))
Esempio n. 5
0
class VisualizePredictions(Task):
    dir_path = luigi.Parameter(default="data")
    prediction_visualization_path = luigi.Parameter(default="data/VisualizePredictions/predictions.png")

    requires = Requires()
    source_data_testset = Requirement(MakeTestSet)
    source_predictions = Requirement(EvaluateModel)

    def output(self):
        return LocalTarget(self.prediction_visualization_path)

    def run(self):
        test_ddf = self.input()["source_data_testset"].read_dask()

        y_predicted = np.load(self.input()["source_predictions"].path, allow_pickle=True)

        fig = visualizepredictions(y_predicted, test_ddf)

        self.output().makedirs()

        # https://mattiacinelli.com/tutorial-on-luigi-part-3-pipeline-input-and-output/
        fig.savefig(self.output().path)
Esempio n. 6
0
class EvaluateModel(Task):
    dir_path = luigi.Parameter(default="data")
    predicted_values_path = luigi.Parameter(default="data/EvaluateModel/predicted.npy")

    requires = Requires()
    source_data_testset = Requirement(MakeTestSet)
    source_model = Requirement(TrainModel)

    def output(self):
        return LocalTarget(self.predicted_values_path)

    def run(self):
        test_ddf = self.input()["source_data_testset"].read_dask()

        with open(self.input()["source_model"].fn, "rb") as file:
            model = pickle.load(file)

        y_predicted = evaluate_model(model, test_ddf)

        self.output().makedirs()

        np.save(self.output().path, y_predicted) 
Esempio n. 7
0
class VisualizeFeatureImportance(Task):
    dir_path = luigi.Parameter(default="data")
    importance_path = luigi.Parameter(default="data/VisualizeFeatureSignificance/featureimportance.png")

    requires = Requires()
    source_data_testset = Requirement(MakeTestSet)
    source_model = Requirement(TrainModel)

    def output(self):
        return LocalTarget(self.importance_path)

    def run(self):
        test_ddf = self.input()["source_data_testset"].read_dask()

        with open(self.input()["source_model"].fn, "rb") as file:
            model = pickle.load(file)

        fig = visualizefeaturesignificance(model, test_ddf)

        self.output().makedirs()

        # https://mattiacinelli.com/tutorial-on-luigi-part-3-pipeline-input-and-output/
        fig.savefig(self.output().path)