Ejemplo n.º 1
0
 def testNumericalParameter(self):
     p = luigi.NumericalParameter(min_value=-3,
                                  max_value=7,
                                  var_type=int,
                                  config_path=dict(section="foo",
                                                   name="bar"))
     self.assertEqual(-3, _value(p))
Ejemplo n.º 2
0
 def test_float_max_value_exclusive(self):
     d = luigi.NumericalParameter(var_type=float,
                                  min_value=-3,
                                  max_value=7,
                                  left_op=le,
                                  right_op=lt)
     self.assertRaises(ValueError, lambda: d.parse(7))
Ejemplo n.º 3
0
class BarTask(ShellTask):
    foo_path = luigi.Parameter()
    foo_num = luigi.NumericalParameter(var_type=int,
                                       min_value=0,
                                       max_value=10000)
    bar_directory = luigi.Parameter()

    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.bar_directory,
                         "bar_%d_success.txt" % self.foo_num))

    def run(self):
        with AtomicFilePointer(
                os.path.join(self.bar_directory,
                             "bar_%d.txt" % self.foo_num)).open() as bar_file:
            (returncode, stdout, stderr) = self.ex(
                "echo \"%d - bar\" > %s" % (self.foo_num, bar_file.tmp_path))

        if returncode > 0:
            raise Exception("Received error code %s: %s -> %s" %
                            (returncode, self.foo_path, self.bar_directory))

        with self.output().open('w') as out_file:
            out_file.write("1")
Ejemplo n.º 4
0
 def test_float_max_value_inclusive(self):
     d = luigi.NumericalParameter(var_type=float,
                                  min_value=-3,
                                  max_value=7,
                                  left_op=le,
                                  right_op=le)
     self.assertEqual(7, d.parse(7))
Ejemplo n.º 5
0
 def test_int_min_value_inclusive(self):
     d = luigi.NumericalParameter(var_type=int,
                                  min_value=-3,
                                  max_value=7,
                                  left_op=le,
                                  right_op=lt)
     self.assertEqual(-3, d.parse(-3))
Ejemplo n.º 6
0
class Pull_data(lu.Task):
    v = lu.NumericalParameter(default=0.1,
                              var_type=float,
                              min_value=0,
                              max_value=100)

    boro = lu.ChoiceParameter(default='Queens',
                              var_type=str,
                              choices=['Queens', 'Brooklyn', 'Manhattan'])

    prod = lu.BoolParameter()

    def output(self):
        prod_ = "prod" if self.prod else 'staging'
        path = f'data/{prod_}/{self.boro}/raw_{self.v}.csv'
        path = str(this_folder / path)

        return lu.LocalTarget(path)

    # def complete(self):
    #     return self.output().exist()

    # def requires(self):
    #     return ...

    def run(self):
        source = f'https://raw.githubusercontent.com/Codecademy/datasets/master/streeteasy/{self.boro.lower()}.csv'
        data = pd.read_csv(source)

        self.output().makedirs()
        data.to_csv(self.output().path)
class Top10(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    N = luigi.NumericalParameter(default=10,
                                 min_value=1,
                                 max_value=100,
                                 var_type=int)

    def requires(self):
        return Collect311(date=self.date)

    def output(self):
        return luigi.LocalTarget(f"{folder}/311/top{self.N}.csv")

    @staticmethod
    def _analize(df, date, N=10):

        dict_ = {
            "boro": "NYC",
            "date": date,
            "metric": "complaints",
            "value": len(df)
        }
        stats = [dict_]

        top_N = df["complaint_type"].value_counts().nlargest(N).to_dict()
        for k, v in top_N.items():
            dict_["metric"] = k
            dict_["balue"]: v
            stats.append(copy(dict_))

        for boro, group in df.groupby("borough"):
            dict_["boro"] = boro
            dict_["metric"] = "complaints"
            dict_["value"] = len(group)
            stats.append(copy(dict_))

            top_N = group["complaint_type"].value_counts().nlargest(
                N).to_dict()
            for k, v in top_N.items():
                dict_["metric"] = k
                dict_["balue"]: v
                stats.append(copy(dict_))

        return stats

    def run(self):
        df = pd.read_csv(self.input().path)

        data = pd.DataFrame(self._analize(df, date=self.date,
                                          N=self.N)).set_index("date")
        data.to_csv(self.output().path)
Ejemplo n.º 8
0
class FooTask(ShellTask):
    foo_directory = luigi.Parameter()
    foo_num = luigi.NumericalParameter(var_type=int,
                                       min_value=0,
                                       max_value=10000)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.foo_directory, "foo_%d.txt" % self.foo_num))

    def run(self):
        with AtomicFilePointer(self.output().path).open() as foo_file:
            self.run_command("echo %d > %s" %
                             (self.foo_num, foo_file.tmp_path))
Ejemplo n.º 9
0
class FooWorkflow(luigi.WrapperTask):
    root_path = luigi.Parameter()
    foo_num = luigi.NumericalParameter(var_type=int,
                                       min_value=0,
                                       max_value=10000)

    def requires(self):
        foo_dir = os.path.join(self.root_path, 'foo')
        bar_dir = os.path.join(self.root_path, 'bar')

        if not os.path.isdir(foo_dir):
            os.makedirs(foo_dir)

        if not os.path.isdir(bar_dir):
            os.makedirs(bar_dir)

        tasks = [
            FooTask(foo_num=self.foo_num),
            BarTask(foo_path=os.path.join(foo_dir, "foo_%s.txt"),
                    foo_num=self.foo_num)
        ]

        return util.sequence_tasks(tasks)
Ejemplo n.º 10
0
class Top10(luigi.Task):
    date = luigi.DateParameter(default=date.today())
    start = luigi.DateParameter(default=datetime(2019, 1, 1))
    N = luigi.NumericalParameter(default=5,
                                 min_value=1,
                                 max_value=100,
                                 var_type=int)

    def requires(self):
        # data for the last {window} days
        delta = self.date - self.start
        dates = [self.start + timedelta(days=d) for d in range(delta.days + 1)]
        return {d.strftime('%Y-%m-%d'): Collect311(date=(d)) for d in dates}

    def output(self):
        return {
            'report':
            luigi.LocalTarget(f'{folder}/311/top{self.N}.csv'),
            'flag':
            luigi.LocalTarget(
                f'{folder}/311/_flags/{self.date:%Y/%m/%d}_{self.N}.flag')
        }

    @staticmethod
    def _analize(df, date, N=10):

        dict_ = {
            'boro': 'NYC',
            'date': date,
            'metric': 'complaints',
            'value': len(df)
        }
        stats = [
            dict_,
        ]

        top_N = df["complaint_type"].value_counts().nlargest(N).to_dict()
        for k, v in top_N.items():
            dict_['metric'] = k
            dict_['balue']: v
            stats.append(copy(dict_))

        for boro, group in df.groupby('borough'):
            dict_['boro'] = boro
            dict_['metric'] = 'complaints'
            dict_['value'] = len(group)
            stats.append(copy(dict_))

            top_N = group["complaint_type"].value_counts().nlargest(
                N).to_dict()
            for k, v in top_N.items():
                dict_['metric'] = k
                dict_['balue']: v
                stats.append(copy(dict_))

        return stats

    def run(self):

        data = []
        for k, v in self.input().items():
            try:
                df = pd.read_csv(v.path)
                stats = self._analize(df, date=k, N=self.N)
                data.extend(stats)
            except Exception as e:
                # print(e)
                pass

        data = pd.DataFrame(data)
        print(data.columns)
        data = data.set_index('date')

        # self.output()['report'].makedirs()
        data.to_csv(self.output()['report'].path)

        with self.output()['flag'].open('w') as f:
            f.write('!')

    def complete(self):
        return self.output()['flag'].exists()
Ejemplo n.º 11
0
 def test_var_type_parameter_exception(self):
     self.assertRaises(
         luigi.parameter.ParameterException,
         lambda: luigi.NumericalParameter(min_value=-3, max_value=7))
Ejemplo n.º 12
0
 def test_endpoint_default_exclusive(self):
     d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7)
     self.assertRaises(ValueError, lambda: d.parse(7))
Ejemplo n.º 13
0
 def test_defaults_start_range(self):
     d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7)
     self.assertEqual(-3, d.parse(-3))
Ejemplo n.º 14
0
class Train_Model(lu.Task):
    v = lu.NumericalParameter(default=0.1,
                              var_type=float,
                              min_value=0,
                              max_value=100)

    boro = lu.ChoiceParameter(default='Queens',
                              var_type=str,
                              choices=['Queens', 'Brooklyn', 'Manhattan'])

    prod = lu.BoolParameter()

    def output(self):
        prod_ = "prod" if self.prod else 'staging'
        path = this_folder / f'data/{prod_}/{self.boro}/model_{self.v}'

        return {
            'metrics': lu.LocalTarget(str(path / 'metrics.json')),
            'predicted': lu.LocalTarget(str(path / 'predicted.csv')),
            'model': lu.LocalTarget(str(path / 'model.pkl'))
        }

    def requires(self):
        return Pull_data(boro=self.boro, prod=self.prod, v=0.1)

    def run(self):
        df = pd.read_csv(self.input().path)

        y = df['rent']
        X = df[[
            'bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor',
            'building_age_yrs', 'no_fee'
        ]]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=.8,
                                                            random_state=2019)

        model = XGBRegressor(random_state=2019,
                             max_depth=10,
                             n_estimators=1000)
        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        metrics = {
            'max_depth': 10,
            'n_extimators': 1000,
            'test_mae': mean_absolute_error(y_test, pred)
        }

        X_test['predicted'] = pred

        self.output()['predicted'].makedirs()
        X_test.to_csv(self.output()['predicted'].path)

        with open(self.output()['metrics'].path, 'w') as f:
            json.dump(metrics, f)

        with open(self.output()['model'].path, 'wb') as f:
            pickle.dump(model, f)