Esempio n. 1
0
    def band(self):
        all_tasks = []
        my_config = SomeConfig()
        for i, d in enumerate(period_dates(self.task_target_date,
                                           self.period)):
            source = TDataSource(task_target_date=d)
            complicated = TComplicatedTask(
                specific_input=source,
                some_param=my_config.some_param,
                task_input=source.logs,
                # empty_input=None,
                task_target_date=d,
            )
            all_tasks.append(complicated)

        self.combined_output = data_combine(
            [t.simplest_output for t in all_tasks])
        self.list_output = [t.simplest_output for t in all_tasks]
        self.list_tasks_output = [t for t in all_tasks]

        self.nested = [
            TNestedPipeline1(task_name="MyNewPipe").some_output,
            TNestedPipeline1(task_name="custom_task_name").some_output,
        ]

        self.nested2 = TNestedPipeline2().some_output
        self.nested3 = TSuperNestedPipeline(
            list_parameter=self.list_param).some_output
def fetch_data(task_target_date, period=datetime.timedelta(days=7)):
    all_data = []
    for d in period_dates(task_target_date, period):
        data = fetch_wine_quality(task_target_date=d)
        all_data.append(data)

    return data_combine(all_data, sort=True)
Esempio n. 3
0
 def band(self):
     projected_logs = []
     for i, d in enumerate(period_dates(self.task_target_date,
                                        self.period)):
         raw_logs = RawDeviceLog(task_target_date=d).logs
         projected = DeviceLogProjection(raw_logs=raw_logs,
                                         task_target_date=d)
         projected_logs.append(projected.projected_logs)
     self.projected = data_combine(projected_logs)
Esempio n. 4
0
def top_artists_report(task_target_date, period=timedelta(days=2)):
    logging.info("top_artists_report")
    streams = [
        stream(task_name="Stream_%s" % i, task_target_date=d)
        for i, d in enumerate(period_dates(task_target_date, period))
    ]
    artists = aggregate_artists(stream=data_combine(streams))
    top_n = top_n_artists(artists=artists)
    return top_n
Esempio n. 5
0
    def band(self):
        all_ids, all_data = {}, {}
        for i, d in enumerate(period_dates(self.task_target_date,
                                           self.period)):
            # if self.task_env == TaskEnv.prod and not self.run_on_prod:
            #     ids = cb_data_dump_path(task_target_date=d, name="ids")
            #     data = cb_data_dump_path(task_target_date=d, name="data")
            # else:
            ids = FetchIds(task_target_date=d, period=one_day).ids
            data = FetchData(task_target_date=d, ids=ids).data

            d_key = d.strftime("%Y-%m-%d")
            all_ids[d_key] = ids
            all_data[d_key] = data
        self.ids = data_combine(all_ids.values(), sort=True)
        self.data = data_combine(all_data.values(), sort=True)
Esempio n. 6
0
def fetch_partner_data(
    task_target_date,
    selected_partners: List[str],
    period=datetime.timedelta(days=7)
) -> List[pd.DataFrame]:
    partner_data = []
    for partner in selected_partners:
        all_data = []
        for d in period_dates(task_target_date, period):
            if partner == "a":
                data = ingest_partner_a(task_target_date=d)
            elif partner == "b":
                data = ingest_partner_b(task_target_date=d)
            elif partner == "c":
                data = ingest_partner_c(task_target_date=d)
            else:
                raise Exception("Partner not found!")

            all_data.append(data)
        partner_data.append(data_combine(all_data, sort=True))
    return partner_data
Esempio n. 7
0
def generate_partner_data(
    seed: pd.DataFrame = demo_data_repo.seed,
    task_target_date=datetime.datetime.now().date(),
    period=datetime.timedelta(days=7),
    bad_labels_date=datetime.datetime.strptime("2018-01-01",
                                               "%Y-%m-%d").date(),
):
    data = rename_columns(data=seed)
    data = calculate_target_variable(data=data)

    results = {}

    for d in period_dates(task_target_date, period):
        r = generate_for_date(
            task_target_date=d,
            data=data,
            a_out=demo_data_repo.partner_a_file(d),
            b_out=demo_data_repo.partner_b_file(d),
            c_out=demo_data_repo.partner_c_file(d),
        )
        results[str(d)] = r

    noisy_data = generate_for_date(
        task_target_date=bad_labels_date,
        data=data,
        noise=True,
        a_out=demo_data_repo.partner_a_file(bad_labels_date),
        b_out=demo_data_repo.partner_b_file(bad_labels_date),
        c_out=demo_data_repo.partner_c_file(bad_labels_date),
    )
    results[str(bad_labels_date)] = noisy_data

    customers = create_customer_files(
        seed=data,
        a_out=partner_data_file("customer_a.csv"),
        b_out=partner_data_file("customer_b.csv"),
    )
    results["customers"] = customers
    return results