Esempio n. 1
0
def table_summary(context: MLClientCtx,
                  dask_client: Union[DataItem, str],
                  dask_key: str = 'my_dask_dataframe',
                  target_path: str = '',
                  name: str = 'table_summary.csv',
                  key: str = 'table_summary') -> None:
    """Summarize a table
    
    :param context:         the function context
    :param dask_client:     path to the dask client scheduler json file, as
                            string or artifact
    :param dask_key:        key of dataframe in dask client 'datasets' attribute
    :param target_path:     destimation folder for table summary file
    :param name:            name of table summary file (with extension like .csv)
    :param key:             key of table summary in artifact store
    """
    print(context.__dict__)
    dask_client = Client(scheduler_file=str(dask_client))
    df = dask_client.get_dataset('dask_key')
    print(df.head())
    dscr = df.describe()

    filepath = os.path.join(target_path, name)
    dd.to_csv(dscr, filepath, single_file=True, index=False)
    context.log_artifact(key, target_path=filepath)
Esempio n. 2
0
    def create_diffdiff():
        print("\ngenerating spreads vs spreads...")

        diff_diff = triu(dateless_df, True)

        ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)"
        diff_diff.columns = [
            f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}'
            for h in diff_diff.columns
        ]

        ## verbose version of above's list comprehension
        # new_headers = []
        # for header in diff_diff.columns:
        #     split_header = header.split(" (")
        #     product_A = split_header[0]
        #     split_section = split_header[1].split(")")
        #     product_B = split_section[1]
        #     new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}")
        # diff_diff.columns = new_headers

        diff_diff = triu(diff_diff, False)

        diff_diff = diff_diff.repartition(npartitions=200)
        diff_diff = diff_diff.reset_index(drop=True)

        dd_date_col = dd.from_array(date_col)
        dd_date_col = dd_date_col.repartition(npartitions=200)
        dd_date_col = dd_date_col.reset_index(drop=True)

        diff_diff = diff_diff.assign(date=dd_date_col)

        diff_diff = dd.melt(
            diff_diff,
            id_vars="date",
            var_name="product_diff",
            value_name="price_diff").dropna().reset_index(drop=True)

        diff_diff["product_diff"] = diff_diff["product_diff"].astype(
            "category")

        diff_diff["differential_A"] = diff_diff["product_diff"].str.partition(
            " - ")[0]
        diff_diff["differential_B"] = diff_diff["product_diff"].str.partition(
            " - ")[2]

        print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)")
        dd.to_csv(df=diff_diff,
                  filename=os.path.join(os.getcwd(), "cleaned_data",
                                        "diff_diff.csv"),
                  index=False,
                  single_file=True,
                  encoding="utf-8-sig",
                  chunksize=10000)
        print(
            f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)"
        )
Esempio n. 3
0
def resample_at_path(s3_in_url,
                     s3_out_url,
                     s3_options,
                     group,
                     index_col,
                     out_file_prefix='out'):

    aggr_func: Callable
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']

    df = dd.read_parquet(path=s3_in_url,
                         storage_options=s3_options,
                         engine='fastparquet')

    # filter
    if filter_by_key == 'weekday':
        df = df.loc[df[index_col].dt.weekday == filter_by_val]

    if group['compute']:
        grouper_cols = group['by_cols']
        aggr_func = group['aggr_func']
        meta_cols = group['meta']
        cols = list(meta_cols.keys())
        print('meta_cols %s' % meta_cols)

        # resample using frequency and aggregate function specified
        df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \
            apply(aggr_func, meta=meta_cols)
        # df = df.resample(resample_freq).sum()
        # print('after resampling')

    print('after grouping and resampling %s' % str(df.shape))

    # save in out bucket
    dd.to_csv(df=df,
              filename=s3_out_url,
              name_function=lambda i: out_file_prefix + '_' + str(i),
              storage_options=s3_options)

    # s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
    # dd.to_parquet(df=df,
    #              path=s3_out_url,
    #              engine='fastparquet',
    #              compute=True,
    #              compression='lz4',
    #              storage_options=s3_options)
    return
Esempio n. 4
0
 def persist(self, output):
     assert output is not None
     if type(output) == df.core.DataFrame:
         if self.ext == "csv":
             df.to_csv(output, self.loc, index=False, encoding="utf-8")
         elif self.wxt == "json":
             df.to_json(output, self.loc, encoding="utf-8")
         else:
             raise Exception(self.ext + " not supported")
     else:
         if type(output) != db.core.Bag:
             logging.getLogger("system").warn("WARNING: converting to bag")
             assert isinstance(output, collections.Iterable)
             output = db.from_sequence(output, npartitions=self.npartitions)
         output.map(json.dumps).to_textfiles(self.loc)
Esempio n. 5
0
    def transform(self, input_scores, calibrated_scores):
        """
        Calibrates a score

        Parameters
        ----------

           input_scores: list
              Input score files to be calibrated

           calibrated_files: list
              Output score files

        """

        assert isinstance(input_scores, list) or isinstance(
            input_scores, tuple)
        assert isinstance(calibrated_scores, list) or isinstance(
            calibrated_scores, tuple)
        assert len(calibrated_scores) == len(input_scores)
        for file_name, output_file_name in zip(input_scores,
                                               calibrated_scores):
            # Fetching scores
            dataframe = dask.dataframe.read_csv(file_name)
            dataframe = dataframe.compute()
            X = dataframe["score"].to_numpy()

            calibrated_scores = np.vstack([
                fitter.predict_proba(X) for fitter in self._categorical_fitters
            ]).T
            calibrated_scores = self.reduction_function(calibrated_scores,
                                                        axis=1)
            dataframe["score"] = calibrated_scores

            dataframe.to_csv(output_file_name, index=False)

        return calibrated_scores
Esempio n. 6
0
    def _write(self, collection, path, **kwargs):
        """ This method implements CSV writing.
        If the parent directory is missing, the function creates it as well as all the ancestors directories
        first. This is to align with the behavior with ParquetTarget.
        Args:
            collection: dask dataframe, to be written to disk
            path: str, full path of the target
            **kwargs: dictionary, named arguments to be passed to to_parquet.
        Returns:
            filenames read or today list of task
        """
        if not self._exists(path):
            self.fs.mkdirs(path)
        new_path = super()._join(path, self.glob)

        return to_csv(collection, new_path, **kwargs)
Esempio n. 7
0
    def _write(self, collection, path, **kwargs):
        if not self._exists(path):
            self.fs.mkdirs(path)
        full_path = super()._join(path, self.glob)

        return to_csv(collection, full_path, **kwargs)
Esempio n. 8
0
 def _write(cls, collection, path, **kwargs):
     return to_csv(collection, path, **kwargs)
Esempio n. 9
0
def perform_tsfare_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            usecols = [
                'date', 'STATION', 'FF', 'SEN/DIS', '7-D AFAS UNL',
                '30-D AFAS/RMF UNL', 'JOINT RR TKT', '7-D UNL', '30-D UNL',
                '14-D RFM UNL', '1-D UNL', '14-D UNL', '7D-XBUS PASS', 'TCMC',
                'RF 2 TRIP', 'RR UNL NO TRADE', 'TCMC ANNUAL MC',
                'MR EZPAY EXP', 'MR EZPAY UNL', 'PATH 2-T', 'AIRTRAIN FF',
                'AIRTRAIN 30-D', 'AIRTRAIN 10-T', 'AIRTRAIN MTHLY', 'STUDENTS'
            ]
            url_part1: str = 's3://' + in_bucket + '/fares_'
            url_part2: str = ".csv"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + year[2:] + prefix_zero(month) +
                prefix_zero(day_tuple[0]) + url_part2
                for month in range(month_st, month_end)
                for day_tuple in calendar.itermonthdays2(int(year), month)
                if day_tuple[0] in range(1, 32) and day_tuple[1] == 5
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=0,
                             usecols=usecols,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             parse_dates=['date'],
                             converters={
                                 'STATION': str.strip,
                                 'FF': row_ops.clean_num,
                                 'SEN/DIS': row_ops.clean_num,
                                 '7-D AFAS UNL': row_ops.clean_num,
                                 '30-D AFAS/RMF UNL': row_ops.clean_num,
                                 'JOINT RR TKT': row_ops.clean_num,
                                 '7-D UNL': row_ops.clean_num,
                                 '30-D UNL': row_ops.clean_num,
                                 '14-D RFM UNL': row_ops.clean_num,
                                 '1-D UNL': row_ops.clean_num,
                                 '14-D UNL': row_ops.clean_num,
                                 '7D-XBUS PASS': row_ops.clean_num,
                                 'TCMC': row_ops.clean_num,
                                 'RF 2 TRIP': row_ops.clean_num,
                                 'RR UNL NO TRADE': row_ops.clean_num,
                                 'TCMC ANNUAL MC': row_ops.clean_num,
                                 'MR EZPAY EXP': row_ops.clean_num,
                                 'MR EZPAY UNL': row_ops.clean_num,
                                 'PATH 2-T': row_ops.clean_num,
                                 'AIRTRAIN FF': row_ops.clean_num,
                                 'AIRTRAIN 30-D': row_ops.clean_num,
                                 'AIRTRAIN 10-T': row_ops.clean_num,
                                 'AIRTRAIN MTHLY': row_ops.clean_num,
                                 'STUDENTS': row_ops.clean_num
                             },
                             encoding='utf-8')
            #to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True)
            dd.to_csv(
                df=df,
                filename='s3://' + out_bucket + '/' + year + '/',
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

    except Exception as err:
        raise err

    else:
        return status
Esempio n. 10
0
 def _write(cls, collection, path, storage_type, **kwargs):
     if storage_type == "parquet":
         return to_parquet(collection, path, engine="fastparquet", **kwargs)
     elif storage_type == "csv":
         path = "{}/export-*.{}".format(path, storage_type)
         return to_csv(collection, path, **kwargs)
Esempio n. 11
0
def perform_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']
    group: Dict = task_type_map['group']
    index_col: str = task_type_map['index']['col']

    aggr_func: Callable
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']

    s3_options: Dict = ps.fetch_s3_options()

    client: Client = dask.create_dask_client(num_workers=8)

    try:
        for year in years:
            s3_in_url: str = 's3://' + in_bucket + '/' + year + '/'
            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' \
                              + resample_freq + '/' + filter_by_key+str(filter_by_val) + '/'
            path: str = ''
            print('s3 url %s' % s3_in_url)
            if task_type in ['rs-gcabs', 'rs-ycabs']:
                if int(year) >= 2016:
                    path = '/special/'
                elif int(year) < 2016:
                    path = '/normal/'

            #resample_at_path(s3_in_url+path,
            #                 s3_out_url,
            #                 s3_options,
            #                 group,
            #                 index_col)

            df = dd.read_parquet(path=s3_in_url + path,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            if task_type in ['rs-gcabs', 'rs-ycabs'] and int(year) == 2016:
                #resample_at_path(s3_in_url + '/normal/',
                #                 s3_out_url,
                #                 s3_options,
                #                 group,
                #                 index_col,
                #                 'out2')
                df_2 = dd.read_parquet(path=s3_in_url + '/normal/',
                                       storage_options=s3_options,
                                       engine='fastparquet')
                df = dd.concat([df, df_2], axis=0)

            partitions = df.npartitions
            if partitions < 5:
                print('repartitioning to 5')
                df = df.repartition(npartitions=5)
                client.persist(df)

            # filter
            if filter_by_key == 'weekday':
                df = df.loc[df[index_col].dt.weekday == filter_by_val]

            if group['compute']:
                grouper_cols = group['by_cols']
                aggr_func = group['aggr_func']
                meta_cols = group['meta']
                cols = [
                    col for col in meta_cols.keys()
                    if col not in grouper_cols + [index_col]
                ]
                meta_types = [
                    meta_cols[key] for key in meta_cols.keys()
                    if key not in grouper_cols + [index_col]
                ]
                print('meta_cols %s' % meta_cols)
                index = [index_col] + grouper_cols
                index_levels: List[List] = [[] for level in index]
                meta: pd.DataFrame = pd.DataFrame(columns=cols,
                                                  index=pd.MultiIndex(
                                                      index_levels,
                                                      index_levels,
                                                      names=index))

                # resample using frequency and aggregate function specified
                df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \
                    apply(aggr_func, meta=meta).reset_index()
                # df = df.resample(resample_freq).sum()
                # print('after resampling')

            print('after grouping and resampling %s' % str(df.shape))

            # save in out bucket
            dd.to_csv(
                df=df,
                filename=s3_out_url,
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

            #dd.to_parquet(df=df,
            #              path=s3_out_url,
            #              engine='fastparquet',
            #compute=True,
            #write_index=True,
            #              compression='lz4',
            #              storage_options=s3_options)

    except Exception as err:
        print('error in perform_cabs %s')
        client.close()
        raise err

    client.close()

    return True
Esempio n. 12
0
    # df.to_dask_array(lengths=True)) to compute chunk sizes. Otherwise df.values has a
    # shape = (nan,6)
    print(df_in.to_dask_array(lengths=True))
    # shape_in = (nbLine_into_File, nbColunm=6)
    shape_in = df_in.to_dask_array(lengths=True).shape

    ############################ Processing ###############################
    # Distribution of each line to launch_rk4_API function on Dask client
    futures = client.map(launch_rk4_API, df_in.to_dask_array(lengths=True))

    # Block until result, and add the column to initial input tables
    results = client.gather(futures)  # result is a list of dask array

    ########################### Write Output File ###########################
    # Create a dask array from results (only x and v : index 1 and 2)
    data = [results[i][0:3] for i in range(len(results))]

    da_output = da.concatenate(data, axis=0)
    da_output = da_output.reshape((shape_in[0], 3))

    # Convert to dataFrame
    df_out = dd.from_dask_array(da_output)

    # Write into output_file
    dd.to_csv(df_out,
              args.output_file,
              single_file=True,
              sep=" ",
              index=False,
              header=False)
Esempio n. 13
0
        datasets["test"] = pre_proc.remove_outliers(
            data=datasets["test"],
            columns=config["COLUMNS"],
            threshold=config["HYPER_PARAMS"]["Z_THRESHOLD"])
        print("Outliers removed!")

        # Encode data
        print("\nEncoding data...")
        datasets["train"], train_encoding = pre_proc.encode_data(
            data=datasets["train"])
        datasets["test"], test_encoding = pre_proc.encode_data(
            data=datasets["test"], encoding=train_encoding, is_train=False)
        print("Data encoded!")

        # Save processed data
        dataframe.to_csv(datasets["train"].compute(num_workers=cpu_count()),
                         filename=config["PATH"]["PROCESSED"]["TRAIN"])
        dataframe.to_csv(datasets["test"].compute(num_workers=cpu_count()),
                         filename=config["PATH"]["PROCESSED"]["TEST"])

    # Split train data
    datasets["train"], datasets["validation"] = dataframe.DataFrame\
        .random_split(datasets["train"],
                      frac=[config["HYPER_PARAMS"]["TRAIN_SPLIT"],
                            1 - config["HYPER_PARAMS"]["TRAIN_SPLIT"]],
                      random_state=config["HYPER_PARAMS"]["SEED"],
                      shuffle=True)

    # Dump data to memory
    print("\nDumping data to memory...")
    datasets["train"] = datasets["train"].compute(num_workers=cpu_count())
    datasets["validation"] = datasets["validation"].compute(