Python read_parquet Beispiele, cudf.read_parquet Python Beispiele

Beispiel #1

0

Datei anzeigen

        dropnacols=["dhdt_slope"],
    )

# %%
# Read in Antarctic Drainage Basin Boundaries shapefile into a GeoDataFrame
ice_boundaries: gpd.GeoDataFrame = (
    deepicedrain.catalog.measures_antarctic_boundaries.read())
drainage_basins: gpd.GeoDataFrame = ice_boundaries.query(expr="TYPE == 'GR'")

# %% [markdown]
# ## Load in ICESat-2 data (x, y, dhdt) and do initial trimming

# %%
# Read in raw x, y, dhdt_slope and referencegroundtrack data into the GPU
cudf_raw: cudf.DataFrame = cudf.read_parquet(
    filepath_or_buffer="ATLXI/df_dhdt_antarctica.parquet",
    columns=["x", "y", "dhdt_slope", "referencegroundtrack"],
)
# Filter to points with dhdt that is less than -0.2 m/yr or more than +0.2 m/yr
cudf_many = cudf_raw.loc[abs(cudf_raw.dhdt_slope) > 0.2]
print(f"Trimmed {len(cudf_raw)} -> {len(cudf_many)}")

# %%
# Clip outlier values to 3 sigma (standard deviations) from mean
_mean = cudf_many.dhdt_slope.mean()
_std = cudf_many.dhdt_slope.std()
cudf_many.dhdt_slope.clip(lower=np.float32(_mean - 3 * _std),
                          upper=np.float32(_mean + 3 * _std),
                          inplace=True)

# %% [markdown]
# ## Label ICESat-2 points according to their drainage basin

Beispiel #2

0

Datei anzeigen

def test_parquet_reader_local_filepath():
    fname = "~/TestLocalFile.parquet"
    if not os.path.isfile(fname):
        pytest.skip("Local .parquet file is not found")

    cudf.read_parquet(fname)

Beispiel #3

0

Datei anzeigen

Datei: aggregate_lsoa.py Projekt: cjber/ahah

def read_dists(dist_files: Generator, pcs: cudf.DataFrame,
               ndvi) -> pd.DataFrame:
    dfs = ([
        cudf.read_csv(file).drop_duplicates("postcode").set_index("postcode").
        rename(columns={"distance": re.split(r"_|\.", file.name)[1]})
        for file in dist_files
    ], )
    dfs = cudf.concat(dfs, axis=1).reset_index().pipe(fix_postcodes)

    return (dfs.set_index("postcode").join(ndvi).join(
        pcs).reset_index().groupby("lsoa11").median())


if __name__ == "__main__":
    dist_files = list(Path(Config.OUT_DATA).glob("distances_*.csv"))

    pcs = cudf.read_parquet(Config.PROCESSED_DATA /
                            "postcodes.parquet").set_index("postcode")

    gspassive = (cudf.read_csv(Config.RAW_DATA / "ndvi" /
                               "sentinel_postcode_ndvi_20210419.csv").rename(
                                   columns={
                                       "PCDS": "postcode",
                                       "NDVI_MEDIAN": "gspassive"
                                   })[["postcode",
                                       "gspassive"]].set_index("postcode"))
    pcs[pcs["lsoa11"] == "E01019077"]
    dists = read_dists(dist_files, pcs, gspassive)
    dists.to_csv(Config.OUT_DATA / "median_dists.csv")

Beispiel #4

0

Datei anzeigen

def test_hugectr(
    tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client
):
    client = client if use_client else None

    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_names
    )
    processor.add_feature(
        [
            ops.FillMissing(columns=op_columns),
            ops.Clip(min_value=0, columns=op_columns),
            ops.LogOp(),
        ]
    )
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # apply the workflow and write out the dataset
    processor.apply(
        dataset,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=None,
        num_io_threads=num_io_threads,
    )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles if not client else nfiles * len(client.cluster.workers)
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"

    data_files = [
        os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext)
    ]

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, data_files[0]))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name

Beispiel #5

0

Datei anzeigen

    dfs = (cudf.concat(
        [
            cudf.read_csv(file).set_index("postcode").rename(
                columns={"distance": re.split(r"_|\.", file.name)[1]})
            for file in dist_files
        ],
        axis=1,
    ).reset_index().pipe(fix_postcodes))
    dfs = dfs.merge(uprn)

    for poi in Config.POI_LIST + ["gspassive"]:
        dfs[poi] = dfs[poi] * dfs["uprn_count"]

    dfs = dfs.groupby("lsoa11").sum()

    for poi in Config.POI_LIST + ["gspassive"]:
        dfs[poi] = dfs[poi] / dfs["uprn_count"]
    return dfs.drop("uprn_count", axis=1)


if __name__ == "__main__":
    dist_files = list(Path(Config.OUT_DATA).glob("distances_*.csv"))

    uprn = (cudf.read_parquet(Config.PROCESSED_DATA /
                              "uprn_pcs.parquet", ).rename(columns={
                                  "lsoa11cd": "lsoa11"
                              }).drop(["oa11cd"], axis=1).pipe(fix_postcodes))
    uprn = uprn[~uprn["lsoa11"].str.contains(r"^\d.*")]
    dists = read_dists(dist_files, uprn)
    dists.to_csv(Config.OUT_DATA / "weighted_mean_dists.csv")

Beispiel #6

0

Datei anzeigen

Datei: fuzz_test_parquet.py Projekt: mnicely/cudf

def parquet_reader_test(parquet_buffer):
    pdf = pd.read_parquet(parquet_buffer)
    gdf = cudf.read_parquet(parquet_buffer)

    assert_eq(gdf, pdf)

Beispiel #7

0

Datei anzeigen

Datei: test_torch_dataloader.py Projekt: vslyu/NVTabular

def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devices):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        devices=devices,
    )

    columns = mycols_pq
    df_test = cudf.read_parquet(tar_paths[0])[columns]
    df_test.columns = [x for x in range(0, len(columns))]
    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0])
    rows = 0
    # works with iterator alone, needs to test inside torch dataloader

    for idx, chunk in enumerate(data_itr):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
        del chunk
    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert rows == num_rows

    def gen_col(batch):
        batch = batch[0]
        return batch[0], batch[1], batch[2]

    t_dl = torch_dataloader.DLDataLoader(
        data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0
    )
    rows = 0
    for idx, chunk in enumerate(t_dl):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])

    if os.path.exists(output_train):
        shutil.rmtree(output_train)

Beispiel #8

0

Datei anzeigen

Datei: routing_dsh.py Projekt: cjber/ahah

            self.distances = cudf.read_csv(self.log_file).append(pc_dist)
        else:
            self.distances = pc_dist[["vertex", "distance", "idx"]]

        self.distances = (self.distances.sort_values(
            "distance").drop_duplicates("vertex").reset_index()[[
                "vertex", "distance", "idx"
            ]])
        self.distances.to_csv(self.log_file, index=False)


if __name__ == "__main__":
    print("Starting Routing!")
    print("Reading graph and postcodes.")

    edges = cudf.read_parquet(Config.OS_GRAPH / "edges.parquet")
    nodes = cudf.read_parquet(Config.OS_GRAPH / "nodes.parquet")
    postcodes = cudf.read_parquet(Config.PROCESSED_DATA / "postcodes.parquet")
    print("Finished reading nodes, edges and postcodes.")

    print(f"Starting Routing for {Config.POI_LIST}.")
    for poi in Config.POI_LIST:
        df = pd.read_parquet(Config.PROCESSED_DATA / f"{poi}.parquet")
        OUT_FILE = Config.OUT_DATA / f"distances_{poi}.csv"

        if not OUT_FILE.exists():
            routing = Routing(
                name=poi,
                edges=edges,
                nodes=nodes,
                postcodes=postcodes,

Beispiel #9

0

Datei anzeigen

Datei: rapids_csp_azure.py Projekt: wphicks/cloud-ml-examples

    def load_data(self, filename = 'dataset.orc', col_labels = None, y_label = 'ArrDelayBinary'):
        """
        Loading the data into the object from the filename and based on the columns that we are
        interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary
        classification problem.

        Parameters
        ----------
        filename : string
                   the path of the dataset to be loaded

        col_labels : list of strings
                     The input columns that we are interested in. None selects all the columns

        y_label : string
                  The column to perform the prediction task in.

        Returns
        ----------
        dataset : dataframe (Pandas, cudf or dask-cudf)
                  Ingested dataset in the format of a dataframe

        col_labels : list of strings
                     The input columns selected

        y_label : string
                  The generated y_label name for binary classification

        duration : float
                   The time it took to execute the function
        """
        target_filename = filename
        self.log_to_file( f'\n> Loading dataset from {target_filename}')

        with PerfTimer() as ingestion_timer:
            if 'CPU' in self.compute_type:
                # CPU Reading options
                self.log_to_file(f'\n\tCPU read')

                if self.data_type == 'ORC':
                    with open( target_filename, mode='rb') as file:
                        dataset = pyarrow_orc.ORCFile(file).read().to_pandas()
                elif self.data_type == 'CSV':
                    dataset = pd.read_csv( target_filename, names = col_labels )
                    
                elif self.data_type == 'Parquet':
                    
                    if 'single' in self.compute_type:
                        dataset = pd.read_parquet(target_filename)
                    
                    elif 'multi' in self.compute_type:
                        self.log_to_file(f'\n\tReading using dask dataframe')
                        dataset = dask.dataframe.read_parquet(target_filename, columns = columns)

            elif 'GPU' in self.compute_type:
                # GPU Reading Option

                self.log_to_file(f'\n\tGPU read')
                if self.data_type == 'ORC':
                    dataset = cudf.read_orc(target_filename)

                elif self.data_type == 'CSV':
                    dataset = cudf.read_csv(target_filename, names = col_labels)

                elif self.data_type == 'Parquet':

                    if 'single' in self.compute_type:
                        dataset = cudf.read_parquet(target_filename)

                    elif 'multi' in self.compute_type:
                        self.log_to_file(f'\n\tReading using dask_cudf')
                        dataset = dask_cudf.read_parquet(target_filename, columns = col_labels)

        # cast all columns to float32
        for col in dataset.columns:
            dataset[col] = dataset[col].astype(np.float32)  # needed for random forest

        # Adding y_label column if it is not present
        if y_label not in dataset.columns:
            dataset[y_label] = 1.0 * (
                    dataset["ArrDelay"] > 10
                )

        dataset[y_label] = dataset[y_label].astype(np.int32) # Needed for cuml RF
        
        dataset = dataset.fillna(0.0) # Filling the null values. Needed for dask-cudf

        self.log_to_file(f'\n\tIngestion completed in {ingestion_timer.duration}')
        self.log_to_file(f'\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}')
        return dataset, col_labels, y_label, ingestion_timer.duration

Beispiel #10

0

Datei anzeigen

    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            # `piece` is a file-path string
            piece = pq.ParquetDatasetPiece(piece,
                                           open_file_func=partial(fs.open,
                                                                  mode="rb"))
        else:
            # `piece` = (path, row_group, partition_keys)
            (path, row_group, partition_keys) = piece
            piece = pq.ParquetDatasetPiece(
                path,
                row_group=row_group,
                partition_keys=partition_keys,
                open_file_func=partial(fs.open, mode="rb"),
            )

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                piece.path,
                engine="cudf",
                columns=columns,
                row_group=piece.row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(piece.path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_group=piece.row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and index[0] in df.columns:
            df = df.set_index(index[0])

        if len(piece.partition_keys) > 0:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(piece.partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]
                sr = cudf.Series(index2).astype(type(index2)).repeat(len(df))
                df[name] = build_categorical_column(
                    categories=categories,
                    codes=as_column(sr._column.base_data,
                                    dtype=sr._column.dtype),
                    size=sr._column.size,
                    offset=sr._column.offset,
                    ordered=False,
                )

        return df

Beispiel #11

0

Datei anzeigen

    def _read_paths(
        cls,
        paths,
        fs,
        columns=None,
        row_groups=None,
        strings_to_categorical=None,
        partitions=None,
        partitioning=None,
        partition_keys=None,
        open_file_options=None,
        **kwargs,
    ):

        # Simplify row_groups if all None
        if row_groups == [None for path in paths]:
            row_groups = None

        with ExitStack() as stack:

            # Non-local filesystem handling
            paths_or_fobs = paths
            if not _is_local_filesystem(fs):
                paths_or_fobs = _open_remote_files(
                    paths_or_fobs,
                    fs,
                    context_stack=stack,
                    **_default_open_file_options(open_file_options, columns,
                                                 row_groups),
                )

            # Use cudf to read in data
            df = cudf.read_parquet(
                paths_or_fobs,
                engine="cudf",
                columns=columns,
                row_groups=row_groups if row_groups else None,
                strings_to_categorical=strings_to_categorical,
                **kwargs,
            )

        if partitions and partition_keys is None:

            # Use `HivePartitioning` by default
            partitioning = partitioning or {"obj": pa_ds.HivePartitioning}
            ds = pa_ds.dataset(
                paths,
                filesystem=fs,
                format="parquet",
                partitioning=partitioning["obj"].discover(
                    *partitioning.get("args", []),
                    **partitioning.get("kwargs", {}),
                ),
            )
            frag = next(ds.get_fragments())
            if frag:
                # Extract hive-partition keys, and make sure they
                # are ordered the same as they are in `partitions`
                raw_keys = pa_ds._get_partition_keys(frag.partition_expression)
                partition_keys = [(hive_part.name, raw_keys[hive_part.name])
                                  for hive_part in partitions]

        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")

            for i, (name, index2) in enumerate(partition_keys):

                # Build the column from `codes` directly
                # (since the category is often a larger dtype)
                codes = as_column(
                    partitions[i].keys.index(index2),
                    length=len(df),
                )
                df[name] = build_categorical_column(
                    categories=partitions[i].keys,
                    codes=codes,
                    size=codes.size,
                    offset=codes.offset,
                    ordered=False,
                )

        return df

Beispiel #12

0

Datei anzeigen

def test_dask_workflow_api_dlrm(
    client,
    tmpdir,
    datasets,
    freq_threshold,
    part_mem_fraction,
    engine,
    cat_cache,
    on_host,
    shuffle,
    cpu,
):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    paths = sorted(paths)
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)
    df0 = df0.to_pandas() if cpu else df0

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold,
                                       out_path=str(tmpdir),
                                       cat_cache=cat_cache,
                                       on_host=on_host)

    conts = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()

    workflow = Workflow(cats + conts + label_name, client=client)

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          cpu=cpu,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)

    output_path = os.path.join(tmpdir, "processed")

    transformed = workflow.fit_transform(dataset)
    transformed.to_parquet(output_path=output_path,
                           shuffle=shuffle,
                           out_files_per_proc=1)

    result = transformed.to_ddf().compute()
    assert len(df0) == len(result)
    assert result["x"].min() == 0.0
    assert result["x"].isna().sum() == 0
    assert result["y"].min() == 0.0
    assert result["y"].isna().sum() == 0

    # Check categories.  Need to sort first to make sure we are comparing
    # "apples to apples"
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    dfm = expect.merge(got, on="index",
                       how="inner")[["name-string_x", "name-string_y"]]
    dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({
        "name-string_x":
        "count",
        "name-string_y":
        "count"
    })
    if freq_threshold:
        dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold]
    assert_eq(dfm_gb["name-string_x"],
              dfm_gb["name-string_y"],
              check_names=False)

    # Read back from disk
    if cpu:
        df_disk = dd_read_parquet(output_path).compute()
    else:
        df_disk = dask_cudf.read_parquet(output_path).compute()

    # we don't have a deterministic ordering here, especially when using
    # a dask client with multiple workers - so we need to sort the values here
    columns = ["label", "x", "y", "id"] + cat_names
    got = result.sort_values(columns).reset_index(drop=True)
    expect = df_disk.sort_values(columns).reset_index(drop=True)
    assert_eq(got, expect)

Beispiel #13

0

Datei anzeigen

def parquet_reader_test(parquet_buffer):
    pdf = pd.read_parquet(parquet_buffer)
    gdf = cudf.read_parquet(parquet_buffer)

    compare_dataframe(gdf, pdf)

Beispiel #14

0

Datei anzeigen

Datei: rapids_cloud_ml.py Projekt: shicongisme/cloud-ml-examples

    def ETL(self, columns=None, label_column=None, random_seed=0):
        """ Perfom ETL given a set of target dataset to prepare for model training. 
            1. Ingest parquet compressed dataset
            2. Rebalance/Re-partition [ for multi-CPU and multi-GPU ]
            3. Drop samples with missing data [ predominantly cancelled flights ]
            4. Split dataset into train and test subsets 
        """
        with PerfTimer('ETL'):
            if 'single' in self.compute_type:
                if 'CPU' in self.compute_type:
                    from sklearn.model_selection import train_test_split
                    dataset = pandas.read_parquet(self.target_files,
                                                  columns=columns,
                                                  engine='pyarrow')
                    dataset = dataset.dropna()
                    X_train, X_test, y_train, y_test = train_test_split(
                        dataset.loc[:, dataset.columns != label_column],
                        dataset[label_column],
                        random_state=random_seed)
                elif 'GPU' in self.compute_type:
                    from cuml.preprocessing.model_selection import train_test_split
                    dataset = cudf.read_parquet(self.target_files,
                                                columns=columns)
                    dataset = dataset.dropna()
                    X_train, X_test, y_train, y_test = train_test_split(
                        dataset, label_column, random_state=random_seed)

                return X_train, X_test, y_train, y_test

            elif 'multi' in self.compute_type:
                from dask_ml.model_selection import train_test_split
                if 'CPU' in self.compute_type:
                    dataset = dask.dataframe.read_parquet(self.target_files,
                                                          columns=columns,
                                                          engine='pyarrow')
                elif 'GPU' in self.compute_type:
                    dataset = dask_cudf.read_parquet(self.target_files,
                                                     columns=columns)

                # drop missing values [ ~2.5% -- predominantly cancelled flights ]
                dataset = dataset.dropna()

                # repartition [ inplace ], rebalance ratio of workers & data partitions
                initial_npartitions = dataset.npartitions
                dataset = dataset.repartition(npartitions=self.n_workers)

                # split [ always runs, regardless of whether dataset is cached ]
                train, test = train_test_split(dataset,
                                               random_state=random_seed)

                # build X [ features ], y [ labels ] for the train and test subsets
                y_train = train[label_column].astype('int32')
                X_train = train.drop(label_column, axis=1).astype('float32')

                y_test = test[label_column].astype('int32')
                X_test = test.drop(label_column, axis=1).astype('float32')

                # return [ CPU/GPU ] dask dataframes
                return X_train, X_test, y_train, y_test

        return None

Beispiel #15

0

Datei anzeigen

def diff_time(train, valid):
    gf1 = cudf.from_pandas(
        train[["timestamp", "a_user_id", "b_user_id", "tweet_id",
               "no_tweet"]]).reset_index(drop=True)
    gf2 = cudf.from_pandas(
        valid[["timestamp", "a_user_id", "b_user_id", "tweet_id",
               "no_tweet"]]).reset_index(drop=True)
    gf = cudf.concat([gf1, gf2], axis=0)
    gf = dask_cudf.from_cudf(gf, npartitions=16)
    gf["timestamp"] = gf["timestamp"].astype("int64") / 1e9
    gf_unique = gf[["timestamp", "a_user_id", "tweet_id"]].drop_duplicates()
    gf_unique.columns = ["tmp_timestamp", "tmp_a_user_id", "tmp_tweet_id"]
    gf = gf[gf["no_tweet"] != 0]
    gf = gf.drop("no_tweet", axis=1)
    gf = gf.drop("a_user_id", axis=1)
    gf = gf.merge(gf_unique,
                  how="left",
                  left_on="b_user_id",
                  right_on="tmp_a_user_id")
    gf = gf[gf["tweet_id"] != gf["tmp_tweet_id"]]
    gf = gf[~gf["tmp_a_user_id"].isna()]

    gf["diff_timestamp_prev"] = gf["timestamp"] - gf["tmp_timestamp"]
    gf["diff_timestamp_after"] = gf["tmp_timestamp"] - gf["timestamp"]

    gf["diff_timestamp_after"] = gf.diff_timestamp_after.where(
        gf["diff_timestamp_after"] > 0, 15 * 24 * 3600)
    gf["diff_timestamp_prev"] = gf.diff_timestamp_prev.where(
        gf["diff_timestamp_prev"] > 0, 15 * 24 * 3600)

    gf = (gf[[
        "tweet_id", "b_user_id", "diff_timestamp_prev", "diff_timestamp_after"
    ]].groupby(["tweet_id", "b_user_id"]).min().reset_index())

    gf.to_parquet("/tmp/gf")
    del gf
    del gf_unique
    del gf1
    del gf2
    gc.collect()

    gf = cudf.read_parquet("/tmp/gf/part.0.parquet")
    gf1 = cudf.from_pandas(train[["b_user_id",
                                  "tweet_id"]]).reset_index(drop=True)
    gf1["idx"] = gf1.index
    gf1 = gf1.merge(
        gf,
        how="left",
        left_on=["tweet_id", "b_user_id"],
        right_on=["tweet_id", "b_user_id"],
    )
    gf1 = gf1.sort_values("idx")
    train["diff_timestamp_prev"] = (gf1["diff_timestamp_prev"].fillna(
        15 * 24 * 3600).astype("int32").to_array())
    train["diff_timestamp_after"] = (gf1["diff_timestamp_after"].fillna(
        15 * 24 * 3600).astype("int32").to_array())
    del gf1
    gc.collect()

    gf1 = cudf.from_pandas(valid[["b_user_id",
                                  "tweet_id"]]).reset_index(drop=True)
    gf1["idx"] = gf1.index
    gf1 = gf1.merge(
        gf,
        how="left",
        left_on=["tweet_id", "b_user_id"],
        right_on=["tweet_id", "b_user_id"],
    )
    gf1 = gf1.sort_values("idx")
    valid["diff_timestamp_prev"] = (gf1["diff_timestamp_prev"].fillna(
        15 * 24 * 3600).astype("int32").to_array())
    valid["diff_timestamp_after"] = (gf1["diff_timestamp_after"].fillna(
        15 * 24 * 3600).astype("int32").to_array())

Beispiel #16

0

Datei anzeigen

Datei: test_io.py Projekt: miguelusque/NVTabular

def test_hugectr(tmpdir, client, df, dataset, output_format, engine,
                 op_columns, num_io_threads, use_client):
    client = client if use_client else None

    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)
    outdir = str(outdir)

    conts = nvt.ColumnGroup(cont_names) >> ops.Normalize
    cats = nvt.ColumnGroup(cat_names) >> ops.Categorify

    workflow = nvt.Workflow(conts + cats + label_names)
    transformed = workflow.fit_transform(dataset)

    if output_format == "hugectr":
        transformed.to_hugectr(
            cats=cat_names,
            conts=cont_names,
            labels=label_names,
            output_path=outdir,
            out_files_per_proc=nfiles,
            num_threads=num_io_threads,
        )
    else:
        transformed.to_parquet(
            output_path=outdir,
            out_files_per_proc=nfiles,
            num_threads=num_io_threads,
        )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles if not client else nfiles * len(
        client.cluster.workers)
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"

    data_files = [
        os.path.join(outdir, filename) for filename in os.listdir(outdir)
        if filename.endswith(ext)
    ]

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, data_files[0]))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name

Beispiel #17

0

Datei anzeigen

import cudf as dd
from feature_engineering_2 import (
    pos_cash, process_unified, process_bureau_and_balance, 
    process_previous_applications, installments_payments,
    credit_card_balance
    )

# initiating mem management
# this allows for spilling out of the gpu ram 
dd.set_allocator("managed")

### Load datasets
print("loading data")
bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet')
bureau = dd.read_parquet('raw_data/bureau.parquet')
cc_balance = dd.read_parquet('raw_data/cc_balance.parquet')
payments = dd.read_parquet('raw_data/payments.parquet')
pc_balance = dd.read_parquet('raw_data/pc_balance.parquet')
prev = dd.read_parquet('raw_data/prev.parquet')
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = dd.concat([train.drop('TARGET', axis=1), test])
print("starting processing")

unified_feat = process_unified(unified, dd)

Beispiel #18

0

Datei anzeigen

        "buffer": distances["dist"].values
    })
    buffers = buffers.sort_values("buffer",
                                  ascending=False).drop_duplicates("node_id")
    buffers["buffer"] = buffers["buffer"].astype("int")

    # this will drop rows that did not appear in the KNN i.e unneeded poi
    # BUG: Unsure this works, bluespace retains ~140,000 points
    return poi_nn.merge(buffers, on="node_id", how="left").dropna()


if __name__ == "__main__":
    print("Starting routing data processing...")
    print("Reading and cleaning data...")

    nodes = cudf.read_parquet(Config.OS_GRAPH / "nodes.parquet")

    pcs: cudf.DataFrame = cudf.read_parquet(
        Config.PROCESSED_DATA / "postcodes.parquet").set_index("postcode")
    retail: cudf.DataFrame = clean_retail(path=Config.RAW_DATA /
                                          "LDC_Secure_Snapshot_2020_01.csv",
                                          postcodes=pcs)
    fast_food: cudf.DataFrame = clean_fast_food(retail=retail)
    gambling: cudf.DataFrame = clean_gambling(retail=retail)
    offlicences: cudf.DataFrame = clean_offlicences(retail=retail)
    pubs: cudf.DataFrame = clean_pubs(retail=retail)
    tobacconists: cudf.DataFrame = clean_tobacconists(retail=retail)
    leisure: cudf.DataFrame = clean_leisure(retail=retail)

    print("Finding nearest node to postcodes...")
    postcodes = nearest_nodes(df=pcs.reset_index(), nodes=nodes)

Beispiel #19

0

Datei anzeigen

Datei: test_io.py Projekt: maruyue/NVTabular

def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
    )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, "0.parquet"))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name

Beispiel #20

0

Datei anzeigen

Datei: process_routing.py Projekt: cjber/ahah

        "buffer": distances["dist"].values
    })
    buffers = buffers.sort_values("buffer",
                                  ascending=False).drop_duplicates("node_id")
    buffers["buffer"] = buffers["buffer"].astype("int")

    # this will drop rows that did not appear in the KNN i.e unneeded poi
    return (poi_nn.merge(buffers, on="node_id",
                         how="left").dropna().drop_duplicates("node_id"))


if __name__ == "__main__":
    logger.info("Starting routing data processing...")
    logger.debug("Reading and cleaning data...")

    nodes: cudf.DataFrame = cudf.read_parquet(Config.OS_GRAPH /
                                              "nodes.parquet")
    pcs: cudf.DataFrame = clean_postcodes(path=Config.RAW_DATA / "onspd" /
                                          "ONSPD_FEB_2022_UK.csv",
                                          current=True)

    all_pcs: cudf.DataFrame = clean_postcodes(path=Config.RAW_DATA / "onspd" /
                                              "ONSPD_FEB_2022_UK.csv",
                                              current=False).drop("lsoa11",
                                                                  axis=1)
    all_pcs.reset_index().to_parquet(Config.PROCESSED_DATA / "all_pcs.parquet")

    gpp: cudf.DataFrame = clean_gpp(
        england=Config.RAW_DATA / "nhs" / "epraccur.csv",
        scotland=Config.RAW_DATA / "nhs" / "scotland" / "gpp.csv",
        postcodes=all_pcs,
    )

Beispiel #21

0

Datei anzeigen

Datei: parquet.py Projekt: vyasr/cudf

    def read_partition(
        fs, piece, columns, index, categories=(), partitions=(), **kwargs
    ):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            path = piece
            row_group = None
            partition_keys = []
        else:
            (path, row_group, partition_keys) = piece

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                path,
                engine="cudf",
                columns=columns,
                row_groups=row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_groups=row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and (index[0] in df.columns):
            df = df.set_index(index[0])
        elif index is False and set(df.index.names).issubset(columns):
            # If index=False, we need to make sure all of the
            # names in `columns` are actually in `df.columns`
            df.reset_index(inplace=True)

        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]

                col = as_column(index2).as_frame().repeat(len(df))._data[None]
                df[name] = build_categorical_column(
                    categories=categories,
                    codes=as_column(col.base_data, dtype=col.dtype),
                    size=col.size,
                    offset=col.offset,
                    ordered=False,
                )

        return df

Beispiel #22

0

Datei anzeigen

Datei: test_dask_nvt.py Projekt: pythonthings/NVTabular

def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(client=client,
                         cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name)

    processor.add_preprocess(
        ops.JoinGroupby(cont_names=cont_names,
                        stats=["count", "sum", "std", "min"],
                        out_path=str(tmpdir)))
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"
                ]].drop_duplicates().sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64),
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "min"
    assert_eq(
        result[[
            "name-string", "name-string_x_min"
        ]].drop_duplicates().sort_values("name-string")["name-string_x_min"],
        df0.groupby("name-string").agg({"x": "min"})["x"],
        check_index=False,
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[[
            "name-string", "name-string_x_std"
        ]].drop_duplicates().sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )

Beispiel #23

0

Datei anzeigen

def test_gpu_workflow_config(tmpdir, datasets, dump, gpu_memory_frac, engine,
                             replace):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False,
                            names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False,
                            names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    # add operators with dependencies
    config["FE"]["continuous"] = [[
        ops.FillMissing(replace=replace),
        ops.LogOp()
    ]]
    config["PP"]["continuous"] = [[
        ops.LogOp(replace=replace),
        ops.Normalize()
    ]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        to_cpu=False,
    )

    data_itr = nvt.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    processor.update_stats(data_itr)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    concat_ops = "_FillMissing_LogOp"
    if replace:
        concat_ops = ""
    assert math.isclose(
        get_norms(df.x).mean(),
        processor.stats["means"]["x" + concat_ops],
        rel_tol=1e-1,
    )
    assert math.isclose(
        get_norms(df.y).mean(),
        processor.stats["means"]["y" + concat_ops],
        rel_tol=1e-1,
    )

    assert math.isclose(
        get_norms(df.x).std(),
        processor.stats["stds"]["x" + concat_ops],
        rel_tol=1e-1,
    )
    assert math.isclose(
        get_norms(df.y).std(),
        processor.stats["stds"]["y" + concat_ops],
        rel_tol=1e-1,
    )

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_to_string()
        cats0 = processor.stats["encoders"]["name-cat"].get_cats(
        ).values_to_string()
        # adding the None entry as a string because of move from gpu
        assert cats0 == ["None"] + cats_expected0
    cats_expected1 = df["name-string"].unique().values_to_string()
    cats1 = processor.stats["encoders"]["name-string"].get_cats(
    ).values_to_string()
    # adding the None entry as a string because of move from gpu
    assert cats1 == ["None"] + cats_expected1

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               data_itr,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    data_itr_2 = nvtabular.io.GPUDatasetIterator(
        glob.glob(str(tmpdir) + "/ds_part.*.parquet"),
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
    )

    df_pp = None
    for chunk in data_itr_2:
        df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk

    if engine == "parquet":
        assert df_pp["name-cat"].dtype == "int64"
    assert df_pp["name-string"].dtype == "int64"

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
    return processor.ds_exports

Beispiel #24

0

Datei anzeigen

Datei: test_dask_nvt.py Projekt: pythonthings/NVTabular

def test_dask_workflow_api_dlrm(client, tmpdir, datasets, freq_threshold,
                                part_mem_fraction, engine, cat_cache, on_host,
                                shuffle):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(client=client,
                         cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name)

    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            freq_threshold=freq_threshold,
            out_path=str(tmpdir),
            cat_cache=cat_cache,
            on_host=on_host,
        ))
    processor.finalize()

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)
    output_path = os.path.join(tmpdir, "processed")
    processor.apply(dataset, output_path=output_path, shuffle=shuffle)

    # Can still access the final ddf if we didn't shuffle
    if not shuffle:
        result = processor.get_ddf().compute()
        assert len(df0) == len(result)
        assert result["x"].min() == 0.0
        assert result["x"].isna().sum() == 0
        assert result["y"].min() == 0.0
        assert result["y"].isna().sum() == 0

        # Check category counts
        cat_expect = df0.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True)
        cat_result = (result.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True))
        if freq_threshold:
            cat_expect = cat_expect[
                cat_expect["name-string"] >= freq_threshold]
            # Note that we may need to skip the 0th element in result (null mapping)
            assert_eq(
                cat_expect,
                cat_result.iloc[1:]
                if len(cat_result) > len(cat_expect) else cat_result,
                check_index=False,
            )
        else:
            assert_eq(cat_expect, cat_result)

        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        for col in df_disk:
            assert_eq(result[col], df_disk[col])

    else:
        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        assert len(df0) == len(df_disk)

Beispiel #25

0

Datei anzeigen

def test_parquet_reader_filenotfound(tmpdir):
    with pytest.raises(FileNotFoundError):
        cudf.read_parquet("TestMissingFile.parquet")

    with pytest.raises(FileNotFoundError):
        cudf.read_parquet(tmpdir.mkdir("cudf_parquet"))

Beispiel #26

0

Datei anzeigen

def _get_random_movielens_data(tmpdir, rows, dataset="movie", valid=None):
    if dataset == "movie":
        json_sample_movie = {
            "conts": {},
            "cats": {
                "genres": {
                    "dtype": None,
                    "cardinality": 50,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                    "multi_min": 2,
                    "multi_max": 4,
                    "multi_avg": 3,
                },
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
        }
        cols = datagen._get_cols_from_schema(json_sample_movie)
    if dataset == "ratings":
        json_sample_ratings = {
            "conts": {},
            "cats": {
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
                "userId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
            "labels": {
                "rating": {
                    "dtype": None,
                    "cardinality": 5
                }
            },
        }
        cols = datagen._get_cols_from_schema(json_sample_ratings)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.1)
    target_path = tmpdir
    df_gen.full_df_create(rows, cols, output=target_path)

    if dataset == "movie":
        movies_converted = cudf.read_parquet(
            os.path.join(tmpdir, "dataset_0.parquet"))
        movies_converted = movies_converted.drop_duplicates(["movieId"],
                                                            keep="first")
        movies_converted.to_parquet(
            os.path.join(tmpdir, "movies_converted.parquet"))

    elif dataset == "ratings" and not valid:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"),
                  os.path.join(tmpdir, "train.parquet"))
    else:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"),
                  os.path.join(tmpdir, "valid.parquet"))

Beispiel #27

0

Datei anzeigen

def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src):
    expect = pd.read_parquet(parquet_path_or_buf("filepath"))
    got = cudf.read_parquet(parquet_path_or_buf(src))

    assert_eq(expect, got)

Beispiel #28

0

Datei anzeigen

def test_parquet_write_bytes_io(simple_gdf):
    output = BytesIO()
    simple_gdf.to_parquet(output)
    assert_eq(cudf.read_parquet(output), simple_gdf)