Exemple #1
0
 def _separate_list_columns(self, gdf):
     lists, scalars = [], []
     for col in gdf.columns:
         if is_list_dtype(gdf[col]):
             lists.append(col)
         else:
             scalars.append(col)
     return _get_embedding_order(scalars), _get_embedding_order(lists)
Exemple #2
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        seed_fn=None,
        buffer_size=0.1,
        device=None,
        parts_per_chunk=1,
        reader_kwargs=None,
        global_size=None,
        global_rank=None,
        drop_last=False,
        sparse_names=None,
        sparse_max=None,
        sparse_as_dense=False,
    ):
        dataset = _validate_dataset(
            paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs
        )
        cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        device = device or 0
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            seed_fn=seed_fn,
            parts_per_chunk=parts_per_chunk,
            device=device,
            global_size=global_size,
            global_rank=global_rank,
            drop_last=drop_last,
            sparse_names=sparse_names,
            sparse_max=sparse_max,
            sparse_as_dense=sparse_as_dense,
        )
        self._map_fns = []
Exemple #3
0
def _get_final_cols(preproc):
    if "cols" not in preproc.columns_ctx["final"]:
        preproc.create_final_cols()
    cat_names = _get_embedding_order(
        preproc.columns_ctx["final"]["cols"]["categorical"])
    cont_names = sorted(preproc.columns_ctx["final"]["cols"]["continuous"])
    label_name = sorted(preproc.columns_ctx["final"]["cols"]["label"])
    return cat_names, cont_names, label_name
Exemple #4
0
def combine_tensors(cats, conts, label):
    cats_list = [cats[x]
                 for x in _get_embedding_order(cats.keys())] if cats else None
    conts_list = [conts[x] for x in sorted(conts.keys())] if conts else None
    label_list = [label[x] for x in sorted(label.keys())] if label else None

    # Change cats, conts to dim=1 for column dim=0 for df sub section
    cats = torch.stack(cats_list, dim=1) if len(cats_list) > 0 else None
    conts = torch.stack(conts_list, dim=1) if len(conts_list) > 0 else None
    label = torch.cat(label_list, dim=0) if len(label_list) > 0 else None
    return cats, conts, label
Exemple #5
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        buffer_size=0.1,
        workflows=None,
        devices=None,
        parts_per_chunk=1,
        reader_kwargs=None,
    ):
        dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size,
                                    engine, reader_kwargs)
        cat_names, cont_names = _validate_schema(feature_columns, cat_names,
                                                 cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        assert devices is None or len(
            devices) == 1  # TODO: figure out multi-gpu support
        devices = devices or [0]
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            parts_per_chunk=parts_per_chunk,
            workflows=workflows,
            devices=devices,
        )
 def create_tensors(self, gdf, cat_names=None, cont_names=None, label_names=None):
     gdf_cats, gdf_conts, gdf_label = (
         gdf[_get_embedding_order(cat_names)],
         gdf[cont_names],
         gdf[label_names],
     )
     del gdf
     cats = self._to_tensor(gdf_cats, torch.long)
     conts = self._to_tensor(gdf_conts, torch.float32)
     label = self._to_tensor(gdf_label, torch.float32)
     del gdf_cats, gdf_conts, gdf_label
     return [cats, conts, label]
Exemple #7
0
def create_tensors(gdf, cat_names=None, cont_names=None, label_names=None):
    gdf_cats, gdf_conts, gdf_label = (
        gdf[_get_embedding_order(cat_names)],
        gdf[cont_names],
        gdf[label_names],
    )
    del gdf
    if len(gdf_cats) > 0:
        cats = _to_tensor(gdf_cats, torch.long, to_cpu=False)
    if len(gdf_conts) > 0:
        conts = _to_tensor(gdf_conts, torch.float32, to_cpu=False)
    if len(gdf_label) > 0:
        label = _to_tensor(gdf_label, torch.float32, to_cpu=False)
    del gdf_cats, gdf_conts, gdf_label
    return [cats[0], conts[0], label[0]]
Exemple #8
0
    def _create_tensors(self, gdf):
        """
        Breaks a dataframe down into the relevant
        categorical, continuous, and label tensors.
        Can be overrideen
        """
        # TODO: how will this work once we have multi-hots
        # also seems brittle to labels with mixed type
        gdf_cats, gdf_conts, gdf_label = (
            gdf[_get_embedding_order(self.cat_names)],
            gdf[self.cont_names],
            gdf[self.label_names],
        )
        del gdf
        cats = self._to_tensor(gdf_cats)
        conts = self._to_tensor(gdf_conts)
        label = self._to_tensor(gdf_label)

        del gdf_cats, gdf_conts, gdf_label
        return cats, conts, label