Example #1
0
 def get_offsets(self, workflow, column_types):
     embeddings = get_embedding_sizes(workflow)
     if embeddings is None:
         raise Exception("embeddings cannot be None")
     else:
         offsets = dict()
         curr_offset = 0
         for name in column_types["cats"]:
             offsets[name] = curr_offset
             curr_offset += embeddings[name][0]
         return offsets
def create_ensemble(workflow_path, hugectr_model_path, ensemble_output_path,
                    ensemble_config_file):
    """
    Creates an ensemble of NVTabular and HugeCTR model.

    This enables preprocessing at the time of inference, allowing the
    user to send raw data directly to the inference server.
    """

    # Load the workflow object
    workflow = nvt.Workflow.load(workflow_path)

    # Verify that the workflow is loaded
    embeddings = get_embedding_sizes(workflow)
    logging.info(f"Embedding sizes for categorical features: {embeddings}")

    with open(ensemble_config_file, "r") as jsonfile:
        ensemble_config = json.load(jsonfile)

    hugectr_params = ensemble_config["hugectr_params"]

    # We override the config param to update the model version
    # Get the model version for updating the config accordingly
    model_version = hugectr_model_path.split('/')[-2]
    logging.info(f"Model version: {model_version}")
    model_json_path = hugectr_params["config"].split(
        os.sep
    )  # "/model/models/dcn/1/dcn.json" -> ['', 'model', 'models', 'dcn', '1', 'dcn.json']
    model_json_path[
        -2] = model_version  # ['', 'model', 'models', 'dcn', '1', 'dcn.json'] -> ['', 'model', 'models', 'dcn', '2', 'dcn.json']
    hugectr_params["config"] = os.sep + os.path.join(
        *model_json_path)  # '/' + 'model/models/dcn/2/dcn.json'

    logging.info(f"HugeCTR configs: {hugectr_params}")

    categorical_cols = ensemble_config["categorical_cols"]
    continuous_cols = ensemble_config["continuous_cols"]
    label_cols = ensemble_config["label_cols"]

    logging.info(f"Categorical Columns: {categorical_cols}")
    logging.info(f"Continuous Columns: {continuous_cols}")
    logging.info(f"Label Columns: {label_cols}")

    logging.info(
        f"Generating the ensemble at directory: {ensemble_output_path}")
    export_hugectr_ensemble(workflow=workflow,
                            hugectr_model_path=hugectr_model_path,
                            hugectr_params=hugectr_params,
                            name=ensemble_config["name"],
                            output_path=ensemble_output_path,
                            label_columns=label_cols,
                            cats=categorical_cols,
                            conts=continuous_cols,
                            max_batch_size=ensemble_config["max_batch_size"])
def save_model_size_config(workflow: Workflow, output_path: str):
    embeddings = {}
    for k, v in get_embedding_sizes(workflow).items():
        embeddings[k] = v[
            0] - 1  # we have to subtract one, as the model expects to get a maximal id for each category

    ordered_dict = OrderedDict()
    for k, v in sorted(list(embeddings.items()), key=lambda x: x[0]):
        ordered_dict[k] = v
    with open(os.path.join(output_path, "model_size.json"), 'w') as file:
        file.write(json.dumps(ordered_dict))
Example #4
0
print(f"train preprocess time: {time() - start}")

start = time()
proc.apply(
    valids_ds,
    apply_offline=True,
    record_stats=False,
    shuffle=shuffle_arg,
    output_path=out_valid,
    out_files_per_proc=2,
)
print(f"valid preprocess time: {time() - start}")
print(proc.timings)

# TODO: Implement the get_embedding_size for dask-based workflow
embeddings = list(get_embedding_sizes(proc).values())

print("Creating Iterators for dataloader")
start = time()

new_train_set = [
    os.path.join(out_train, x) for x in os.listdir(out_train)
    if x.endswith("parquet")
]
new_valid_set = [
    os.path.join(out_valid, x) for x in os.listdir(out_valid)
    if x.endswith("parquet")
]

if args.pool:
    # free up the cudf pool here so that we don't run out of memory training the model
Example #5
0
def train_pytorch(workflow, out_path, cats, conts, labels, batch_size,
                  parts_per_chunk):
    # Set paths and dataloaders
    train_paths = glob.glob(os.path.join(out_path, "train", "*.parquet"))
    valid_paths = glob.glob(os.path.join(out_path, "valid", "*.parquet"))

    train_data = nvt.Dataset(train_paths,
                             engine="parquet",
                             part_mem_fraction=0.04 / parts_per_chunk)
    valid_data = nvt.Dataset(valid_paths,
                             engine="parquet",
                             part_mem_fraction=0.04 / parts_per_chunk)

    train_data_itrs = TorchAsyncItr(
        train_data,
        batch_size=batch_size,
        cats=cats,
        conts=conts,
        labels=labels,
        parts_per_chunk=parts_per_chunk,
    )
    valid_data_itrs = TorchAsyncItr(
        valid_data,
        batch_size=batch_size,
        cats=cats,
        conts=conts,
        labels=labels,
        parts_per_chunk=parts_per_chunk,
    )

    train_dataloader = DLDataLoader(train_data_itrs,
                                    collate_fn=gen_col,
                                    batch_size=None,
                                    pin_memory=False,
                                    num_workers=0)
    valid_dataloader = DLDataLoader(valid_data_itrs,
                                    collate_fn=gen_col,
                                    batch_size=None,
                                    pin_memory=False,
                                    num_workers=0)
    databunch = TabularDataLoaders(train_dataloader, valid_dataloader)

    embeddings = list(get_embedding_sizes(workflow).values())
    # We limit the output dimension to 16
    embeddings = [[emb[0], min(16, emb[1])] for emb in embeddings]

    model = TabularModel(emb_szs=embeddings,
                         n_cont=len(conts),
                         out_sz=2,
                         layers=[512, 256]).cuda()
    learn = Learner(
        databunch,
        model,
        loss_func=torch.nn.CrossEntropyLoss(),
        metrics=[RocAucBinary(), APScoreBinary()],
    )

    learning_rate = 1.32e-2
    epochs = 1
    start = time()
    learn.fit(epochs, learning_rate)
    t_final = time() - start
    total_rows = train_data_itrs.num_rows_processed + valid_data_itrs.num_rows_processed
    print(f"run_time: {t_final} - rows: {total_rows} - epochs: " +
          "{epochs} - dl_thru: {total_rows / t_final}")
Example #6
0
def test_training():
    # Download & Convert data
    download_file(
        "http://files.grouplens.org/datasets/movielens/ml-25m.zip",
        os.path.join(DATA_DIR, "ml-25m.zip"),
    )

    ratings = cudf.read_csv(os.path.join(DATA_DIR, "ml-25m", "ratings.csv"))
    ratings["new_cat1"] = ratings["userId"] / ratings["movieId"]
    ratings["new_cat1"] = ratings["new_cat1"].astype("int64")
    ratings.head()

    ratings = ratings.drop("timestamp", axis=1)
    train, valid = train_test_split(ratings, test_size=0.2, random_state=42)

    train.to_parquet(DATA_DIR + "train.parquet")
    valid.to_parquet(DATA_DIR + "valid.parquet")

    del train
    del valid
    gc.collect()

    # Perform ETL with NVTabular
    cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(cat_cache="device")
    ratings = nvt.ColumnSelector(["rating"]) >> nvt.ops.LambdaOp(
        lambda col: (col > 3).astype("int8")
    )
    output = cat_features + ratings

    workflow = nvt.Workflow(output)

    train_dataset = nvt.Dataset(DATA_DIR + "train.parquet", part_size="100MB")
    valid_dataset = nvt.Dataset(DATA_DIR + "valid.parquet", part_size="100MB")

    workflow.fit(train_dataset)

    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    if path.exists(DATA_DIR + "train"):
        shutil.rmtree(os.path.join(DATA_DIR, "train"))
    if path.exists(DATA_DIR + "valid"):
        shutil.rmtree(os.path.join(DATA_DIR, "valid"))

    workflow.transform(train_dataset).to_parquet(
        output_path=DATA_DIR + "train/",
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        cats=CATEGORICAL_COLUMNS,
        labels=LABEL_COLUMNS,
        dtypes=dict_dtypes,
    )
    workflow.transform(valid_dataset).to_parquet(
        output_path=DATA_DIR + "valid/",
        shuffle=False,
        cats=CATEGORICAL_COLUMNS,
        labels=LABEL_COLUMNS,
        dtypes=dict_dtypes,
    )

    # Train with HugeCTR
    embeddings = get_embedding_sizes(workflow)
    total_cardinality = 0
    slot_sizes = []
    for column in CATEGORICAL_COLUMNS:
        slot_sizes.append(embeddings[column][0])
        total_cardinality += embeddings[column][0]

    test_data_path = DATA_DIR + "test/"
    if path.exists(test_data_path):
        shutil.rmtree(test_data_path)

    os.mkdir(test_data_path)

    if path.exists(MODEL_DIR):
        shutil.rmtree(MODEL_DIR)

    os.makedirs(TRAIN_DIR)

    sample_data = cudf.read_parquet(DATA_DIR + "valid.parquet", num_rows=TEST_N_ROWS)
    sample_data.to_csv(test_data_path + "data.csv")

    sample_data_trans = nvt.workflow._transform_partition(sample_data, [workflow.output_node])

    dense_features, embedding_columns, row_ptrs = _convert(sample_data_trans, slot_sizes)

    _run_model(slot_sizes, total_cardinality)

    if path.exists(TEMP_DIR):
        shutil.rmtree(TEMP_DIR)

    os.mkdir(TEMP_DIR)

    file_names = glob.iglob(os.path.join(os.getcwd(), "*.model"))
    for files in file_names:
        shutil.move(files, TEMP_DIR)

    hugectr_params = dict()
    hugectr_params["config"] = NETWORK_FILE
    hugectr_params["slots"] = len(slot_sizes)
    hugectr_params["max_nnz"] = len(slot_sizes)
    hugectr_params["embedding_vector_size"] = 16
    hugectr_params["n_outputs"] = 1

    export_hugectr_ensemble(
        workflow=workflow,
        hugectr_model_path=TEMP_DIR,
        hugectr_params=hugectr_params,
        name=MODEL_NAME,
        output_path=MODEL_DIR,
        label_columns=["rating"],
        cats=CATEGORICAL_COLUMNS,
        max_batch_size=64,
    )

    shutil.rmtree(TEMP_DIR)
    _predict(dense_features, embedding_columns, row_ptrs, hugectr_params["config"], MODEL_NAME)
def train_tensorflow(workflow, out_path, cats, conts, labels, batch_size):
    # Get embeddings from workflow
    embeddings = get_embedding_sizes(workflow)
    for key in embeddings:

        embeddings[key] = (
            embeddings[key][0],
            min(16, embeddings[key][1]),
        )

    # Set paths and dataloaders
    train_path = os.path.join(out_path, "train/")
    valid_path = os.path.join(out_path, "valid/")

    train_dataset_tf = KerasSequenceLoader(
        train_path,
        batch_size=batch_size,
        label_names=labels,
        cat_names=cats,
        cont_names=conts,
        engine="parquet",
        shuffle=True,
        buffer_size=0.06,
        parts_per_chunk=1,
    )

    valid_dataset_tf = KerasSequenceLoader(
        valid_path,
        batch_size=batch_size,
        label_names=labels,
        cat_names=cats,
        cont_names=conts,
        engine="parquet",
        shuffle=False,
        buffer_size=0.06,
        parts_per_chunk=1,
    )

    inputs = {}  # tf.keras.Input placeholders for each feature to be used
    emb_layers = [
    ]  # output of all embedding layers, which will be concatenated
    num_layers = []  # output of numerical layers

    for col in cats:
        inputs[col] = tf.keras.Input(name=col, dtype=tf.int32, shape=(1, ))

    for col in conts:
        inputs[col] = tf.keras.Input(name=col, dtype=tf.float32, shape=(1, ))

    for col in cats:
        emb_layers.append(
            tf.feature_column.embedding_column(
                tf.feature_column.categorical_column_with_identity(
                    col,
                    embeddings[col][0]  # Input dimension (vocab size)
                ),
                embeddings[col][1],  # Embedding output dimension
            ))

    for col in conts:
        num_layers.append(tf.feature_column.numeric_column(col))

    emb_layer = layers.DenseFeatures(emb_layers)
    x_emb_output = emb_layer(inputs)

    x = tf.keras.layers.Dense(128, activation="relu")(x_emb_output)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(x)

    model = tf.keras.Model(inputs=inputs, outputs=x)
    model.compile("sgd", "binary_crossentropy")

    tf.keras.utils.plot_model(model)

    validation_callback = KerasSequenceValidater(valid_dataset_tf)

    model.fit(train_dataset_tf, callbacks=[validation_callback], epochs=1)

    model.save(os.path.join(out_path, "model.savedmodel"))
Example #8
0
def train_hugectr(workflow, devices, out_path):
    # Gets embeddings and devices
    embeddings = list(get_embedding_sizes(workflow).values())
    embeddings = [emb[0] for emb in embeddings]
    devices = [[int(d)] for d in list(devices)[0::2]]
    # Set solver and model
    solver = hugectr.solver_parser_helper(
        vvgpu=[[0]],
        max_iter=10000,
        max_eval_batches=100,
        batchsize_eval=2720,
        batchsize=2720,
        display=1000,
        eval_interval=3200,
        snapshot=3200,
        i64_input_key=True,
        use_mixed_precision=False,
        repeat_dataset=True,
    )
    optimizer = hugectr.optimizer.CreateOptimizer(
        optimizer_type=hugectr.Optimizer_t.SGD, use_mixed_precision=False)
    model = hugectr.Model(solver, optimizer)
    model.add(
        hugectr.Input(
            data_reader_type=hugectr.DataReaderType_t.Parquet,
            source=out_path + "/output/train/_file_list.txt",
            eval_source=out_path + "/output/valid/_file_list.txt",
            check_type=hugectr.Check_t.Non,
            label_dim=1,
            label_name="label",
            dense_dim=13,
            dense_name="dense",
            slot_size_array=embeddings,
            data_reader_sparse_param_array=[
                hugectr.DataReaderSparseParam(
                    hugectr.DataReaderSparse_t.Localized, 26, 1, 26)
            ],
            sparse_names=["data1"],
        ))
    model.add(
        hugectr.SparseEmbedding(
            embedding_type=hugectr.Embedding_t.
            LocalizedSlotSparseEmbeddingHash,
            max_vocabulary_size_per_gpu=15500000,
            embedding_vec_size=128,
            combiner=0,
            sparse_embedding_name="sparse_embedding1",
            bottom_name="data1",
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["dense"],
            top_names=["fc1"],
            num_output=512,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc1"],
                           top_names=["relu1"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu1"],
            top_names=["fc2"],
            num_output=256,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc2"],
                           top_names=["relu2"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu2"],
            top_names=["fc3"],
            num_output=128,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc3"],
                           top_names=["relu3"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.Interaction,
            bottom_names=["relu3", "sparse_embedding1"],
            top_names=["interaction1"],
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["interaction1"],
            top_names=["fc4"],
            num_output=1024,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc4"],
                           top_names=["relu4"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu4"],
            top_names=["fc5"],
            num_output=1024,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc5"],
                           top_names=["relu5"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu5"],
            top_names=["fc6"],
            num_output=512,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc6"],
                           top_names=["relu6"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu6"],
            top_names=["fc7"],
            num_output=256,
        ))
    model.add(
        hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU,
                           bottom_names=["fc7"],
                           top_names=["relu7"]))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.InnerProduct,
            bottom_names=["relu7"],
            top_names=["fc8"],
            num_output=1,
        ))
    model.add(
        hugectr.DenseLayer(
            layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
            bottom_names=["fc8", "label"],
            top_names=["loss"],
        ))
    # Run training
    model.compile()
    model.summary()
    model.fit()