def get_offsets(self, workflow, column_types): embeddings = get_embedding_sizes(workflow) if embeddings is None: raise Exception("embeddings cannot be None") else: offsets = dict() curr_offset = 0 for name in column_types["cats"]: offsets[name] = curr_offset curr_offset += embeddings[name][0] return offsets
def create_ensemble(workflow_path, hugectr_model_path, ensemble_output_path, ensemble_config_file): """ Creates an ensemble of NVTabular and HugeCTR model. This enables preprocessing at the time of inference, allowing the user to send raw data directly to the inference server. """ # Load the workflow object workflow = nvt.Workflow.load(workflow_path) # Verify that the workflow is loaded embeddings = get_embedding_sizes(workflow) logging.info(f"Embedding sizes for categorical features: {embeddings}") with open(ensemble_config_file, "r") as jsonfile: ensemble_config = json.load(jsonfile) hugectr_params = ensemble_config["hugectr_params"] # We override the config param to update the model version # Get the model version for updating the config accordingly model_version = hugectr_model_path.split('/')[-2] logging.info(f"Model version: {model_version}") model_json_path = hugectr_params["config"].split( os.sep ) # "/model/models/dcn/1/dcn.json" -> ['', 'model', 'models', 'dcn', '1', 'dcn.json'] model_json_path[ -2] = model_version # ['', 'model', 'models', 'dcn', '1', 'dcn.json'] -> ['', 'model', 'models', 'dcn', '2', 'dcn.json'] hugectr_params["config"] = os.sep + os.path.join( *model_json_path) # '/' + 'model/models/dcn/2/dcn.json' logging.info(f"HugeCTR configs: {hugectr_params}") categorical_cols = ensemble_config["categorical_cols"] continuous_cols = ensemble_config["continuous_cols"] label_cols = ensemble_config["label_cols"] logging.info(f"Categorical Columns: {categorical_cols}") logging.info(f"Continuous Columns: {continuous_cols}") logging.info(f"Label Columns: {label_cols}") logging.info( f"Generating the ensemble at directory: {ensemble_output_path}") export_hugectr_ensemble(workflow=workflow, hugectr_model_path=hugectr_model_path, hugectr_params=hugectr_params, name=ensemble_config["name"], output_path=ensemble_output_path, label_columns=label_cols, cats=categorical_cols, conts=continuous_cols, max_batch_size=ensemble_config["max_batch_size"])
def save_model_size_config(workflow: Workflow, output_path: str): embeddings = {} for k, v in get_embedding_sizes(workflow).items(): embeddings[k] = v[ 0] - 1 # we have to subtract one, as the model expects to get a maximal id for each category ordered_dict = OrderedDict() for k, v in sorted(list(embeddings.items()), key=lambda x: x[0]): ordered_dict[k] = v with open(os.path.join(output_path, "model_size.json"), 'w') as file: file.write(json.dumps(ordered_dict))
print(f"train preprocess time: {time() - start}") start = time() proc.apply( valids_ds, apply_offline=True, record_stats=False, shuffle=shuffle_arg, output_path=out_valid, out_files_per_proc=2, ) print(f"valid preprocess time: {time() - start}") print(proc.timings) # TODO: Implement the get_embedding_size for dask-based workflow embeddings = list(get_embedding_sizes(proc).values()) print("Creating Iterators for dataloader") start = time() new_train_set = [ os.path.join(out_train, x) for x in os.listdir(out_train) if x.endswith("parquet") ] new_valid_set = [ os.path.join(out_valid, x) for x in os.listdir(out_valid) if x.endswith("parquet") ] if args.pool: # free up the cudf pool here so that we don't run out of memory training the model
def train_pytorch(workflow, out_path, cats, conts, labels, batch_size, parts_per_chunk): # Set paths and dataloaders train_paths = glob.glob(os.path.join(out_path, "train", "*.parquet")) valid_paths = glob.glob(os.path.join(out_path, "valid", "*.parquet")) train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.04 / parts_per_chunk) valid_data = nvt.Dataset(valid_paths, engine="parquet", part_mem_fraction=0.04 / parts_per_chunk) train_data_itrs = TorchAsyncItr( train_data, batch_size=batch_size, cats=cats, conts=conts, labels=labels, parts_per_chunk=parts_per_chunk, ) valid_data_itrs = TorchAsyncItr( valid_data, batch_size=batch_size, cats=cats, conts=conts, labels=labels, parts_per_chunk=parts_per_chunk, ) train_dataloader = DLDataLoader(train_data_itrs, collate_fn=gen_col, batch_size=None, pin_memory=False, num_workers=0) valid_dataloader = DLDataLoader(valid_data_itrs, collate_fn=gen_col, batch_size=None, pin_memory=False, num_workers=0) databunch = TabularDataLoaders(train_dataloader, valid_dataloader) embeddings = list(get_embedding_sizes(workflow).values()) # We limit the output dimension to 16 embeddings = [[emb[0], min(16, emb[1])] for emb in embeddings] model = TabularModel(emb_szs=embeddings, n_cont=len(conts), out_sz=2, layers=[512, 256]).cuda() learn = Learner( databunch, model, loss_func=torch.nn.CrossEntropyLoss(), metrics=[RocAucBinary(), APScoreBinary()], ) learning_rate = 1.32e-2 epochs = 1 start = time() learn.fit(epochs, learning_rate) t_final = time() - start total_rows = train_data_itrs.num_rows_processed + valid_data_itrs.num_rows_processed print(f"run_time: {t_final} - rows: {total_rows} - epochs: " + "{epochs} - dl_thru: {total_rows / t_final}")
def test_training(): # Download & Convert data download_file( "http://files.grouplens.org/datasets/movielens/ml-25m.zip", os.path.join(DATA_DIR, "ml-25m.zip"), ) ratings = cudf.read_csv(os.path.join(DATA_DIR, "ml-25m", "ratings.csv")) ratings["new_cat1"] = ratings["userId"] / ratings["movieId"] ratings["new_cat1"] = ratings["new_cat1"].astype("int64") ratings.head() ratings = ratings.drop("timestamp", axis=1) train, valid = train_test_split(ratings, test_size=0.2, random_state=42) train.to_parquet(DATA_DIR + "train.parquet") valid.to_parquet(DATA_DIR + "valid.parquet") del train del valid gc.collect() # Perform ETL with NVTabular cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(cat_cache="device") ratings = nvt.ColumnSelector(["rating"]) >> nvt.ops.LambdaOp( lambda col: (col > 3).astype("int8") ) output = cat_features + ratings workflow = nvt.Workflow(output) train_dataset = nvt.Dataset(DATA_DIR + "train.parquet", part_size="100MB") valid_dataset = nvt.Dataset(DATA_DIR + "valid.parquet", part_size="100MB") workflow.fit(train_dataset) dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 if path.exists(DATA_DIR + "train"): shutil.rmtree(os.path.join(DATA_DIR, "train")) if path.exists(DATA_DIR + "valid"): shutil.rmtree(os.path.join(DATA_DIR, "valid")) workflow.transform(train_dataset).to_parquet( output_path=DATA_DIR + "train/", shuffle=nvt.io.Shuffle.PER_PARTITION, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) workflow.transform(valid_dataset).to_parquet( output_path=DATA_DIR + "valid/", shuffle=False, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) # Train with HugeCTR embeddings = get_embedding_sizes(workflow) total_cardinality = 0 slot_sizes = [] for column in CATEGORICAL_COLUMNS: slot_sizes.append(embeddings[column][0]) total_cardinality += embeddings[column][0] test_data_path = DATA_DIR + "test/" if path.exists(test_data_path): shutil.rmtree(test_data_path) os.mkdir(test_data_path) if path.exists(MODEL_DIR): shutil.rmtree(MODEL_DIR) os.makedirs(TRAIN_DIR) sample_data = cudf.read_parquet(DATA_DIR + "valid.parquet", num_rows=TEST_N_ROWS) sample_data.to_csv(test_data_path + "data.csv") sample_data_trans = nvt.workflow._transform_partition(sample_data, [workflow.output_node]) dense_features, embedding_columns, row_ptrs = _convert(sample_data_trans, slot_sizes) _run_model(slot_sizes, total_cardinality) if path.exists(TEMP_DIR): shutil.rmtree(TEMP_DIR) os.mkdir(TEMP_DIR) file_names = glob.iglob(os.path.join(os.getcwd(), "*.model")) for files in file_names: shutil.move(files, TEMP_DIR) hugectr_params = dict() hugectr_params["config"] = NETWORK_FILE hugectr_params["slots"] = len(slot_sizes) hugectr_params["max_nnz"] = len(slot_sizes) hugectr_params["embedding_vector_size"] = 16 hugectr_params["n_outputs"] = 1 export_hugectr_ensemble( workflow=workflow, hugectr_model_path=TEMP_DIR, hugectr_params=hugectr_params, name=MODEL_NAME, output_path=MODEL_DIR, label_columns=["rating"], cats=CATEGORICAL_COLUMNS, max_batch_size=64, ) shutil.rmtree(TEMP_DIR) _predict(dense_features, embedding_columns, row_ptrs, hugectr_params["config"], MODEL_NAME)
def train_tensorflow(workflow, out_path, cats, conts, labels, batch_size): # Get embeddings from workflow embeddings = get_embedding_sizes(workflow) for key in embeddings: embeddings[key] = ( embeddings[key][0], min(16, embeddings[key][1]), ) # Set paths and dataloaders train_path = os.path.join(out_path, "train/") valid_path = os.path.join(out_path, "valid/") train_dataset_tf = KerasSequenceLoader( train_path, batch_size=batch_size, label_names=labels, cat_names=cats, cont_names=conts, engine="parquet", shuffle=True, buffer_size=0.06, parts_per_chunk=1, ) valid_dataset_tf = KerasSequenceLoader( valid_path, batch_size=batch_size, label_names=labels, cat_names=cats, cont_names=conts, engine="parquet", shuffle=False, buffer_size=0.06, parts_per_chunk=1, ) inputs = {} # tf.keras.Input placeholders for each feature to be used emb_layers = [ ] # output of all embedding layers, which will be concatenated num_layers = [] # output of numerical layers for col in cats: inputs[col] = tf.keras.Input(name=col, dtype=tf.int32, shape=(1, )) for col in conts: inputs[col] = tf.keras.Input(name=col, dtype=tf.float32, shape=(1, )) for col in cats: emb_layers.append( tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( col, embeddings[col][0] # Input dimension (vocab size) ), embeddings[col][1], # Embedding output dimension )) for col in conts: num_layers.append(tf.feature_column.numeric_column(col)) emb_layer = layers.DenseFeatures(emb_layers) x_emb_output = emb_layer(inputs) x = tf.keras.layers.Dense(128, activation="relu")(x_emb_output) x = tf.keras.layers.Dense(128, activation="relu")(x) x = tf.keras.layers.Dense(128, activation="relu")(x) x = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(x) model = tf.keras.Model(inputs=inputs, outputs=x) model.compile("sgd", "binary_crossentropy") tf.keras.utils.plot_model(model) validation_callback = KerasSequenceValidater(valid_dataset_tf) model.fit(train_dataset_tf, callbacks=[validation_callback], epochs=1) model.save(os.path.join(out_path, "model.savedmodel"))
def train_hugectr(workflow, devices, out_path): # Gets embeddings and devices embeddings = list(get_embedding_sizes(workflow).values()) embeddings = [emb[0] for emb in embeddings] devices = [[int(d)] for d in list(devices)[0::2]] # Set solver and model solver = hugectr.solver_parser_helper( vvgpu=[[0]], max_iter=10000, max_eval_batches=100, batchsize_eval=2720, batchsize=2720, display=1000, eval_interval=3200, snapshot=3200, i64_input_key=True, use_mixed_precision=False, repeat_dataset=True, ) optimizer = hugectr.optimizer.CreateOptimizer( optimizer_type=hugectr.Optimizer_t.SGD, use_mixed_precision=False) model = hugectr.Model(solver, optimizer) model.add( hugectr.Input( data_reader_type=hugectr.DataReaderType_t.Parquet, source=out_path + "/output/train/_file_list.txt", eval_source=out_path + "/output/valid/_file_list.txt", check_type=hugectr.Check_t.Non, label_dim=1, label_name="label", dense_dim=13, dense_name="dense", slot_size_array=embeddings, data_reader_sparse_param_array=[ hugectr.DataReaderSparseParam( hugectr.DataReaderSparse_t.Localized, 26, 1, 26) ], sparse_names=["data1"], )) model.add( hugectr.SparseEmbedding( embedding_type=hugectr.Embedding_t. LocalizedSlotSparseEmbeddingHash, max_vocabulary_size_per_gpu=15500000, embedding_vec_size=128, combiner=0, sparse_embedding_name="sparse_embedding1", bottom_name="data1", )) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["dense"], top_names=["fc1"], num_output=512, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu1"], top_names=["fc2"], num_output=256, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu2"], top_names=["fc3"], num_output=128, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc3"], top_names=["relu3"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.Interaction, bottom_names=["relu3", "sparse_embedding1"], top_names=["interaction1"], )) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["interaction1"], top_names=["fc4"], num_output=1024, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc4"], top_names=["relu4"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu4"], top_names=["fc5"], num_output=1024, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc5"], top_names=["relu5"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu5"], top_names=["fc6"], num_output=512, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc6"], top_names=["relu6"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu6"], top_names=["fc7"], num_output=256, )) model.add( hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc7"], top_names=["relu7"])) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.InnerProduct, bottom_names=["relu7"], top_names=["fc8"], num_output=1, )) model.add( hugectr.DenseLayer( layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, bottom_names=["fc8", "label"], top_names=["loss"], )) # Run training model.compile() model.summary() model.fit()