def main(argv): tf.random.set_seed(FLAGS.seed) number_of_entries = FLAGS.synthetic_dataset_num_entries batch_size = FLAGS.synthetic_dataset_batch_size number_of_batches = number_of_entries // batch_size if FLAGS.feature_spec is not None: fspec = FeatureSpec.from_yaml(FLAGS.feature_spec) else: cardinalities = [int(s) for s in FLAGS.synthetic_dataset_table_sizes] fspec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=FLAGS.num_numerical_features, categorical_feature_cardinalities=cardinalities) fspec.base_directory = FLAGS.synthetic_dataset_dir fspec.check_feature_spec() number_of_numerical_features = fspec.get_number_of_numerical_features() categorical_feature_sizes = fspec.get_categorical_sizes() train_dataset = DummyDataset(batch_size=batch_size, num_numerical_features=number_of_numerical_features, categorical_feature_cardinalities=categorical_feature_sizes, num_batches=number_of_batches) test_dataset = DummyDataset(batch_size=batch_size, num_numerical_features=number_of_numerical_features, categorical_feature_cardinalities=categorical_feature_sizes, num_batches=number_of_batches) write_dataset_to_disk( dataset_train=train_dataset, dataset_test=test_dataset, feature_spec=fspec )
def save_feature_spec(user_cardinality, item_cardinality, dtypes, test_negative_samples, output_path, user_feature_name='user', item_feature_name='item', label_feature_name='label'): feature_spec = { user_feature_name: { 'dtype': dtypes[user_feature_name], 'cardinality': int(user_cardinality) }, item_feature_name: { 'dtype': dtypes[item_feature_name], 'cardinality': int(item_cardinality) }, label_feature_name: { 'dtype': dtypes[label_feature_name], } } metadata = {TEST_SAMPLES_PER_SERIES: test_negative_samples + 1} train_mapping = [{ 'type': 'torch_tensor', 'features': [user_feature_name, item_feature_name], 'files': [TRAIN_0] }, { 'type': 'torch_tensor', 'features': [label_feature_name], 'files': [TRAIN_1] }] test_mapping = [{ 'type': 'torch_tensor', 'features': [user_feature_name, item_feature_name], 'files': [TEST_0], }, { 'type': 'torch_tensor', 'features': [label_feature_name], 'files': [TEST_1], }] channel_spec = { USER_CHANNEL_NAME: [user_feature_name], ITEM_CHANNEL_NAME: [item_feature_name], LABEL_CHANNEL_NAME: [label_feature_name] } source_spec = {'train': train_mapping, 'test': test_mapping} feature_spec = FeatureSpec(feature_spec=feature_spec, metadata=metadata, source_spec=source_spec, channel_spec=channel_spec, base_directory="") feature_spec.to_yaml(output_path=output_path)
def main(): # TODO: would be kind of nice to have an optional tag to be able to juggle # between sets of vectors corresponding to different user folds or sets # of features. (Though don't want too many lying around. Vectorizing just # 1% of users can already produce files as big as a GB.) parser = argparse.ArgumentParser() parser.add_argument('user_fold') parser.add_argument( '--tag', action='store_true', help='Whether to tag the generated vectors by the user fold used') parser.add_argument('--lim', type=int, help='Limit number of users vectorized') args = parser.parse_args() #featspec = FeatureSpec.all_features_spec() featspec = FeatureSpec.basic_spec() #featspec.add_feature(features.PrevOrderPids) users = iterate_wrapped_users(args.user_fold) if args.tag: affix = '_' + args.user_fold else: affix = '' victor = Vectorizer(featspec, affix) n = victor.vectorize_users(users, limit=args.lim) print 'Vectorized {} users from fold {}'.format(n, args.user_fold)
def main(): args = parse_args() dataset_size = args.size fspec_in = FeatureSpec.from_yaml(args.feature_spec_in) fspec_in.base_directory = args.output cat_cardinalities = fspec_in.get_categorical_sizes() cat_names = fspec_in.get_categorical_feature_names() cardinalities = {name: cardinality for name, cardinality in zip(cat_names, cat_cardinalities)} input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0] numerical_names_set = set(fspec_in.channel_spec[NUMERICAL_CHANNEL]) for mapping_name, mapping in fspec_in.source_spec.items(): for chunk in mapping: assert chunk['type'] == 'csv', "Only csv files supported in this generator" assert len(chunk['files']) == 1, "Only one file per chunk supported in this transcoder" path_to_save = os.path.join(fspec_in.base_directory, chunk['files'][0]) data = [] for name in chunk['features']: if name == input_label_feature_name: data.append(np.random.randint(0, 1, size=dataset_size)) elif name in numerical_names_set: data.append(np.random.rand(dataset_size)) else: local_cardinality = cardinalities[name] data.append(np.random.randint(0, local_cardinality, size=dataset_size)) values = np.stack(data).T to_save = pd.DataFrame(values, columns=chunk['features']) os.makedirs(os.path.dirname(path_to_save), exist_ok=True) to_save.to_csv(path_to_save, index=False, header=False)
def test_samples_in_test_series(path): loaded_featurespec = FeatureSpec.from_yaml(path) series_length = loaded_featurespec.metadata[TEST_SAMPLES_PER_SERIES] dataset = TorchTensorDataset(loaded_featurespec, 'test', mock_args()) for feature in dataset.features.values(): assert len(feature) % series_length == 0
def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int): categorical_sizes_file = os.path.join(dataset_dir, "model_size.json") with open(categorical_sizes_file) as f: # model_size.json contains the max value of each feature instead of the cardinality. # For feature spec this is changed for consistency and clarity. categorical_cardinalities = [int(v) + 1 for v in json.load(f).values()] train_file = os.path.join(dataset_dir, "train_data.bin") test_file = os.path.join(dataset_dir, "test_data.bin") val_file = os.path.join(dataset_dir, "validation_data.bin") target_train = os.path.join(output_dir, "train") target_test = os.path.join(output_dir, "test") target_val = os.path.join(output_dir, "validation") os.makedirs(output_dir, exist_ok=True) os.makedirs(target_train, exist_ok=True) os.makedirs(target_test, exist_ok=True) os.makedirs(target_val, exist_ok=True) # VALIDATION chunk is ignored in feature spec on purpose feature_spec = FeatureSpec.get_default_feature_spec( number_of_numerical_features=numerical_features, categorical_feature_cardinalities=categorical_cardinalities) feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml')) split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size) split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size) split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
def __init__( self, feature_spec: FeatureSpec, instance: str, local_categorical_feature_names: List[str], batch_size: int = 1, numerical_features_enabled: bool = False, ): self._feature_spec = feature_spec self._batch_size = batch_size self._instance = instance feature_spec.check_feature_spec() self._create_readers(feature_spec, local_categorical_feature_names, numerical_features_enabled) self._categorical_types_tf = [ fspec_type_to_tf_type[feature_spec.feature_spec[feature] [DTYPE_SELECTOR]] for feature in local_categorical_feature_names ]
def test_cardinalities(path): loaded_featurespec = FeatureSpec.from_yaml(path) features = loaded_featurespec.feature_spec declared_cardinalities = { name: data['cardinality'] for name, data in features.items() if 'cardinality' in data } source_spec = loaded_featurespec.source_spec for mapping_name, mapping in source_spec.items(): dataset = TorchTensorDataset(loaded_featurespec, mapping_name, mock_args()) for feature_name, cardinality in declared_cardinalities.items(): feature_data = dataset.features[feature_name] biggest_num = feature_data.max().item() assert biggest_num < cardinality
def write_dataset_to_disk(dataset_train, dataset_test, feature_spec: FeatureSpec) -> None: feature_spec.check_feature_spec() # We rely on the feature spec being properly formatted categorical_features_list = feature_spec.get_categorical_feature_names() categorical_features_types = [feature_spec.feature_spec[feature_name][DTYPE_SELECTOR] for feature_name in categorical_features_list] number_of_numerical_features = feature_spec.get_number_of_numerical_features() number_of_categorical_features = len(categorical_features_list) for mapping_name, dataset in zip((TRAIN_MAPPING, TEST_MAPPING), (dataset_train, dataset_test)): file_streams = [] label_path, numerical_path, categorical_paths = feature_spec.get_mapping_paths(mapping_name) try: os.makedirs(os.path.dirname(numerical_path), exist_ok=True) numerical_f = open(numerical_path, "wb+") file_streams.append(numerical_f) os.makedirs(os.path.dirname(label_path), exist_ok=True) label_f = open(label_path, 'wb+') file_streams.append(label_f) categorical_fs = [] for feature_name in categorical_features_list: local_path = categorical_paths[feature_name] os.makedirs(os.path.dirname(local_path), exist_ok=True) fs = open(local_path, 'wb+') categorical_fs.append(fs) file_streams.append(fs) pipe = iter(dataset.op()) for _ in tqdm.tqdm( range(len(dataset)), desc=mapping_name + " dataset saving"): (numerical, categorical), label = pipe.get_next() categoricals = tf.split(categorical, number_of_categorical_features, axis=1) assert (numerical.shape[-1] == number_of_numerical_features) assert (len(categoricals) == number_of_categorical_features) numerical_f.write(numerical.numpy().astype('float16').tobytes()) # numerical is always float16 label_f.write(label.numpy().astype('bool').tobytes()) # label is always boolean for cat_type, cat_tensor, cat_file in zip(categorical_features_types, categoricals, categorical_fs): cat_file.write(cat_tensor.numpy().astype(cat_type).tobytes()) finally: for stream in file_streams: stream.close() feature_spec.to_yaml()
def test_dtypes(path): loaded_featurespec = FeatureSpec.from_yaml(path) features = loaded_featurespec.feature_spec declared_dtypes = {name: data['dtype'] for name, data in features.items()} source_spec = loaded_featurespec.source_spec for mapping in source_spec.values(): for chunk in mapping: chunk_dtype = None for present_feature in chunk['features']: assert present_feature in declared_dtypes, "unknown feature in mapping" # Check declared type feature_dtype = declared_dtypes[present_feature] if chunk_dtype is None: chunk_dtype = feature_dtype else: assert chunk_dtype == feature_dtype path_to_load = os.path.join(loaded_featurespec.base_directory, chunk['files'][0]) loaded_data = torch.load(path_to_load) assert str(loaded_data.dtype) == chunk_dtype
def main(): args = parse_args() args_output = args.output args_input = args.input args_feature_spec_in = args.feature_spec_in args_feature_spec_out = args.feature_spec_out batch_size = args.chunk_size fspec_in_path = os.path.join(args_input, args_feature_spec_in) fspec_in = FeatureSpec.from_yaml(fspec_in_path) input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0] input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL] input_categorical_features_list = fspec_in.channel_spec[ CATEGORICAL_CHANNEL] # Do a pass to establish the cardinalities: they influence the type we save the dataset as found_cardinalities = defaultdict(lambda: 0) for mapping_name, mapping in fspec_in.source_spec.items(): df_iterators = [] for chunk in mapping: assert chunk[ 'type'] == 'csv', "Only csv files supported in this transcoder" assert len( chunk['files'] ) == 1, "Only one file per chunk supported in this transcoder" path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat(chunks, axis=1) for feature in input_categorical_features_list: mapping_cardinality = mapping_df[feature].max() + 1 previous_cardinality = found_cardinalities[feature] found_cardinalities[feature] = max(previous_cardinality, mapping_cardinality) for feature in input_categorical_features_list: declared_cardinality = fspec_in.feature_spec[feature][ CARDINALITY_SELECTOR] if declared_cardinality == 'auto': pass else: assert int(declared_cardinality) >= found_cardinalities[feature] found_cardinalities[feature] = int(declared_cardinality) categorical_cardinalities = [ found_cardinalities[f] for f in input_categorical_features_list ] number_of_numerical_features = fspec_in.get_number_of_numerical_features() fspec_out = FeatureSpec.get_default_feature_spec( number_of_numerical_features=number_of_numerical_features, categorical_feature_cardinalities=categorical_cardinalities) fspec_out.base_directory = args.output for mapping_name, mapping in fspec_in.source_spec.items(): # open files for outputting label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths( mapping_name) for path in [label_path, numerical_path, *categorical_paths.values()]: os.makedirs(os.path.dirname(path), exist_ok=True) output_categorical_features_list = fspec_out.get_categorical_feature_names( ) numerical_f = open(numerical_path, "ab+") label_f = open(label_path, "ab+") categorical_fs = [ open(categorical_paths[name], "ab+") for name in output_categorical_features_list ] categorical_feature_types = [ get_categorical_feature_type(card) for card in categorical_cardinalities ] df_iterators = [] for chunk in mapping: # We checked earlier it's a single file chunk path_to_load = os.path.join(fspec_in.base_directory, chunk['files'][0]) chunk_iterator = pd.read_csv(path_to_load, header=None, chunksize=batch_size, names=chunk['features']) df_iterators.append(chunk_iterator) zipped = zip(*df_iterators) for chunks in zipped: mapping_df = pd.concat( chunks, axis=1 ) # This takes care of making sure feature names are unique # Choose the right columns numerical_df = mapping_df[input_numerical_features_list] categorical_df = mapping_df[input_categorical_features_list] label_df = mapping_df[[input_label_feature_name]] # Append them to the binary files numerical_f.write(numerical_df.values.astype(np.float16).tobytes()) label_f.write(label_df.values.astype(np.bool).tobytes()) categorical_arr = categorical_df.values for cat_idx, cat_feature_type in enumerate( categorical_feature_types): categorical_fs[cat_idx].write( categorical_arr[:, cat_idx].astype( cat_feature_type).tobytes()) feature_spec_save_path = os.path.join(args_output, args_feature_spec_out) fspec_out.to_yaml(output_path=feature_spec_save_path)
def test_matches_template(path, template_path): loaded_featurespec_string = FeatureSpec.from_yaml(path).to_string() loaded_template_string = FeatureSpec.from_yaml(template_path).to_string() assert loaded_template_string == loaded_featurespec_string
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.metadata('train_throughput', { "name": 'train_throughput', 'format': ":.3e" }) dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"}) dllogger.metadata('train_epoch_time', { "name": 'train_epoch_time', 'format': ":.3f" }) dllogger.metadata('validation_epoch_time', { "name": 'validation_epoch_time', 'format': ":.3f" }) dllogger.metadata('eval_throughput', { "name": 'eval_throughput', 'format': ":.3e" }) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() feature_spec_path = os.path.join(args.data, args.feature_spec_file) feature_spec = FeatureSpec.from_yaml(feature_spec_path) trainset = dataloading.TorchTensorDataset(feature_spec, mapping_name='train', args=args) testset = dataloading.TorchTensorDataset(feature_spec, mapping_name='test', args=args) train_loader = dataloading.TrainDataloader(trainset, args) test_loader = dataloading.TestDataLoader(testset, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0] item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0] label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0] model = NeuMF( nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'], nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'], mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - start eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return # this should always be overridden if hr>0. # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring # to an uninitialized variable. max_hr = 0 best_epoch = 0 best_model_timestamp = time.time() train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() batch_dict_list = train_loader.get_epoch_data() num_batches = len(batch_dict_list) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j batch_dict = batch_dict_list[batch_idx] user_features = batch_dict[USER_CHANNEL_NAME] item_features = batch_dict[ITEM_CHANNEL_NAME] user_batch = user_features[user_feature_name] item_batch = item_features[item_feature_name] label_features = batch_dict[LABEL_CHANNEL_NAME] label_batch = label_features[label_feature_name] outputs = model(user_batch, item_batch) loss = traced_criterion(outputs, label_batch.view(-1, 1)).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del batch_dict_list train_time = time.time() - begin begin = time.time() epoch_samples = train_loader.length_after_augmentation train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - begin eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
def create_input_pipelines(flags): if flags.dataset_type == 'synthetic' and not flags.synthetic_dataset_use_feature_spec: cardinalities = [int(d) for d in flags.synthetic_dataset_cardinalities] feature_spec = FeatureSpec.get_default_feature_spec( number_of_numerical_features=flags. synthetic_dataset_num_numerical_features, categorical_feature_cardinalities=cardinalities) else: # synthetic based on feature spec, or raw fspec_path = os.path.join(flags.dataset_path, flags.feature_spec) feature_spec = FeatureSpec.from_yaml(fspec_path) dataset_metadata = DatasetMetadata( num_numerical_features=feature_spec.get_number_of_numerical_features(), categorical_cardinalities=feature_spec.get_categorical_sizes()) if flags.columnwise_split and not flags.data_parallel_bottom_mlp and dataset_metadata.num_numerical_features > 0: raise ValueError( 'Currently when using the --columnwise_split option ' 'you must either set --data_parallel_bottom_mlp or have no numerical features' ) multi_gpu_metadata = get_device_mapping( embedding_sizes=dataset_metadata.categorical_cardinalities, num_gpus=hvd.size(), data_parallel_bottom_mlp=flags.data_parallel_bottom_mlp, columnwise_split=flags.columnwise_split, num_numerical_features=dataset_metadata.num_numerical_features) local_tables = multi_gpu_metadata.rank_to_categorical_ids[hvd.rank()] local_numerical_features_enabled = hvd.rank( ) in multi_gpu_metadata.bottom_mlp_ranks local_numerical_features = dataset_metadata.num_numerical_features if local_numerical_features_enabled else 0 if flags.dataset_type == 'synthetic': local_table_sizes = [ dataset_metadata.categorical_cardinalities[i] for i in local_tables ] train_dataset = DummyDataset( batch_size=flags.batch_size, num_numerical_features=local_numerical_features, categorical_feature_cardinalities=local_table_sizes, num_batches=flags.synthetic_dataset_train_batches) test_dataset = DummyDataset( batch_size=flags.valid_batch_size, num_numerical_features=local_numerical_features, categorical_feature_cardinalities=local_table_sizes, num_batches=flags.synthetic_dataset_valid_batches) elif flags.dataset_type == 'tf_raw': local_categorical_feature_names = feature_spec.cat_positions_to_names( local_tables) train_dataset = TfRawBinaryDataset( feature_spec=feature_spec, instance=TRAIN_MAPPING, batch_size=flags.batch_size, numerical_features_enabled=local_numerical_features_enabled, local_categorical_feature_names=local_categorical_feature_names) test_dataset = TfRawBinaryDataset( feature_spec=feature_spec, instance=TEST_MAPPING, batch_size=flags.valid_batch_size, numerical_features_enabled=local_numerical_features_enabled, local_categorical_feature_names=local_categorical_feature_names) else: raise ValueError(f'Unsupported dataset type: {flags.dataset_type}') return train_dataset, test_dataset, dataset_metadata, multi_gpu_metadata
def main(): args = parse_args() args_output = args.output args_path = args.path args_feature_spec_in = args.feature_spec_in args_feature_spec_out = args.feature_spec_out feature_spec_path = os.path.join(args_path, args_feature_spec_in) feature_spec = FeatureSpec.from_yaml(feature_spec_path) # Only three features are transcoded - this is NCF specific user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0] item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0] label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0] categorical_features = [user_feature_name, item_feature_name] found_cardinalities = {f: 0 for f in categorical_features} new_source_spec = {} for mapping_name, mapping in feature_spec.source_spec.items(): # Load all chunks and link into one df chunk_dfs = [] for chunk in mapping: assert chunk[ 'type'] == 'csv', "Only csv files supported in this transcoder" file_dfs = [] for file in chunk['files']: path_to_load = os.path.join(feature_spec.base_directory, file) file_dfs.append(pd.read_csv(path_to_load, header=None)) chunk_df = pd.concat(file_dfs, ignore_index=True) chunk_df.columns = chunk['features'] chunk_df.reset_index(drop=True, inplace=True) chunk_dfs.append(chunk_df) mapping_df = pd.concat( chunk_dfs, axis=1) # This takes care of making sure feature names are unique for feature in categorical_features: mapping_cardinality = mapping_df[feature].max() + 1 previous_cardinality = found_cardinalities[feature] found_cardinalities[feature] = max(previous_cardinality, mapping_cardinality) # We group together users and items, while separating labels. This is because of the target dtypes: ids are int, # while labels are float to compute loss. ints_tensor = torch.from_numpy( mapping_df[[user_feature_name, item_feature_name]].values).long() ints_file = f"{mapping_name}_data_0.pt" ints_chunk = { "type": "torch_tensor", "features": [user_feature_name, item_feature_name], "files": [ints_file] } torch.save(ints_tensor, os.path.join(args_output, ints_file)) floats_tensor = torch.from_numpy(mapping_df[[label_feature_name ]].values).float() floats_file = f"{mapping_name}_data_1.pt" floats_chunk = { "type": "torch_tensor", "features": [label_feature_name], "files": [floats_file] } torch.save(floats_tensor, os.path.join(args_output, floats_file)) new_source_spec[mapping_name] = [ints_chunk, floats_chunk] for feature in categorical_features: found_cardinality = found_cardinalities[feature] declared_cardinality = feature_spec.feature_spec[feature].get( 'cardinality', 'auto') if declared_cardinality != "auto": declared = int(declared_cardinality) assert declared >= found_cardinality, "Specified cardinality conflicts data" found_cardinalities[feature] = declared new_inner_feature_spec = { user_feature_name: { "dtype": "torch.int64", "cardinality": int(found_cardinalities[user_feature_name]) }, item_feature_name: { "dtype": "torch.int64", "cardinality": int(found_cardinalities[item_feature_name]) }, label_feature_name: { "dtype": "torch.float32" } } new_feature_spec = FeatureSpec(feature_spec=new_inner_feature_spec, source_spec=new_source_spec, channel_spec=feature_spec.channel_spec, metadata=feature_spec.metadata, base_directory="") feature_spec_save_path = os.path.join(args_output, args_feature_spec_out) new_feature_spec.to_yaml(output_path=feature_spec_save_path)