Esempio n. 1
0
def main(argv):
    tf.random.set_seed(FLAGS.seed)

    number_of_entries = FLAGS.synthetic_dataset_num_entries
    batch_size = FLAGS.synthetic_dataset_batch_size
    number_of_batches = number_of_entries // batch_size

    if FLAGS.feature_spec is not None:
        fspec = FeatureSpec.from_yaml(FLAGS.feature_spec)
    else:
        cardinalities = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
        fspec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=FLAGS.num_numerical_features,
                                                     categorical_feature_cardinalities=cardinalities)

    fspec.base_directory = FLAGS.synthetic_dataset_dir
    fspec.check_feature_spec()

    number_of_numerical_features = fspec.get_number_of_numerical_features()
    categorical_feature_sizes = fspec.get_categorical_sizes()

    train_dataset = DummyDataset(batch_size=batch_size, num_numerical_features=number_of_numerical_features,
                                 categorical_feature_cardinalities=categorical_feature_sizes,
                                 num_batches=number_of_batches)

    test_dataset = DummyDataset(batch_size=batch_size, num_numerical_features=number_of_numerical_features,
                                categorical_feature_cardinalities=categorical_feature_sizes,
                                num_batches=number_of_batches)

    write_dataset_to_disk(
        dataset_train=train_dataset,
        dataset_test=test_dataset,
        feature_spec=fspec
    )
Esempio n. 2
0
def save_feature_spec(user_cardinality,
                      item_cardinality,
                      dtypes,
                      test_negative_samples,
                      output_path,
                      user_feature_name='user',
                      item_feature_name='item',
                      label_feature_name='label'):
    feature_spec = {
        user_feature_name: {
            'dtype': dtypes[user_feature_name],
            'cardinality': int(user_cardinality)
        },
        item_feature_name: {
            'dtype': dtypes[item_feature_name],
            'cardinality': int(item_cardinality)
        },
        label_feature_name: {
            'dtype': dtypes[label_feature_name],
        }
    }
    metadata = {TEST_SAMPLES_PER_SERIES: test_negative_samples + 1}
    train_mapping = [{
        'type': 'torch_tensor',
        'features': [user_feature_name, item_feature_name],
        'files': [TRAIN_0]
    }, {
        'type': 'torch_tensor',
        'features': [label_feature_name],
        'files': [TRAIN_1]
    }]
    test_mapping = [{
        'type': 'torch_tensor',
        'features': [user_feature_name, item_feature_name],
        'files': [TEST_0],
    }, {
        'type': 'torch_tensor',
        'features': [label_feature_name],
        'files': [TEST_1],
    }]
    channel_spec = {
        USER_CHANNEL_NAME: [user_feature_name],
        ITEM_CHANNEL_NAME: [item_feature_name],
        LABEL_CHANNEL_NAME: [label_feature_name]
    }
    source_spec = {'train': train_mapping, 'test': test_mapping}
    feature_spec = FeatureSpec(feature_spec=feature_spec,
                               metadata=metadata,
                               source_spec=source_spec,
                               channel_spec=channel_spec,
                               base_directory="")
    feature_spec.to_yaml(output_path=output_path)
Esempio n. 3
0
def main():
    # TODO: would be kind of nice to have an optional tag to be able to juggle
    # between sets of vectors corresponding to different user folds or sets
    # of features. (Though don't want too many lying around. Vectorizing just
    # 1% of users can already produce files as big as a GB.)
    parser = argparse.ArgumentParser()
    parser.add_argument('user_fold')
    parser.add_argument(
        '--tag',
        action='store_true',
        help='Whether to tag the generated vectors by the user fold used')
    parser.add_argument('--lim',
                        type=int,
                        help='Limit number of users vectorized')
    args = parser.parse_args()

    #featspec = FeatureSpec.all_features_spec()
    featspec = FeatureSpec.basic_spec()
    #featspec.add_feature(features.PrevOrderPids)
    users = iterate_wrapped_users(args.user_fold)
    if args.tag:
        affix = '_' + args.user_fold
    else:
        affix = ''
    victor = Vectorizer(featspec, affix)
    n = victor.vectorize_users(users, limit=args.lim)
    print 'Vectorized {} users from fold {}'.format(n, args.user_fold)
Esempio n. 4
0
def main():
    args = parse_args()
    dataset_size = args.size
    fspec_in = FeatureSpec.from_yaml(args.feature_spec_in)
    fspec_in.base_directory = args.output
    cat_cardinalities = fspec_in.get_categorical_sizes()
    cat_names = fspec_in.get_categorical_feature_names()
    cardinalities = {name: cardinality for name, cardinality in zip(cat_names, cat_cardinalities)}
    input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0]
    numerical_names_set = set(fspec_in.channel_spec[NUMERICAL_CHANNEL])
    for mapping_name, mapping in fspec_in.source_spec.items():
        for chunk in mapping:
            assert chunk['type'] == 'csv', "Only csv files supported in this generator"
            assert len(chunk['files']) == 1, "Only one file per chunk supported in this transcoder"
            path_to_save = os.path.join(fspec_in.base_directory, chunk['files'][0])
            data = []
            for name in chunk['features']:
                if name == input_label_feature_name:
                    data.append(np.random.randint(0, 1, size=dataset_size))
                elif name in numerical_names_set:
                    data.append(np.random.rand(dataset_size))
                else:
                    local_cardinality = cardinalities[name]
                    data.append(np.random.randint(0, local_cardinality, size=dataset_size))
            values = np.stack(data).T
            to_save = pd.DataFrame(values, columns=chunk['features'])
            os.makedirs(os.path.dirname(path_to_save), exist_ok=True)
            to_save.to_csv(path_to_save, index=False, header=False)
Esempio n. 5
0
def test_samples_in_test_series(path):
    loaded_featurespec = FeatureSpec.from_yaml(path)

    series_length = loaded_featurespec.metadata[TEST_SAMPLES_PER_SERIES]
    dataset = TorchTensorDataset(loaded_featurespec, 'test', mock_args())
    for feature in dataset.features.values():
        assert len(feature) % series_length == 0
Esempio n. 6
0
def split_dataset(dataset_dir: str, output_dir: str, batch_size: int,
                  numerical_features: int):
    categorical_sizes_file = os.path.join(dataset_dir, "model_size.json")
    with open(categorical_sizes_file) as f:
        # model_size.json contains the max value of each feature instead of the cardinality.
        # For feature spec this is changed for consistency and clarity.
        categorical_cardinalities = [int(v) + 1 for v in json.load(f).values()]

    train_file = os.path.join(dataset_dir, "train_data.bin")
    test_file = os.path.join(dataset_dir, "test_data.bin")
    val_file = os.path.join(dataset_dir, "validation_data.bin")

    target_train = os.path.join(output_dir, "train")
    target_test = os.path.join(output_dir, "test")
    target_val = os.path.join(output_dir, "validation")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(target_train, exist_ok=True)
    os.makedirs(target_test, exist_ok=True)
    os.makedirs(target_val, exist_ok=True)

    # VALIDATION chunk is ignored in feature spec on purpose
    feature_spec = FeatureSpec.get_default_feature_spec(
        number_of_numerical_features=numerical_features,
        categorical_feature_cardinalities=categorical_cardinalities)
    feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml'))
    split_binary_file(test_file, target_test, categorical_cardinalities,
                      numerical_features, batch_size)
    split_binary_file(train_file, target_train, categorical_cardinalities,
                      numerical_features, batch_size)
    split_binary_file(val_file, target_val, categorical_cardinalities,
                      numerical_features, batch_size)
Esempio n. 7
0
    def __init__(
        self,
        feature_spec: FeatureSpec,
        instance: str,
        local_categorical_feature_names: List[str],
        batch_size: int = 1,
        numerical_features_enabled: bool = False,
    ):

        self._feature_spec = feature_spec
        self._batch_size = batch_size
        self._instance = instance
        feature_spec.check_feature_spec()
        self._create_readers(feature_spec, local_categorical_feature_names,
                             numerical_features_enabled)
        self._categorical_types_tf = [
            fspec_type_to_tf_type[feature_spec.feature_spec[feature]
                                  [DTYPE_SELECTOR]]
            for feature in local_categorical_feature_names
        ]
Esempio n. 8
0
def test_cardinalities(path):
    loaded_featurespec = FeatureSpec.from_yaml(path)
    features = loaded_featurespec.feature_spec
    declared_cardinalities = {
        name: data['cardinality']
        for name, data in features.items() if 'cardinality' in data
    }
    source_spec = loaded_featurespec.source_spec

    for mapping_name, mapping in source_spec.items():
        dataset = TorchTensorDataset(loaded_featurespec, mapping_name,
                                     mock_args())
        for feature_name, cardinality in declared_cardinalities.items():
            feature_data = dataset.features[feature_name]
            biggest_num = feature_data.max().item()
            assert biggest_num < cardinality
Esempio n. 9
0
def write_dataset_to_disk(dataset_train, dataset_test, feature_spec: FeatureSpec) -> None:

    feature_spec.check_feature_spec()  # We rely on the feature spec being properly formatted

    categorical_features_list = feature_spec.get_categorical_feature_names()
    categorical_features_types = [feature_spec.feature_spec[feature_name][DTYPE_SELECTOR]
                                  for feature_name in categorical_features_list]
    number_of_numerical_features = feature_spec.get_number_of_numerical_features()
    number_of_categorical_features = len(categorical_features_list)

    for mapping_name, dataset in zip((TRAIN_MAPPING, TEST_MAPPING),
                                     (dataset_train, dataset_test)):
        file_streams = []
        label_path, numerical_path, categorical_paths = feature_spec.get_mapping_paths(mapping_name)
        try:
            os.makedirs(os.path.dirname(numerical_path), exist_ok=True)
            numerical_f = open(numerical_path, "wb+")
            file_streams.append(numerical_f)

            os.makedirs(os.path.dirname(label_path), exist_ok=True)
            label_f = open(label_path, 'wb+')
            file_streams.append(label_f)

            categorical_fs = []
            for feature_name in categorical_features_list:
                local_path = categorical_paths[feature_name]
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
                fs = open(local_path, 'wb+')
                categorical_fs.append(fs)
                file_streams.append(fs)

            pipe = iter(dataset.op())
            for _ in tqdm.tqdm(
                    range(len(dataset)), desc=mapping_name + " dataset saving"):
                (numerical, categorical), label = pipe.get_next()
                categoricals = tf.split(categorical, number_of_categorical_features, axis=1)
                assert (numerical.shape[-1] == number_of_numerical_features)
                assert (len(categoricals) == number_of_categorical_features)

                numerical_f.write(numerical.numpy().astype('float16').tobytes())  # numerical is always float16
                label_f.write(label.numpy().astype('bool').tobytes())  # label is always boolean
                for cat_type, cat_tensor, cat_file in zip(categorical_features_types, categoricals, categorical_fs):
                    cat_file.write(cat_tensor.numpy().astype(cat_type).tobytes())
        finally:
            for stream in file_streams:
                stream.close()
    feature_spec.to_yaml()
Esempio n. 10
0
def test_dtypes(path):
    loaded_featurespec = FeatureSpec.from_yaml(path)
    features = loaded_featurespec.feature_spec
    declared_dtypes = {name: data['dtype'] for name, data in features.items()}
    source_spec = loaded_featurespec.source_spec
    for mapping in source_spec.values():
        for chunk in mapping:
            chunk_dtype = None
            for present_feature in chunk['features']:
                assert present_feature in declared_dtypes, "unknown feature in mapping"
                # Check declared type
                feature_dtype = declared_dtypes[present_feature]
                if chunk_dtype is None:
                    chunk_dtype = feature_dtype
                else:
                    assert chunk_dtype == feature_dtype

            path_to_load = os.path.join(loaded_featurespec.base_directory,
                                        chunk['files'][0])
            loaded_data = torch.load(path_to_load)
            assert str(loaded_data.dtype) == chunk_dtype
Esempio n. 11
0
def main():
    args = parse_args()
    args_output = args.output
    args_input = args.input
    args_feature_spec_in = args.feature_spec_in
    args_feature_spec_out = args.feature_spec_out
    batch_size = args.chunk_size

    fspec_in_path = os.path.join(args_input, args_feature_spec_in)
    fspec_in = FeatureSpec.from_yaml(fspec_in_path)

    input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0]
    input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL]
    input_categorical_features_list = fspec_in.channel_spec[
        CATEGORICAL_CHANNEL]

    # Do a pass to establish the cardinalities: they influence the type we save the dataset as
    found_cardinalities = defaultdict(lambda: 0)
    for mapping_name, mapping in fspec_in.source_spec.items():
        df_iterators = []
        for chunk in mapping:
            assert chunk[
                'type'] == 'csv', "Only csv files supported in this transcoder"
            assert len(
                chunk['files']
            ) == 1, "Only one file per chunk supported in this transcoder"
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(chunks, axis=1)
            for feature in input_categorical_features_list:
                mapping_cardinality = mapping_df[feature].max() + 1
                previous_cardinality = found_cardinalities[feature]
                found_cardinalities[feature] = max(previous_cardinality,
                                                   mapping_cardinality)

    for feature in input_categorical_features_list:
        declared_cardinality = fspec_in.feature_spec[feature][
            CARDINALITY_SELECTOR]
        if declared_cardinality == 'auto':
            pass
        else:
            assert int(declared_cardinality) >= found_cardinalities[feature]
            found_cardinalities[feature] = int(declared_cardinality)

    categorical_cardinalities = [
        found_cardinalities[f] for f in input_categorical_features_list
    ]
    number_of_numerical_features = fspec_in.get_number_of_numerical_features()

    fspec_out = FeatureSpec.get_default_feature_spec(
        number_of_numerical_features=number_of_numerical_features,
        categorical_feature_cardinalities=categorical_cardinalities)
    fspec_out.base_directory = args.output

    for mapping_name, mapping in fspec_in.source_spec.items():

        # open files for outputting
        label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths(
            mapping_name)
        for path in [label_path, numerical_path, *categorical_paths.values()]:
            os.makedirs(os.path.dirname(path), exist_ok=True)
        output_categorical_features_list = fspec_out.get_categorical_feature_names(
        )
        numerical_f = open(numerical_path, "ab+")
        label_f = open(label_path, "ab+")
        categorical_fs = [
            open(categorical_paths[name], "ab+")
            for name in output_categorical_features_list
        ]
        categorical_feature_types = [
            get_categorical_feature_type(card)
            for card in categorical_cardinalities
        ]

        df_iterators = []
        for chunk in mapping:
            # We checked earlier it's a single file chunk
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(
                chunks, axis=1
            )  # This takes care of making sure feature names are unique

            # Choose the right columns
            numerical_df = mapping_df[input_numerical_features_list]
            categorical_df = mapping_df[input_categorical_features_list]
            label_df = mapping_df[[input_label_feature_name]]

            # Append them to the binary files
            numerical_f.write(numerical_df.values.astype(np.float16).tobytes())
            label_f.write(label_df.values.astype(np.bool).tobytes())

            categorical_arr = categorical_df.values
            for cat_idx, cat_feature_type in enumerate(
                    categorical_feature_types):
                categorical_fs[cat_idx].write(
                    categorical_arr[:, cat_idx].astype(
                        cat_feature_type).tobytes())

    feature_spec_save_path = os.path.join(args_output, args_feature_spec_out)
    fspec_out.to_yaml(output_path=feature_spec_save_path)
Esempio n. 12
0
def test_matches_template(path, template_path):
    loaded_featurespec_string = FeatureSpec.from_yaml(path).to_string()
    loaded_template_string = FeatureSpec.from_yaml(template_path).to_string()
    assert loaded_template_string == loaded_featurespec_string
Esempio n. 13
0
def main():
    args = parse_args()
    init_distributed(args)

    if args.local_rank == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.metadata('train_throughput', {
        "name": 'train_throughput',
        'format': ":.3e"
    })
    dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"})
    dllogger.metadata('train_epoch_time', {
        "name": 'train_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('validation_epoch_time', {
        "name": 'validation_epoch_time',
        'format': ":.3f"
    })
    dllogger.metadata('eval_throughput', {
        "name": 'eval_throughput',
        'format': ":.3e"
    })

    dllogger.log(data=vars(args), step='PARAMETER')

    if args.seed is not None:
        torch.manual_seed(args.seed)

    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir:
        print("Saving results to {}".format(args.checkpoint_dir))
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    main_start_time = time.time()

    feature_spec_path = os.path.join(args.data, args.feature_spec_file)
    feature_spec = FeatureSpec.from_yaml(feature_spec_path)
    trainset = dataloading.TorchTensorDataset(feature_spec,
                                              mapping_name='train',
                                              args=args)
    testset = dataloading.TorchTensorDataset(feature_spec,
                                             mapping_name='test',
                                             args=args)
    train_loader = dataloading.TrainDataloader(trainset, args)
    test_loader = dataloading.TestDataLoader(testset, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()

    # Create model
    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]
    model = NeuMF(
        nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'],
        nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'],
        mf_dim=args.factors,
        mlp_layer_sizes=args.layers,
        dropout=args.dropout)

    optimizer = FusedAdam(model.parameters(),
                          lr=args.learning_rate,
                          betas=(args.beta1, args.beta2),
                          eps=args.eps)

    criterion = nn.BCEWithLogitsLoss(
        reduction='none'
    )  # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=False,
                                          loss_scale='dynamic')

    if args.distributed:
        model = DDP(model)

    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(
        criterion.forward,
        (torch.rand(local_batch, 1), torch.rand(local_batch, 1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        state_dict = {
            k.replace('module.', ''): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)

    if args.mode == 'test':
        start = time.time()
        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)
        val_time = time.time() - start
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time

        dllogger.log(step=tuple(),
                     data={
                         'best_eval_throughput': eval_throughput,
                         'hr@10': hr
                     })
        return

    # this should always be overridden if hr>0.
    # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring
    # to an uninitialized variable.
    max_hr = 0
    best_epoch = 0
    best_model_timestamp = time.time()
    train_throughputs, eval_throughputs = [], []

    for epoch in range(args.epochs):

        begin = time.time()
        batch_dict_list = train_loader.get_epoch_data()
        num_batches = len(batch_dict_list)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
                batch_dict = batch_dict_list[batch_idx]

                user_features = batch_dict[USER_CHANNEL_NAME]
                item_features = batch_dict[ITEM_CHANNEL_NAME]

                user_batch = user_features[user_feature_name]
                item_batch = item_features[item_feature_name]

                label_features = batch_dict[LABEL_CHANNEL_NAME]
                label_batch = label_features[label_feature_name]

                outputs = model(user_batch, item_batch)
                loss = traced_criterion(outputs, label_batch.view(-1,
                                                                  1)).float()
                loss = torch.mean(loss.view(-1), 0)

                if args.amp:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            optimizer.step()

            for p in model.parameters():
                p.grad = None

        del batch_dict_list
        train_time = time.time() - begin
        begin = time.time()

        epoch_samples = train_loader.length_after_augmentation
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)

        hr, ndcg = val_epoch(model,
                             test_loader,
                             args.topk,
                             distributed=args.distributed)

        val_time = time.time() - begin
        eval_size = test_loader.raw_dataset_length
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)

        dllogger.log(step=(epoch, ),
                     data={
                         'train_throughput': train_throughput,
                         'hr@10': hr,
                         'train_epoch_time': train_time,
                         'validation_epoch_time': val_time,
                         'eval_throughput': eval_throughput
                     })

        if hr > max_hr and args.local_rank == 0:
            max_hr = hr
            best_epoch = epoch
            print("New best hr!")
            if args.checkpoint_dir:
                save_checkpoint_path = os.path.join(args.checkpoint_dir,
                                                    'model.pth')
                print("Saving the model to: ", save_checkpoint_path)
                torch.save(model.state_dict(), save_checkpoint_path)
            best_model_timestamp = time.time()

        if args.threshold is not None:
            if hr >= args.threshold:
                print("Hit threshold of {}".format(args.threshold))
                break

    if args.local_rank == 0:
        dllogger.log(data={
            'best_train_throughput':
            max(train_throughputs),
            'best_eval_throughput':
            max(eval_throughputs),
            'mean_train_throughput':
            np.mean(train_throughputs),
            'mean_eval_throughput':
            np.mean(eval_throughputs),
            'best_accuracy':
            max_hr,
            'best_epoch':
            best_epoch,
            'time_to_target':
            time.time() - main_start_time,
            'time_to_best_model':
            best_model_timestamp - main_start_time
        },
                     step=tuple())
Esempio n. 14
0
def create_input_pipelines(flags):
    if flags.dataset_type == 'synthetic' and not flags.synthetic_dataset_use_feature_spec:
        cardinalities = [int(d) for d in flags.synthetic_dataset_cardinalities]
        feature_spec = FeatureSpec.get_default_feature_spec(
            number_of_numerical_features=flags.
            synthetic_dataset_num_numerical_features,
            categorical_feature_cardinalities=cardinalities)
    else:  # synthetic based on feature spec, or raw
        fspec_path = os.path.join(flags.dataset_path, flags.feature_spec)
        feature_spec = FeatureSpec.from_yaml(fspec_path)

    dataset_metadata = DatasetMetadata(
        num_numerical_features=feature_spec.get_number_of_numerical_features(),
        categorical_cardinalities=feature_spec.get_categorical_sizes())

    if flags.columnwise_split and not flags.data_parallel_bottom_mlp and dataset_metadata.num_numerical_features > 0:
        raise ValueError(
            'Currently when using the --columnwise_split option '
            'you must either set --data_parallel_bottom_mlp or have no numerical features'
        )

    multi_gpu_metadata = get_device_mapping(
        embedding_sizes=dataset_metadata.categorical_cardinalities,
        num_gpus=hvd.size(),
        data_parallel_bottom_mlp=flags.data_parallel_bottom_mlp,
        columnwise_split=flags.columnwise_split,
        num_numerical_features=dataset_metadata.num_numerical_features)

    local_tables = multi_gpu_metadata.rank_to_categorical_ids[hvd.rank()]

    local_numerical_features_enabled = hvd.rank(
    ) in multi_gpu_metadata.bottom_mlp_ranks
    local_numerical_features = dataset_metadata.num_numerical_features if local_numerical_features_enabled else 0

    if flags.dataset_type == 'synthetic':
        local_table_sizes = [
            dataset_metadata.categorical_cardinalities[i] for i in local_tables
        ]
        train_dataset = DummyDataset(
            batch_size=flags.batch_size,
            num_numerical_features=local_numerical_features,
            categorical_feature_cardinalities=local_table_sizes,
            num_batches=flags.synthetic_dataset_train_batches)

        test_dataset = DummyDataset(
            batch_size=flags.valid_batch_size,
            num_numerical_features=local_numerical_features,
            categorical_feature_cardinalities=local_table_sizes,
            num_batches=flags.synthetic_dataset_valid_batches)

    elif flags.dataset_type == 'tf_raw':
        local_categorical_feature_names = feature_spec.cat_positions_to_names(
            local_tables)
        train_dataset = TfRawBinaryDataset(
            feature_spec=feature_spec,
            instance=TRAIN_MAPPING,
            batch_size=flags.batch_size,
            numerical_features_enabled=local_numerical_features_enabled,
            local_categorical_feature_names=local_categorical_feature_names)

        test_dataset = TfRawBinaryDataset(
            feature_spec=feature_spec,
            instance=TEST_MAPPING,
            batch_size=flags.valid_batch_size,
            numerical_features_enabled=local_numerical_features_enabled,
            local_categorical_feature_names=local_categorical_feature_names)

    else:
        raise ValueError(f'Unsupported dataset type: {flags.dataset_type}')

    return train_dataset, test_dataset, dataset_metadata, multi_gpu_metadata
Esempio n. 15
0
def main():
    args = parse_args()
    args_output = args.output
    args_path = args.path
    args_feature_spec_in = args.feature_spec_in
    args_feature_spec_out = args.feature_spec_out

    feature_spec_path = os.path.join(args_path, args_feature_spec_in)
    feature_spec = FeatureSpec.from_yaml(feature_spec_path)

    # Only three features are transcoded - this is NCF specific
    user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0]
    item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0]
    label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0]

    categorical_features = [user_feature_name, item_feature_name]

    found_cardinalities = {f: 0 for f in categorical_features}

    new_source_spec = {}
    for mapping_name, mapping in feature_spec.source_spec.items():
        # Load all chunks and link into one df
        chunk_dfs = []
        for chunk in mapping:
            assert chunk[
                'type'] == 'csv', "Only csv files supported in this transcoder"
            file_dfs = []
            for file in chunk['files']:
                path_to_load = os.path.join(feature_spec.base_directory, file)
                file_dfs.append(pd.read_csv(path_to_load, header=None))
            chunk_df = pd.concat(file_dfs, ignore_index=True)
            chunk_df.columns = chunk['features']
            chunk_df.reset_index(drop=True, inplace=True)
            chunk_dfs.append(chunk_df)
        mapping_df = pd.concat(
            chunk_dfs,
            axis=1)  # This takes care of making sure feature names are unique

        for feature in categorical_features:
            mapping_cardinality = mapping_df[feature].max() + 1
            previous_cardinality = found_cardinalities[feature]
            found_cardinalities[feature] = max(previous_cardinality,
                                               mapping_cardinality)

        # We group together users and items, while separating labels. This is because of the target dtypes: ids are int,
        # while labels are float to compute loss.
        ints_tensor = torch.from_numpy(
            mapping_df[[user_feature_name, item_feature_name]].values).long()
        ints_file = f"{mapping_name}_data_0.pt"
        ints_chunk = {
            "type": "torch_tensor",
            "features": [user_feature_name, item_feature_name],
            "files": [ints_file]
        }
        torch.save(ints_tensor, os.path.join(args_output, ints_file))

        floats_tensor = torch.from_numpy(mapping_df[[label_feature_name
                                                     ]].values).float()
        floats_file = f"{mapping_name}_data_1.pt"
        floats_chunk = {
            "type": "torch_tensor",
            "features": [label_feature_name],
            "files": [floats_file]
        }
        torch.save(floats_tensor, os.path.join(args_output, floats_file))

        new_source_spec[mapping_name] = [ints_chunk, floats_chunk]

    for feature in categorical_features:
        found_cardinality = found_cardinalities[feature]
        declared_cardinality = feature_spec.feature_spec[feature].get(
            'cardinality', 'auto')
        if declared_cardinality != "auto":
            declared = int(declared_cardinality)
            assert declared >= found_cardinality, "Specified cardinality conflicts data"
            found_cardinalities[feature] = declared

    new_inner_feature_spec = {
        user_feature_name: {
            "dtype": "torch.int64",
            "cardinality": int(found_cardinalities[user_feature_name])
        },
        item_feature_name: {
            "dtype": "torch.int64",
            "cardinality": int(found_cardinalities[item_feature_name])
        },
        label_feature_name: {
            "dtype": "torch.float32"
        }
    }

    new_feature_spec = FeatureSpec(feature_spec=new_inner_feature_spec,
                                   source_spec=new_source_spec,
                                   channel_spec=feature_spec.channel_spec,
                                   metadata=feature_spec.metadata,
                                   base_directory="")
    feature_spec_save_path = os.path.join(args_output, args_feature_spec_out)
    new_feature_spec.to_yaml(output_path=feature_spec_save_path)