Ejemplo n.º 1
0
def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_features: int):
    categorical_sizes_file = os.path.join(dataset_dir, "model_size.json")
    with open(categorical_sizes_file) as f:
        # model_size.json contains the max value of each feature instead of the cardinality.
        # For feature spec this is changed for consistency and clarity.
        categorical_cardinalities = [int(v)+1 for v in json.load(f).values()]

    train_file = os.path.join(dataset_dir, "train_data.bin")
    test_file = os.path.join(dataset_dir, "test_data.bin")
    val_file = os.path.join(dataset_dir, "validation_data.bin")

    target_train = os.path.join(output_dir, "train")
    target_test = os.path.join(output_dir, "test")
    target_val = os.path.join(output_dir, "validation")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(target_train, exist_ok=True)
    os.makedirs(target_test, exist_ok=True)
    os.makedirs(target_val, exist_ok=True)

    # VALIDATION chunk is ignored in feature spec on purpose
    feature_spec = FeatureSpec.get_default_feature_spec(number_of_numerical_features=numerical_features,
                                                        categorical_feature_cardinalities=categorical_cardinalities)
    feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml'))
    split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size)
    split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size)
    split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
Ejemplo n.º 2
0
def load_feature_spec(flags):
    if flags.dataset_type == 'synthetic_gpu' and not flags.synthetic_dataset_use_feature_spec:
        num_numerical = flags.synthetic_dataset_numerical_features
        categorical_sizes = [int(s) for s in FLAGS.synthetic_dataset_table_sizes]
        return FeatureSpec.get_default_feature_spec(number_of_numerical_features=num_numerical,
                                                    categorical_feature_cardinalities=categorical_sizes)
    fspec_path = os.path.join(flags.dataset, flags.feature_spec)
    return FeatureSpec.from_yaml(fspec_path)
Ejemplo n.º 3
0
def main():
    args = parse_args()
    args_output = args.output
    args_input = args.input
    args_feature_spec_in = args.feature_spec_in
    args_feature_spec_out = args.feature_spec_out
    batch_size = args.chunk_size

    fspec_in_path = os.path.join(args_input, args_feature_spec_in)
    fspec_in = FeatureSpec.from_yaml(fspec_in_path)

    input_label_feature_name = fspec_in.channel_spec[LABEL_CHANNEL][0]
    input_numerical_features_list = fspec_in.channel_spec[NUMERICAL_CHANNEL]
    input_categorical_features_list = fspec_in.channel_spec[
        CATEGORICAL_CHANNEL]

    # Do a pass to establish the cardinalities: they influence the type we save the dataset as
    found_cardinalities = defaultdict(lambda: 0)
    for mapping_name, mapping in fspec_in.source_spec.items():
        df_iterators = []
        for chunk in mapping:
            assert chunk[
                'type'] == 'csv', "Only csv files supported in this transcoder"
            assert len(
                chunk['files']
            ) == 1, "Only one file per chunk supported in this transcoder"
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(chunks, axis=1)
            for feature in input_categorical_features_list:
                mapping_cardinality = mapping_df[feature].max() + 1
                previous_cardinality = found_cardinalities[feature]
                found_cardinalities[feature] = max(previous_cardinality,
                                                   mapping_cardinality)

    for feature in input_categorical_features_list:
        declared_cardinality = fspec_in.feature_spec[feature][
            CARDINALITY_SELECTOR]
        if declared_cardinality == 'auto':
            pass
        else:
            assert int(declared_cardinality) >= found_cardinalities[feature]
            found_cardinalities[feature] = int(declared_cardinality)

    categorical_cardinalities = [
        found_cardinalities[f] for f in input_categorical_features_list
    ]
    number_of_numerical_features = fspec_in.get_number_of_numerical_features()

    fspec_out = FeatureSpec.get_default_feature_spec(
        number_of_numerical_features=number_of_numerical_features,
        categorical_feature_cardinalities=categorical_cardinalities)
    fspec_out.base_directory = args.output

    for mapping_name, mapping in fspec_in.source_spec.items():

        # open files for outputting
        label_path, numerical_path, categorical_paths = fspec_out.get_mapping_paths(
            mapping_name)
        for path in [label_path, numerical_path, *categorical_paths.values()]:
            os.makedirs(os.path.dirname(path), exist_ok=True)
        output_categorical_features_list = fspec_out.get_categorical_feature_names(
        )
        numerical_f = open(numerical_path, "ab+")
        label_f = open(label_path, "ab+")
        categorical_fs = [
            open(categorical_paths[name], "ab+")
            for name in output_categorical_features_list
        ]
        categorical_feature_types = [
            get_categorical_feature_type(card)
            for card in categorical_cardinalities
        ]

        df_iterators = []
        for chunk in mapping:
            # We checked earlier it's a single file chunk
            path_to_load = os.path.join(fspec_in.base_directory,
                                        chunk['files'][0])
            chunk_iterator = pd.read_csv(path_to_load,
                                         header=None,
                                         chunksize=batch_size,
                                         names=chunk['features'])
            df_iterators.append(chunk_iterator)

        zipped = zip(*df_iterators)
        for chunks in zipped:
            mapping_df = pd.concat(
                chunks, axis=1
            )  # This takes care of making sure feature names are unique

            # Choose the right columns
            numerical_df = mapping_df[input_numerical_features_list]
            categorical_df = mapping_df[input_categorical_features_list]
            label_df = mapping_df[[input_label_feature_name]]

            numerical = torch.tensor(numerical_df.values)
            label = torch.tensor(label_df.values)
            categorical = torch.tensor(categorical_df.values)

            # Append them to the binary files
            numerical_f.write(
                numerical.to(torch.float16).cpu().numpy().tobytes())
            label_f.write(label.to(torch.bool).cpu().numpy().tobytes())
            for cat_idx, cat_feature_type in enumerate(
                    categorical_feature_types):
                categorical_fs[cat_idx].write(
                    categorical[:, cat_idx].cpu().numpy().astype(
                        cat_feature_type).tobytes())

    feature_spec_save_path = os.path.join(args_output, args_feature_spec_out)
    fspec_out.to_yaml(output_path=feature_spec_save_path)