Ejemplo n.º 1
0
def transform(input_path, features_path, labels_path, metadata_path,
              scaler_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        # the amount of values (minus the label)
        # should be the same as the amount of variables
        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        # assign each column with the right mapping
        for variable, value in zip(VARIABLES, values[:-1]):
            value = float(value)
            features[sample_index,
                     metadata["value_to_index"][variable]] = value

        # the last value is the label
        labels[sample_index] = int(values[-1])

        # next row
        sample_index += 1
        # next line
        line = input_file.readline()

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)
    validate_class_distribution(metadata, NUM_SAMPLES)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 2
0
def transform(input_path, features_path, labels_path, metadata_path,
              scaler_path):
    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    variables = set(reader.fieldnames)
    variables.remove("ID")
    variables.remove("default payment next month")

    metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES)

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    for row in reader:
        for variable in metadata["variables"]:
            value = row[variable]
            if TYPES[variable] == "numerical":
                value = float(value)
                features[sample_index,
                         metadata["value_to_index"][variable]] = value
            elif TYPES[variable] == "categorical":
                value = int(float(value))
                assert value in ORIGINAL_ENCODING_TO_VALUES[variable], \
                    "'{}' is not a valid value for '{}'".format(value, variable)
                value = ORIGINAL_ENCODING_TO_VALUES[variable][value]
                features[sample_index,
                         metadata["value_to_index"][variable][value]] = 1.0

        # the class needs to be transformed
        labels[sample_index] = int(row["default payment next month"].replace(
            ".0", ""))

        # next row
        sample_index += 1

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 3
0
def transform(input_path, output_path, metadata_path):
    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    # read the variables from the header
    variables = sorted(reader.fieldnames)
    # but ignore the ID
    variables.remove("caseid")

    # first read everything to count the values per variable
    categorical_values = {}
    for variable in variables:
        categorical_values[variable] = set()

    for row_number, row in enumerate(reader):
        for variable in variables:
            value = row[variable]
            categorical_values[variable].add(value)

    # now create the metadata
    metadata = create_metadata(
        variables, create_one_type_dictionary("categorical", variables),
        categorical_values, NUM_SAMPLES)

    # go back to the beginning
    input_file.seek(0)
    # the reader needs to be re-initialized
    # using tell after reading the headers does not work during the reader iteration
    reader = csv.DictReader(input_file)

    # now fill the feature matrix with the right encoding
    ones = []
    rows = []
    cols = []

    for row_number, row in enumerate(reader):
        for variable in variables:
            value = row[variable]
            feature_number = metadata["value_to_index"][variable][value]

            ones.append(1)
            rows.append(row_number)
            cols.append(feature_number)

    output = csr_matrix(
        (ones, (rows, cols)),
        shape=(metadata["num_samples"], metadata["num_features"]),
        dtype=np.uint8)

    save_npz(output_path, output)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 4
0
def transform(input_path, features_path, labels_path, metadata_path,
              scaler_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        for feature_index, value in enumerate(values[1:]):
            value = float(value)
            features[sample_index, feature_index] = value

        labels[sample_index] = CLASS_TO_INDEX[values[0]]

        # next line
        line = input_file.readline()
        # next row
        sample_index += 1

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)
    validate_class_distribution(metadata, NUM_SAMPLES)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 5
0
def transform(input_path, features_path, labels_path, metadata_path):
    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file, fieldnames=VARIABLES + ["class"])

    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("categorical", VARIABLES),
        VALUES, NUM_SAMPLES, CLASSES)

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.uint8)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    for row in reader:
        labels[sample_index] = CLASS_TO_INDEX[row["class"]]

        for variable in VARIABLES:
            value = row[variable]
            assert value in VALUES[
                variable], "'{}' is not a valid value for '{}'".format(
                    value, variable)
            feature_index = metadata["value_to_index"][variable][value]
            features[sample_index, feature_index] = 1

        # next row
        sample_index += 1

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 6
0
def transform(train_path, test_path, features_path, labels_path, metadata_path,
              ignore_missing, scaler_path):

    num_samples_train = NUM_SAMPLES[ignore_missing]["train"]
    num_samples_test = NUM_SAMPLES[ignore_missing]["test"]

    metadata = create_metadata(VARIABLES, TYPES, VALUES,
                               num_samples_train + num_samples_test, CLASSES)

    # transform train
    train_file = open(train_path, "r")
    features_train, labels_train = adult_transform_file(
        train_file, num_samples_train, metadata["num_features"],
        metadata["value_to_index"], ignore_missing)
    train_file.close()

    # transform test
    test_file = open(test_path, "r")
    test_file.readline()  # has an extra first line
    features_test, labels_test = adult_transform_file(
        test_file, num_samples_test, metadata["num_features"],
        metadata["value_to_index"], ignore_missing)
    test_file.close()

    # concatenate train and test
    features = np.concatenate((features_train, features_test))
    labels = np.concatenate((labels_train, labels_test))

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # save
    np.save(features_path, features)
    np.save(labels_path, labels)

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 7
0
def transform(input_path, features_path, labels_path, metadata_path,
              ignore_missing, scaler_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES[ignore_missing]), CLASSES)

    input_file = open(input_path, "r")

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        missing_values = False
        for value in values:
            if value == "?" or value == "":
                missing_values = True
                break

        # if there are missing values the ignore missing flag is set then ignore the row
        if not missing_values or not ignore_missing:
            # the amount of values (minus the ID and the label)
            # should be the same as the amount of variables
            assert len(values) - 2 == len(VARIABLES), str(
                (len(values), len(VARIABLES)))

            # assign each column with the right mapping
            # (skip the ID and the label)
            for variable, value in zip(VARIABLES, values[1:-1]):
                if value == "?" or value == "":
                    value = np.nan
                else:
                    value = float(value)
                features[sample_index,
                         metadata["value_to_index"][variable]] = value

            # the second value is the label
            labels[sample_index] = CLASS_TO_INDEX[values[-1]]

            # next row
            sample_index += 1

        # next line
        line = input_file.readline()

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)
    validate_class_distribution(metadata, NUM_SAMPLES[ignore_missing])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 8
0
def transform(input_path, features_path, labels_path, metadata_path, scaler_path):
    metadata = create_metadata(VARIABLES, TYPES, VALUES, sum(NUM_SAMPLES), CLASSES)

    # The raw data is already nicely one-hot-encoded, but we need to follow the standard of our metadata.
    # The most important thing is that we put the categorical features at the beginning.
    # The extra annoying thing is that we also sort the categorical values alphabetically.
    # Because of this, we need to do map of feature indices unnecessarily.
    old_to_new_feature_indices = {}
    old_feature_index = 0
    for variable in VARIABLES:
        if TYPES[variable] == "numerical":
            old_to_new_feature_indices[old_feature_index] = metadata["value_to_index"][variable]
            old_feature_index += 1
        elif TYPES[variable] == "categorical":
            for value in VALUES[variable]:
                old_to_new_feature_indices[old_feature_index] = metadata["value_to_index"][variable][value]
                old_feature_index += 1
        else:
            raise Exception("Invalid type.")

    input_file = open(input_path, "r")

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    sample_index = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        # transform original class numbers to 0 indexed arrays
        class_number = int(values[-1]) - 1

        if sample_index < metadata["num_samples"]:
            # the categorical variables are already one hot encoded
            for old_feature_index, value in enumerate(values[:-1]):
                new_feature_index = old_to_new_feature_indices[old_feature_index]
                value = float(value)
                features[sample_index, new_feature_index] = value

            # the class needs to be transformed
            labels[sample_index] = class_number

        # next line
        line = input_file.readline()
        # next row
        sample_index += 1

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)
    update_class_distribution(metadata, labels)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)
    validate_class_distribution(metadata, NUM_SAMPLES)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Ejemplo n.º 9
0
def transform(input_path, features_path, labels_path, metadata_path,
              scaler_path):
    metadata = create_metadata(VARIABLES, TYPES, VALUES, NUM_SAMPLES)

    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    reader.fieldnames = [variable.strip() for variable in reader.fieldnames]

    # initialize outputs
    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.float32)

    # transform
    sample_index = 0
    for row in reader:
        for variable in metadata["variables"]:
            # numerical variable
            if TYPES[variable] == "numerical":
                value = float(row[variable])
                features[sample_index,
                         metadata["value_to_index"][variable]] = value

            # categorical variable
            elif TYPES[variable] == "categorical":
                value = None

                # check all possible values
                for possible_value in VALUES[variable]:
                    # skip the none possible value
                    if possible_value == "none":
                        continue

                    # if the possible value binary flag is set
                    real_variable = "{}{}{}".format(
                        variable, ONE_HOT_ENCODING_SEPARATOR, possible_value)
                    if read_binary(row[real_variable]) == 1:
                        # if the categorical variable had no value set
                        if value is None:
                            value = possible_value
                        # the categorical variable already had a value set
                        else:
                            raise Exception(
                                "'{}' was already defined".format(variable))

                # if no binary flag was set for the categorical variable
                if value is None:
                    # if it is possible to have no value
                    if "none" in VALUES[variable]:
                        value = "none"
                    # if it is not possible to have no value but there is no value
                    else:
                        raise Exception(
                            "'{}' has no valid value".format(variable))

                # set the categorical variable flag in the mapped feature
                features[sample_index,
                         metadata["value_to_index"][variable][value]] = 1.0

            # binary variable
            elif TYPES[variable] == "binary":
                value = read_binary(row[variable])
                assert value in [
                    0, 1
                ], "'{}' is not a valid value for '{}'".format(
                    value, variable)
                features[sample_index,
                         metadata["value_to_index"][variable]] = value

            # unknown variable type
            else:
                raise Exception("Unknown variable type.")

        # label
        labels[sample_index] = row["shares"]

        # next row
        sample_index += 1

    # scale
    if scaler_path is not None:
        features = scale_and_save_scaler(features, scaler_path)

    # add distributions to the metadata
    update_feature_distributions(metadata, features)

    # validate the known distributions
    validate_num_samples(metadata, sample_index)

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)