def transform(input_path, features_path, labels_path, metadata_path, scaler_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") # the amount of values (minus the label) # should be the same as the amount of variables assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) # assign each column with the right mapping for variable, value in zip(VARIABLES, values[:-1]): value = float(value) features[sample_index, metadata["value_to_index"][variable]] = value # the last value is the label labels[sample_index] = int(values[-1]) # next row sample_index += 1 # next line line = input_file.readline() # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) validate_class_distribution(metadata, NUM_SAMPLES) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path, scaler_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file) variables = set(reader.fieldnames) variables.remove("ID") variables.remove("default payment next month") metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES) # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 for row in reader: for variable in metadata["variables"]: value = row[variable] if TYPES[variable] == "numerical": value = float(value) features[sample_index, metadata["value_to_index"][variable]] = value elif TYPES[variable] == "categorical": value = int(float(value)) assert value in ORIGINAL_ENCODING_TO_VALUES[variable], \ "'{}' is not a valid value for '{}'".format(value, variable) value = ORIGINAL_ENCODING_TO_VALUES[variable][value] features[sample_index, metadata["value_to_index"][variable][value]] = 1.0 # the class needs to be transformed labels[sample_index] = int(row["default payment next month"].replace( ".0", "")) # next row sample_index += 1 # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, output_path, metadata_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file) # read the variables from the header variables = sorted(reader.fieldnames) # but ignore the ID variables.remove("caseid") # first read everything to count the values per variable categorical_values = {} for variable in variables: categorical_values[variable] = set() for row_number, row in enumerate(reader): for variable in variables: value = row[variable] categorical_values[variable].add(value) # now create the metadata metadata = create_metadata( variables, create_one_type_dictionary("categorical", variables), categorical_values, NUM_SAMPLES) # go back to the beginning input_file.seek(0) # the reader needs to be re-initialized # using tell after reading the headers does not work during the reader iteration reader = csv.DictReader(input_file) # now fill the feature matrix with the right encoding ones = [] rows = [] cols = [] for row_number, row in enumerate(reader): for variable in variables: value = row[variable] feature_number = metadata["value_to_index"][variable][value] ones.append(1) rows.append(row_number) cols.append(feature_number) output = csr_matrix( (ones, (rows, cols)), shape=(metadata["num_samples"], metadata["num_features"]), dtype=np.uint8) save_npz(output_path, output) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path, scaler_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES), CLASSES) input_file = open(input_path, "r") # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") assert len(values) - 1 == len(VARIABLES), str( (len(values) - 1, len(VARIABLES))) for feature_index, value in enumerate(values[1:]): value = float(value) features[sample_index, feature_index] = value labels[sample_index] = CLASS_TO_INDEX[values[0]] # next line line = input_file.readline() # next row sample_index += 1 # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) validate_class_distribution(metadata, NUM_SAMPLES) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path): input_file = open(input_path, "r") reader = csv.DictReader(input_file, fieldnames=VARIABLES + ["class"]) metadata = create_metadata( VARIABLES, create_one_type_dictionary("categorical", VARIABLES), VALUES, NUM_SAMPLES, CLASSES) # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.uint8) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 for row in reader: labels[sample_index] = CLASS_TO_INDEX[row["class"]] for variable in VARIABLES: value = row[variable] assert value in VALUES[ variable], "'{}' is not a valid value for '{}'".format( value, variable) feature_index = metadata["value_to_index"][variable][value] features[sample_index, feature_index] = 1 # next row sample_index += 1 # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(train_path, test_path, features_path, labels_path, metadata_path, ignore_missing, scaler_path): num_samples_train = NUM_SAMPLES[ignore_missing]["train"] num_samples_test = NUM_SAMPLES[ignore_missing]["test"] metadata = create_metadata(VARIABLES, TYPES, VALUES, num_samples_train + num_samples_test, CLASSES) # transform train train_file = open(train_path, "r") features_train, labels_train = adult_transform_file( train_file, num_samples_train, metadata["num_features"], metadata["value_to_index"], ignore_missing) train_file.close() # transform test test_file = open(test_path, "r") test_file.readline() # has an extra first line features_test, labels_test = adult_transform_file( test_file, num_samples_test, metadata["num_features"], metadata["value_to_index"], ignore_missing) test_file.close() # concatenate train and test features = np.concatenate((features_train, features_test)) labels = np.concatenate((labels_train, labels_test)) # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # save np.save(features_path, features) np.save(labels_path, labels) with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path, ignore_missing, scaler_path): metadata = create_metadata( VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {}, sum(NUM_SAMPLES[ignore_missing]), CLASSES) input_file = open(input_path, "r") # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") missing_values = False for value in values: if value == "?" or value == "": missing_values = True break # if there are missing values the ignore missing flag is set then ignore the row if not missing_values or not ignore_missing: # the amount of values (minus the ID and the label) # should be the same as the amount of variables assert len(values) - 2 == len(VARIABLES), str( (len(values), len(VARIABLES))) # assign each column with the right mapping # (skip the ID and the label) for variable, value in zip(VARIABLES, values[1:-1]): if value == "?" or value == "": value = np.nan else: value = float(value) features[sample_index, metadata["value_to_index"][variable]] = value # the second value is the label labels[sample_index] = CLASS_TO_INDEX[values[-1]] # next row sample_index += 1 # next line line = input_file.readline() # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) validate_class_distribution(metadata, NUM_SAMPLES[ignore_missing]) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path, scaler_path): metadata = create_metadata(VARIABLES, TYPES, VALUES, sum(NUM_SAMPLES), CLASSES) # The raw data is already nicely one-hot-encoded, but we need to follow the standard of our metadata. # The most important thing is that we put the categorical features at the beginning. # The extra annoying thing is that we also sort the categorical values alphabetically. # Because of this, we need to do map of feature indices unnecessarily. old_to_new_feature_indices = {} old_feature_index = 0 for variable in VARIABLES: if TYPES[variable] == "numerical": old_to_new_feature_indices[old_feature_index] = metadata["value_to_index"][variable] old_feature_index += 1 elif TYPES[variable] == "categorical": for value in VALUES[variable]: old_to_new_feature_indices[old_feature_index] = metadata["value_to_index"][variable][value] old_feature_index += 1 else: raise Exception("Invalid type.") input_file = open(input_path, "r") # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.int32) # transform sample_index = 0 line = input_file.readline() while line != "": line = line.rstrip("\n") values = line.split(",") # transform original class numbers to 0 indexed arrays class_number = int(values[-1]) - 1 if sample_index < metadata["num_samples"]: # the categorical variables are already one hot encoded for old_feature_index, value in enumerate(values[:-1]): new_feature_index = old_to_new_feature_indices[old_feature_index] value = float(value) features[sample_index, new_feature_index] = value # the class needs to be transformed labels[sample_index] = class_number # next line line = input_file.readline() # next row sample_index += 1 # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) update_class_distribution(metadata, labels) # validate the known distributions validate_num_samples(metadata, sample_index) validate_class_distribution(metadata, NUM_SAMPLES) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)
def transform(input_path, features_path, labels_path, metadata_path, scaler_path): metadata = create_metadata(VARIABLES, TYPES, VALUES, NUM_SAMPLES) input_file = open(input_path, "r") reader = csv.DictReader(input_file) reader.fieldnames = [variable.strip() for variable in reader.fieldnames] # initialize outputs features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32) labels = np.zeros(metadata["num_samples"], dtype=np.float32) # transform sample_index = 0 for row in reader: for variable in metadata["variables"]: # numerical variable if TYPES[variable] == "numerical": value = float(row[variable]) features[sample_index, metadata["value_to_index"][variable]] = value # categorical variable elif TYPES[variable] == "categorical": value = None # check all possible values for possible_value in VALUES[variable]: # skip the none possible value if possible_value == "none": continue # if the possible value binary flag is set real_variable = "{}{}{}".format( variable, ONE_HOT_ENCODING_SEPARATOR, possible_value) if read_binary(row[real_variable]) == 1: # if the categorical variable had no value set if value is None: value = possible_value # the categorical variable already had a value set else: raise Exception( "'{}' was already defined".format(variable)) # if no binary flag was set for the categorical variable if value is None: # if it is possible to have no value if "none" in VALUES[variable]: value = "none" # if it is not possible to have no value but there is no value else: raise Exception( "'{}' has no valid value".format(variable)) # set the categorical variable flag in the mapped feature features[sample_index, metadata["value_to_index"][variable][value]] = 1.0 # binary variable elif TYPES[variable] == "binary": value = read_binary(row[variable]) assert value in [ 0, 1 ], "'{}' is not a valid value for '{}'".format( value, variable) features[sample_index, metadata["value_to_index"][variable]] = value # unknown variable type else: raise Exception("Unknown variable type.") # label labels[sample_index] = row["shares"] # next row sample_index += 1 # scale if scaler_path is not None: features = scale_and_save_scaler(features, scaler_path) # add distributions to the metadata update_feature_distributions(metadata, features) # validate the known distributions validate_num_samples(metadata, sample_index) np.save(features_path, features) np.save(labels_path, labels) input_file.close() with open(metadata_path, "w") as metadata_file: json.dump(metadata, metadata_file)