# Drop the pickup_datetime and dropoff_datetime columns because they're
# no longer needed (granular time features like hour,
# minute and second are more useful for model training).
processed_df = transformed_features_df.drop_columns(
    columns=["pickup_datetime", "dropoff_datetime"])

# Use the type inference functionality to automatically check the data type of each field,
# and display the inference results.
type_infer = processed_df.builders.set_column_types()
type_infer.learn()

# The inference results look correct based on the data. Now apply the type conversions to the dataflow.
type_converted_df = type_infer.to_dataflow()

# Before you package the dataflow, run two final filters on the data set.
# To eliminate incorrectly captured data points,
# filter the dataflow on records where both the cost and distance variable values are greater than zero.
# This step will significantly improve machine learning model accuracy,
# because data points with a zero cost or distance represent major outliers that throw off prediction accuracy.

final_df = type_converted_df.filter(dprep.col("distance") > 0)
final_df = final_df.filter(dprep.col("cost") > 0)

# Writing the final dataframe to use for training in the following steps
if not (args.output_transform is None):
    os.makedirs(args.output_transform, exist_ok=True)
    print("%s created" % args.output_transform)
    write_df = final_df.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_transform))
    write_df.run_local()
Esempio n. 2
0
parser = argparse.ArgumentParser("prep_data")
parser.add_argument("--input_file", type=str, help="input raw data file")
parser.add_argument("--output_path", type=str, help="output prepped data path")

args, unknown = parser.parse_known_args()
if (unknown):
    print("Unknown args:")
    print(unknown)

print("Argument 1 (input training data file): %s" % args.input_file)
print("Argument 2 (output prepped training data path) %s" % args.output_path)

input_file = dprep.read_csv(args.input_file)

prepped_data = (
    input_file.drop_columns(
        columns='skin'
    )  # skin is same as thickness with another unit (inches/cm)
    .replace(columns='diabetes', find="TRUE", replace_with="1").replace(
        columns='diabetes', find="FALSE", replace_with="0").set_column_types(
            type_conversions={
                'diabetes': dprep.TypeConverter(
                    data_type=dprep.FieldType.INTEGER)
            }))

if not (args.output_path is None):
    os.makedirs(args.output_path, exist_ok=True)
    print("%s created" % args.output_path)
    write_df = prepped_data.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_path))
    write_df.run_local()
Esempio n. 3
0
                    help="input featurization")
parser.add_argument("--useful_columns", type=str, help="columns to use")
parser.add_argument("--output_featurization",
                    type=str,
                    help="output featurization")

args = parser.parse_args()

print("Argument 1(input training data path): %s" % args.input_featurization)
print("Argument 2(column features to use): %s" %
      str(args.useful_columns.strip("[]").split("\;")))
print("Argument 3:(output featurized training data path) %s" %
      args.output_featurization)

dflow_prepared = dprep.read_csv(args.input_featurization + '/part-*')

# These functions extracts useful features for training
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")
]
dflow = dflow_prepared.keep_columns(useful_columns)

if not (args.output_featurization is None):
    os.makedirs(args.output_featurization, exist_ok=True)
    print("%s created" % args.output_featurization)
    write_df = dflow.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_featurization))
    write_df.run_local()
# These functions replace undefined values and rename to use meaningful names.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details

replaced_stfor_vals_df = combined_converted_df.replace(columns="store_forward",
                                                       find="0",
                                                       replace_with="N").fill_nulls("store_forward", "N")

replaced_distance_vals_df = replaced_stfor_vals_df.replace(columns="distance",
                                                           find=".00",
                                                           replace_with=0).fill_nulls("distance", 0)

replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"])

time_split_df = (replaced_distance_vals_df
                 .split_column_by_example(source_column="pickup_datetime")
                 .split_column_by_example(source_column="dropoff_datetime"))

# Split the pickup and dropoff datetime values into the respective date and time columns
renamed_col_df = (time_split_df
                  .rename_columns(column_pairs={
                      "pickup_datetime_1": "pickup_date",
                      "pickup_datetime_2": "pickup_time",
                      "dropoff_datetime_1": "dropoff_date",
                      "dropoff_datetime_2": "dropoff_time"}))

if not (args.output_normalize is None):
    os.makedirs(args.output_normalize, exist_ok=True)
    print("%s created" % args.output_normalize)
    write_df = renamed_col_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_normalize))
    write_df.run_local()