raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED) # These functions ensure that null data is removed from the data set, # which will help increase machine learning model accuracy. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep # for more details useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;") ] columns = get_dict(args.columns) all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [ all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL) ] new_df = (raw_df.replace_na(columns=all_columns).drop_nulls( *drop_if_all_null).rename_columns(column_pairs=columns).keep_columns( columns=useful_columns)) if not (args.output_cleanse is None): os.makedirs(args.output_cleanse, exist_ok=True) print("%s created" % args.output_cleanse) write_df = new_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_cleanse)) write_df.run_local()
type_conversions={ "pickup_longitude": decimal_type, "pickup_latitude": decimal_type, "dropoff_longitude": decimal_type, "dropoff_latitude": decimal_type }) # Filter out coordinates for locations that are outside the city border. # Chain the column filter commands within the filter() function # and define the minimum and maximum bounds for each field latlong_filtered_df = (combined_df.drop_nulls( columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude" ], column_relationship=dprep.ColumnRelationship( dprep.ColumnRelationship.ANY)).filter( dprep.f_and( dprep.col("pickup_longitude") <= -73.72, dprep.col("pickup_longitude") >= -74.09, dprep.col("pickup_latitude") <= 40.88, dprep.col("pickup_latitude") >= 40.53, dprep.col("dropoff_longitude") <= -73.72, dprep.col("dropoff_longitude") >= -74.09, dprep.col("dropoff_latitude") <= 40.88, dprep.col("dropoff_latitude") >= 40.53))) if not (args.output_filter is None): os.makedirs(args.output_filter, exist_ok=True) print("%s created" % args.output_filter) write_df = latlong_filtered_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_filter))
print("Argument 1(input taxi data path): %s" % args.input_cleanse) print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;"))) print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;"))) print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED) # These functions ensure that null data is removed from the data set, # which will help increase machine learning model accuracy. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep # for more details useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")] columns = get_dict(args.columns) all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)] new_df = (raw_df .replace_na(columns=all_columns) .drop_nulls(*drop_if_all_null) .rename_columns(column_pairs=columns) .keep_columns(columns=useful_columns)) if not (args.output_cleanse is None): os.makedirs(args.output_cleanse, exist_ok=True) print("%s created" % args.output_cleanse) write_df = new_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_cleanse)) write_df.run_local()