parser = argparse.ArgumentParser("prep_data") parser.add_argument("--input_file", type=str, help="input raw data file") parser.add_argument("--output_path", type=str, help="output prepped data path") args, unknown = parser.parse_known_args() if (unknown): print("Unknown args:") print(unknown) print("Argument 1 (input training data file): %s" % args.input_file) print("Argument 2 (output prepped training data path) %s" % args.output_path) input_file = dprep.read_csv(args.input_file) prepped_data = ( input_file.drop_columns( columns='skin' ) # skin is same as thickness with another unit (inches/cm) .replace(columns='diabetes', find="TRUE", replace_with="1").replace( columns='diabetes', find="FALSE", replace_with="0").set_column_types( type_conversions={ 'diabetes': dprep.TypeConverter( data_type=dprep.FieldType.INTEGER) })) if not (args.output_path is None): os.makedirs(args.output_path, exist_ok=True) print("%s created" % args.output_path) write_df = prepped_data.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_path)) write_df.run_local()
type=str, help="filter out out of city locations") args = parser.parse_args() print("Argument 1(input taxi data path): %s" % args.input_filter) print("Argument 2(output filtered taxi data path): %s" % args.output_filter) combined_df = dprep.read_csv(args.input_filter + '/part-*') # These functions filter out coordinates for locations that are outside the city border. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details # Create a condensed view of the dataflow to just show the lat/long fields, # which makes it easier to evaluate missing or out-of-scope coordinates decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL) combined_df = combined_df.set_column_types( type_conversions={ "pickup_longitude": decimal_type, "pickup_latitude": decimal_type, "dropoff_longitude": decimal_type, "dropoff_latitude": decimal_type }) # Filter out coordinates for locations that are outside the city border. # Chain the column filter commands within the filter() function # and define the minimum and maximum bounds for each field latlong_filtered_df = (combined_df.drop_nulls( columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"
asset_data_path = 'data/AssetData_Historical.csv' asset_data_df = dprep.read_csv(path=asset_data_path, header=dprep.PromoteHeadersMode.GROUPED) display(asset_data_df.head(5)) #%% dprep_path = os.path.join(os.getcwd(), 'dflows.dprep') dflow_prepared = asset_data_df package = dprep.Package([dflow_prepared]) package.save(dprep_path) #%% package_saved = dprep.Package.open(dprep_path) dflow_prepared = package_saved.dataflows[0] dflow_prepared.get_profile() #%% int_type = dprep.TypeConverter(dprep.FieldType.INTEGER) dflow_prepared = dflow_prepared.set_column_types( type_conversions={'Failure_NextHour': int_type}) dflow_prepared = dflow_prepared.to_number([ 'Density_Overload', 'Abnormal_Flow_Rate', 'Heat_Flow', 'Asset_Integrity', 'Temperature_Differential', 'Volumetric_Flow_Rate', 'Tangential_Stress', 'Duct_Lenghts_in_Units', 'Fault_in_last_Month', 'Avg_hours_in_Use', 'Pressure_Alarm', 'Inclination_Angle', 'Operating_Pressure_above_Normal', 'Compression_Ratio', 'Multiple_Connects', 'Water_Exposure_units', 'Humidity_Factor', 'Cathodic_Protection', 'Pressure_Class' ]) display(dflow_prepared.head(5)) #%%