def capture_filedataset_layout(self, dataset_name: str, output_path: str): from azureml.dataprep.api.functions import get_portable_path from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction dataset = self.__workspace.datasets[dataset_name] files_column = 'Path' PORTABLE_PATH = 'PortablePath' STREAM_PROPERTIES = 'StreamProperties' dataflow = dataset._dataflow \ .add_column(get_portable_path(col(files_column), None), PORTABLE_PATH, files_column) \ .add_column(get_stream_properties(col(files_column)), STREAM_PROPERTIES, PORTABLE_PATH) \ .keep_columns([files_column, PORTABLE_PATH, STREAM_PROPERTIES]) dataflow_to_execute = dataflow.add_step('Microsoft.DPrep.WritePreppyBlock', { 'outputPath': { 'target': 0, 'resourceDetails': [{'path': str(output_path)}] }, 'profilingFields': ['Kinds', 'MissingAndEmpty'] }) dataflow_to_execute.run_local() df = dataflow.to_pandas_dataframe(extended_types=True) df = df.merge(pd.io.json.json_normalize(df.StreamProperties), left_index=True, right_index=True) print(f'{len(df.index)} files found in the dataset, totalling to a size of {(df.Size.sum() / (1024 * 1024)):,.2f} MB') return df
"dropoff_longitude": decimal_type, "dropoff_latitude": decimal_type }) # Filter out coordinates for locations that are outside the city border. # Chain the column filter commands within the filter() function # and define the minimum and maximum bounds for each field latlong_filtered_df = (combined_df.drop_nulls( columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude" ], column_relationship=dprep.ColumnRelationship( dprep.ColumnRelationship.ANY)).filter( dprep.f_and( dprep.col("pickup_longitude") <= -73.72, dprep.col("pickup_longitude") >= -74.09, dprep.col("pickup_latitude") <= 40.88, dprep.col("pickup_latitude") >= 40.53, dprep.col("dropoff_longitude") <= -73.72, dprep.col("dropoff_longitude") >= -74.09, dprep.col("dropoff_latitude") <= 40.88, dprep.col("dropoff_latitude") >= 40.53))) if not (args.output_filter is None): os.makedirs(args.output_filter, exist_ok=True) print("%s created" % args.output_filter) write_df = latlong_filtered_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_filter)) write_df.run_local()
if columnCount != len(dataFlowColumns): # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved! # Fork a new data flow with rows that have data in the un-expected columns quarantinedDataFlow = dataFlow.drop_nulls(dataFlowColumns[columnCount:]) print('{0}: created quarantined data with {1} rows'.format(dataName, quarantinedDataFlow.row_count)) # Finally save the data flow so it can be used later fullPackagePath = savePackage(dataFlow, dataName, '1', 'B') print('{0}: saved quarantined data to {1}'.format(dataName, fullPackagePath)) # Filter out the quarantined rows from the main data set # NOTE : can't figure out a better way of doign this for now - see note below... for columnToCheck in dataFlowColumns[columnCount:]: # Don't know why line of code below doesn't work! # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '') dataFlow = dataFlow.assert_value(columnToCheck, value != '' , error_code='ShouldBeNone') dataFlow = dataFlow.filter(col(columnToCheck).is_error()) print('{0}: filtered column {1}, row count now {2}'.format(dataName, columnToCheck, dataFlow.row_count)) # Now drop the extra columns dataFlow = dataFlow.drop_columns(dataFlowColumns[columnCount:]) print('{0}: dropped {1} unwanted columns'.format(dataName, len(dataFlowColumns[columnCount:]))) # Detect and apply column types builder = dataFlow.builders.set_column_types() builder.learn() builder.ambiguous_date_conversions_keep_month_day() dataFlow = builder.to_dataflow() # Finally save the data flow so it can be used later fullPackagePath = savePackage(dataFlow, dataName, '1', 'A') print('{0}: saved package to {1}'.format(dataName, fullPackagePath))
combined_df.keep_columns(columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude" ]).get_profile() # COMMAND ---------- tmp_df = (combined_df.drop_nulls( columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude" ], column_relationship=dprep.ColumnRelationship( dprep.ColumnRelationship.ANY)).filter( dprep.f_and( dprep.col("pickup_longitude") <= -73.72, dprep.col("pickup_longitude") >= -74.09, dprep.col("pickup_latitude") <= 40.88, dprep.col("pickup_latitude") >= 40.53, dprep.col("dropoff_longitude") <= -73.72, dprep.col("dropoff_longitude") >= -74.09, dprep.col("dropoff_latitude") <= 40.88, dprep.col("dropoff_latitude") >= 40.53))) tmp_df.keep_columns(columns=[ "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude" ]).get_profile() # COMMAND ---------- combined_df = tmp_df
# Drop the pickup_datetime and dropoff_datetime columns because they're # no longer needed (granular time features like hour, # minute and second are more useful for model training). processed_df = transformed_features_df.drop_columns( columns=["pickup_datetime", "dropoff_datetime"]) # Use the type inference functionality to automatically check the data type of each field, # and display the inference results. type_infer = processed_df.builders.set_column_types() type_infer.learn() # The inference results look correct based on the data. Now apply the type conversions to the dataflow. type_converted_df = type_infer.to_dataflow() # Before you package the dataflow, run two final filters on the data set. # To eliminate incorrectly captured data points, # filter the dataflow on records where both the cost and distance variable values are greater than zero. # This step will significantly improve machine learning model accuracy, # because data points with a zero cost or distance represent major outliers that throw off prediction accuracy. final_df = type_converted_df.filter(dprep.col("distance") > 0) final_df = final_df.filter(dprep.col("cost") > 0) # Writing the final dataframe to use for training in the following steps if not (args.output_transform is None): os.makedirs(args.output_transform, exist_ok=True) print("%s created" % args.output_transform) write_df = final_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_transform)) write_df.run_local()
def quarantineRows(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top # First count the number of columns found dataFlowColumns = list(dataFlow.get_profile().columns.keys()) numberOfColumnsFound = len(dataFlowColumns) # Now convert the operationFlag to an integer headerCount = int(operationFlag) # If we have more columns that expected, we quarantine rows which have values in the extra columns if numberOfColumnsFound > headerCount: # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved! # Fork a new data flow with rows that have data in the un-expected columns print( '{0}: we have found {1} columns, expected {2} so will now quarantine any rows with data in them ' .format(dataName, numberOfColumnsFound, headerCount)) quarantinedDataFlow = dataFlow.drop_nulls( dataFlowColumns[headerCount:]) # There is a chance we have an extra column but no rows to quarantine, so check this first if quarantinedDataFlow.row_count is None: quarantinedRowCount = 0 print('{0}: no rows to quarantine'.format(dataName)) else: quarantinedRowCount = quarantinedDataFlow.row_count # Finally save the data flow so it can be used later fullPackagePath = saveDataFlowPackage(quarantinedDataFlow, dataName, thisStageNumber, 'B') print('{0}: quarantined {1} rows of data to {2}'.format( dataName, quarantinedRowCount, fullPackagePath)) # Now filter out the quarantined rows from the main data set # NOTE : can't figure out a better way of doign this for now - see note below... for columnToCheck in dataFlowColumns[headerCount:]: # NOTE - don't know why commented line of code below doesn't work! # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '') dataFlow = dataFlow.assert_value(columnToCheck, value != '', error_code='ShouldBeNone') dataFlow = dataFlow.filter(col(columnToCheck).is_error()) print('{0}: filtered column {1}, row count now {2}'.format( dataName, columnToCheck, dataFlow.row_count)) # Finally drop the extra columns dataFlow = dataFlow.drop_columns(dataFlowColumns[headerCount:]) print('{0}: dropped {1} unwanted columns'.format( dataName, len(dataFlowColumns[headerCount:]))) else: print( '{0}: we have found {1} columns, expected {2} so not going to do anything' .format(dataName, numberOfColumnsFound, headerCount)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None