def capture_filedataset_layout(self, dataset_name: str, output_path: str):
        from azureml.dataprep.api.functions import get_portable_path
        from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction

        dataset = self.__workspace.datasets[dataset_name]
        files_column = 'Path'
        PORTABLE_PATH = 'PortablePath'
        STREAM_PROPERTIES = 'StreamProperties'
        dataflow = dataset._dataflow \
                .add_column(get_portable_path(col(files_column), None), PORTABLE_PATH, files_column) \
                .add_column(get_stream_properties(col(files_column)), STREAM_PROPERTIES, PORTABLE_PATH) \
                .keep_columns([files_column, PORTABLE_PATH, STREAM_PROPERTIES])
        dataflow_to_execute = dataflow.add_step('Microsoft.DPrep.WritePreppyBlock', {
            'outputPath': {
                'target': 0,
                'resourceDetails': [{'path': str(output_path)}]
            },
            'profilingFields': ['Kinds', 'MissingAndEmpty']
        })
        dataflow_to_execute.run_local()
        df = dataflow.to_pandas_dataframe(extended_types=True)
        df = df.merge(pd.io.json.json_normalize(df.StreamProperties), left_index=True, right_index=True)
        print(f'{len(df.index)} files found in the dataset, totalling to a size of {(df.Size.sum() / (1024 * 1024)):,.2f} MB')

        return df
Example #2
0
        "dropoff_longitude": decimal_type,
        "dropoff_latitude": decimal_type
    })

# Filter out coordinates for locations that are outside the city border.
# Chain the column filter commands within the filter() function
# and define the minimum and maximum bounds for each field
latlong_filtered_df = (combined_df.drop_nulls(
    columns=[
        "pickup_longitude", "pickup_latitude", "dropoff_longitude",
        "dropoff_latitude"
    ],
    column_relationship=dprep.ColumnRelationship(
        dprep.ColumnRelationship.ANY)).filter(
            dprep.f_and(
                dprep.col("pickup_longitude") <= -73.72,
                dprep.col("pickup_longitude") >= -74.09,
                dprep.col("pickup_latitude") <= 40.88,
                dprep.col("pickup_latitude") >= 40.53,
                dprep.col("dropoff_longitude") <= -73.72,
                dprep.col("dropoff_longitude") >= -74.09,
                dprep.col("dropoff_latitude") <= 40.88,
                dprep.col("dropoff_latitude") >= 40.53)))

if not (args.output_filter is None):
    os.makedirs(args.output_filter, exist_ok=True)
    print("%s created" % args.output_filter)
    write_df = latlong_filtered_df.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_filter))
    write_df.run_local()
Example #3
0
    if columnCount != len(dataFlowColumns):
        # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved!
        # Fork a new data flow with rows that have data in the un-expected columns
        quarantinedDataFlow = dataFlow.drop_nulls(dataFlowColumns[columnCount:])
        print('{0}: created quarantined data with {1} rows'.format(dataName, quarantinedDataFlow.row_count))
        # Finally save the data flow so it can be used later
        fullPackagePath = savePackage(dataFlow, dataName, '1', 'B')
        print('{0}: saved quarantined data to {1}'.format(dataName, fullPackagePath))

    # Filter out the quarantined rows from the main data set
    # NOTE : can't figure out a better way of doign this for now - see note below...
    for columnToCheck in dataFlowColumns[columnCount:]:
        # Don't know why line of code below doesn't work!
        # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '')
        dataFlow = dataFlow.assert_value(columnToCheck, value != '' , error_code='ShouldBeNone')
        dataFlow = dataFlow.filter(col(columnToCheck).is_error())
        print('{0}: filtered column {1}, row count now {2}'.format(dataName, columnToCheck, dataFlow.row_count))
    
    # Now drop the extra columns
    dataFlow = dataFlow.drop_columns(dataFlowColumns[columnCount:])
    print('{0}: dropped {1} unwanted columns'.format(dataName, len(dataFlowColumns[columnCount:])))
    
    # Detect and apply column types
    builder = dataFlow.builders.set_column_types()
    builder.learn()
    builder.ambiguous_date_conversions_keep_month_day()
    dataFlow = builder.to_dataflow()
    
    # Finally save the data flow so it can be used later
    fullPackagePath = savePackage(dataFlow, dataName, '1', 'A')
    print('{0}: saved package to {1}'.format(dataName, fullPackagePath))
Example #4
0
combined_df.keep_columns(columns=[
    "pickup_longitude", "pickup_latitude", "dropoff_longitude",
    "dropoff_latitude"
]).get_profile()

# COMMAND ----------

tmp_df = (combined_df.drop_nulls(
    columns=[
        "pickup_longitude", "pickup_latitude", "dropoff_longitude",
        "dropoff_latitude"
    ],
    column_relationship=dprep.ColumnRelationship(
        dprep.ColumnRelationship.ANY)).filter(
            dprep.f_and(
                dprep.col("pickup_longitude") <= -73.72,
                dprep.col("pickup_longitude") >= -74.09,
                dprep.col("pickup_latitude") <= 40.88,
                dprep.col("pickup_latitude") >= 40.53,
                dprep.col("dropoff_longitude") <= -73.72,
                dprep.col("dropoff_longitude") >= -74.09,
                dprep.col("dropoff_latitude") <= 40.88,
                dprep.col("dropoff_latitude") >= 40.53)))
tmp_df.keep_columns(columns=[
    "pickup_longitude", "pickup_latitude", "dropoff_longitude",
    "dropoff_latitude"
]).get_profile()

# COMMAND ----------

combined_df = tmp_df
# Drop the pickup_datetime and dropoff_datetime columns because they're
# no longer needed (granular time features like hour,
# minute and second are more useful for model training).
processed_df = transformed_features_df.drop_columns(
    columns=["pickup_datetime", "dropoff_datetime"])

# Use the type inference functionality to automatically check the data type of each field,
# and display the inference results.
type_infer = processed_df.builders.set_column_types()
type_infer.learn()

# The inference results look correct based on the data. Now apply the type conversions to the dataflow.
type_converted_df = type_infer.to_dataflow()

# Before you package the dataflow, run two final filters on the data set.
# To eliminate incorrectly captured data points,
# filter the dataflow on records where both the cost and distance variable values are greater than zero.
# This step will significantly improve machine learning model accuracy,
# because data points with a zero cost or distance represent major outliers that throw off prediction accuracy.

final_df = type_converted_df.filter(dprep.col("distance") > 0)
final_df = final_df.filter(dprep.col("cost") > 0)

# Writing the final dataframe to use for training in the following steps
if not (args.output_transform is None):
    os.makedirs(args.output_transform, exist_ok=True)
    print("%s created" % args.output_transform)
    write_df = final_df.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_transform))
    write_df.run_local()
def quarantineRows(dataName, previousStageNumber, thisStageNumber, qualityFlag,
                   operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top

        # First count the number of columns found
        dataFlowColumns = list(dataFlow.get_profile().columns.keys())
        numberOfColumnsFound = len(dataFlowColumns)

        # Now convert the operationFlag to an integer
        headerCount = int(operationFlag)

        # If we have more columns that expected, we quarantine rows which have values in the extra columns
        if numberOfColumnsFound > headerCount:
            # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved!
            # Fork a new data flow with rows that have data in the un-expected columns
            print(
                '{0}: we have found {1} columns, expected {2} so will now quarantine any rows with data in them '
                .format(dataName, numberOfColumnsFound, headerCount))
            quarantinedDataFlow = dataFlow.drop_nulls(
                dataFlowColumns[headerCount:])

            # There is a chance we have an extra column but no rows to quarantine, so check this first
            if quarantinedDataFlow.row_count is None:
                quarantinedRowCount = 0
                print('{0}: no rows to quarantine'.format(dataName))
            else:
                quarantinedRowCount = quarantinedDataFlow.row_count
                # Finally save the data flow so it can be used later
                fullPackagePath = saveDataFlowPackage(quarantinedDataFlow,
                                                      dataName,
                                                      thisStageNumber, 'B')
                print('{0}: quarantined {1} rows of data to {2}'.format(
                    dataName, quarantinedRowCount, fullPackagePath))

            # Now filter out the quarantined rows from the main data set
            # NOTE : can't figure out a better way of doign this for now - see note below...
            for columnToCheck in dataFlowColumns[headerCount:]:
                # NOTE - don't know why commented line of code below doesn't work!
                # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '')
                dataFlow = dataFlow.assert_value(columnToCheck,
                                                 value != '',
                                                 error_code='ShouldBeNone')
                dataFlow = dataFlow.filter(col(columnToCheck).is_error())
                print('{0}: filtered column {1}, row count now {2}'.format(
                    dataName, columnToCheck, dataFlow.row_count))

            # Finally drop the extra columns
            dataFlow = dataFlow.drop_columns(dataFlowColumns[headerCount:])
            print('{0}: dropped {1} unwanted columns'.format(
                dataName, len(dataFlowColumns[headerCount:])))
        else:
            print(
                '{0}: we have found {1} columns, expected {2} so not going to do anything'
                .format(dataName, numberOfColumnsFound, headerCount))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None