def removeDuplicates(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': columnsToKeep = operationFlag numberOfRowsBefore = dataFlow.row_count dataFlow = dataFlow.distinct( dprep.ColumnSelector(columnsToKeep, True, True, invert=False)) print( '{0}: removed duplicates from column {1} rows before {2} rows afer {3}' .format(dataName, operationFlag, numberOfRowsBefore, dataFlow.row_count)) else: print('{0}: no duplicate processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
from azureml.train.automl import AutoMLConfig target = "utilization" ws = Workspace( workspace_name=dbutils.secrets.get("azureml", "AML_WORKSPACE_NAME"), # noqa subscription_id=dbutils.secrets.get("azureml", "AML_SUBSCRIPTION_ID"), # noqa resource_group=dbutils.secrets.get("azureml", "AML_RESOURCE_GROUP"), # noqa ) ds = ws.get_default_datastore() x = dprep.read_parquet_file(ds.path('model_data_x.parquet')) y = dprep.read_parquet_file(ds.path('model_data_y.parquet')).to_long( dprep.ColumnSelector(term='.*', use_regex=True)) project_folder = './automl' automl_config = AutoMLConfig( task="regression", iteration_timeout_minutes=10, iterations=10, primary_metric="r2_score", n_cross_validations=5, debug_log="automl.log", verbosity=logging.INFO, spark_context=sc, # noqa whitelist_models=[ "GradientBoosting", "DecisionTree", "RandomForest",
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': mappingConfig = dprep.read_csv( './Config/' + operationFlag).to_pandas_dataframe() targetDataFlow = dataFlow columnsToKeep = '' for sourceTable in mappingConfig[mappingConfig.SourceTable == dataName]['SourceTable'].unique(): for sourceColumn, targetColumn in mappingConfig[ mappingConfig.SourceTable == sourceTable][[ 'SourceColumn', 'TargetColumn' ]].values: if columnsToKeep is '': columnsToKeep = targetColumn else: columnsToKeep = columnsToKeep + '|' + targetColumn targetDataFlow = targetDataFlow.rename_columns( {sourceColumn: targetColumn}) targetDataFlow = targetDataFlow.drop_columns( dprep.ColumnSelector(columnsToKeep, True, True, invert=True)) newPackageName = next( iter(mappingConfig[mappingConfig.SourceTable == dataName] ['TargetTable'].unique())) createNewPackageDirectory(newPackageName) saveDataFlowPackage(targetDataFlow, newPackageName, thisStageNumber, 'A') else: print('{0}: no duplicate processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED) # These functions ensure that null data is removed from the data set, # which will help increase machine learning model accuracy. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep # for more details useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;") ] columns = get_dict(args.columns) all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [ all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL) ] new_df = (raw_df.replace_na(columns=all_columns).drop_nulls( *drop_if_all_null).rename_columns(column_pairs=columns).keep_columns( columns=useful_columns)) if not (args.output_cleanse is None): os.makedirs(args.output_cleanse, exist_ok=True) print("%s created" % args.output_cleanse) write_df = new_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_cleanse)) write_df.run_local()