def prepare_dataflows(csv_file_path, label_column='duration_minutes'): import azureml.dataprep as dprep dataflow_schema = { 'taxi_type': dprep.FieldType.STRING, 'store_and_fwd_flag': dprep.FieldType.BOOLEAN, 'passenger_count': dprep.FieldType.INTEGER, 'trip_distance': dprep.FieldType.DECIMAL, 'vendor_abbreviation': dprep.FieldType.STRING, 'rate_code_description': dprep.FieldType.STRING, 'pickup_borough': dprep.FieldType.STRING, 'pickup_zone': dprep.FieldType.STRING, 'pickup_service_zone': dprep.FieldType.STRING, 'dropoff_borough': dprep.FieldType.STRING, 'dropoff_zone': dprep.FieldType.STRING, 'dropoff_service_zone': dprep.FieldType.STRING, 'pickup_year': dprep.FieldType.INTEGER, 'pickup_month': dprep.FieldType.INTEGER, 'pickup_day': dprep.FieldType.INTEGER, 'pickup_hour': dprep.FieldType.INTEGER, 'is_rush_hour_flag': dprep.FieldType.BOOLEAN, 'is_weekend_flag': dprep.FieldType.BOOLEAN, 'duration_minutes': dprep.FieldType.DECIMAL, } dataflow = dprep.read_csv(csv_file_path) dataflow = dataflow.set_column_types(dataflow_schema) return dataflow.keep_columns([label_column ]), dataflow.drop_columns([label_column])
def test_dprep_datastream(self): import azureml.dataprep as dprep dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"] col2 = ['0', '1'] label_array = np.repeat([0], 2) train_df = pd.DataFrame({ 'col1': dates, 'col2': col2, 'label': label_array }) pipeline = Pipeline(steps=[ Handler(columns={'2': 'col2'}, concat=False, impute_by_slot=True, replace_with='Mean') ]) file_name = get_temp_file('.csv') train_df.to_csv(file_name) dataflow = dprep.read_csv(file_name, infer_column_types=True) dprepDataStream = DprepDataStream(dataflow) result = pipeline.fit_transform(dprepDataStream) self.assertEqual(result.loc[:, 'col1'].dtype, np.dtype('datetime64[ns]')) self.assertEqual(result.loc[0, 'col1'].year, 2018) self.assertEqual(result.loc[0, 'col1'].month, 1) self.assertEqual(result.loc[0, 'col1'].day, 2) self.assertEqual(result.loc[0, 'col1'].hour, 0) self.assertEqual(result.loc[0, 'col1'].minute, 0) self.assertEqual(result.loc[0, 'col1'].second, 0) self.assertEqual(result.loc[1, 'col1'].year, 2018) self.assertEqual(result.loc[1, 'col1'].month, 2) self.assertEqual(result.loc[1, 'col1'].day, 1) self.assertEqual(result.loc[1, 'col1'].hour, 10) self.assertEqual(result.loc[1, 'col1'].minute, 0) self.assertEqual(result.loc[1, 'col1'].second, 0) os.remove(file_name)
def dataFlowProcessingLoop(previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, functionToCall, **kwargs): # Load the dataFlow controller file dataFlows = dprep.read_csv('dataFlowController.csv').to_pandas_dataframe() # Set up empty dataframes that we will use to build up inventories at both dataFlow and column level dataFlowInventoryAll = pd.DataFrame() columnInventoryAll = pd.DataFrame() for index, row in dataFlows.iterrows(): dataName = row["DataName"] operationFlag = row[operatorToUse] newDataFlow, columnInventory, dataFlowInventory = functionToCall( dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag, **kwargs) if newDataFlow: # Capture the column inventory for the new dataflow columnInventoryAll = columnInventoryAll.append(columnInventory) print('{0}: appended {1} rows to column inventory'.format( dataName, len(columnInventory))) # Capture the data flow inventory for the new data flow dataFlowInventoryAll = dataFlowInventoryAll.append( dataFlowInventory) print('{0}: appended {1} rows to data flow inventory'.format( dataName, len(dataFlowInventory))) # Once we have processed all dataflows, we save the inventories away saveColumnInventory(columnInventoryAll, thisStageNumber) saveDataFlowInventory(dataFlowInventoryAll, thisStageNumber) return dataFlowInventoryAll
packageToSave = packageToSave.save(fullPackagePath) return fullPackagePath # An open package helper function def openPackage(packageName, stage, qualityFlag): fullPackagePath = createFullPackagePath(packageName, stage, qualityFlag) packageToOpen = Package.open(fullPackagePath) dataFlow = packageToOpen[packageName] return dataFlow #%% [markdown] # ## Prepare for ingestion... #%% # Load in file names to be processed from the config.csv file dataFiles = dprep.read_csv('dataFiles.csv').to_pandas_dataframe() # Create a fully qualified path to the data files and append this to the dataFiles data frame fullFilePaths = dataPath + '/' + dataFiles.FileName fullFilePaths.name = "FullFilePath" dataFiles = pd.concat([dataFiles, fullFilePaths], axis=1) # now grab the number of headers in the first row of each file headerCount = [] for index, row in dataFiles.iterrows(): firstRow = open(row["FullFilePath"]).readline().strip() regexPattern = re.compile(',\w') patternCount = len(re.findall(regexPattern,firstRow)) headerCount.append(patternCount + 1) columnCount = pd.DataFrame({'ColumnCount':headerCount}) dataFiles = pd.concat([dataFiles, columnCount], axis=1)
parser = argparse.ArgumentParser("merge") parser.add_argument("--input_green_merge", type=str, help="cleaned green taxi data directory") parser.add_argument("--input_yellow_merge", type=str, help="cleaned yellow taxi data directory") parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged") args = parser.parse_args() print("Argument 1(input green taxi data path): %s" % args.input_green_merge) print("Argument 2(input yellow taxi data path): %s" % args.input_yellow_merge) print("Argument 3(output merge taxi data path): %s" % args.output_merge) green_df = dprep.read_csv(args.input_green_merge + '/part-*') yellow_df = dprep.read_csv(args.input_yellow_merge + '/part-*') # Appending yellow data to green data combined_df = green_df.append_rows([yellow_df]) if not (args.output_merge is None): os.makedirs(args.output_merge, exist_ok=True) print("%s created" % args.output_merge) write_df = combined_df.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_merge)) write_df.run_local()
help="cleaned taxi data directory") parser.add_argument("--useful_columns", type=str, help="useful columns to keep") parser.add_argument("--columns", type=str, help="rename column pattern") args = parser.parse_args() print("Argument 1(input taxi data path): %s" % args.input_cleanse) print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;"))) print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;"))) print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED) # These functions ensure that null data is removed from the data set, # which will help increase machine learning model accuracy. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep # for more details useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;") ] columns = get_dict(args.columns) all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [ all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)
import os as os import re as re import collections from azureml.dataprep import value from azureml.dataprep import col from azureml.dataprep import Dataflow from commonInventoryCreation import getColumnStats, getDataFlowStats, saveColumnInventory, saveDataFlowInventory from commonPackageHandling import saveDataFlowPackage # Let's also set up global variables... previousStageNumber = '00' thisStageNumber = '10' #%% # Load in file names to be processed from the data file inventory dataFileStats = dprep.read_csv('dataFileInventory_' + previousStageNumber + '.csv').to_pandas_dataframe() #%% # First a quick pass through each file to grab the number of headers and count columns # NOTE - this loop could improved such that there is less code the dataFileStats dataframe above headerCount = [] for index, row in dataFileStats.iterrows(): firstRow = open(row["FullFilePath"]).readline().strip() regexPattern = re.compile(',\w') patternCount = len(re.findall(regexPattern, firstRow)) headerCount.append(patternCount + 1) print(firstRow) print(patternCount) headerCountCol = pd.DataFrame({'HeaderCount': headerCount}) dataFileStats = pd.concat([dataFileStats, headerCountCol], axis=1)
args = parser.parse_args() print("Argument 1(input taxi data features path): %s" % args.input_split_features) print("Argument 2(input taxi data labels path): %s" % args.input_split_labels) print("Argument 3(output training features split path): %s" % args.output_split_train_x) print("Argument 4(output training labels split path): %s" % args.output_split_train_y) print("Argument 5(output test features split path): %s" % args.output_split_test_x) print("Argument 6(output test labels split path): %s" % args.output_split_test_y) x_df = dprep.read_csv( path=args.input_split_features, header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe() y_df = dprep.read_csv( path=args.input_split_labels, header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe() # These functions splits the input features and labels into test and train data # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223) if not (args.output_split_train_x is None and args.output_split_test_x is None and args.output_split_train_y is None
compute_target.wait_for_completion(show_output=True, min_node_count=True, timeout_in_minutes=20) print(compute_target.status.serialize()) #%% os.makedirs('./data', exist_ok=True) # INSERT DATA SOURCE HERE ds = ws.get_default_datastore() print(ds.datastore_type, ds.account_name, ds.container_name) # ds.upload(src_dir='./data', target_path='AssetData', overwrite=True, show_progress=True) #%% asset_data_path = 'data/AssetData_Historical.csv' asset_data_df = dprep.read_csv(path=asset_data_path, header=dprep.PromoteHeadersMode.GROUPED) display(asset_data_df.head(5)) #%% dprep_path = os.path.join(os.getcwd(), 'dflows.dprep') dflow_prepared = asset_data_df package = dprep.Package([dflow_prepared]) package.save(dprep_path) #%% package_saved = dprep.Package.open(dprep_path) dflow_prepared = package_saved.dataflows[0] dflow_prepared.get_profile() #%% int_type = dprep.TypeConverter(dprep.FieldType.INTEGER) dflow_prepared = dflow_prepared.set_column_types( type_conversions={'Failure_NextHour': int_type})
training_sdf = trainingSDF training_sdf = training_sdf.drop("Idx", "initialDebt") training_sdf \ .drop("SeriousDlqin2yrs") \ .toPandas() \ .to_csv("/dbfs/FileStore/tables/constant-scoring-training-vars.csv") training_sdf \ .select("SeriousDlqin2yrs") \ .toPandas() \ .to_csv("/dbfs/FileStore/tables/constant-scoring-training-res.csv") X_train = dataprep.read_csv( path="/dbfs/FileStore/tables/constant-scoring-training-vars.csv", separator=',') X_train = X_train.drop_columns("Column1") Y_train = dataprep.read_csv( path="/dbfs/FileStore/tables/constant-scoring-training-res.csv", separator=',') Y_train = Y_train.drop_columns("Column1") # COMMAND ---------- # MAGIC %md # MAGIC Checking to make sure we have data inside. # COMMAND ----------
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) if operationFlag != '': mappingConfig = dprep.read_csv( './Config/' + operationFlag).to_pandas_dataframe() targetDataFlow = dataFlow columnsToKeep = '' for sourceTable in mappingConfig[mappingConfig.SourceTable == dataName]['SourceTable'].unique(): for sourceColumn, targetColumn in mappingConfig[ mappingConfig.SourceTable == sourceTable][[ 'SourceColumn', 'TargetColumn' ]].values: if columnsToKeep is '': columnsToKeep = targetColumn else: columnsToKeep = columnsToKeep + '|' + targetColumn targetDataFlow = targetDataFlow.rename_columns( {sourceColumn: targetColumn}) targetDataFlow = targetDataFlow.drop_columns( dprep.ColumnSelector(columnsToKeep, True, True, invert=True)) newPackageName = next( iter(mappingConfig[mappingConfig.SourceTable == dataName] ['TargetTable'].unique())) createNewPackageDirectory(newPackageName) saveDataFlowPackage(targetDataFlow, newPackageName, thisStageNumber, 'A') else: print('{0}: no duplicate processing required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None
"and define the minimum and maximum bounds for each field.") parser = argparse.ArgumentParser("filter") parser.add_argument("--input_filter", type=str, help="merged taxi data directory") parser.add_argument("--output_filter", type=str, help="filter out out of city locations") args = parser.parse_args() print("Argument 1(input taxi data path): %s" % args.input_filter) print("Argument 2(output filtered taxi data path): %s" % args.output_filter) combined_df = dprep.read_csv(args.input_filter + '/part-*') # These functions filter out coordinates for locations that are outside the city border. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details # Create a condensed view of the dataflow to just show the lat/long fields, # which makes it easier to evaluate missing or out-of-scope coordinates decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL) combined_df = combined_df.set_column_types( type_conversions={ "pickup_longitude": decimal_type, "pickup_latitude": decimal_type, "dropoff_longitude": decimal_type, "dropoff_latitude": decimal_type })
# Wait for the cluster to complete, show the output log cpu_cluster.wait_for_completion(show_output=True) # COMMAND ---------- import azureml.dataprep as dprep # COMMAND ---------- dataset_root = "https://dprepdata.blob.core.windows.net/demo" green_path = "/".join([dataset_root, "green-small/*"]) yellow_path = "/".join([dataset_root, "yellow-small/*"]) green_df = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED) # auto_read_file will automatically identify and parse the file type, and is useful if you don't know the file type yellow_df = dprep.auto_read_file(path=yellow_path) green_df.head(5) yellow_df.head(5) # COMMAND ---------- all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [ all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL) ] useful_columns = [ "cost", "distance", "dropoff_datetime", "dropoff_latitude",
print("Transforms the renamed taxi data to the required format") parser = argparse.ArgumentParser("transform") parser.add_argument("--input_transform", type=str, help="renamed taxi data") parser.add_argument("--output_transform", type=str, help="transformed taxi data") args = parser.parse_args() print("Argument 1(input taxi data path): %s" % args.input_transform) print("Argument 2(output final transformed taxi data): %s" % args.output_transform) renamed_df = dprep.read_csv(args.input_transform + '/part-*') # These functions transform the renamed data to be used finally for training. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details # Split the pickup and dropoff date further into the day of the week, day of the month, and month values. # To get the day of the week value, use the derive_column_by_example() function. # The function takes an array parameter of example objects that define the input data, # and the preferred output. The function automatically determines your preferred transformation. # For the pickup and dropoff time columns, split the time into the hour, minute, and second by using # the split_column_by_example() function with no example parameter. After you generate the new features, # use the drop_columns() function to delete the original fields as the newly generated features are preferred. # Rename the rest of the fields to use meaningful descriptions. transformed_features_df = (renamed_df.derive_column_by_example( source_columns="pickup_date",
type=str, help="input featurization") parser.add_argument("--useful_columns", type=str, help="columns to use") parser.add_argument("--output_featurization", type=str, help="output featurization") args = parser.parse_args() print("Argument 1(input training data path): %s" % args.input_featurization) print("Argument 2(column features to use): %s" % str(args.useful_columns.strip("[]").split("\;"))) print("Argument 3:(output featurized training data path) %s" % args.output_featurization) dflow_prepared = dprep.read_csv(args.input_featurization + '/part-*') # These functions extracts useful features for training # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;") ] dflow = dflow_prepared.keep_columns(useful_columns) if not (args.output_featurization is None): os.makedirs(args.output_featurization, exist_ok=True) print("%s created" % args.output_featurization) write_df = dflow.write_to_csv( directory_path=dprep.LocalFileOutput(args.output_featurization)) write_df.run_local()
print("Prepare data for training") parser = argparse.ArgumentParser("prep_data") parser.add_argument("--input_file", type=str, help="input raw data file") parser.add_argument("--output_path", type=str, help="output prepped data path") args, unknown = parser.parse_known_args() if (unknown): print("Unknown args:") print(unknown) print("Argument 1 (input training data file): %s" % args.input_file) print("Argument 2 (output prepped training data path) %s" % args.output_path) input_file = dprep.read_csv(args.input_file) prepped_data = ( input_file.drop_columns( columns='skin' ) # skin is same as thickness with another unit (inches/cm) .replace(columns='diabetes', find="TRUE", replace_with="1").replace( columns='diabetes', find="FALSE", replace_with="0").set_column_types( type_conversions={ 'diabetes': dprep.TypeConverter( data_type=dprep.FieldType.INTEGER) })) if not (args.output_path is None): os.makedirs(args.output_path, exist_ok=True) print("%s created" % args.output_path)
args, unknown = parser.parse_known_args() if (unknown): print("Unknown args:") print(unknown) print("Argument 1 (input prepared data): %s" % args.input_prepared_data) print("Argument 2 (output training features split path): %s" % args.output_split_train_x) print("Argument 3 (output training labels split path): %s" % args.output_split_train_y) print("Argument 4 (output test features split path): %s" % args.output_split_test_x) print("Argument 5 (output test labels split path): %s" % args.output_split_test_y) input_data = dprep.read_csv(args.input_prepared_data) feature_names = [ 'num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age' ] label_names = ['diabetes'] print("Features:") print(feature_names) print("Labels:") print(label_names) x_df = input_data.keep_columns(feature_names).to_pandas_dataframe() y_df = input_data.keep_columns(label_names).to_pandas_dataframe()
parser = argparse.ArgumentParser("split") parser.add_argument("--input_data_frame", type=str, help="input data frame") parser.add_argument("--output_train_frame", type=str, help="output train frame") parser.add_argument("--output_val_frame", type=str, help="output validation frame") args = parser.parse_args() print("Argument 1(input data frame path): {}".format(args.input_data_frame)) print("Argument 2(output training frame path): {}".format( args.output_train_frame)) print("Argument 3(output validation frame path): {}".format( args.output_val_frame)) input_df = dprep.read_csv( path=args.input_data_frame, header=dprep.PromoteHeadersMode.SAMEALLFILES).to_pandas_dataframe() idx = int(input_df.shape[0] * 0.8) #Getting the index of the first 80% of rows input_df = input_df[['date', 'value']] train_df = input_df[:idx] val_df = input_df[idx:] if not (args.output_train_frame is None and args.output_val_frame is None): write_output(train_df, args.output_train_frame) write_output(val_df, args.output_val_frame)
parser = argparse.ArgumentParser("normalize") parser.add_argument("--input_normalize", type=str, help="combined and converted taxi data") parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns") args = parser.parse_args() print("Argument 1(input taxi data path): %s" % args.input_normalize) print("Argument 2(output normalized taxi data path): %s" % args.output_normalize) combined_converted_df = dprep.read_csv(args.input_normalize + '/part-*') # These functions replace undefined values and rename to use meaningful names. # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details replaced_stfor_vals_df = combined_converted_df.replace( columns="store_forward", find="0", replace_with="N").fill_nulls("store_forward", "N") replaced_distance_vals_df = replaced_stfor_vals_df.replace( columns="distance", find=".00", replace_with=0).fill_nulls("distance", 0) replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"]) time_split_df = (replaced_distance_vals_df.split_column_by_example( source_column="pickup_datetime").split_column_by_example(
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj['datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values except ImportError: logger.info("SDK version does not support column names extraction, fallback to old path") fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X) try: fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e): logger.debug("User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
def joinTables(dataName, previousStageNumber, thisStageNumber, qualityFlag, operatorToUse, operationFlag): dataFlow, fullPackagePath = openDataFlowPackage(dataName, previousStageNumber, qualityFlag) if dataFlow: print('{0}: loaded package from path {1}'.format( dataName, fullPackagePath)) # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level dataFlowInventoryIntermediate = pd.DataFrame() columnInventoryIntermediate = pd.DataFrame() if operationFlag != '': # Load config file joinConfig = dprep.read_csv('./Config/' + operationFlag).to_pandas_dataframe() # For each config in the file... for index, row in joinConfig.iterrows(): leftDataName = row['LeftDataName'] leftDataFlowJoinColumn = row['LeftDataFlowJoinColumn'] rightDataName = row['RightDataName'] rightDataFlowJoinColumn = row['RightDataFlowJoinColumn'] joinType = row['JoinType'] print( '{0}: ready to join {1} {2} -> {3} {4} using jointype {5}'. format(dataName, leftDataName, leftDataFlowJoinColumn, rightDataName, rightDataFlowJoinColumn, joinType)) # Load right hand data flow rightDataFlow, fullPackagePath = openDataFlowPackage( rightDataName, previousStageNumber, qualityFlag) print('{0}: loaded package from path {1}'.format( rightDataName, fullPackagePath)) # We always perform the inner "MATCH" stype join join_builder = dataFlow.builders.join( right_dataflow=rightDataFlow, left_column_prefix=dataName + '_', right_column_prefix=rightDataName + '_') join_builder.detect_column_info() join_builder.join_key_pairs = [(leftDataFlowJoinColumn, rightDataFlowJoinColumn)] # Setting up join type: # NONE = 0 # MATCH = 2 # UNMATCHLEFT = 4 # UNMATCHRIGHT = 8 join_builder.join_type = 2 innerDataFlow = join_builder.to_dataflow() print('{0} created inner dataflow : Columns : {1}, Rows : {2}'. format(dataName, len(innerDataFlow.get_profile().columns), innerDataFlow.row_count)) if joinType == "LEFT": # Use the "UNMATCHLEFT" setting to grab the rows that haven't been joined from the left data flow join_builder.join_type = 4 leftUnmatchedDataFlow = join_builder.to_dataflow() print( '{0} created left unmatched dataflow : Columns : {1}, Rows : {2}' .format( dataName, len(leftUnmatchedDataFlow.get_profile().columns), leftUnmatchedDataFlow.row_count)) # Now append this dataflow to the original inner join dataflow, to create a "left outer join" newDataFlow = innerDataFlow.append_rows( [leftUnmatchedDataFlow]) else: newDataFlow = innerDataFlow # Create a new name for this data flow based on concatenation of left dataflow and right newDataName = dataName + '_' + rightDataName # Output key stats print('{0} left table : {0}, Columns : {1}, Rows : {2}'.format( leftDataName, len(dataFlow.get_profile().columns), dataFlow.row_count)) print( '{0} right table : {0}, Columns : {1}, Rows : {2}'.format( rightDataName, len(rightDataFlow.get_profile().columns), rightDataFlow.row_count)) newDataProfile = newDataFlow.get_profile() print( '{0} joined table : {0}, Columns : {1}, Rows : {2}'.format( newDataName, len(newDataProfile.columns), newDataFlow.row_count)) # Now generate column and data flow inventories columnInventory = getColumnStats(newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats( newDataFlow, newDataProfile, newDataName, thisStageNumber, operatorToUse, operationFlag) # Capture the column inventory for the new dataflow columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) # Capture the data flow inventory for the new data flow dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage( newDataFlow, newDataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format( newDataName, targetPackagePath)) else: print('{0}: no joining of tables required'.format(dataName)) dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved source package to {1}'.format( dataName, targetPackagePath)) return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None