def fetch_df(step, output_name): output_data = step.get_output_data(output_name) download_path = './outputs/' + output_name output_data.download(download_path) df_path = get_download_path(download_path, output_name) + '/data' return dprep.auto_read_file(path=df_path)
def load_azureml_df(local_cache_path=None, file_split=Split.TRAIN, file_type="txt"): """ Loads the SNLI dataset as AzureML dataflow object Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip, and load. Args: local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. file_split (str): File split to load. One of (dev, test, train) file_type (str): File type to load. One of (txt, jsonl) Returns: AzureML dataflow: SNLI dataset """ with download_path(local_cache_path) as path: filepath = os.path.join(path, "snli_1.0.zip") snlipath = _maybe_download_and_extract(filepath, file_split, file_type) # NOTE: this works for the txt format but not the jsonl format df = dprep.auto_read_file(snlipath) return df
def prepareDataForMLTraining(testSize, dataset_root, shouldShuffle): # dataset_root path must be set to where YOU are storing the .csv file. #Creates a DataFlow object from .csv file dataFlow = dprep.auto_read_file(dataset_root, False) #Defines the input (X) columns and the output-to-predict (Y) columns from the .csv file. dataflow_X = dataFlow.keep_columns([ 'Soil_Name', 'MEAN_Eleva', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)', 'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)' ]) dataflow_Y = dataFlow.keep_columns('NormalizedYield') #Converts data into a pandas DataFrame to ease in splitting data into Train/Test sets. x_dataFlow_Pandas = dataflow_X.to_pandas_dataframe() y_dataFlow_Pandas = dataflow_Y.to_pandas_dataframe() #Splits data into a train portion and a test portion #User-defined parameters for how the dataset is split for training/testing purposes. #Test size should be a float between 0 and 1. Train size is 1 - testSize. #shouldShuffle determines whether data is given a final shuffle before division. X_toTrain, X_toTest, Y_toTrain, Y_toTest = train_test_split( x_dataFlow_Pandas, y_dataFlow_Pandas, test_size=testSize, random_state=173, shuffle=shouldShuffle) return (X_toTrain, X_toTest, Y_toTrain, Y_toTest)
def test_fit_transform(self): import azureml.dataprep as dprep path = get_dataset('infert').as_filepath() dflow = dprep.auto_read_file(path=path) dprep_data = DprepDataStream(dflow) file_data = FileDataStream.read_csv(path) xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) pipe = Pipeline([xf]) transformed_data = pipe.fit_transform(file_data) transformed_data1 = pipe.fit_transform(dprep_data) assert_array_equal(transformed_data.columns, transformed_data1.columns) assert_2d_array_equal(transformed_data.values, transformed_data1.values)
from azureml.core import Webservice from azureml.core.webservice import AciWebservice # try: # setting the local env to hadnle missing packages run_user_managed = RunConfiguration() run_user_managed.environment.python.user_managed_dependencies = False # Create workspace object for existing one and create an experiment ws = Workspace.from_config('subscription.json') print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t') experiment = Experiment(workspace=ws, name='experiment1') # full path to training data,testing data file_path1 = os.path.join(os.getcwd(), "cumodelwo2014.csv") dflowtr = dprep.auto_read_file(path=file_path1) file_path2 = os.path.join(os.getcwd(), "test2014.csv") dflowte = dprep.auto_read_file(path=file_path2) # Specifying x(causal) and y(response) attributes in training data dflowtr_x = dflowtr.keep_columns([ 'cell-ID', 'Soil_Name', 'MEAN_Yld_V', 'COUNT_Yld', 'MEAN_Eleva', 'RANGE_Elev', 'Crop-Type', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)', 'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)', 'M.T.R(inch)' ]) dflowtr_y = dflowtr.keep_columns('NormalizedYield') # causal factors in training dataframe trainingx_df = dflowtr_x.to_pandas_dataframe() # response variable in training dataframe trainingy_df = dflowtr_y.to_pandas_dataframe()
#*** ## Part 2 - Data Ingestion # %% #Load the train and test files as Pandas dataframes... df_train = pd.read_csv('./kaggle/input/train.csv') df_train.shape # %% df_train.info() # %% df_test = pd.read_csv('./kaggle/input/test.csv') df_test.shape # %% dataflow = dprep.auto_read_file('./kaggle/input/train.csv') dataflow.head(10) # %% dataflow_test = dprep.auto_read_file('./kaggle/input/test.csv') dataflow_test.head(10) # %% [markdown] # *** ## Part 3 - Feature Engineering I # #Feature engineering necessary to support exploratory data analysis (EDA) below. # #In this section a range of feature extraction actions are performed using Microsoft's ML Dataprep SDK for Python: #1. Append the test data set to training data set so that feature engineering can be addressed collectively; #2. Filling in empty values in "Cabin" and "Fare" where simple rules can be applied;
def main(): # local compute run_user_managed = RunConfiguration() run_user_managed.environment.python.user_managed_dependencies = False # print to check azure sdk installation print(azureml.core.VERSION) # create workspace object to connect to omtest workspace in MLSERVICE ws = Workspace.from_config('./config.json') # default data store # ds = ws.get_default_datastore() # print(ds) # choose a name for the run history container in the workspace experiment_name = 'automated-ml-regression' # project folder project_folder = './automated-ml-regression' output = {} output['SDK version'] = azureml.core.VERSION output['Subscription ID'] = ws.subscription_id output['Workspace'] = ws.name output['Resource Group'] = ws.resource_group output['Location'] = ws.location output['Project Directory'] = project_folder pd.set_option('display.max_colwidth', -1) pd.DataFrame(data=output, index=['']).T # stats for all the columns dflow = dprep.auto_read_file(path='/Users/omprakashnekkanti/Desktop/Spring 2019/CS445-Capstone/automatedML/cuformodel.csv') print(type(dflow)) dflow.get_profile() # filepath as a string file_path = os.path.join(os.getcwd(), 'cuformodel.csv') print(file_path) print(type(file_path)) # dflow_prepared = dprep.Dataflow.open(file_path) # dflow_prepared.get_profile() dflow_X = dflow.keep_columns([ 'cell-ID', 'Soil_Name', 'MEAN_Yld_V', 'COUNT_Yld', 'MEAN_Eleva', 'RANGE_Elev', 'Crop-Type', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)', 'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)', 'M.T.R(inch)' ]) dflow_y = dflow.keep_columns('NormalizedYield') x_df = dflow_X.to_pandas_dataframe() y_df = dflow_y.to_pandas_dataframe() x_train, x_test, y_train, y_test = train_test_split( x_df, y_df, test_size=0.2, random_state=223) # flatten y_train to 1d array y_train.values.flatten() automl_settings = { "iteration_timeout_minutes": 20, "iterations": 40, "primary_metric": 'mean_absolute_error', "preprocess": False, "verbosity": logging.INFO, "n_cross_validations": 10 } # local compute automated_ml_config = AutoMLConfig( task='regression', debug_log='automated_ml_errors.log', path=project_folder, X=x_train.values, y=y_train.values.flatten(), **automl_settings) experiment = Experiment(ws, experiment_name) local_run = experiment.submit(automated_ml_config, show_output=True)
def to_dataflow(self) -> 'dprep.Dataflow': import azureml.dataprep as dprep return dprep.auto_read_file(self.get_url())
# COMMAND ---------- import azureml.dataprep as dprep # COMMAND ---------- dataset_root = "https://dprepdata.blob.core.windows.net/demo" green_path = "/".join([dataset_root, "green-small/*"]) yellow_path = "/".join([dataset_root, "yellow-small/*"]) green_df = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED) # auto_read_file will automatically identify and parse the file type, and is useful if you don't know the file type yellow_df = dprep.auto_read_file(path=yellow_path) green_df.head(5) yellow_df.head(5) # COMMAND ---------- all_columns = dprep.ColumnSelector(term=".*", use_regex=True) drop_if_all_null = [ all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL) ] useful_columns = [ "cost", "distance", "dropoff_datetime", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_datetime", "pickup_latitude", "pickup_longitude", "store_forward", "vendor"
parser.add_argument("--d_ts_2", type=str, help="d_ts_2") parser.add_argument("--output", type=str, help="output") args = parser.parse_args() # parse inputs h_ts_1_dr = args.h_ts_1 h_ts_2_dr = args.h_ts_2 h_ts_3_dr = args.h_ts_3 d_ts_1_dr = args.d_ts_1 d_ts_2_dr = args.d_ts_2 # ======================= # azureml-dataprep-sdk.py # ======================= h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr) h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr) h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr) d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr) d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr) # Pivot data h_ts_1_pivot_dflow = h_ts_1_dflow.pivot( ['NODE_ID'], 'MW', azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX, ['MYDATE', 'HOUR']) h_ts_2_pivot_dflow = h_ts_2_dflow.pivot( ['NODE_ID'], 'MW', azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX, ['MYDATE', 'HOUR'])