Example #1
0
    # subscription_id = config.subscription_id
    # resource_group = config.resource_group
    # workspace_name = config.workspace_name
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

    # cluster_name= config.cluster_name
    ct = ComputeTarget(workspace=ws, name=cluster_name)
    # datastore_name =config.datastore_name
    ds = Datastore(workspace=ws, name=datastore_name)

    workdir = os.path.realpath('.')[os.path.realpath('.').find('FixMatch-pytorch'):]
    workdir = workdir.replace('\\', '/')

    script_params = {
        "--workdir": ds.path('/projects/'+workdir).as_mount(), # REQUIRED !!!
        "--cxk_volna": ds.path('/').as_mount(),
        "--exp_name": workdir.split('/')[-1],
    }

    def make_container_registry(address, username, password):
        cr = ContainerRegistry()
        cr.address = address
        cr.username = username
        cr.password = password
        return cr


    estimator = PyTorch(source_directory='./',
                        script_params=script_params,
                        compute_target=ct,
Example #2
0
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj: # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']:
                    fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else: # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj['datastoreName'] # mandatory
                data_path = dataprep_json_obj['dataPath'] # mandatory
                label_column = dataprep_json_obj['label'] # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                        separator=separator,
                                        header=header,
                                        encoding=encoding,
                                        quoting=quoting,
                                        skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info("SDK version does not support column names extraction, fallback to old path")
                    fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X)

                try:
                    fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e):
                logger.debug("User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")
Example #3
0
#%% now you can upload that directory to blobstorage
# I use the date to diferentiate the different versions
blob_path = f"Campus_Recruitment/{datetime.now().strftime('%Y-%m-%d')}"# if None will upload to root
local_path = "./Upload/Data"

blob_store.upload(src_dir=local_path, 
                  target_path=blob_path,
                  overwrite=True, 
                  show_progress=True)

#%% 
# ** Register the data as a dataset **
# %% now that the data is up on the blobstore we can register it as a dataset 
# to keep track of its versions and make it easily acessible
dataset = Dataset.File.from_files( blob_store.path(blob_path + "/data.csv") )
dataset.register(ws, 
                 name="Campus_Recruitment_PCA_Training_Data",
                 create_new_version=True)

#%% 
# ** Upload and register the model as a Model **
#%% 
model = Model.register(workspace=ws,
                       model_name='Campus_Recruitment_PCA',                # Name of the registered model in your workspace.
                       model_path='./Upload/Model/model.pkl',  # Local file to upload and register as a model.
                      
                       sample_input_dataset=dataset,
                       sample_output_dataset=None,
                      
                       description='PCA model for dimention reduction of the Campus Recruitment Dataset',
Example #4
0
File: run.py Project: hwuu/hello-tf
ct = ComputeTarget(workspace=ws, name="cpucluster-II")
ds = Datastore(workspace=ws, name="hellotfstore")

#
# Create an estimator.
#

# Single node
est_1 = Estimator(
    compute_target=compute_target,
    use_gpu=False,
    node_count=1,
    pip_packages=['tensorflow==1.13.1'],
    source_directory="../",
    entry_script="mnist-mlp.py",
    script_params={"--data-dir": ds.path("data/mnist").as_mount()})

# Distributed with PS architecture
#est_2 = ...

# Distributed with Horovod
est_3 = Estimator(
    compute_target=compute_target,
    use_gpu=False,
    node_count=2,
    process_count_per_node=2,
    distributed_backend='mpi',
    pip_packages=['tensorflow==1.13.1', 'horovod'],
    source_directory="../",
    entry_script="mnist-mlp-dist-hvd.py",
    script_params={"--data-dir": ds.path("data/mnist").as_mount()})
Example #5
0
# In[ ]:


# Module
select_columns_in_dataset = Module.load(ws, namespace='azureml', name='Select Columns in Dataset')
clean_missing_data = Module.load(ws, namespace='azureml', name='Clean Missing Data')
split_data = Module.load(ws, namespace='azureml', name='Split Data')
join_data = Module.load(ws, namespace='azureml', name='Join Data')


# Dataset
try:
    dset = Dataset.get_by_name(ws, 'Automobile_price_data_(Raw)')
except Exception:
    global_datastore = Datastore(ws, name="azureml_globaldatasets")
    dset = Dataset.File.from_files(global_datastore.path('GenericCSV/Automobile_price_data_(Raw)'))
    dset.register(workspace=ws,
                  name='Automobile_price_data_(Raw)',
                  create_new_version=True)
blob_input_data = dset


# In[ ]:


# sub pipeline: TODO improve this experience
@dsl.pipeline(name='sub sub', description='sub')
def sub_sub_pipeline(minimum_missing_value_ratio):
    module1 = select_columns_in_dataset(
        dataset=blob_input_data,
        select_columns="{\"isFilter\":true,\"rules\":[{\"exclude\":false,\"ruleType\":\"AllColumns\"},"
Example #6
0
    ejoin_module_func = Module.register(
        ws, os.path.join('modules', 'ejoin', 'amlmodule.yaml'))
    eselect_module_func = Module.register(
        ws, os.path.join('modules', 'eselect', 'amlmodule.yaml'))

join_data_module_func = Module.load(ws, namespace='azureml', name='Join Data')
train_svd_recommender_module_func = Module.load(ws,
                                                namespace='azureml',
                                                name='Train SVD Recommender')

# datasets
input1 = Dataset.get_by_name(ws, 'query data (large)')
input2 = Dataset.get_by_name(ws, 'query data (small)')
global_datastore = Datastore(ws, name="azureml_globaldatasets")
movie_ratings_data = Dataset.File.from_files(
    global_datastore.path('GenericCSV/Movie_Ratings')).as_named_input(
        'Movie_Ratings')
imdb_movie_titles_data = Dataset.File.from_files(
    global_datastore.path('GenericCSV/IMDB_Movie_Titles')).as_named_input(
        'IMDB_Movie_Titles')

# In[ ]:

# steps
ejoin = ejoin_module_func().set_parameters(
    leftcolumns='m:query;querId',
    # missing 'rightcolumns' parameter
    leftkeys='m:query',
    rightkeys='m:Query',
    jointype='HashInner').set_inputs(left_input=input1, right_input=input2)
def main():
    # Ger our configs
    with open("ptgnn/authentication.json") as jsonFile:
        authData = json.load(jsonFile)[args.auth_cluster]

    # Copy the convertCorpus script here. Done so we don't upload the corpus to Azure, or keep a copy of the script in here.
    # (It's weird, I know. It works and has a purpose though)
    convertCorpusLocation = Path("../convertCorpusForML.py")
    convertCorpusAzureLocation = Path("./convertCorpusForML.py")
    shutil.copy(convertCorpusLocation, convertCorpusAzureLocation)

    # Grab the authentication data from the JSON file
    subID = authData["subID"]  # Get from Azure Portal; used for billing
    resGroup = authData["resGroup"]  # Name for the resource group
    wsName = authData["wsName"]  # Name for the workspace, which is the collection of compute clusters + experiments
    computeName = authData["computeName"]  # Name for computer cluster
    datastoreName = authData["datastoreName"]

    # Get the workspace, the compute target and the datastore
    ws = Workspace.get(wsName, subscription_id=subID, resource_group=resGroup)
    computeTarget = ComputeTarget(ws, computeName)
    datastore = Datastore(ws, name=datastoreName)

    # Download the entire corpus to the compute target. Save the DataReference obj here
    # as_mount is also possible, but slows things down due to network opening of files
    corpus_location = datastore.path(args.aml_location).as_download()
    output_location = "./"
    # The files that will be uploaded for usage by our script (everything in the azure folder)
    source_directory = "."

    # params for the script
    params = {
        "--corpus_location": corpus_location,
        "--output_folder": output_location,
        "--aml": "",
        "--training_percent": args.training_percent,
        "--validation_percent": args.validation_percent,
        "-c": ""
    }
    if args.log_num is not None:
        params["-l"] = args.log_num
        tags = {
            "logs": str(args.log_num)
        }
    else:
        tags = {
            "logs": "MAX"
        }
    if args.statement_generation:
        params["-s"] = ""
        tags["generationType"] = "Statement"
    else:
        tags["generationType"] = "Severity"
    # Set up the estimator object. Note the inputs element, it tells azure that corpus_location in params
    # will be a DataReference Object.
    est = Estimator(source_directory=source_directory,
                    compute_target=computeTarget,
                    entry_script='convertCorpusForML.py',
                    script_params=params,
                    inputs=[corpus_location],
                    conda_packages=["pip"],
                    pip_packages=["azureml-core", "tqdm", "numpy", "protobuf"],
                    use_docker=True,
                    use_gpu=False)
    # Start the experiment
    run = Experiment(ws, args.exp_name).submit(config=est, tags=tags)
    # remove the copy of convertCorpus (Remember, don't question this)
    convertCorpusAzureLocation.unlink()
    # print out the portral URL
    # print("Portal URL: ", run.get_portal_url())
    # this will stream everything that the compute target does.
    print("Experiment Started. Remember you can exit out of this program but the experiment will still run on Azure!")
    run.wait_for_completion(show_output=True)