def _setup_datastore(self, blob_dataset_name, output_path=None):
        """
        sets up the datastore in azureml. Either retrieves a pre-existing datastore
        or registers a new one in the workspace.

        :param str blob_dataset_name: [required] name of the datastore registered with the
                                 workspace. If the datastore does not yet exist, the
                                 name it will be registered under.
        :param str output_path: [optional] if registering a datastore for inferencing,
                                the output path for writing back predictions.
        """
        try:
            self.blob_ds = Datastore.get(self.ws, blob_dataset_name)
            print("Found Blob Datastore with name: %s" % blob_dataset_name)
        except HttpOperationError:
            self.blob_ds = Datastore.register_azure_blob_container(
                workspace=self.ws,
                datastore_name=blob_dataset_name,
                account_name=self.account_name,
                container_name=self.container_name,
                account_key=self.account_key,
                subscription_id=self.blob_sub_id,
            )

            print("Registered blob datastore with name: %s" %
                  blob_dataset_name)
        if output_path is not None:
            self.output_dir = PipelineData(
                name="output",
                datastore=self.ws.get_default_datastore(),
                output_path_on_compute=output_path)
Example #2
0
    def __init__(self):
        self._parser = argparse.ArgumentParser("train")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._tb = Tensorboard([self._run])
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)
Example #3
0
    def __init__(self):
        self._parser = argparse.ArgumentParser("evaluate")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)

        self._INPUT_SIZE = 416
        self._NUM_CLASS = len(utils.read_class_names(cfg.YOLO.CLASSES))
        self._CLASSES = utils.read_class_names(cfg.YOLO.CLASSES)

        self._predicted_dir_path = 'mAP/predicted'
        self._ground_truth_dir_path = 'mAP/ground-truth'
Example #4
0
def update_dataset(ws, datastore_name, dataset, time_stamp):
    datastore = Datastore.get(ws, datastore_name)
    #datastore = adlsgen2_datastore

    if dataset["dataset_name"] in ws.datasets:
        print("Dataset " + dataset["dataset_name"] + " already created in " +
              ws.name + ", will update to new version...")
    else:
        print("Dataset " + dataset["dataset_name"] +
              " is new and will be created in " + ws.name + "...")

    # create a TabularDataset from the path in the datastore
    datastore_paths = [(datastore, dataset["dataset_path"])]
    retrieved_dataset = Dataset.Tabular.from_delimited_files(
        path=datastore_paths)

    #Register the dataset (and make a new version if needed)
    #The timestamp description to make it easier to see the same
    # dataset was registered at the same time in different workspaces if you want to filter
    retrieved_dataset = retrieved_dataset.register(
        workspace=ws,
        name=dataset["dataset_name"],
        description='versioned data, timestamp: ' + time_stamp,
        create_new_version=True)
    print("Updated dataset " + dataset["dataset_name"] + " in workspace " +
          ws.name + " at timestamp " + time_stamp)
    return retrieved_dataset
Example #5
0
    def __init__(self):
        self.__parser = argparse.ArgumentParser("preprocessing")
        self.__parser.add_argument("--datastore",
                                   type=str,
                                   help="Name of the datastore",
                                   default="workspaceblobstore")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   help="Name of the dataset")
        self.__parser.add_argument("--dataset_preprocessed_name",
                                   type=str,
                                   help="Standard preprocessed dataset")
        self.__parser.add_argument("--output_preprocess_dataset",
                                   type=str,
                                   help="Name of the PipelineData reference")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'exploratory_analysis')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__datastore = Datastore.get(self.__ws,
                                         datastore_name=self.__args.datastore)
def write_results(df, cols, output_datastore, output_path, model, run):

    ws = run.experiment.workspace
    datastore = Datastore.get(ws, output_datastore)
    output_folder = tempfile.TemporaryDirectory(dir="/tmp")
    filename = os.path.join(output_folder.name, os.path.basename(output_path))
    print("Output filename: {}".format(filename))

    try:
        os.remove(filename)
    except OSError:
        pass

    df["ScoredLabels"] = model.predict(df[cols].astype(int).values)
    print("resultLabels", df["ScoredLabels"].iloc[:10])
    df["ScoredProbabilities"] = model.predict_proba(
        df[cols].astype(int).values)[:, 1]
    print("resultProbabilities", df["ScoredProbabilities"].iloc[:10])

    # set HotelCustomerID to index to remove the column1 columns in the dataframe
    df = df.set_index("CustomerId")

    directory_name = os.path.dirname(output_path)
    print("Extracting Directory {} from path {}".format(
        directory_name, output_path))

    df.to_csv(filename)

    # Datastore.upload() is supported currently, but is being deprecated by Dataset.File.upload_directory()
    # datastore.upload(src_dir=output_folder.name, target_path=directory_name, overwrite=False, show_progress=True)
    # upload_directory can fail sometimes.
    output_dataset = Dataset.File.upload_directory(src_dir=output_folder.name,
                                                   target=(datastore,
                                                           directory_name))
    return df
Example #7
0
    def create_pipeline(self):
        '''
        IRIS Data training and Validation
        '''        
        self.datastore = Datastore.get(self.workspace, self.workspace.get_default_datastore().name)
        print("Received datastore")
        input_ds = self.get_files_from_datastore(self.args.container_name,self.args.input_csv)
        final_df = input_ds.to_pandas_dataframe()
        print("Input DF Info",final_df.info())
        print("Input DF Head",final_df.head())

        X = final_df[["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]]
        y = final_df[["Species"]]

        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1984)
        
        model = DecisionTreeClassifier()
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        print("Model Score : ", model.score(X_test,y_test))

        joblib.dump(model, self.args.model_path)

        self.validate(y_test, y_pred, X_test)

        match = re.search('([^\/]*)$', self.args.model_path)
        # Upload Model to Run artifacts
        self.run.upload_file(name=self.args.artifact_loc + match.group(1),
                                path_or_stream=self.args.model_path)

        print("Run Files : ", self.run.get_file_names())
        self.run.complete()
def setup_azureml():
    """
    Get an Azure ML workspace from environment variables.
    Assumes the following are created outside of the code in this project:
      AML workspace
      AML datastore
      AML compute resource for training (can be blank for inferencing)
      AML compute resource for inferencing (can be blank for training)
    """
    subscription_id = os.environ['AML_SUBSCRIPTION']
    resource_group = os.environ['AML_RESOURCE_GROUP']
    workspace_name = os.environ['AML_WORKSPACE']
    datastore_name = os.environ['AML_DATASTORE']
    training_target_name = os.environ.get('AML_COMPUTE')
    inference_target_name = os.environ.get('AML_INFERENCE_COMPUTE')
    ws = Workspace(subscription_id, resource_group, workspace_name)
    ds = Datastore.get(ws, datastore_name=datastore_name)
    if training_target_name:
        training_target = ws.compute_targets[training_target_name]
    else:
        training_target = None
    if inference_target_name:
        inference_target = ws.compute_targets[inference_target_name]
    else:
        inference_target = None
    return ws, ds, training_target, inference_target
Example #9
0
def main():
    # Connect to your AMLS Workspace and set your Datastore
    ws = run.experiment.workspace
    datastoreName = args.datastore_name
    datastore = Datastore.get(ws, datastoreName)
    print('Datastore Set')

    # Set your Time Zone
    timeZone = pytz.timezone(args.pytz_time_zone)
    timeLocal = dt.datetime.now(timeZone).strftime('%Y-%m-%d')
    print('Time Zone Set')

    # Specify your File Names
    trainFile = timeLocal + '/' + args.train_file_name
    valFile = timeLocal + '/' + args.val_file_name
    print('File Names Set for Training and Validation Data.')

    # Set Tags and Description
    description = args.project_description
    trainTags = set_tags(['Project', 'Dataset Type', 'Date Created'],\
                         [args.project_name, 'Training', timeLocal])
    valTags = set_tags(['Project', 'Dataset Type', 'Date Created'],\
                       [args.project_name, 'Validation', timeLocal])
    print("Dataset Tags and Description Assigned")

    # Register your Training data as an Azure Tabular Dataset
    register_dataset(ws, datastore, args.datastore_path, trainFile,
                     args.train_dataset_name, description, trainTags)
    print('Training Data Registered')

    # Register your Validation data as an Azure Tabular Dataset
    register_dataset(ws, datastore, args.datastore_path, valFile,
                     args.val_dataset_name, description, valTags)
    print('Validation Data Registered')
Example #10
0
def main():
    # workspace
    ws = Workspace.from_config()

    #compute
    compute = AmlCompute(workspace=ws, name='gandalf')

    # datasource
    datastore = Datastore.get(ws, datastore_name='surfrider')

    # experiment
    script_params = {
        "--datastore": datastore.as_mount()
    }

    # Create and run experiment
    estimator = Estimator(source_directory='./',
                            script_params=script_params,
                            compute_target=compute,
                            entry_script='train.py',
                            use_gpu=True,
                            pip_packages=['opencv-python>=4.1',
                                            'tensorpack==0.9.8',
                                            'tensorflow-gpu>=1.3,<2.0',
                                            'tqdm>=4.36.1',
                                            'cython>=0.29.13',
                                            'scipy>=1.3.1',
                                            'ffmpeg-python',
                                            'wget'])

    
    exp = Experiment(ws, 'surfrider_rcnn')
    run = exp.submit(estimator)
Example #11
0
def convert_voc_annotation(ws,
                           ds,
                           data_type,
                           anno_path,
                           container_name,
                           use_difficult_bbox=True):
    classes = ['helmet', 'none']

    datastore = Datastore.get(ws, datastore_name=ds)
    voc_dataset_annotations = datastore.blob_service.list_blobs(
        container_name, prefix='VOC/Annotations')
    voc_dataset_images = datastore.blob_service.list_blobs(
        container_name, prefix='VOC/JPEGImages')
    voc_dataset_imagesets = datastore.blob_service.list_blobs(
        container_name, prefix=f'VOC/ImageSets/Main/{data_type}.txt')

    voc_list_annotations = list(voc_dataset_annotations)
    print("Succesfully list annotations")
    voc_list_images = list(voc_dataset_images)
    print("Succesfully list images")
    voc_list_imagesets = list(voc_dataset_imagesets)
    print("Succesfully list imagesets")

    txt = datastore.blob_service.get_blob_to_text(container_name,
                                                  voc_list_imagesets[0].name)
    txt_split = txt.content.splitlines()
    image_inds = [line.strip() for line in txt_split]
    with open(anno_path, 'a') as f:
        for image_ind in image_inds:
            image_path = datastore.blob_service.make_blob_url(
                container_name, 'VOC/JPEGImages/' + image_ind + '.jpg')
            annotation = image_path
            label_path = datastore.blob_service.get_blob_to_text(
                container_name,
                'VOC/Annotations/' + image_ind + '.xml').content
            root = ET.fromstring(label_path)
            objects = root.findall('object')
            for obj in objects:
                difficult = obj.find('difficult').text.strip()
                if (not use_difficult_bbox) and (int(difficult) == 1):
                    continue
                bbox = obj.find('bndbox')
                class_ind = classes.index(
                    obj.find('name').text.lower().strip())
                xmin = bbox.find('xmin').text.strip()
                xmax = bbox.find('xmax').text.strip()
                ymin = bbox.find('ymin').text.strip()
                ymax = bbox.find('ymax').text.strip()
                annotation += ' ' + ','.join(
                    [xmin, ymin, xmax, ymax,
                     str(class_ind)])
            print(annotation)
            f.write(annotation + "\n")
    datastore.blob_service.create_blob_from_path(
        container_name,
        anno_path,
        anno_path,
        content_settings=ContentSettings(
            content_type=__get_mime_type(anno_path)))
Example #12
0
def register_dataset(aml_workspace: Workspace, dataset_name: str,
                     datastore_name: str, file_path: str) -> Dataset:
    datastore = Datastore.get(aml_workspace, datastore_name)
    dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path))
    dataset = dataset.register(workspace=aml_workspace,
                               name=dataset_name,
                               create_new_version=True)

    return dataset
Example #13
0
def register_dataset(workspace, datastore_name, dataset_name, file_path):
    datastore = Datastore.get(workspace=workspace,
                              datastore_name=datastore_name)
    dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path))
    dataset = dataset.regisetr(
        workspace=workspace, name=dataset_name, create_new_version=True
    )  #either create new version if existed of exist_ok=True

    return dataset
Example #14
0
def main(_):
    # Export the trained model
    if not os.path.exists(FLAGS.export_dir):
        os.makedirs(FLAGS.export_dir)

    run.log('accuracy', float(0.91))
    run.log('val_accuracy', float(0.901))

    datastore = Datastore.get(ws, 'mtcseattle')
    datastore.download(FLAGS.export_dir, prefix="model")
Example #15
0
def prepare_data(workspace):
    datastore = Datastore.get(workspace, TRAINING_DATASTORE)
    x_train = get_df_from_datastore_path(datastore, 'train/X_train.csv')
    y_train = get_df_from_datastore_path(datastore, 'train/y_train.csv')
    y_train = y_train['Target']
    x_test = get_df_from_datastore_path(datastore, 'test/X_test.csv')
    y_test = get_df_from_datastore_path(datastore, 'test/y_test.csv')
    y_test = y_test['Target']
    x_train = remove_collinear_cols(x_train)
    x_test = remove_collinear_cols(x_test)
    return x_train, y_train, x_test, y_test
def get_dataset(ws: Workspace) -> Dataset:
    if _dataset_name not in ws.datasets:
        datastore = Datastore.get(ws, _datastore_name)

        datastore_paths = [(datastore, f"{_dataset_name}/*.csv")]

        dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)
        dataset.register(workspace=ws,
                         name=_dataset_name,
                         description="Names with sentiment scores")
    else:
        dataset = ws.datasets[_dataset_name]
    return dataset
Example #17
0
def get_data(workspace):
    # blob_storage_client = __get_blob_storage_client()
    # blob_client = blob_storage_client.get_blob_client("raw", "iris/year=2020/month=10/day=05/iris.csv")
    datastore = Datastore.get(workspace, "train")
    datastore_path = [(datastore, "iris/year=2020/month=10/day=05/iris.csv")]
    dataset = Dataset.Tabular.from_delimited_files(path=datastore_path)
    # content = blob_client.download_blob().readall().decode()
    # dataframe = pd.read_csv(StringIO(content))
    dataframe = dataset.to_pandas_dataframe()
    x = dataframe.values[:, 0:4]
    y = dataframe.values[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    return x_train, y_train, x_test, y_test, dataset
Example #18
0
    def __init__(self):
        self._parser = argparse.ArgumentParser("mAP")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")
        self._parser.add_argument('-na',
                                  '--no-animation',
                                  help="no animation is shown.",
                                  action="store_true")
        self._parser.add_argument('-np',
                                  '--no-plot',
                                  help="no plot is shown.",
                                  action="store_true")
        self._parser.add_argument('-q',
                                  '--quiet',
                                  help="minimalistic console output.",
                                  action="store_true")
        self._parser.add_argument('-i',
                                  '--ignore',
                                  nargs='+',
                                  type=str,
                                  help="ignore a list of classes.")
        self._parser.add_argument('--set-class-iou',
                                  nargs='+',
                                  type=str,
                                  help="set IoU for a specific class.")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)

        self._MINOVERLAP = 0.5
Example #19
0
def prepare_data(workspace):
    training_datastore = Datastore.get(workspace, TRAINING_DATASTORE)
    #validation_datastore = Datastore.get(workspace, SCORING_CONTAINER)
    x_train = get_df_from_datastore_path(training_datastore,
                                         'train/X_train.csv')
    y_train = get_df_from_datastore_path(training_datastore,
                                         'train/y_train.csv')
    y_train = y_train['class']
    x_test = get_df_from_datastore_path(training_datastore,
                                        'valid/X_valid.csv')
    y_test = get_df_from_datastore_path(training_datastore,
                                        'valid/y_valid.csv')
    y_test = y_test['class']
    return x_train, y_train, x_test, y_test
Example #20
0
def load_image_files(dimension=(256, 256)):

    subscription_id = os.environ['AZURE_SUBSCRIPTION_ID']
    resource_group = 'ai-lab'
    workspace_name = 'ailabml'
    workspace = Workspace(subscription_id, resource_group, workspace_name)

    # get dataset (online run)
    #run = Run.get_context()
    #dataset = run.input_datasets['Light Bulbs-2019-12-08 00:35:33']

    # get dataset (offline run)
    ds = Dataset.get_by_name(workspace, name='Light Bulbs-2019-12-08 00:35:33')
    df = ds.to_pandas_dataframe()

    # Images
    descr = "Defect Detection Dataset"
    images = []
    flat_data = []
    target = []
    categories = set()
    for i in tqdm(range(df.shape[0])):
        si = df.loc[i].image_url.to_pod()
        if i == 0:
            datastore = Datastore.get(workspace,
                                      si['arguments']['datastoreName'])
        categories.add(df.loc[i].label[0])
        datastore.download(target_path='.',
                           prefix=si['resourceIdentifier'],
                           overwrite=True,
                           show_progress=False)
        img = imread(si['resourceIdentifier'], as_gray=True)
        img_resized = resize(img,
                             dimension,
                             anti_aliasing=True,
                             mode='reflect')
        flat_data.append(img_resized.flatten())
        images.append(img_resized)
        target.append(df.loc[i].label[0])

    categories = list(categories)
    flat_data = np.array(flat_data)
    target = np.array(target)
    images = np.array(images)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 images=images,
                 DESCR=descr)
def register_dataset(aml_workspace: Workspace, dataset_name: str,
                     datastore_name: str, file_path: str) -> Dataset:
    if datastore_name:
        datastore = Datastore.get(aml_workspace, datastore_name)
    else:
        datastore = aml_workspace.get_default_datastore()
    # if the path is same as the latest version, no new version will be registered  # NOQA: E501
    # however, run.input_datasets['name'] = dataset will not log the dataset in the run  # NOQA: E501
    # in this case, the dataset returned from Dataset.get_by_name does get logged  # NOQA: E501
    dataset = Dataset.File.from_files(path=(datastore, file_path))
    dataset = dataset.register(workspace=aml_workspace,
                               name=dataset_name,
                               create_new_version=True)

    return Dataset.get_by_name(aml_workspace, dataset_name)
def create_dataset(ws, name, datastore, data_path):
    '''create the dataset object'''

    # get the datastore
    if datastore:
        datastore = Datastore.get(ws, datastore)
    else:
        datastore = ws.get_default_datastore()

    # define dataset
    dataset = Dataset.File.from_files(path=(datastore, data_path))

    # register the dataset for future use
    dataset = dataset.register(workspace=ws,
                               name=name,
                               create_new_version=True)
def register_dataset(
    aml_workspace: Workspace,
    dataset_name: str,
    datastore_name: str,
    file_path: str = "COVID19Articles.csv",
) -> Dataset:
    if (datastore_name):
        datastore = Datastore.get(aml_workspace, datastore_name)
    else:
        datastore = Datastore.get_default(aml_workspace)
    dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path))
    dataset = dataset.register(workspace=aml_workspace,
                               name=dataset_name,
                               create_new_version=True)

    return dataset
def convert_voc_annotation(ws,
                           ds,
                           data_type,
                           container_name,
                           use_difficult_bbox=True):
    classes = ['yellow', 'white', 'blue', 'red', 'hat']

    datastore = Datastore.get(ws, datastore_name=ds)
    voc_dataset_annotations = datastore.blob_service.list_blobs(
        container_name, prefix='VOC/Annotations')
    voc_dataset_images = datastore.blob_service.list_blobs(
        container_name, prefix='VOC/JPEGImages')
    voc_dataset_imagesets = datastore.blob_service.list_blobs(
        container_name, prefix=f'VOC/ImageSets/Main/{data_type}.txt')

    voc_list_annotations = list(voc_dataset_annotations)
    print("Succesfully list annotations")
    voc_list_images = list(voc_dataset_images)
    print("Succesfully list images")
    voc_list_imagesets = list(voc_dataset_imagesets)
    print("Succesfully list imagesets")

    txt = datastore.blob_service.get_blob_to_text(container_name,
                                                  voc_list_imagesets[0].name)
    txt_split = txt.content.splitlines()
    image_inds = [line.strip() for line in txt_split]
    for image_ind in image_inds:
        image_path = datastore.blob_service.make_blob_url(
            container_name, 'VOC/JPEGImages/' + image_ind + '.jpg')
        annotation = image_path
        label_path = datastore.blob_service.get_blob_to_text(
            container_name, 'VOC/Annotations/' + image_ind + '.xml').content
        print(f'XML {image_ind}')
        with open(f"./Test/{image_ind}.xml", 'w') as f:
            root = ET.fromstring(label_path)
            root.set('verified', '')
            root.find('path').text = f'{image_ind}.xml'
            root.find('folder').text = ''
            objects = root.findall('object')
            for obj in objects:
                class_ind = obj.find('name').text.lower().strip()
                if (class_ind in classes):
                    obj.find('name').text = 'helmet'
                elif (class_ind == "person"):
                    obj.find('name').text = 'none'
            f.write(ET.tostring(root, encoding='unicode'))
Example #25
0
    def _resolve_path(self, item, is_query=False):
        if not is_query and isinstance(item, str):
            return item

        item_prop = self._prop_query if is_query else self._prop_path
        subitem_prop = self._prop_query if is_query else self._prop_relative_path
        datastoreName = self._json_utility.try_get_value(
            item, self._prop_datastore_name, None,
            lambda v: isinstance(v, str) and len(v) > 0,
            'Property "{}.{}" must be specified.'.format(
                item_prop, self._prop_datastore_name))
        subitem = self._json_utility.try_get_value(
            item, subitem_prop, None,
            lambda v: isinstance(v, str) and len(v) > 0,
            'Property "{}.{}" must be specified.'.format(
                item_prop, subitem_prop))
        return (Datastore.get(self._workspace, datastoreName), subitem)
Example #26
0
    def get_pipeline_data(self, config):
        pipeline_data = []

        for c in config:            
            if c["type"] == StepArgParser.ARG_TYPE_PIPELINE_DATA:
                pconfig = c["config"]
                pname = pconfig["name"]
                pds = pconfig.get("datastore") or "default"

                if pds == "default":
                    use_ds = self.workspace.get_default_datastore()
                else:
                    use_ds = Datastore.get(workspace=self.workspace, datastore_name=pds)

                pd = PipelineData(pname, datastore=use_ds)

                pipeline_data.append(pd)        

        return pipeline_data
Example #27
0
def get_datastore():
    env = EnvironmentVariables()
    datastore_name = env.datastore_name
    storage_account_name = env.storage_account_name
    storage_container_name = env.storage_container_name
    storage_account_key = env.storage_account_key
    workspace = get_workspace()

    try:
        datastore = Datastore.get(workspace=workspace, datastore_name=datastore_name)
    except HttpOperationError:
        datastore = Datastore.register_azure_blob_container(
            workspace=workspace,
            datastore_name=datastore_name,
            account_name=storage_account_name,
            container_name=storage_container_name,
            account_key=storage_account_key)

    return datastore
def run(input_path: str, output_path: str, datastore_name: str):
    """Run Function.

    Args:
        input_path (str): path to raw text files in the datastore
        output_path (str): path to the output directory
        datastore_name (str): name of the datastore
    """
    logger.info("PREPARATION")
    logger.info(f"input files path: {input_path}")
    logger.info(f"output directory path: {output_path}")

    Path(output_path).mkdir(parents=True, exist_ok=True)

    # Download input datasets
    run = Run.get_context()
    workspace = run.experiment.workspace

    shared_blob_store = Datastore.get(workspace, datastore_name)
    shared_blob_store.download(target_path=output_path, prefix=input_path)
def create_sample_data_csv(aml_workspace: Workspace,
                           datastore_name: str,
                           file_name: str = "COVID19Articles.csv",
                           for_scoring: bool = False):

    url = \
        "https://solliancepublicdata.blob.core.windows.net" + \
        "/ai-in-a-day/lab-02/"
    df = pd.read_csv(url + file_name)
    if for_scoring:
        df = df.drop(columns=['cluster'])
    df.to_csv(file_name, index=False)

    if (datastore_name):
        datastore = Datastore.get(aml_workspace, datastore_name)
    else:
        datastore = Datastore.get_default(aml_workspace)
    datastore.upload_files(
        files=[file_name],
        overwrite=True,
        show_progress=False,
    )
    def __init__(self):
        self.__parser = argparse.ArgumentParser("preprocessing")
        self.__parser.add_argument("--datastore",
                                   type=str,
                                   help="Name of the datastore",
                                   default="workspaceblobstore")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   help="Name of the dataset")
        self.__parser.add_argument("--dataset_preprocessed_name",
                                   type=str,
                                   help="Standard preprocessed dataset")
        self.__parser.add_argument("--output_preprocess_dataset",
                                   type=str,
                                   help="Name of the PipelineData reference")
        self.__parser.add_argument(
            "--use_datadrift",
            type=distutils.util.strtobool,
            help=
            "Use datadrift(True/False). If true, we split the original datset by sex"
        )
        self.__parser.add_argument("--retrain_status",
                                   type=distutils.util.strtobool,
                                   help="Retrain status")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'exploratory_analysis')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__datastore = Datastore.get(self.__ws,
                                         datastore_name=self.__args.datastore)