コード例 #1
0
def get_data():
    time_column_name = 'dtime'

    target_column_name = os.environ['FORECAST_FILE_PREFIX']
    granularity = os.environ['FORECAST_GRANULARITY']
    horizon = int(os.environ['FORECAST_HORIZON'])

    print('target:{}, granularity:{}, horizon:{}' \
        .format(target_column_name, granularity, horizon))

    # get data from dataset, below is equivalent to the following local call
    #df = pd.read_csv(csvfile, header=0, index_col=0, parse_dates=True)
    run = Run.get_context()
    workspace = run.experiment.workspace
    dataset = Dataset.get(workspace, target_column_name)
    df = dataset.to_pandas_dataframe()
    df.index = df[time_column_name]
    df.drop(time_column_name, inplace=True, axis=1)

    min_time, max_time = df.index.min(), df.index.max()
    try_split = split_train_test_by_granularity(granularity, horizon, min_time,
                                                max_time)
    if try_split is None:
        raise Exception('can not train data', min_time, max_time)
    else:
        (delta, frequency, training_slice_begin, training_slice_end,
         test_slice_begin, test_slice_end) = try_split
        print('train between %s and %s, forecast between %s and %s' % \
        (training_slice_begin, training_slice_end, test_slice_begin, test_slice_end))

    df = df.loc[training_slice_begin:test_slice_end, ]
    X_train, X_test, y_train, y_test = compute_train_test_for_automl(df, \
        training_slice_begin, training_slice_end, test_slice_begin, test_slice_end, target_column_name, time_column_name)

    return X_train, y_train
コード例 #2
0
    def __init__(self,
                 workspace,
                 snapshot_name,
                 dataset_id,
                 definition_version=None,
                 time_stamp=None,
                 profile_action_id=None,
                 datastore_name=None,
                 relative_path=None,
                 dataset_name=None):
        """Dataset snapshot is a combination of Profile and an optional materialized copy of the data.

        To learn more about Dataset Snapshots, go to https://aka.ms/azureml/howto/createsnapshots

        :param workspace: The workspace the Dataset is registered in.
        :type workspace: azureml.core.Workspace.
        :param snapshot_name: The name of the Dataset snapshot.
        :type snapshot_name: str
        :param dataset_id: The identifier of the Dataset.
        :type dataset_id: str
        :param definition_version: The definition version of the Dataset.
        :type definition_version: str
        :param time_stamp: The snapshot creation time.
        :type time_stamp: datetime
        :param profile_action_id: The snapshot profile action ID.
        :type profile_action_id: str
        :param datastore_name: The snapshot data store name.
        :type datastore_name: str
        :param relative_path: The relative path to the snapshot data.
        :type relative_path: str
        :param dataset_name: The name of the Dataset.
        :type dataset_name: str
        """
        from azureml.core import Dataset

        self._workspace = workspace
        self._name = snapshot_name
        self._dataset_id = dataset_id
        self._definition_version = definition_version
        self._time_stamp = time_stamp
        self._profile_action_id = profile_action_id
        self._datastore_name = datastore_name
        self._relative_path = relative_path

        # This is a hack, we should either return the dataset name in the DTO or remove the _dataset_name field.
        dataset = Dataset.get(workspace, id=dataset_id)
        self._dataset_name = dataset_name or dataset.name
コード例 #3
0
import os
import datetime
import shutil

from azureml.core import Workspace, Datastore, Dataset, Experiment, Run
from sklearn.model_selection import train_test_split
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from sklearn.tree import DecisionTreeClassifier

run = Run.get_context()
workspace = run.experiment.workspace

dataset_name = 'clean_Titanic_tutorial'

dataset = Dataset.get(workspace=workspace, name=dataset_name)
df = dataset.to_pandas_dataframe()

x_col = ['Pclass', 'Sex', 'SibSp', 'Parch']
y_col = ['Survived']
x_df = df.loc[:, x_col]
y_df = df.loc[:, y_col]

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)

data = {"train": {"X": x_train, "y": y_train},

        "test": {"X": x_test, "y": y_test}}

clf = DecisionTreeClassifier().fit(data["train"]["X"], data["train"]["y"])
コード例 #4
0
ds = Datastore.get(ws, datastore_name=datastore_name)
compute_target = ws.compute_targets[compute_target]
experiment_name = 'forecast_automl_' + file_prefix + '_' + granularity

# environment for get_data.py
time_column_name = 'dtime'
script_folder = './'  # where is get_data.py relative to current folder
script_env = {
    'FORECAST_FILE_PREFIX': file_prefix,
    'FORECAST_GRANULARITY': granularity,
    'FORECAST_HORIZON': horizon
}

# register dataset so get_data can access it
try:
    dataset = Dataset.get(ws, file_prefix)
    print('using existing dataset:{0}'.format(file_prefix))
except:
    data_file = datastore_folder + file_prefix + '_' + granularity + '.csv'
    dataset = Dataset.from_delimited_files(ds.path(data_file))
    dataset = dataset.register(ws, file_prefix)
    print('registered dataset:{0}'.format(file_prefix))

# Setup run configuration
run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
run_config.environment.environment_variables = script_env
dependencies = CondaDependencies.create(
    pip_packages=["scikit-learn", "scipy", "numpy"])
コード例 #5
0
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.experiment import Experiment
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
ws = Workspace.from_config(path="./file-path/ws_config.json")
experiment = Experiment(workspace=ws, name='BrainStar')
def_blob_store = Datastore(ws, "workspaceblobstore")
compute_target = ws.compute_targets["BrainStar1"]
input_data = Dataset.get(ws, name="Absence data")
output_data1 = PipelineData("output_data1",
                            datastore=def_blob_store,
                            output_name="output_data1")
source_directory = './process'
step1 = PythonScriptStep(name="process_step",
                         script_name="process.py",
                         inputs=[input_data],
                         outputs=[output_data1],
                         compute_target=compute_target,
                         source_directory=source_directory,
                         allow_reuse=True)

steps = step1
pipeline1 = Pipeline(workspace=ws, steps=steps)
pipeline1.validate()
pipeline_run1 = Experiment(ws, 'Hello_World1').submit(pipeline1,
                                                      regenerate_outputs=False)