Beispiel #1
0
 def list(self):
     selected = self.ctx.config.get('dataset', None)
     datasets = Dataset.get_all(self._get_ws())
     ndatasts = len(datasets)
     for name in datasets.keys():
         self.ctx.log(('[%s] ' % ('x' if selected == name else ' ')) + name)
     self.ctx.log('%s DataSet(s) listed' % ndatasts)
     return {'datasets': [name for name in datasets.keys()]}
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="COVID19Articles_model_github.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    datastore_name = os.environ.get("DATASTORE_NAME")
    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    # Get the dataset
    if (dataset_name):
        if (data_file_path == ""):
            if (dataset_name
                    in Dataset.get_all(run.experiment.workspace).keys()):
                dataset = Dataset.get_by_name(run.experiment.workspace,
                                              dataset_name,
                                              version=dataset_version)
            else:
                create_sample_data_csv(run.experiment.workspace,
                                       datastore_name)
                dataset = register_dataset(run.experiment.workspace,
                                           dataset_name, datastore_name)
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       datastore_name, data_file_path)
    else:
        if (data_file_path == ""):
            data_file_path = "COVID19Articles.csv"
            create_sample_data_csv(run.experiment.workspace, datastore_name)
        dataset_name = "COVID19Articles_Training_githubactions"
        dataset = register_dataset(run.experiment.workspace, dataset_name,
                                   datastore_name, data_file_path)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    class_args = {"max_depth": 5}
    # Train the model
    model = train_model(data, class_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)

    # files saved in the "outputs" folder are automatically uploaded into run history
    model_file_name = "COVID19Articles_model.pkl"
    joblib.dump(model, os.path.join('outputs', model_file_name))
    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()