def list(self): selected = self.ctx.config.get('dataset', None) datasets = Dataset.get_all(self._get_ws()) ndatasts = len(datasets) for name in datasets.keys(): self.ctx.log(('[%s] ' % ('x' if selected == name else ' ')) + name) self.ctx.log('%s DataSet(s) listed' % ndatasts) return {'datasets': [name for name in datasets.keys()]}
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str, help="Name of the Model", default="COVID19Articles_model_github.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) datastore_name = os.environ.get("DATASTORE_NAME") model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() # Get the dataset if (dataset_name): if (data_file_path == ""): if (dataset_name in Dataset.get_all(run.experiment.workspace).keys()): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, version=dataset_version) else: create_sample_data_csv(run.experiment.workspace, datastore_name) dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name) else: dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) else: if (data_file_path == ""): data_file_path = "COVID19Articles.csv" create_sample_data_csv(run.experiment.workspace, datastore_name) dataset_name = "COVID19Articles_Training_githubactions" dataset = register_dataset(run.experiment.workspace, dataset_name, datastore_name, data_file_path) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) class_args = {"max_depth": 5} # Train the model model = train_model(data, class_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) # files saved in the "outputs" folder are automatically uploaded into run history model_file_name = "COVID19Articles_model.pkl" joblib.dump(model, os.path.join('outputs', model_file_name)) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()