def repackage_to_staging(output_path): """Repackage it from local installed location and copy it to GCS.""" import google.datalab.ml as ml # Find the package root. __file__ is under [package_root]/mltoolbox/image/classification. package_root = os.path.join(os.path.dirname(__file__), '../../../') # We deploy setup.py in the same dir for repackaging purpose. setup_py = os.path.join(os.path.dirname(__file__), 'setup.py') staging_package_url = os.path.join(output_path, 'staging', 'image_classification.tar.gz') ml.package_and_copy(package_root, setup_py, staging_package_url) return staging_package_url
def _package_to_staging(staging_package_url): """Repackage this package from local installed location and copy it to GCS. Args: staging_package_url: GCS path. """ import google.datalab.ml as ml # Find the package root. __file__ is under [package_root]/mltoolbox/_structured_data/this_file package_root = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../')) setup_path = os.path.abspath( os.path.join(os.path.dirname(__file__), 'master_setup.py')) tar_gz_path = os.path.join(staging_package_url, 'staging', 'trainer.tar.gz') print('Building package and uploading to %s' % tar_gz_path) ml.package_and_copy(package_root, setup_path, tar_gz_path) return tar_gz_path
def _train(args, cell): env = google.datalab.utils.commands.notebook_environment() cell_data = google.datalab.utils.commands.parse_config(cell, env) required_keys = ['training_data', 'evaluation_data'] if args['cloud']: required_keys.append('cloud') google.datalab.utils.commands.validate_config(cell_data, required_keys=required_keys, optional_keys=['model_args']) job_args = [ '--job-dir', _abs_path(args['output_dir']), '--output-dir-from-analysis-step', _abs_path(args['output_dir_from_analysis_step']) ] def _process_train_eval_data(data, arg_name, job_args): if isinstance(data, dict): if 'csv_file_pattern' in data: job_args.extend( [arg_name, _abs_path(data['csv_file_pattern'])]) if '--run-transforms' not in job_args: job_args.append('--run-transforms') elif 'transformed_file_pattern' in data: job_args.extend( [arg_name, _abs_path(data['transformed_file_pattern'])]) else: raise ValueError( 'Invalid training_data dict. ' + 'Requires either "csv_file_pattern" or "transformed_file_pattern".' ) elif isinstance(data, google.datalab.ml.CsvDataSet): for file_name in data.input_files: job_args.append(arg_name + '=' + _abs_path(file_name)) else: raise ValueError( 'Invalid training data. Requires either a dict, or ' + 'a google.datalab.ml.CsvDataSet') _process_train_eval_data(cell_data['training_data'], '--train-data-paths', job_args) _process_train_eval_data(cell_data['evaluation_data'], '--eval-data-paths', job_args) # TODO(brandondutra) document that any model_args that are file paths must # be given as an absolute path if 'model_args' in cell_data: for k, v in six.iteritems(cell_data['model_args']): job_args.extend(['--' + k, str(v)]) try: tmpdir = None if args['package']: tmpdir = tempfile.mkdtemp() code_path = os.path.join(tmpdir, 'package') _archive.extract_archive(args['package'], code_path) else: code_path = MLTOOLBOX_CODE_PATH if args['cloud']: cloud_config = cell_data['cloud'] if not args['output_dir'].startswith('gs://'): raise ValueError( 'Cloud training requires a GCS (starting with "gs://") output_dir.' ) staging_tarball = os.path.join(args['output_dir'], 'staging', 'trainer.tar.gz') datalab_ml.package_and_copy(code_path, os.path.join(code_path, 'setup.py'), staging_tarball) job_request = { 'package_uris': [staging_tarball], 'python_module': 'trainer.task', 'job_dir': args['output_dir'], 'args': job_args, } job_request.update(cloud_config) job_id = cloud_config.get('job_id', None) job = datalab_ml.Job.submit_training(job_request, job_id) _show_job_link(job) else: cmd_args = ['python', '-m', 'trainer.task'] + job_args _shell_process.run_and_monitor(cmd_args, os.getpid(), cwd=code_path) finally: if tmpdir: shutil.rmtree(tmpdir)
def _train(args, cell): if args['cloud_config'] and not args['cloud']: raise ValueError('"cloud_config" is provided but no "--cloud". ' 'Do you want local run or cloud run?') job_args = ['--job-dir', _abs_path(args['output']), '--analysis', _abs_path(args['analysis'])] def _process_train_eval_data(data, arg_name, job_args): if isinstance(data, dict): if 'csv' in data: job_args.append(arg_name + '=' + _abs_path(data['csv'])) if '--transform' not in job_args: job_args.append('--transform') elif 'transformed' in data: job_args.append(arg_name + '=' + _abs_path(data['transformed'])) else: raise ValueError('Invalid training_data dict. ' 'Requires either "csv" or "transformed".') elif isinstance(data, google.datalab.ml.CsvDataSet): for file_name in data.input_files: job_args.append(arg_name + '=' + _abs_path(file_name)) else: raise ValueError('Invalid training data. Requires either a dict, or ' 'a google.datalab.ml.CsvDataSet') _process_train_eval_data(args['training_data'], '--train', job_args) _process_train_eval_data(args['evaluation_data'], '--eval', job_args) # TODO(brandondutra) document that any model_args that are file paths must # be given as an absolute path if args['model_args']: for k, v in six.iteritems(args['model_args']): job_args.extend(['--' + k, str(v)]) try: tmpdir = None if args['package']: tmpdir = tempfile.mkdtemp() code_path = os.path.join(tmpdir, 'package') _archive.extract_archive(args['package'], code_path) else: code_path = MLTOOLBOX_CODE_PATH if args['cloud']: cloud_config = args['cloud_config'] if not args['output'].startswith('gs://'): raise ValueError('Cloud training requires a GCS (starting with "gs://") output.') staging_tarball = os.path.join(args['output'], 'staging', 'trainer.tar.gz') datalab_ml.package_and_copy(code_path, os.path.join(code_path, 'setup.py'), staging_tarball) job_request = { 'package_uris': [staging_tarball], 'python_module': 'trainer.task', 'job_dir': args['output'], 'args': job_args, } job_request.update(cloud_config) job_id = cloud_config.get('job_id', None) job = datalab_ml.Job.submit_training(job_request, job_id) _show_job_link(job) else: cmd_args = ['python', '-m', 'trainer.task'] + job_args _shell_process.run_and_monitor(cmd_args, os.getpid(), cwd=code_path) finally: if tmpdir: shutil.rmtree(tmpdir)