def _run_hook(config, hook_name, submission_id): """ run hooks corresponding to hook_name """ conf = config[AWS_CONFIG_SECTION] hooks = conf.get(HOOKS_SECTION) if not hooks: return if hook_name in hooks: submission = get_submission_by_id(config, submission_id) submission_folder_name = _get_submission_folder_name(submission_id) submission_folder = os.path.join( conf[LOCAL_LOG_FOLDER_FIELD], submission_folder_name) env = { 'RAMP_AWS_SUBMISSION_ID': str(submission_id), 'RAMP_AWS_SUBMISSION_NAME': submission.name, 'RAMP_AWS_EVENT': submission.event.name, 'RAMP_AWS_TEAM': submission.team.name, 'RAMP_AWS_HOOK': hook_name, 'RAMP_AWS_SUBMISSION_FOLDER': submission_folder } env.update(os.environ) cmd = hooks[hook_name] if type(cmd) == list: cmd = ';'.join(cmd) logger.info('Running "{}" for hook {}'.format(cmd, hook_name)) return call(cmd, shell=True, env=env)
def train_on_existing_ec2_instance(config, instance_id, submission_id): """ Train a submission on a ready ec2 instance the steps followed by this function are the following: 1) upload the submission code to the instance 2) launch training in a screen 3) wait until training is finished 4) download the predictions 5) download th log 6) set the predictions in the database 7) score the submission """ conf_aws = config[AWS_CONFIG_SECTION] upload_submission(conf_aws, instance_id, submission_id) launch_train(conf_aws, instance_id, submission_id) set_submission_state(config, submission_id, 'training') _run_hook(config, HOOK_START_TRAINING, submission_id) _wait_until_train_finished(conf_aws, instance_id, submission_id) download_log(conf_aws, instance_id, submission_id) label = _get_submission_label_by_id(config, submission_id) submission = get_submission_by_id(config, submission_id) actual_nb_folds = get_event_nb_folds(config, submission.event.name) if _training_successful(conf_aws, instance_id, submission_id, actual_nb_folds): logger.info('Training of "{}" was successful'.format( label, instance_id)) if conf_aws[MEMORY_PROFILING_FIELD]: logger.info('Download max ram usage info of "{}"'.format(label)) download_mprof_data(conf_aws, instance_id, submission_id) max_ram = _get_submission_max_ram(conf_aws, submission_id) logger.info('Max ram usage of "{}": {}MB'.format(label, max_ram)) set_submission_max_ram(config, submission_id, max_ram) logger.info('Downloading predictions of : "{}"'.format(label)) predictions_folder_path = download_predictions( conf_aws, instance_id, submission_id) set_predictions(config, submission_id, predictions_folder_path) set_time(config, submission_id, predictions_folder_path) set_scores(config, submission_id, predictions_folder_path) set_submission_state(config, submission_id, 'tested') logger.info('Scoring "{}"'.format(label)) score_submission(config, submission_id) _run_hook(config, HOOK_SUCCESSFUL_TRAINING, submission_id) else: logger.info('Training of "{}" in "{}" failed'.format( label, instance_id)) set_submission_state(config, submission_id, 'training_error') error_msg = _get_traceback( _get_log_content(conf_aws, submission_id)) set_submission_error_msg(config, submission_id, error_msg) _run_hook(config, HOOK_FAILED_TRAINING, submission_id)
def launch_train(config, instance_id, submission_id): """ Launch the training of a submission on an ec2 instance. A screen named `submission_folder_name` (see below) is created and within that screen `ramp_test_submission` is launched. Parameters ---------- instance_id : str instance id submission_id : int submission id """ conf = config[AWS_CONFIG_SECTION] ramp_kit_folder = conf[REMOTE_RAMP_KIT_FOLDER_FIELD] submission_folder_name = _get_submission_folder_name(submission_id) submission = get_submission_by_id(config, submission_id) values = { 'ramp_kit_folder': ramp_kit_folder, 'submission': submission_folder_name, 'submission_folder': os.path.join(ramp_kit_folder, SUBMISSIONS_FOLDER, submission_folder_name), 'log': os.path.join(ramp_kit_folder, SUBMISSIONS_FOLDER, submission_folder_name, 'log') } # we use python -u so that standard input/output are flushed # and thus we can retrieve the log file live during training # without waiting for the process to finish. # We use an espace character around "$" because it is interpreted # before being run remotely and leads to an empty string run_cmd = r"python -u \$(which ramp_test_submission) --submission {submission} --save-y-preds " if conf.get(MEMORY_PROFILING_FIELD): run_cmd = "mprof run --output={submission_folder}/mprof.dat --include-children " + run_cmd cmd = ( "screen -dm -S {submission} sh -c '. ~/.profile;"+ "cd {ramp_kit_folder};"+ "rm -fr {submission_folder}/training_output;"+ "rm -f {submission_folder}/log;"+ "rm -f {submission_folder}/mprof.dat;"+ run_cmd+">{log} 2>&1'" ) cmd = cmd.format(**values) # tag the ec2 instance with info about submission _tag_instance_by_submission(config, instance_id, submission) label = _get_submission_label(submission) logger.info('Launch training of {}..'.format(label)) return _run(config, instance_id, cmd)
def _training_successful(config, instance_id, submission_id): """ Return True if a finished submission have been trained successfully. If the folder training_output exists and each fold directory contains .npz prediction files we consider that the training was successful. """ folder = _get_remote_training_output_folder( config, instance_id, submission_id) cmd = "ls -l {}|grep fold_|wc -l".format(folder) nb_folds = int(_run(config, instance_id, cmd, return_output=True)) cmd = "find {}|egrep 'fold.*/y_pred_train.npz'|wc -l".format(folder) nb_train_files = int(_run(config, instance_id, cmd, return_output=True)) cmd = "find {}|egrep 'fold.*/y_pred_test.npz'|wc -l".format(folder) nb_test_files = int(_run(config, instance_id, cmd, return_output=True)) submission = get_submission_by_id(config, submission_id) actual_nb_folds = get_event_nb_folds(config, submission.event.name) return nb_folds == nb_train_files == nb_test_files == actual_nb_folds
def _get_submission_label_by_id(config, submission_id): submission = get_submission_by_id(config, submission_id) return _get_submission_label(submission)
def _get_submission_path(config, submission_id): # Get local submission path submission = get_submission_by_id(config, submission_id) return submission.path
def train_loop(config, event_name): """ This function starts a training loop for a given event The loop waits for any submission with the state 'new' then create an ec2 instance to train the submission on it. Parameters ---------- event_name : str event name """ conf = config[AWS_CONFIG_SECTION] secs = conf[TRAIN_LOOP_INTERVAL_SECS_FIELD] while True: # Launch new instances for new submissions submissions = get_submissions(config, event_name, 'new') for submission_id, _ in submissions: submission = get_submission_by_id(config, submission_id) if submission.is_sandbox: continue try: instance, = launch_ec2_instances(config, nb=1) except botocore.exceptions.ClientError as ex: logger.info('Exception when launching a new instance : "{}"'.format(ex)) logger.info('Skipping...') continue nb_trials = 0 while nb_trials < conf.get('new_instance_nb_trials', 20): if instance.state.get('name') == 'running': break nb_trials += 1 time.sleep(conf.get('new_instance_check_interval', 6)) _tag_instance_by_submission(config, instance.id, submission) _add_or_update_tag(config, instance.id, 'train_loop', '1') logger.info('Launched instance "{}" for submission "{}"'.format( instance.id, submission)) set_submission_state(config, submission.id, 'sent_to_training') # Score tested submissions submissions = get_submissions(config, event_name, 'tested') for submission_id, _ in submissions: label = _get_submission_label_by_id(config, submission_id) logger.info('Scoring submission : {}'.format(label)) score_submission(config, submission_id) _run_hook(config, HOOK_SUCCESSFUL_TRAINING, submission_id) # Get running instances and process events instance_ids = list_ec2_instance_ids(config) for instance_id in instance_ids: if not _is_ready(config, instance_id): continue tags = _get_tags(config, instance_id) # Filter instances that were not launched # by the training loop API if 'submission_id' not in tags: continue if tags.get('event_name') != event_name: continue if 'train_loop' not in tags: continue # Process each instance label = tags['Name'] submission_id = int(tags['submission_id']) state = get_submission_state(config, submission_id) if state == 'sent_to_training': exit_status = upload_submission( config, instance_id, submission_id) if exit_status != 0: logger.error( 'Cannot upload submission "{}"' ', an error occured'.format(label)) continue # start training HERE exit_status = launch_train(config, instance_id, submission_id) if exit_status != 0: logger.error( 'Cannot start training of submission "{}"' ', an error occured.'.format(label)) continue set_submission_state(config, submission_id, 'training') _run_hook(config, HOOK_START_TRAINING, submission_id) elif state == 'training': # in any case (successful training or not) # download the log download_log(config, instance_id, submission_id) if _training_finished(config, instance_id, submission_id): logger.info( 'Training of "{}" finished, checking ' 'if successful or not...'.format(label)) if _training_successful( config, instance_id, submission_id): logger.info('Training of "{}" was successful'.format(label)) if conf.get(MEMORY_PROFILING_FIELD): logger.info('Download max ram usage info of "{}"'.format(label)) download_mprof_data(config, instance_id, submission_id) max_ram = _get_submission_max_ram(config, submission_id) logger.info('Max ram usage of "{}": {}MB'.format(label, max_ram)) set_submission_max_ram(config, submission_id, max_ram) logger.info('Downloading the predictions of "{}"'.format(label)) path = download_predictions( config, instance_id, submission_id) set_predictions(config, submission_id, path) set_time(config, submission_id, path) set_scores(config, submission_id, path) set_submission_state(config, submission_id, 'tested') else: logger.info('Training of "{}" failed'.format(label)) set_submission_state( config, submission_id, 'training_error') error_msg = _get_traceback( _get_log_content(config, submission_id) ) set_submission_error_msg( config, submission_id, error_msg) _run_hook(config, HOOK_FAILED_TRAINING, submission_id) # training finished, so terminate the instance terminate_ec2_instance(config, instance_id) time.sleep(secs)