def test_terminate_trainingjob(kfp_client, experiment_id, region, sagemaker_client): test_file_dir = "resources/config/simple-mnist-training" download_dir = utils.mkdir( os.path.join(test_file_dir + "/generated_test_terminate")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) input_job_name = test_params["Arguments"]["job_name"] = ( utils.generate_random_string(4) + "-terminate-job") run_id, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], 60, "running", ) print( f"Terminating run: {run_id} where Training job_name: {input_job_name}") kfp_client_utils.terminate_run(kfp_client, run_id) response = sagemaker_utils.describe_training_job(sagemaker_client, input_job_name) assert response["TrainingJobStatus"] in ["Stopping", "Stopped"] utils.remove_dir(download_dir)
def test_workteamjob( kfp_client, experiment_id, region, sagemaker_client, test_file_dir ): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) workteam_name, workflow_json = create_workteamjob( kfp_client, experiment_id, region, sagemaker_client, test_file_dir, download_dir ) outputs = {"sagemaker-private-workforce": ["workteam_arn"]} try: output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir ) response = sagemaker_utils.describe_workteam(sagemaker_client, workteam_name) # Verify WorkTeam was created in SageMaker assert response["Workteam"]["CreateDate"] is not None assert response["Workteam"]["WorkteamName"] == workteam_name # Verify WorkTeam arn artifact was created in Minio and matches the one in SageMaker workteam_arn = utils.read_from_file_in_tar( output_files["sagemaker-private-workforce"]["workteam_arn"] ) assert response["Workteam"]["WorkteamArn"] == workteam_arn finally: # Cleanup the SageMaker Resources sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files only if the test is successful utils.remove_dir(download_dir)
def test_transform_job( kfp_client, experiment_id, s3_client, sagemaker_client, s3_data_bucket, test_file_dir, ): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, job name to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "job_name"] = input_job_name = (utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model/job name: {input_job_name}") # Generate unique location for output since output filename is generated according to the content_type test_params["Arguments"]["output_location"] = os.path.join( test_params["Arguments"]["output_location"], input_job_name) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-batch-transformation": ["output_location"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Job was successful on SageMaker response = sagemaker_utils.describe_transform_job(sagemaker_client, input_job_name) assert response["TransformJobStatus"] == "Completed" assert response["TransformJobName"] == input_job_name # Verify output location from pipeline matches job output and that the transformed file exists output_location = utils.read_from_file_in_tar( output_files["sagemaker-batch-transformation"]["output_location"]) print(f"output location: {output_location}") assert output_location == response["TransformOutput"]["S3OutputPath"] # Get relative path of file in S3 bucket # URI is following format s3://<bucket_name>/relative/path/to/file # split below is to extract the part after bucket name file_key = os.path.join("/".join(output_location.split("/")[3:]), test_params["ExpectedOutputFile"]) assert s3_utils.check_object_exists(s3_client, s3_data_bucket, file_key) utils.remove_dir(download_dir)
def remove_temp(self): try: utils.remove_dir(self.temp_dir) del self.temp_basename del self.temp_path del self.temp_dir except AttributeError: log.warning('%s file has not temporal version' % self.basename)
def test_create_endpoint(kfp_client, experiment_id, boto3_session, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, endpoint config and endpoint name # to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "endpoint_config_name"] = test_params["Arguments"][ "endpoint_name"] = input_endpoint_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model/endpoint name: {input_endpoint_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) try: outputs = {"sagemaker-deploy-model": ["endpoint_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_endpoint_name = utils.read_from_file_in_tar( output_files["sagemaker-deploy-model"]["endpoint_name"], "endpoint_name.txt") print(f"endpoint name: {output_endpoint_name}") # Verify output from pipeline is endpoint name assert output_endpoint_name == input_endpoint_name # Verify endpoint is running assert (sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointStatus"] == "InService") # Validate the model for use by running a prediction result = run_predict_mnist(boto3_session, input_endpoint_name, download_dir) print(f"prediction result: {result}") assert json.dumps(result, sort_keys=True) == json.dumps( test_params["ExpectedPrediction"], sort_keys=True) utils.remove_dir(download_dir) finally: # delete endpoint sagemaker_utils.delete_endpoint(sagemaker_client, input_endpoint_name)
def main(): remove_dir("java_template") make_dirs("java_template") generate_template("original/java/train.4186.diff", "java_template/train.4186.diff.new", "train") generate_template("original/java/test.436.diff", "java_template/test.436.diff.new", "test") generate_template("original/java/valid.453.diff", "java_template/valid.453.diff.new", "valid")
def test_trainingjob(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-training-job": ["job_name", "model_artifact_url", "training_image"] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Training job was successful on SageMaker training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["job_name"]) print(f"training job name: {training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, training_job_name) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["model_artifact_url"]) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert training_job_name in model_artifact_url # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["training_image"]) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image assert not argo_utils.error_in_cw_logs( workflow_json["metadata"]["name"] ), "Found the CloudWatch error message in the log output. Check SageMaker to see if the job has failed." utils.remove_dir(download_dir)
def test_workteamjob( kfp_client, experiment_id, region, sagemaker_client, test_file_dir ): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), ) ) # Generate random prefix for workteam_name to avoid errors if resources with same name exists test_params["Arguments"]["team_name"] = workteam_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["team_name"] ) try: workflow_json = create_workteamjob( kfp_client, test_params, experiment_id, region, sagemaker_client, download_dir, ) outputs = {"sagemaker-private-workforce": ["workteam_arn"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir ) response = sagemaker_utils.describe_workteam(sagemaker_client, workteam_name) # Verify WorkTeam was created in SageMaker assert response["Workteam"]["CreateDate"] is not None assert response["Workteam"]["WorkteamName"] == workteam_name # Verify WorkTeam arn artifact was created in Minio and matches the one in SageMaker workteam_arn = utils.read_from_file_in_tar( output_files["sagemaker-private-workforce"]["workteam_arn"] ) assert response["Workteam"]["WorkteamArn"] == workteam_arn finally: workteams = sagemaker_utils.list_workteams(sagemaker_client)["Workteams"] workteam_names = list(map((lambda x: x["WorkteamName"]), workteams)) # Check workteam was successfully created if workteam_name in workteam_names: sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files only if the test is successful utils.remove_dir(download_dir)
def create_dataset(file_type, folder, train_diffs, train_msgs, test_diffs, test_msgs, valid_diffs, valid_msgs): (train_diffs, train_msgs, train_cnt, vocab_diffs, vocab_msgs) = get_dataset(file_type, train_diffs, train_msgs) test_diffs, test_msgs, test_cnt, _, _ = get_dataset( file_type, test_diffs, test_msgs) valid_diffs, valid_msgs, valid_cnt, _, _ = get_dataset( file_type, valid_diffs, valid_msgs) remove_dir(folder) make_dirs(folder) save_dataset(folder, "train." + str(train_cnt), train_diffs, train_msgs) save_dataset(folder, "test." + str(test_cnt), test_diffs, test_msgs) save_dataset(folder, "valid." + str(valid_cnt), valid_diffs, valid_msgs) save_vocab(folder, vocab_diffs, vocab_msgs)
def train_batch(symbols_file, data_path, export_dir): '''prep data for training''' # read from symbols file symbols = [] with open(format_path(symbols_file), 'r') as data: read_data = data.read() symbols = str(read_data).split() for symbol in symbols: print('training neural network model for ' + symbol) train_data = pd.read_csv(format_path(data_path + '/train/' + symbol + '.csv'), index_col='date') test_data = pd.read_csv(format_path(data_path + '/test/' + symbol + '.csv'), index_col='date') model_dir = format_path(export_dir + '/' + symbol) remove_dir(model_dir) train(train_data, test_data, format_path(model_dir)) print('training finished for ' + symbol)
def _create_deployment(repo_url, branch): # create temp folder prefix = '{0}_'.format(time.strftime("%Y%m%d")) temp_build_folder = tempfile.mkdtemp(prefix=prefix) try: # clone git repo logger.info('Cloning repo..') clone_folder = os.path.join(temp_build_folder, 'repo') repo_path = utils.clone_repo(repo_url, destination=clone_folder, branch=branch) faaspot_folder = os.path.join(repo_path, FAASPOT_FOLDER) faaspot_config = os.path.join(faaspot_folder, 'faaspot.yml') logger.debug('Repo cloned to: {0}'.format(clone_folder)) # prepare deployment folder logger.debug('Creating deployment folder..') deployment_folder = os.path.join(temp_build_folder, 'deploy') utils.makedir(deployment_folder) logger.debug( 'Deployment folder created: {0}'.format(deployment_folder)) # copy modules from faaspot folder to the deployment folder logger.info('Copying config files into deployment folder..') utils.copy_files(faaspot_folder, deployment_folder) # build package into the deployment folder logger.info('Installing dependencies..') utils.install_libraries(repo_path, deployment_folder) # create a zip from the logger.info('Packaging it..') deployment_zip = os.path.join(temp_build_folder, 'deploy.zip') utils.zip_dir(deployment_folder, deployment_zip) logger.info('Zip file created: {0}'.format(deployment_zip)) yield Deployment(repo_path, faaspot_config, deployment_zip) finally: utils.remove_dir(temp_build_folder)
def test_createmodel(kfp_client, experiment_id, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model name to avoid errors if model with same name exists test_params["Arguments"]["model_name"] = input_model_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model_name: {input_model_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-create-model": ["model_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_model_name = utils.read_from_file_in_tar( output_files["sagemaker-create-model"]["model_name"]) print(f"model_name: {output_model_name}") assert output_model_name == input_model_name assert (sagemaker_utils.describe_model(sagemaker_client, input_model_name) is not None) utils.remove_dir(download_dir)
def __download_submission(self, subm, path): user_id = subm['class'][0][4:] username = subm.find(class_='cell c2').a.contents[0] timestamp = self.__parse_timestamp( subm.find(class_='cell c7').contents[0]) subm_path = os.path.join(path, 'user_' + user_id) subm_data = Submission(user_id, username, timestamp, subm_path) utils.remove_dir(subm_path) utils.make_dir(subm_path) for f in subm.find_all(class_='fileuploadsubmission'): name = f.a.contents[0] link = f.a['href'] if not self.__download_file(link, os.path.join(subm_path, name)): self.__logger.warning('Can not download file `{}\', ' \ 'skip submission [user_id={}, username=`{}\', timestamp={}]'.format( subm_data.user_id, subm_data.username, subm_data.timestamp)) return None else: self.__logger.info('Got file `{}\' ' \ '[user_id={}, username=`{}\', timerstamp={}]'.format(name, subm_data.user_id, subm_data.username, subm_data.timestamp)) return subm_data
def clear_all(): result = {"success": True, "message": "sucessfully cleared!"} try: utils.remove_dir(config.OUT_EVALS_DIR) utils.remove_dir(config.OUT_PLOT_DIR) utils.remove_dir(config.OUT_IOU_DIR) # call corresponding function in calc # result = getattr(calculate, section)(global_data) # utils.printFlaskMsg(str(result["message"]['norm'])) except: utils.printFlaskMsg("Unexpected error:") errorMsg = traceback.format_exc() utils.printFlaskMsg(errorMsg) return jsonify({"success": False, "message": errorMsg}) # raise result["trace"] = getTraceBack() return jsonify(result)
track_selection = project_config["tracks"] chataigne_base_path = project_config["chataigneProjectPath"] start_time = project_config["startTime"] all_configs.update({ltp_path: project_config}) time_shift = -start_time tracks = create_tracks(ltp, time_shift) tracks = filter_by_name(tracks, track_selection) tracks = add_dmx_channels(tracks, track_selection) if len(tracks) > 0: converter = ChataigneProject(tracks, chataigne_base_path) exported_json = converter.generate_projects_json() path = f"{output_path}/{project_name}.noisette" with open("%s" % path, 'w') as f: f.write(exported_json) print("Exported to " + path) # save cache save_cache(ltp_path, all_configs) # cleanup remove_dir(tmp_path) print("All done.")
def test_processingjob(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for job name to avoid errors if model with same name exists test_params["Arguments"]["job_name"] = input_job_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["job_name"]) print(f"running test with job_name: {input_job_name}") for index, output in enumerate(test_params["Arguments"]["output_config"]): if "S3Output" in output: test_params["Arguments"]["output_config"][index]["S3Output"][ "S3Uri"] = os.path.join(output["S3Output"]["S3Uri"], input_job_name) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-processing-job": ["job_name", "output_artifacts"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Processing job was successful on SageMaker processing_job_name = utils.read_from_file_in_tar( output_files["sagemaker-processing-job"]["job_name"]) print(f"processing job name: {processing_job_name}") process_response = sagemaker_utils.describe_processing_job( sagemaker_client, processing_job_name) assert process_response["ProcessingJobStatus"] == "Completed" assert process_response["ProcessingJobArn"].split("/")[1] == input_job_name # Verify processing job produced the correct outputs processing_outputs = json.loads( utils.read_from_file_in_tar( output_files["sagemaker-processing-job"]["output_artifacts"], )) print( f"processing job outputs: {json.dumps(processing_outputs, indent = 2)}" ) assert processing_outputs is not None for output in process_response["ProcessingOutputConfig"]["Outputs"]: assert processing_outputs[ output["OutputName"]] == output["S3Output"]["S3Uri"] assert not argo_utils.error_in_cw_logs( workflow_json["metadata"]["name"] ), "Found the CloudWatch error message in the log output. Check SageMaker to see if the job has failed." utils.remove_dir(download_dir)
def test_hyperparameter_tuning(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) test_params["Arguments"]["channels"] = json.dumps( test_params["Arguments"]["channels"]) test_params["Arguments"]["static_parameters"] = json.dumps( test_params["Arguments"]["static_parameters"]) test_params["Arguments"]["integer_parameters"] = json.dumps( test_params["Arguments"]["integer_parameters"]) test_params["Arguments"]["categorical_parameters"] = json.dumps( test_params["Arguments"]["categorical_parameters"]) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-hyperparameter-tuning": [ "best_hyperparameters", "best_job_name", "hpo_job_name", "model_artifact_url", "training_image", ] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify HPO job was successful on SageMaker hpo_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["hpo_job_name"], "hpo_job_name.txt", ) print(f"HPO job name: {hpo_job_name}") hpo_response = sagemaker_utils.describe_hpo_job(sagemaker_client, hpo_job_name) assert hpo_response["HyperParameterTuningJobStatus"] == "Completed" # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["training_image"], "training_image.txt", ) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image # Verify Training job was part of HPO job, returned as best and was successful best_training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["best_job_name"], "best_job_name.txt", ) print(f"best training job name: {best_training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, best_training_job_name) assert train_response["TuningJobArn"] == hpo_response[ "HyperParameterTuningJobArn"] assert (train_response["TrainingJobName"] == hpo_response["BestTrainingJob"]["TrainingJobName"]) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["model_artifact_url"], "model_artifact_url.txt", ) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert best_training_job_name in model_artifact_url # Verify hyper_parameters output is not empty hyper_parameters = json.loads( utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"] ["best_hyperparameters"], "best_hyperparameters.txt", )) print( f"HPO best hyperparameters: {json.dumps(hyper_parameters, indent = 2)}" ) assert hyper_parameters is not None utils.remove_dir(download_dir)
def _cleanup_temp_files(): utils.remove_dir(args.BACKUP_GIT_WORKING_DIR) utils.remove_dir(args.SECRET_GIT_WORKING_DIR) utils.remove_file(args.temp_ssh_file) utils.remove_file(args.temp_cert_file)
def test_create_endpoint(kfp_client, experiment_id, boto3_session, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, endpoint config and endpoint name # to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "endpoint_config_name"] = test_params["Arguments"][ "endpoint_name"] = input_endpoint_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) try: print(f"running test with model/endpoint name: {input_endpoint_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-deploy-model": ["endpoint_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_endpoint_name = utils.read_from_file_in_tar( output_files["sagemaker-deploy-model"]["endpoint_name"]) print(f"endpoint name: {output_endpoint_name}") # Verify output from pipeline is endpoint name assert output_endpoint_name == input_endpoint_name # Verify endpoint is running assert (sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointStatus"] == "InService") # Verify that the update was successful by checking that InstanceType changed if "ExpectedInstanceType" in test_params.keys(): new_endpoint_config_name = sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointConfigName"] response = sagemaker_utils.describe_endpoint_config( sagemaker_client, new_endpoint_config_name) prod_variant = response["ProductionVariants"][0] print(f"Production Variant item: {prod_variant}") instance_type = prod_variant["InstanceType"] print(f"Production Variant item InstanceType: {instance_type}") assert instance_type == test_params["ExpectedInstanceType"] # Validate the model for use by running a prediction result = run_predict_mnist(boto3_session, input_endpoint_name, download_dir) print(f"prediction result: {result}") assert json.dumps(result, sort_keys=True) == json.dumps( test_params["ExpectedPrediction"], sort_keys=True) utils.remove_dir(download_dir) finally: endpoints = sagemaker_utils.list_endpoints( sagemaker_client, name_contains=input_endpoint_name)["Endpoints"] endpoint_names = list(map((lambda x: x["EndpointName"]), endpoints)) # Check endpoint was successfully created if input_endpoint_name in endpoint_names: sagemaker_utils.delete_endpoint(sagemaker_client, input_endpoint_name)
def test_groundtruth_labeling_job(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Verify the GroundTruthJob was created in SageMaker and is InProgress. # TODO: Add a bot to complete the labeling job and check for completion instead. try: workteam_name, workteam_arn = create_initial_workteam( kfp_client, experiment_id, region, sagemaker_client, "resources/config/create-workteam", download_dir, ) test_params["Arguments"]["workteam_arn"] = workteam_arn # Generate the ground_truth_train_job_name based on the workteam which will be used for labeling. test_params["Arguments"][ "ground_truth_train_job_name"] = ground_truth_train_job_name = ( test_params["Arguments"]["ground_truth_train_job_name"] + "-by-" + workteam_name) run_id, _, _ = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], test_params["StatusToCheck"], ) response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] == "InProgress" # Verify that the workteam has the specified labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) assert len(labeling_jobs["LabelingJobSummaryList"]) == 1 assert (labeling_jobs["LabelingJobSummaryList"][0]["LabelingJobName"] == ground_truth_train_job_name) # Test terminate functionality print( f"Terminating run: {run_id} where GT job_name: {ground_truth_train_job_name}" ) kfp_client_utils.terminate_run(kfp_client, run_id) response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] in ["Stopping", "Stopped"] finally: # Check if terminate failed, and stop the labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) if len(labeling_jobs["LabelingJobSummaryList"]) > 0: sagemaker_utils.stop_labeling_job(sagemaker_client, ground_truth_train_job_name) # Cleanup the workteam workteams = sagemaker_utils.list_workteams( sagemaker_client)["Workteams"] workteam_names = list(map((lambda x: x["WorkteamName"]), workteams)) if workteam_name in workteam_names: sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files utils.remove_dir(download_dir)
def test_ok(): bin_path = '../build/bin/logs' logs_dir = '/tmp/log_dir' files_number = 10 threads_number = 10 result_dir = '/tmp/res_dir' test_result_dir = '/tmp/test_res' file_lines_number = 100000 fact_names = ['fact_name' + str(i) for i in range(1, 100)] utils.make_dir(logs_dir) timer = monotonic() result = {} for i in range(file_lines_number): entry = utils.Entry(randrange(1600560000, 1601424000), choice(fact_names), 111222, [randrange(1, 1000) for i in range(10)]) with open(logs_dir + '/file1.log', 'a') as file: file.write(str(entry) + '\n') dt = str(datetime.utcfromtimestamp(entry.ts_fact).date()) if dt not in result: result[dt] = {} if entry.fact_name not in result[dt]: result[dt][entry.fact_name] = [] result[dt][entry.fact_name].append({ 'props': entry.props, 'count': files_number }) for i in range(2, files_number + 1): copyfile(logs_dir + '/file1.log', logs_dir + '/file' + str(i) + '.log') print('generation time:', monotonic() - timer) timer = monotonic() utils.make_dir(test_result_dir) with open(test_result_dir + '/agr.txt', 'w') as file: file.write(json.dumps(utils.order_dict(result))) print('result writing time:', monotonic() - timer) timer = monotonic() ret = subprocess.run([ bin_path, logs_dir, str(files_number), str(threads_number), result_dir ]) assert ret.returncode == 0, ret.stderr print('application running time:', monotonic() - timer) with open(result_dir + '/agr.txt') as file: res_content = file.read() with open(test_result_dir + '/agr.txt', 'r') as file: test_res_content = file.read() assert res_content == test_res_content utils.remove_dir(logs_dir) utils.remove_dir(result_dir) utils.remove_dir(test_result_dir)
def run(min_pings_init=30, min_pings_split=20, min_dist=2.0): """ Runs feature generation that allows modeling stage to take place. Feature generation involves 3 main stages: - generating a sample to show the model - breaking sample up into trajectories - computing quantitative features on each trajectory - writing an image of each trajectory to folders grouped by 'vessel_type' :param min_pings_init: int The minimum number of AIS data points that must appear in a trajectory for it to be included in the sample. :param min_pings_split: int Applied after splitting trajectories at the gap. Should be smaller than min_pings_init. Ensures that split trajectories also have more than a certain minimum number of pings. :returns: None """ start = time.time() # Set environment variables settings.load() # Get PostgreSQL database credentials psql_credentials = settings.get_psql() base_dir = settings.get_base_dir() sql_dir = base_dir.joinpath('sql') data_dir = settings.get_data_dir() # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') # Create a sql table with complete trajectories sample_switch = input("Create new sample for Convolutional Neural Net? (Y/N)") if sample_switch in ['Y', 'y', '1', 'Yes']: print("Creating CNN sample.") create_cnn_sample(sql_dir, engine, min_pings_init=min_pings_init, min_dist=min_dist) # Get data to process from postgres execute_sql('drop table if exists features.quants;', engine, read_file=False) if (data_dir / 'trajectories').is_dir(): print("Removing old trajectories directory.") remove_dir(data_dir / 'trajectories') try: df = execute_sql("select * from features.cnn_sample", engine, read_file=False, return_df=True) print("Grabbing trajectory data") except db.exc.ProgrammingError: print("The table features.cnn_sample doesn't exist. Please create one.") raise SystemExit # Set data types of several key columns df = df.rename(columns={'time_stamp': 't'}) df['t'] = pd.to_datetime(df['t']) df['longitude'] = pd.to_numeric(df['longitude']) df['latitude'] = pd.to_numeric(df['latitude']) # Set df index df.index = df['t'] df_geo = df_to_geodf(df) # Filter by date and mmsi df_group = df_geo.groupby([pd.Grouper(freq='D'), 'mmsi']) # Loop through the grouped dataframes counter = 0 # Load basemap shape file base_map = geopandas.read_file( '/Akamai/ais_project_data/GSHHS_shp/c/GSHHS_c_L1.shp') # c: coarse, l: low, i: intermedate, h: high, f: full # Set CRS WGS 84 base_map = base_map.to_crs(epsg=4326) for name, group in df_group: if len(group) < min_pings_init: continue trajectory = mp.Trajectory(name, group) # Split the trajectory at the gap split_trajectories = list(trajectory.split_by_observation_gap(timedelta(minutes=30))) ### CREATE TRAJECTORY IDs for split_index, trajectory in enumerate(split_trajectories): # create a universal trajectory ID # format is: mmsi-date-split_index trajectory.df['traj_id'] = str(name[1]) + '-' + str(name[0].date()) + '-' + str(split_index) ### CREATE QUANT FEATURES AND WRITE IMAGES TO DISK for split in split_trajectories: # store the length of the split trajectory in km traj_length = split.get_length() / 1_000 if (len(split.df) < min_pings_split) or (traj_length < .5): print(f"Dropping a trajectory with length: {str(traj_length)} km and {str(len(split.df))} pings.") continue else: try: quants = compute_quants(split.df[['longitude', 'latitude']]) quants['traj_id'] = str(split.df['traj_id'].iloc[0]) quants['vessel_type'] = str(split.df['vessel_type'].iloc[0]) quants.to_sql('quants', engine, schema='features', if_exists='append', index=False) ### WRITE IMAGES TO DISK save_matplotlib_img(split, data_dir, base_map) counter += 1 except: print(f"An error occurred processing trajectory {split.df['traj_id'].iloc[0]}.") end = time.time() print(f"Generated features for {str(counter)} images in {str(round(end - start))} seconds.") return
def test_groundtruth_labeling_job(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # First create a workteam using a separate pipeline and get the name, arn of the workteam created. workteam_name, _ = create_workteamjob( kfp_client, experiment_id, region, sagemaker_client, "resources/config/create-workteam", download_dir, ) test_params["Arguments"][ "workteam_arn"] = workteam_arn = sagemaker_utils.get_workteam_arn( sagemaker_client, workteam_name) # Generate the ground_truth_train_job_name based on the workteam which will be used for labeling. test_params["Arguments"][ "ground_truth_train_job_name"] = ground_truth_train_job_name = ( test_params["Arguments"]["ground_truth_train_job_name"] + "-by-" + workteam_name) _ = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], test_params["StatusToCheck"], ) # Verify the GroundTruthJob was created in SageMaker and is InProgress. # TODO: Add a bot to complete the labeling job and check for completion instead. try: response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] == "InProgress" # Verify that the workteam has the specified labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) assert len(labeling_jobs["LabelingJobSummaryList"]) == 1 assert (labeling_jobs["LabelingJobSummaryList"][0]["LabelingJobName"] == ground_truth_train_job_name) finally: # Cleanup the SageMaker Resources sagemaker_utils.stop_labeling_job(sagemaker_client, ground_truth_train_job_name) sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files utils.remove_dir(download_dir)
def main(): if main_repo_url is None or sub_repos is None: config_error() old_wd = os.getcwd() print utils.blue('Initialising merging dir '+merger_dir) utils.remove_dir(merger_dir) utils.ensure_dir(merger_dir) os.chdir(merger_dir) main_repo_name = utils.get_repo_name(main_repo_url) main_repo_dir = os.path.join(merger_dir, main_repo_name) print utils.blue('\nCloning main repo ' + main_repo_name) call(['git', 'clone', main_repo_url]) print '' for sub_repo in sub_repos: sub_repo_name = utils.get_repo_name(sub_repo['url']) sub_repo_dir = os.path.join(merger_dir, sub_repo_name) print utils.blue('Merging sub-repo ' + sub_repo_name + ' into main repo ' + main_repo_name) print utils.blue('>Cloning sub-repo ' + sub_repo_name) call(['git', 'clone', sub_repo['url']]) os.chdir(sub_repo_dir) print utils.blue('>Looking for files to delete') if files_to_delete is not None and files_to_delete: files = os.listdir(sub_repo_dir) for f in files: for ftd in files_to_delete: if (ftd.startswith('*') and f.endswith(ftd.split('*')[1])) or (not ftd.startswith('*') and f == ftd): file_path = os.path.join(sub_repo_dir, f) utils.remove_file(file_path) print utils.blue('>>File ' + file_path + ' deleted') files = os.listdir(sub_repo_dir) files.remove('.git') destination_dir = os.path.join(sub_repo_dir, sub_repo['dir']) print utils.blue('>Directory ' + destination_dir + ' created') utils.ensure_dir(destination_dir) for f in files: call(['git', 'mv', f, sub_repo['dir']]) print utils.blue('>>File/dir ' + f + ' moved into ' + sub_repo['dir']) call(['git', 'add', '-A']) call(['git', 'commit', '-m', 'Merging '+sub_repo_name+' into '+main_repo_name]) print utils.blue('>Changes committed in sub-repo') os.chdir(main_repo_dir) print utils.blue('>Adding remote '+sub_repo_name) call(['git', 'remote', 'add', sub_repo_name, sub_repo_dir]) print utils.blue('>Fetching remote '+sub_repo_name) call(['git', 'fetch', sub_repo_name]) print utils.blue('>Merging '+sub_repo_name+' into '+main_repo_name) call(['git', 'merge', '--allow-unrelated-histories', '--no-edit', sub_repo_name+'/master']) print utils.blue('>Removing remote '+sub_repo_name) call(['git', 'remote', 'remove', sub_repo_name]) print utils.blue('>Sub-repo '+sub_repo_name+' merged into main repo '+main_repo_name) os.chdir(merger_dir) print '' os.chdir(main_repo_dir) push = None while push is None: push = utils.yes_no_input('Do you want to push the merged main repo '+main_repo_name+' to remote', 'y') if push: call(['git', 'push']) remote_delete = None while remote_delete is None: remote_delete = utils.yes_no_input('Do you want to delete the remote sub-repos', 'n') if remote_delete: for sub_repo in sub_repos: sub_repo_name = utils.get_repo_name(sub_repo['url']) print '\tTODO: delete here remote repo '+sub_repo_name+'. Not yet implemented, does nothing for now.' # TODO: see https://developer.github.com/v3/repos/#delete-a-repository clean = None while clean is None: clean = utils.yes_no_input('Do you want to delete the merging dir '+merger_dir+' and all its content', 'y') if clean: utils.remove_dir(merger_dir) os.chdir(old_wd)
# Note that the entire row is the value here all_dupes = ais_bounded.map(lambda x: ((x[0], x[1]), x)) # Group by row values, dropping duplicates ais_deduped = all_dupes.reduceByKey(lambda x, y: x) \ .map(lambda x: x[1]) #ais_deduped = all_dupes.distinct() ais_with_dupes.unpersist() print("Rows in deduped data: ", ais_deduped.count()) def toCSVLine(data): return '\t'.join(str(d) for d in data) lines = ais_deduped.map(toCSVLine) deduped_path = Path('/Akamai/ais_project_data/ais_deduped') save_path = deduped_path.joinpath(monthly_dir.name) #save_path = deduped_path.joinpath('2019Sep') if save_path.is_dir(): remove_dir(save_path) print(save_path.parts) lines.saveAsTextFile(str(save_path.resolve())) end = datetime.datetime.now() print("Runtime: ", end - start)
def save_subtitle_file(download_dir, temp_file_path): dir_path, name_list = unzip_file(temp_file_path) success, file_path = move_subtitle_to_download_dir( dir_path, name_list, download_dir, SUBTITLE_FILE_EXTENSION) remove_dir(dir_path) return success, file_path