def docker_cloud_login(job_config): """ Log into docker cloud using creds in job_config. :param job_config: MorfJobConfig object. :return: None """ cmd = "docker login --username={} --password={}".format( job_config.docker_cloud_username, job_config.docker_cloud_password) logger = set_logger_handlers(module_logger, job_config) execute_and_log_output(cmd, logger) return
def docker_cloud_push(job_config, image_uuid): """ Push image to Docker Cloud repo in job_config; tagging the image with its morf_id. :param job_config: MorfJobConfig object :param image_uuid: Docker image uuid :return: None """ logger = set_logger_handlers(module_logger, job_config) docker_cloud_repo_and_tag_path = "{}:{}".format( job_config.docker_cloud_repo, job_config.morf_id) # tag the docker image using the morf_id tag_cmd = "docker tag {} {}".format(image_uuid, docker_cloud_repo_and_tag_path) execute_and_log_output(tag_cmd, logger) # push the image to docker cloud push_cmd = "docker push {}".format(docker_cloud_repo_and_tag_path) execute_and_log_output(push_cmd, logger) return docker_cloud_repo_and_tag_path
def run_image(job_config, raw_data_bucket, course=None, session=None, level=None, label_type=None): """ Run a docker image with the specified parameters, initializing any data as necessary and archiving results to s3. :param docker_url: URL for a built and compressed (.tar) docker image :param user_id: unique user id (string). :param job_id: unique job id (string). :param mode: mode to run image in; {extract, extract-holdout, train, test} (string). :param raw_data_bucket: raw data bucket; specify multiple buckets only if level == all. :param course: Coursera course slug or course shortname (string). :param session: 3-digit course session number (for trained model or extraction). :param level: level of aggregation of MORF API function; {session, course, all} (string). :param label_type: type of outcome label to use (required for model training and testing) (string). :return: """ logger = set_logger_handlers(module_logger, job_config) s3 = job_config.initialize_s3() # create local directory for processing on this instance with tempfile.TemporaryDirectory( dir=job_config.local_working_directory) as working_dir: try: fetch_file(s3, working_dir, job_config.docker_url, dest_filename="docker_image") except Exception as e: logger.error("[ERROR] Error downloading file {} to {}".format( job_config.docker_url, working_dir)) input_dir, output_dir = initialize_input_output_dirs(working_dir) # fetch any data or models needed if "extract" in job_config.mode: # download raw data initialize_raw_course_data(job_config, raw_data_bucket=raw_data_bucket, mode=job_config.mode, course=course, session=session, level=level, input_dir=input_dir) job_config.mode = "extract" # sets mode to "extract" in case of "extract-holdout" # fetch training/testing data if job_config.mode in ["train", "test"]: sync_s3_job_cache(job_config) initialize_train_test_data(job_config, raw_data_bucket=raw_data_bucket, level=level, label_type=label_type, course=course, session=session, input_dir=input_dir) if job_config.mode == "test": # fetch models and untar download_models(job_config, course=course, session=session, dest_dir=input_dir, level=level) image_uuid = load_docker_image(dir=working_dir, job_config=job_config, logger=logger) # build docker run command and execute the image cmd = make_docker_run_command(job_config, job_config.docker_exec, input_dir, output_dir, image_uuid, course, session, job_config.mode, client_args=job_config.client_args) execute_and_log_output(cmd, logger) # cleanup execute_and_log_output( "{} rmi --force {}".format(job_config.docker_exec, image_uuid), logger) # archive and write output archive_file = make_output_archive_file(output_dir, job_config, course=course, session=session) move_results_to_destination(archive_file, job_config, course=course, session=session) return
def execute_image_for_cv(job_config, raw_data_bucket, course, fold_num, docker_image_dir, label_type, raw_data_dir="morf-data/"): """ :param job_config: :param raw_data_bucket: :param course: :param fold_num: :param docker_image_dir: :param label_type: :param raw_data_dir: :return: """ user_id_col = "userID" logger = set_logger_handlers(module_logger, job_config) with tempfile.TemporaryDirectory( dir=job_config.local_working_directory) as working_dir: input_dir, output_dir = initialize_input_output_dirs(working_dir) # get fold train data course_input_dir = os.path.join(input_dir, course) trainkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, fold_num, "train")) train_data_path = download_from_s3(job_config.proc_data_bucket, trainkey, job_config.initialize_s3(), dir=course_input_dir, job_config=job_config) testkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, fold_num, "test")) test_data_path = download_from_s3(job_config.proc_data_bucket, testkey, job_config.initialize_s3(), dir=course_input_dir, job_config=job_config) # get labels train_users = pd.read_csv(train_data_path)[user_id_col] train_labels_path = initialize_cv_labels(job_config, train_users, raw_data_bucket, course, label_type, input_dir, raw_data_dir, fold_num, "train", level="course") # run docker image with mode == cv image_uuid = load_docker_image(docker_image_dir, job_config, logger) cmd = make_docker_run_command( job_config, job_config.docker_exec, input_dir, output_dir, image_uuid, course, None, mode, job_config.client_args) + " --fold_num {}".format(fold_num) execute_and_log_output(cmd, logger) # upload results pred_csv = os.path.join(output_dir, "{}_{}_test.csv".format(course, fold_num)) pred_key = make_s3_key_path(job_config, course, os.path.basename(pred_csv), mode="test") upload_file_to_s3(pred_csv, job_config.proc_data_bucket, pred_key, job_config, remove_on_success=True) return
for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course, fetch_all_sessions=True): with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: print("[INFO] processing course {} session {}".format(course, session)) # download the data exports fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir) # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir, # data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir # create docker run command and load image image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME) cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid, course=course, session=session, mode=None, client_args=None) # run the docker image, make sure to pass params for course and session execute_and_log_output(cmd, logger) # concatenate into single file in OUTPUT_DIR df_list = [] for f in os.listdir(OUTPUT_DIR): if f.endswith(".csv") and not f.startswith("."): # only use csv files, ignore system files print("[INFO] reading file {}".format(f)) # pull course name and session from filename regex = re.compile("^hash_mapping_(\S+)_([0-9\-]+).*") res = re.search(regex, f) course = res.group(1) session = res.group(2) # read dataframe and add columns for course and session df = pd.read_csv(os.path.join(OUTPUT_DIR, f)) df["course"] = course df["session"] = session