def extract_all(): """ Extract features using the docker image across all courses and all sessions except holdout. :return: """ mode = "extract" raw_data_buckets = fetch_data_buckets_from_config() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode) # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image run_job(docker_url, mode, course=None, user=user_id, job_id=job_id, session=None, level="all", raw_data_buckets=raw_data_buckets) result_file = collect_all_results(s3, raw_data_buckets, proc_data_bucket, mode, user_id, job_id) upload_key = make_s3_key_path(user_id, job_id, mode, course=None, filename=result_file) upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(aws_access_key_id, aws_secret_access_key, job_id, user_id, status=mode, emailaddr_to=email_to) return
def test_all(label_type): """ test a single overall model using the entire dataset using the Docker image. :return: """ level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) run_image(job_config, job_config.raw_data_buckets, level=level, label_type=label_type) # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow result_file = collect_all_results(job_config) upload_key = make_s3_key_path(job_config, filename=generate_archive_filename( job_config, extension="csv")) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def run_morf_job(client_config_url, server_config_url, email_to = None, no_cache = False): """ Wrapper function to run complete MORF job. :param client_config_url: url to client.config file; should be located on local machine. :param server_config_url: url (local or s3) to server.config file. :return: """ controller_script_name = "controller.py" docker_image_name = "docker_image" server_config_path = urlparse(server_config_url).path # read server.config and get those properties server_config = get_config_properties(server_config_path) # create temporary directory in local_working_directory from server.config with tempfile.TemporaryDirectory(dir=server_config["local_working_directory"]) as working_dir: # save calling working directory; change directory into working_dir calling_dir = os.getcwd() os.chdir(working_dir) # download client.config into local_working_directory using AWS creds from server.config s3 = boto3.client("s3", aws_access_key_id=server_config["aws_access_key_id"], aws_secret_access_key=server_config["aws_secret_access_key"]) fetch_file(s3, working_dir, client_config_url) local_client_config_path = os.path.join(os.getcwd(), "client.config") combine_config_files(server_config_path, local_client_config_path) config = get_config_properties() if email_to: # if email_to was provided, this overrides in config file -- allows users to easily run mwe print("[INFO] email address from submission {} overriding email address in config file {}" .format(email_to, config["email_to"])) config["email_to"] = email_to update_config_fields_in_section("client", email_to = email_to) cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"]) # from client.config, fetch and download the following: docker image, controller script try: fetch_file(s3, working_dir, config["docker_url"], dest_filename = docker_image_name) fetch_file(s3, working_dir, config["controller_url"], dest_filename = controller_script_name) if not no_cache: # cache job files in s3 unless no_cache parameter set to true cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"], docker_image_name) cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"], controller_script_name) except KeyError as e: cause = e.args[0] print("[Error]: field {} missing from client.config file.".format(cause)) sys.exit(-1) # change working directory and run controller script with notifications for initialization and completion send_email_alert(config["aws_access_key_id"], config["aws_secret_access_key"], config["job_id"], config["user_id"], status = "INITIALIZED", emailaddr_to=config["email_to"]) subprocess.call("python3 {}".format(controller_script_name), shell = True) send_success_email(config["aws_access_key_id"], config["aws_secret_access_key"], config["proc_data_bucket"], config["job_id"], config["user_id"], config["email_to"]) return
def run_morf_job(job_config, no_cache=False): """ Wrapper function to run complete MORF job. :param client_config_url: url to client.config file. :param server_config_url: url to server.config file. :return: """ combined_config_filename = "config.properties" logger = set_logger_handlers(module_logger, job_config) logger.info("running job id: {}".format(job_config.morf_id)) controller_script_name = "controller.py" docker_image_name = "docker_image" s3 = job_config.initialize_s3() # create temporary directory in local_working_directory from server.config with tempfile.TemporaryDirectory( dir=job_config.local_working_directory) as working_dir: # copy config file into new directory shutil.copy(combined_config_filename, working_dir) os.chdir(working_dir) # from job_config, fetch and download the following: docker image, controller script, cached config file update_morf_job_cache(job_config) # from client.config, fetch and download the following: docker image, controller script try: fetch_file(s3, working_dir, job_config.docker_url, dest_filename=docker_image_name, job_config=job_config) fetch_file(s3, working_dir, job_config.controller_url, dest_filename=controller_script_name, job_config=job_config) if not no_cache: # cache job files in s3 unless no_cache parameter set to true cache_job_file_in_s3(job_config, filename=docker_image_name) cache_job_file_in_s3(job_config, filename=controller_script_name) except KeyError as e: cause = e.args[0] logger.error( "[Error]: field {} missing from client.config file.".format( cause)) sys.exit(-1) # change working directory and run controller script with notifications for initialization and completion job_config.update_status("INITIALIZED") send_email_alert(job_config) subprocess.call("python3 {}".format(controller_script_name), shell=True) job_config.update_status("SUCCESS") send_success_email(job_config) return
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True): """ Extract features using the Docker image across each course of holdout data. :return: """ mode = "extract-holdout" level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # call job_runner once percourse with --mode=extract and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: holdout_session = fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, holdout_session, level, None ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def test_course(label_type, raw_data_dir="morf-data/", multithread=True): """ tests one model per course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: """ level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 ## for each bucket, call job_runner once per course with --mode=test and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("[INFO] processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, None, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=generate_archive_filename( job_config, extension="csv")) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def train_all(label_type): """ Train a single overall model using the entire dataset using the Docker image. :param label_type: label type provided by user. :return: None """ level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode("train") check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) run_image(job_config, raw_data_bucket=job_config.raw_data_buckets, level=level, label_type=label_type) send_email_alert(job_config) return
def test_course(raw_data_dir="morf-data/"): """ tests one model per course using the Docker image. :return: """ raw_data_buckets = fetch_data_buckets_from_config() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode) ## for each bucket, call job_runner once per course with --mode=test and --level=course for raw_data_bucket in raw_data_buckets: print("[INFO] processing bucket {}".format(raw_data_bucket)) with Pool() as pool: for course in fetch_complete_courses(s3, raw_data_bucket, raw_data_dir, n_train=1): pool.apply_async(run_job, [ docker_url, mode, course, user_id, job_id, None, "course", raw_data_bucket ]) pool.close() pool.join() result_file = collect_course_results(s3, raw_data_buckets, proc_data_bucket, mode, user_id, job_id) upload_key = make_s3_key_path(user_id, job_id, mode, course=None, filename=generate_archive_filename( user_id=user_id, job_id=job_id, mode=mode, extension="csv")) upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(aws_access_key_id, aws_secret_access_key, job_id, user_id, status=mode, emailaddr_to=email_to) return
def extract_all(): """ Extract features using the docker image across all courses and all sessions except holdout. :return: """ mode = "extract" level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image run_image(job_config, job_config.raw_data_buckets, level=level) result_file = collect_all_results(job_config) upload_key = make_s3_key_path(job_config, filename=result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def train_session(label_type, raw_data_dir="morf-data/", multithread=True): """ Train one model per session of the course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: None """ level = "session" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # for each bucket, call job_runner once per session with --mode=train and --level=session for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course): poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, session, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) send_email_alert(job_config) return
def test_all(): """ test a single overall model using the entire dataset using the Docker image. :return: """ raw_data_buckets = fetch_data_buckets_from_config() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode) run_job(docker_url, mode, None, user_id, job_id, None, "all", None, raw_data_buckets=raw_data_buckets) # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow result_file = collect_all_results(s3, proc_data_bucket, mode, user_id, job_id) upload_key = make_s3_key_path(user_id, job_id, mode, course=None, filename=generate_archive_filename( user_id=user_id, job_id=job_id, mode=mode, extension="csv")) upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(aws_access_key_id, aws_secret_access_key, job_id, user_id, status=mode, emailaddr_to=email_to) return
def extract_course(raw_data_dir="morf-data/"): """ Extract features using the Docker image, building individual feature sets for each course. :return: """ mode = "extract" raw_data_buckets = fetch_data_buckets_from_config() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode) # call job_runner once percourse with --mode=extract and --level=course for raw_data_bucket in raw_data_buckets: print("[INFO] processing bucket {}".format(raw_data_bucket)) with Pool() as pool: for course in fetch_courses(s3, raw_data_bucket, raw_data_dir): pool.apply_async(run_job, [ docker_url, mode, course, user_id, job_id, None, "course", raw_data_bucket ]) pool.close() pool.join() result_file = collect_course_results(s3, raw_data_buckets, proc_data_bucket, mode, user_id, job_id) upload_key = make_s3_key_path(user_id, job_id, mode, course=None, filename=result_file) upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(aws_access_key_id, aws_secret_access_key, job_id, user_id, status=mode, emailaddr_to=email_to) return
def run_morf_job(job_config, no_cache=False, no_morf_cache=False): """ Wrapper function to run complete MORF job. :param job_config: MorfJobConfig object :param no_cache: boolean, indicator whether docker_image should be cached in s3 :param no_morf_cache: boolean, indicator for whether to cache morf data locally :return: """ combined_config_filename = "config.properties" logger = set_logger_handlers(module_logger, job_config) logger.info("running job id: {}".format(job_config.morf_id)) controller_script_name = "controller.py" docker_image_name = "docker_image" s3 = job_config.initialize_s3() # create temporary directory in local_working_directory from server.config with tempfile.TemporaryDirectory( dir=job_config.local_working_directory) as working_dir: # copy config file into new directory shutil.copy(combined_config_filename, working_dir) os.chdir(working_dir) # from job_config, fetch and download the following: docker image, controller script, cached config file if not no_morf_cache: update_raw_data_cache(job_config) # from client.config, fetch and download the following: docker image, controller script try: fetch_file(s3, working_dir, job_config.docker_url, dest_filename=docker_image_name, job_config=job_config) fetch_file(s3, working_dir, job_config.controller_url, dest_filename=controller_script_name, job_config=job_config) if not no_cache: # cache job files in s3 unless no_cache parameter set to true cache_job_file_in_s3(job_config, filename=docker_image_name) cache_job_file_in_s3(job_config, filename=controller_script_name) except KeyError as e: cause = e.args[0] logger.error( "[Error]: field {} missing from client.config file.".format( cause)) sys.exit(-1) # change working directory and run controller script with notifications for initialization and completion job_config.update_status("INITIALIZED") send_email_alert(job_config) subprocess.call("python3 {}".format(controller_script_name), shell=True) job_config.update_status("SUCCESS") # push image to docker cloud, create doi for job files in zenodo, and send success email docker_cloud_path = cache_to_docker_hub(job_config, working_dir, docker_image_name) setattr(job_config, "docker_cloud_path", docker_cloud_path) zenodo_deposition_id = upload_files_to_zenodo( job_config, upload_files=(job_config.controller_url, job_config.client_config_url)) setattr(job_config, "zenodo_deposition_id", zenodo_deposition_id) send_success_email(job_config) return
def extract_holdout_session(labels=False, raw_data_dir="morf-data/", label_type="labels-train", multithread=True): """ Extract features using the Docker image across each session of holdout data. :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv). :return: None """ mode = "extract-holdout" level = "session" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # call job_runner once per session with --mode=extract-holdout and --level=session # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 for raw_data_bucket in job_config.raw_data_buckets: logger.info("[INFO] processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: holdout_session = fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, holdout_session, level ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) if not labels: # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket result_file = collect_session_results(job_config, holdout=True) upload_key = "{}/{}/{}/{}".format(job_config.user_id, job_config.job_id, job_config.mode, result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) else: # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket for raw_data_bucket in job_config.raw_data_buckets: result_file = collect_session_results( job_config, raw_data_buckets=[raw_data_bucket]) upload_key = raw_data_dir + "{}.csv".format(label_type) upload_file_to_s3(result_file, bucket=raw_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def extract_session(labels=False, raw_data_dir="morf-data/", label_type="labels-train", multithread=True): """ Extract features using the Docker image, building individual feature sets for each "session" or iteration of the course. :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-train.csv). :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :label_type: type of outcome label to use (string). :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: """ level = "session" mode = "extract" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # # clear any preexisting data for this user/job/mode and set number of cores clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 ## for each bucket, call job_runner once per session with --mode=extract and --level=session for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: for session in fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=False): poolres = pool.apply_async( run_image, [job_config, raw_data_bucket, course, session, level]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) if not labels: # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket result_file = collect_session_results(job_config) upload_key = "{}/{}/extract/{}".format(job_config.user_id, job_config.job_id, result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) else: # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket for raw_data_bucket in job_config.raw_data_buckets: result_file = collect_session_results( job_config, raw_data_buckets=[raw_data_bucket]) upload_key = raw_data_dir + "{}.csv".format(label_type) upload_file_to_s3(result_file, bucket=raw_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def extract_holdout_session(labels=False, raw_data_dir="morf-data/", label_type="labels-train", multithread=True): """ Extract features using the Docker image across each session of holdout data. :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv). :return: None """ mode = "extract-holdout" # call job_runner once per session with --mode=extract-holdout and --level=session raw_data_buckets = fetch_data_buckets_from_config() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode) for raw_data_bucket in raw_data_buckets: print("[INFO] processing bucket {}".format(raw_data_bucket)) if multithread: with Pool() as pool: for course in fetch_courses(s3, raw_data_bucket, raw_data_dir): holdout_run = fetch_sessions( s3, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted pool.apply_async(run_job, [ docker_url, mode, course, user_id, job_id, holdout_run, "session", raw_data_bucket ]) pool.close() pool.join() else: # do job in serial; this is useful for debugging for course in fetch_courses(s3, raw_data_bucket, raw_data_dir): holdout_run = fetch_sessions( s3, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted run_job(docker_url, mode, course, user_id, job_id, holdout_run, "session", raw_data_bucket) if not labels: # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket result_file = collect_session_results(s3, raw_data_buckets, proc_data_bucket, mode, user_id, job_id, holdout=True) upload_key = "{}/{}/{}/{}".format(user_id, job_id, mode, result_file) upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key) if labels: # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket for raw_data_bucket in raw_data_buckets: result_file = collect_session_results(s3, [raw_data_bucket], proc_data_bucket, mode, user_id, job_id, holdout=True) upload_key = raw_data_dir + "{}.csv".format(label_type) upload_file_to_s3(result_file, bucket=raw_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(aws_access_key_id, aws_secret_access_key, job_id, user_id, status=mode, emailaddr_to=email_to) return