def create_course_folds(label_type, k=5, multithread=True): """ From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3. :param label_type: type of outcome label to use. :param k: number of folds. :param multithread: logical indicating whether multiple cores should be used (if available) :param raw_data_dir: name of subfolder in s3 buckets containing raw data. :return: """ job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("creating cross-validation folds") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): poolres = pool.apply_async( make_folds, [job_config, raw_data_bucket, course, k, label_type]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) return
def test_all(label_type): """ test a single overall model using the entire dataset using the Docker image. :return: """ level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) run_image(job_config, job_config.raw_data_buckets, level=level, label_type=label_type) # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow result_file = collect_all_results(job_config) upload_key = make_s3_key_path(job_config, filename=generate_archive_filename( job_config, extension="csv")) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def evaluate_cv_course(label_type, k=5, label_col = "label_type", raw_data_dir = "morf-data/", course_col = "course", fold_col = "fold_num", pred_cols = ("prob", "pred"), user_col = "userID"): """ Fetch metrics by first averaging over folds within course, then returning results by course. :param label_type: label type defined by user. :param label_col: column containing labels. :param raw_data_bucket: bucket containing raw data; used to fetch course names. :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories. :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features). :param course_col: column containing course identifier. :param pred_cols: user-supplied prediction columns; these columns will be checked for missing values and to ensure they contain values for every user in the course. :param user_col: column containing user ID for predictions. :param labels_file: name of csv file containing labels. :return: None. """ job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) check_label_type(label_type) raw_data_buckets = job_config.raw_data_buckets proc_data_bucket = job_config.proc_data_bucket s3 = job_config.initialize_s3() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) course_data = [] for raw_data_bucket in raw_data_buckets: pred_file = generate_archive_filename(job_config, mode="test", extension="csv") pred_key = make_s3_key_path(job_config, pred_file, mode="test") # download course prediction and label files, fetch classification metrics at course level with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: pred_csv = download_from_s3(proc_data_bucket, pred_key, s3, working_dir, job_config=job_config) job_config.update_mode("cv") # set mode to cv to fetch correct labels for sessions even if they are train/test sessions label_csv = initialize_labels(job_config, raw_data_bucket, None, None, label_type, working_dir, raw_data_dir, level="all") pred_df = pd.read_csv(pred_csv) lab_df = pd.read_csv(label_csv, dtype=object) pred_lab_df = pd.merge(lab_df, pred_df, how = "left", on = [user_col, course_col]) check_dataframe_complete(pred_lab_df, job_config, columns = list(pred_cols)) for course in fetch_complete_courses(job_config, data_bucket = raw_data_bucket, data_dir = raw_data_dir, n_train=1): fold_metrics_list = list() for fold_num in range(1, k+1): fold_metrics_df = fetch_binary_classification_metrics(job_config, pred_lab_df[pred_lab_df[fold_col] == fold_num], course) fold_metrics_list.append(fold_metrics_df) assert len(fold_metrics_list) == k, "something is wrong; number of folds doesn't match. Try running job again from scratch." course_metrics_df = pd.concat(fold_metrics_list).mean() course_metrics_df[course_col] = course course_data.append(course_metrics_df) job_config.update_mode(mode) master_metrics_df = pd.concat(course_data, axis = 1).T # reorder dataframe so course name is first cols = list(master_metrics_df) # move the column to head of list using index, pop and insert cols.insert(0, cols.pop(cols.index(course_col))) master_metrics_df = master_metrics_df.ix[:, cols] csv_fp = generate_archive_filename(job_config, extension="csv") master_metrics_df[course_col] = hash_df_column(master_metrics_df[course_col], job_config.user_id, job_config.hash_secret) master_metrics_df.to_csv(csv_fp, index = False, header = True) upload_key = make_s3_key_path(job_config, mode = "test", filename=csv_fp) upload_file_to_s3(csv_fp, bucket=proc_data_bucket, key=upload_key) os.remove(csv_fp) return
def evaluate_course(label_type, label_col = "label_type", raw_data_dir = "morf-data/", course_col = "course", pred_cols = ("prob", "pred"), user_col = "userID", labels_file = "labels-test.csv"): """ Fetch metrics by course. :param label_type: label type defined by user. :param label_col: column containing labels. :param raw_data_bucket: bucket containing raw data; used to fetch course names. :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories. :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features). :param course_col: column containing course identifier. :param pred_cols: user-supplied prediction columns; these columns will be checked for missing values and to ensure they contain values for every user in the course. :param user_col: column containing user ID for predictions. :param labels_file: name of csv file containing labels. :return: None. """ job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) check_label_type(label_type) raw_data_buckets = job_config.raw_data_buckets proc_data_bucket = job_config.proc_data_bucket s3 = job_config.initialize_s3() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) course_data = [] for raw_data_bucket in raw_data_buckets: pred_file = generate_archive_filename(job_config, mode="test", extension="csv") pred_key = "{}/{}/{}/{}".format(job_config.user_id, job_config.job_id, "test", pred_file) label_key = raw_data_dir + labels_file # download course prediction and label files, fetch classification metrics at course level with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: download_from_s3(proc_data_bucket, pred_key, s3, working_dir, job_config=job_config) download_from_s3(raw_data_bucket, label_key, s3, working_dir, job_config=job_config) pred_df = pd.read_csv("/".join([working_dir, pred_file])) lab_df = pd.read_csv("/".join([working_dir, labels_file]), dtype=object) lab_df = lab_df[lab_df[label_col] == label_type].copy() pred_lab_df = pd.merge(lab_df, pred_df, how = "left", on = [user_col, course_col]) check_dataframe_complete(pred_lab_df, job_config, columns = pred_cols) for course in fetch_complete_courses(job_config, data_bucket = raw_data_bucket, data_dir = raw_data_dir, n_train=1): course_metrics_df = fetch_binary_classification_metrics(job_config, pred_lab_df, course) course_data.append(course_metrics_df) master_metrics_df = pd.concat(course_data).reset_index().rename(columns={"index": course_col}) csv_fp = generate_archive_filename(job_config, extension="csv") master_metrics_df[course_col] = hash_df_column(master_metrics_df[course_col], job_config.user_id, job_config.hash_secret) master_metrics_df.to_csv(csv_fp, index = False, header = True) upload_key = make_s3_key_path(job_config, mode = "test", filename=csv_fp) upload_file_to_s3(csv_fp, bucket=proc_data_bucket, key=upload_key) os.remove(csv_fp) return
def evaluate_prule_session(): """ Perform statistical testing for prule analysis. :return: None """ raw_data_dir = "morf-data/" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) raw_data_buckets = job_config.raw_data_buckets proc_data_bucket = job_config.proc_data_bucket prule_file = job_config.prule_url s3 = job_config.initialize_s3() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: input_dir, output_dir = initialize_input_output_dirs(working_dir) # pull extraction results from every course into working_dir for raw_data_bucket in raw_data_buckets: for course in fetch_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_all_sessions=True): if session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course): ## session is a non-holdout session fetch_mode = "extract" else: fetch_mode = "extract-holdout" feat_file = generate_archive_filename(job_config, course=course, session=session, mode=fetch_mode) feat_key = make_s3_key_path(job_config, filename=feat_file, course=course, session=session, mode=fetch_mode) feat_local_fp = download_from_s3(proc_data_bucket, feat_key, s3, input_dir, job_config=job_config) unarchive_file(feat_local_fp, input_dir) docker_image_fp = urlparse(job_config.prule_evaluate_image).path docker_image_dir = os.path.dirname(docker_image_fp) docker_image_name = os.path.basename(docker_image_fp) image_uuid = load_docker_image(docker_image_dir, job_config, logger, image_name=docker_image_name) # create a directory for prule file and copy into it; this will be mounted to docker image prule_dir = os.path.join(working_dir, "prule") os.makedirs(prule_dir) shutil.copy(urlparse(prule_file).path, prule_dir) cmd = "{} run --network=\"none\" --rm=true --volume={}:/input --volume={}:/output --volume={}:/prule {} ".format(job_config.docker_exec, input_dir, output_dir, prule_dir, image_uuid) subprocess.call(cmd, shell=True) # rename result file and upload results to s3 final_output_file = os.path.join(output_dir, "output.csv") final_output_archive_name = generate_archive_filename(job_config, extension="csv") final_output_archive_fp = os.path.join(output_dir, final_output_archive_name) os.rename(final_output_file, final_output_archive_fp) output_key = make_s3_key_path(job_config, filename = final_output_archive_name, mode = "test") upload_file_to_s3(final_output_archive_fp, proc_data_bucket, output_key, job_config, remove_on_success=True) return
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True): """ Extract features using the Docker image across each course of holdout data. :return: """ mode = "extract-holdout" level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # call job_runner once percourse with --mode=extract and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: holdout_session = fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, holdout_session, level, None ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def test_course(label_type, raw_data_dir="morf-data/", multithread=True): """ tests one model per course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: """ level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 ## for each bucket, call job_runner once per course with --mode=test and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("[INFO] processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, None, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=generate_archive_filename( job_config, extension="csv")) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def train_all(label_type): """ Train a single overall model using the entire dataset using the Docker image. :param label_type: label type provided by user. :return: None """ level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode("train") check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) run_image(job_config, raw_data_bucket=job_config.raw_data_buckets, level=level, label_type=label_type) send_email_alert(job_config) return
def cross_validate_course(label_type, k=5, multithread=True): """ Compute k-fold cross-validation across courses. :return: """ # todo: call to create_course_folds() goes here job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) # clear previous test results clear_s3_subdirectory(job_config, mode="test") docker_image_dir = os.getcwd( ) # directory the function is called from; should contain docker image logger = set_logger_handlers(module_logger, job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("conducting cross validation") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): for fold_num in range(1, k + 1): poolres = pool.apply_async(execute_image_for_cv, [ job_config, raw_data_bucket, course, fold_num, docker_image_dir, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) test_csv_fp = collect_course_cv_results(job_config) pred_key = make_s3_key_path(job_config, os.path.basename(test_csv_fp), mode="test") upload_file_to_s3(test_csv_fp, job_config.proc_data_bucket, pred_key, job_config, remove_on_success=True) return
def extract_all(): """ Extract features using the docker image across all courses and all sessions except holdout. :return: """ mode = "extract" level = "all" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image run_image(job_config, job_config.raw_data_buckets, level=level) result_file = collect_all_results(job_config) upload_key = make_s3_key_path(job_config, filename=result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def train_session(label_type, raw_data_dir="morf-data/", multithread=True): """ Train one model per session of the course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: None """ level = "session" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # for each bucket, call job_runner once per session with --mode=train and --level=session for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course): poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, session, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) send_email_alert(job_config) return
def cross_validate_session(label_type, k=5, multithread=True, raw_data_dir="morf-data/"): """ Compute k-fold cross-validation across sessions. :return: """ raise NotImplementedError # this is not implemented! # todo: call to create_session_folds() goes here job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode # clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("conducting cross validation") with Pool(num_cores) as pool: for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=raw_data_dir, course=course, fetch_all_sessions=True): for fold_num in range(1, k + 1): with tempfile.TemporaryDirectory( dir=job_config.local_working_directory ) as working_dir: # get fold train data input_dir, output_dir = initialize_input_output_dirs( working_dir) session_input_dir = os.path.join( input_dir, course, session) session_output_dir = os.path.join( output_dir, course, session) trainkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, session, fold_num, "train"), session) train_data_path = download_from_s3( job_config.proc_data_bucket, trainkey, job_config.initialize_s3(), dir=session_input_dir, job_config=job_config) testkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, session, fold_num, "test"), session) test_data_path = download_from_s3( job_config.proc_data_bucket, testkey, job_config.initialize_s3(), dir=session_input_dir, job_config=job_config) # get labels initialize_labels(job_config, raw_data_bucket, course, session, label_type, session_input_dir, raw_data_dir) # run docker image with mode == cv #todo # upload results #todo pool.close() pool.join() return
def create_session_folds(label_type, k=5, multithread=True, raw_data_dir="morf-data/"): """ From extract and extract-holdout data, create k randomized folds for each session and archive results to s3. :param label_type: type of outcome label to use. :param k: number of folds. :param multithread: logical indicating whether multiple cores should be used (if available) :param raw_data_dir: name of subfolder in s3 buckets containing raw data. :return: """ user_id_col = "userID" label_col = "label_value" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("creating cross-validation folds") with Pool(num_cores) as pool: for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=raw_data_dir, course=course, fetch_all_sessions=True): with tempfile.TemporaryDirectory( dir=job_config.local_working_directory ) as working_dir: # todo: call make_folds() here via apply_async(); currently this is not parallelized! input_dir, output_dir = initialize_input_output_dirs( working_dir) # get the session feature and label data download_train_test_data(job_config, raw_data_bucket, raw_data_dir, course, session, input_dir, label_type=label_type) feature_file = os.path.join( input_dir, course, session, make_feature_csv_name(course, session)) label_file = os.path.join( input_dir, course, session, make_label_csv_name(course, session)) feat_df = pd.read_csv(feature_file, dtype=object) label_df = pd.read_csv(label_file, dtype=object) # merge features to ensure splits are correct feat_label_df = pd.merge(feat_df, label_df, on=user_id_col) assert feat_df.shape[0] == label_df.shape[ 0], "features and labels must contain same number of observations" # create the folds logger.info( "creating cv splits with k = {} course {} session {}" .format(k, course, session)) skf = StratifiedKFold(n_splits=k, shuffle=True) folds = skf.split(np.zeros(feat_df.shape[0]), feat_label_df.label_value) for fold_num, train_test_indices in enumerate( folds, 1 ): # write each fold train/test data to csv and push to s3 train_index, test_index = train_test_indices train_df, test_df = feat_label_df.loc[ train_index, ].drop( label_col, axis=1), feat_label_df.loc[ test_index, ].drop(label_col, axis=1) train_df_name = os.path.join( working_dir, make_feature_csv_name(course, session, fold_num, "train")) test_df_name = os.path.join( working_dir, make_feature_csv_name(course, session, fold_num, "test")) train_df.to_csv(train_df_name, index=False) test_df.to_csv(test_df_name, index=False) # upload to s3 try: train_key = make_s3_key_path( job_config, course, os.path.basename(train_df_name), session) upload_file_to_s3(train_df_name, job_config.proc_data_bucket, train_key, job_config, remove_on_success=True) test_key = make_s3_key_path( job_config, course, os.path.basename(test_df_name), session) upload_file_to_s3(test_df_name, job_config.proc_data_bucket, test_key, job_config, remove_on_success=True) except Exception as e: logger.warning( "exception occurred while uploading cv results: {}" .format(e)) pool.close() pool.join() return
def run_morf_job(client_config_url, server_config_url, email_to=None, no_cache=False): """ Wrapper function to run complete MORF job. :param client_config_url: url to client.config file; should be located on local machine. :param server_config_url: url (local or s3) to server.config file. :return: """ controller_script_name = "controller.py" docker_image_name = "docker_image" config_filename = "config.properties" server_config_path = urlparse(server_config_url).path # read server.config and get those properties server_config = get_config_properties(server_config_path) # create temporary directory in local_working_directory from server.config with tempfile.TemporaryDirectory( dir=server_config["local_working_directory"]) as working_dir: # save calling working directory; change directory into working_dir calling_dir = os.getcwd() os.chdir(working_dir) # download client.config into local_working_directory using AWS creds from server.config s3 = boto3.client( "s3", aws_access_key_id=server_config["aws_access_key_id"], aws_secret_access_key=server_config["aws_secret_access_key"]) fetch_file(s3, working_dir, client_config_url) local_client_config_path = os.path.join(os.getcwd(), "client.config") combine_config_files(server_config_path, local_client_config_path, outfile=config_filename) job_config = MorfJobConfig(config_filename) if email_to: # if email_to was provided by user, this overrides in config file -- allows users to easily run mwe print( "[INFO] email address from submission {} overriding email address in config file {}" .format(email_to, job_config.email_to)) job_config.email_to = email_to update_config_fields_in_section("client", email_to=email_to) cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id, job_config.proc_data_bucket) # from client.config, fetch and download the following: docker image, controller script try: fetch_file(s3, working_dir, job_config.docker_url, dest_filename=docker_image_name) fetch_file(s3, working_dir, job_config.controller_url, dest_filename=controller_script_name) if not no_cache: # cache job files in s3 unless no_cache parameter set to true cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id, job_config.proc_data_bucket, docker_image_name) cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id, job_config.proc_data_bucket, controller_script_name) except KeyError as e: cause = e.args[0] print("[Error]: field {} missing from client.config file.".format( cause)) sys.exit(-1) # change working directory and run controller script with notifications for initialization and completion job_config.update_status("INITIALIZED") send_email_alert(job_config) subprocess.call("python3 {}".format(controller_script_name), shell=True) job_config.update_status("SUCCESS") send_success_email(job_config) return
def extract_session(labels=False, raw_data_dir="morf-data/", label_type="labels-train", multithread=True): """ Extract features using the Docker image, building individual feature sets for each "session" or iteration of the course. :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-train.csv). :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :label_type: type of outcome label to use (string). :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: """ level = "session" mode = "extract" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # # clear any preexisting data for this user/job/mode and set number of cores clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 ## for each bucket, call job_runner once per session with --mode=extract and --level=session for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: for session in fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=False): poolres = pool.apply_async( run_image, [job_config, raw_data_bucket, course, session, level]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) if not labels: # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket result_file = collect_session_results(job_config) upload_key = "{}/{}/extract/{}".format(job_config.user_id, job_config.job_id, result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) else: # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket for raw_data_bucket in job_config.raw_data_buckets: result_file = collect_session_results( job_config, raw_data_buckets=[raw_data_bucket]) upload_key = raw_data_dir + "{}.csv".format(label_type) upload_file_to_s3(result_file, bucket=raw_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
import os import tempfile import re import pandas as pd GENDER_CSV_FP = os.path.join(os.getcwd(), "data/names_for_josh.csv") # docker doesn't like relative file paths GENDER_VALUES_TO_KEEP = ("male", "female") MORF_DATA_DIR = "morf-data/" MYSQL_DOCKER_DIR = os.path.join(os.getcwd(), "docker") MYSQL_DOCKER_IMG_NAME = "mysql-docker.tar" OUTPUT_DIR = os.path.join(os.getcwd(), "data/hash-mapping-exports") OUTPUT_FILENAME = "coursera_user_hash_gender_lookup.csv" GENDER_COL_NAME = "gender" module_logger = logging.getLogger(__name__) job_config = MorfJobConfig("config.properties") logger = set_logger_handlers(module_logger, job_config) for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course, fetch_all_sessions=True): with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: print("[INFO] processing course {} session {}".format(course, session)) # download the data exports fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir) # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir, # data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir # create docker run command and load image image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME) cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid,
def fork_features(job_id_to_fork, raw_data_dir="morf-data/"): """ Copies features from job_id_to_fork into current job_id. :param job_id_to_fork: string, name of job_id (must be from same user). :return: None. """ job_config = MorfJobConfig(CONFIG_FILENAME) #todo: multithread this for mode in ["extract", "extract-holdout"]: job_config.update_mode(mode) clear_s3_subdirectory(job_config) for raw_data_bucket in job_config.raw_data_buckets: print("[INFO] forking features from bucket {} mode {}".format( raw_data_bucket, mode)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) for course in courses: for session in fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=mode == "extract-holdout"): # get current location of file, with old jobid name prev_job_archive_filename = generate_archive_filename( job_config, course=course, session=session, mode=mode, job_id=job_id_to_fork) # get location of prev archive file in s3 prev_job_key = make_s3_key_path( job_config, filename=prev_job_archive_filename, course=course, session=session, mode=mode, job_id=job_id_to_fork) prev_job_s3_url = "s3://{}/{}".format( job_config.proc_data_bucket, prev_job_key) # make new location of file, with new jobid name current_job_archive_filename = generate_archive_filename( job_config, course=course, session=session, mode=mode) # copy frmo current location to new location current_job_key = make_s3_key_path( job_config, filename=current_job_archive_filename, course=course, session=session, mode=mode) current_job_s3_url = "s3://{}/{}".format( job_config.proc_data_bucket, current_job_key) copy_s3_file(job_config, sourceloc=prev_job_s3_url, destloc=current_job_s3_url) # after copying individual extraction results, copy collected feature file result_file = collect_session_results( job_config, holdout=mode == "extract-holdout") upload_key = "{}/{}/{}/{}".format(job_config.user_id, job_config.job_id, job_config.mode, result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) return
def extract_holdout_session(labels=False, raw_data_dir="morf-data/", label_type="labels-train", multithread=True): """ Extract features using the Docker image across each session of holdout data. :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv). :return: None """ mode = "extract-holdout" level = "session" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # call job_runner once per session with --mode=extract-holdout and --level=session # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 for raw_data_bucket in job_config.raw_data_buckets: logger.info("[INFO] processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: holdout_session = fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, holdout_session, level ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) if not labels: # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket result_file = collect_session_results(job_config, holdout=True) upload_key = "{}/{}/{}/{}".format(job_config.user_id, job_config.job_id, job_config.mode, result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) else: # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket for raw_data_bucket in job_config.raw_data_buckets: result_file = collect_session_results( job_config, raw_data_buckets=[raw_data_bucket]) upload_key = raw_data_dir + "{}.csv".format(label_type) upload_file_to_s3(result_file, bucket=raw_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return