def create_course_folds(label_type, k=5, multithread=True): """ From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3. :param label_type: type of outcome label to use. :param k: number of folds. :param multithread: logical indicating whether multiple cores should be used (if available) :param raw_data_dir: name of subfolder in s3 buckets containing raw data. :return: """ job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("creating cross-validation folds") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): poolres = pool.apply_async( make_folds, [job_config, raw_data_bucket, course, k, label_type]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) return
def cross_validate_course(label_type, k=5, multithread=True): """ Compute k-fold cross-validation across courses. :return: """ # todo: call to create_course_folds() goes here job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) # clear previous test results clear_s3_subdirectory(job_config, mode="test") docker_image_dir = os.getcwd( ) # directory the function is called from; should contain docker image logger = set_logger_handlers(module_logger, job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("conducting cross validation") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): for fold_num in range(1, k + 1): poolres = pool.apply_async(execute_image_for_cv, [ job_config, raw_data_bucket, course, fold_num, docker_image_dir, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) test_csv_fp = collect_course_cv_results(job_config) pred_key = make_s3_key_path(job_config, os.path.basename(test_csv_fp), mode="test") upload_file_to_s3(test_csv_fp, job_config.proc_data_bucket, pred_key, job_config, remove_on_success=True) return
def cross_validate_session(label_type, k=5, multithread=True, raw_data_dir="morf-data/"): """ Compute k-fold cross-validation across sessions. :return: """ raise NotImplementedError # this is not implemented! # todo: call to create_session_folds() goes here job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode # clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("conducting cross validation") with Pool(num_cores) as pool: for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=raw_data_dir, course=course, fetch_all_sessions=True): for fold_num in range(1, k + 1): with tempfile.TemporaryDirectory( dir=job_config.local_working_directory ) as working_dir: # get fold train data input_dir, output_dir = initialize_input_output_dirs( working_dir) session_input_dir = os.path.join( input_dir, course, session) session_output_dir = os.path.join( output_dir, course, session) trainkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, session, fold_num, "train"), session) train_data_path = download_from_s3( job_config.proc_data_bucket, trainkey, job_config.initialize_s3(), dir=session_input_dir, job_config=job_config) testkey = make_s3_key_path( job_config, course, make_feature_csv_name(course, session, fold_num, "test"), session) test_data_path = download_from_s3( job_config.proc_data_bucket, testkey, job_config.initialize_s3(), dir=session_input_dir, job_config=job_config) # get labels initialize_labels(job_config, raw_data_bucket, course, session, label_type, session_input_dir, raw_data_dir) # run docker image with mode == cv #todo # upload results #todo pool.close() pool.join() return
def create_session_folds(label_type, k=5, multithread=True, raw_data_dir="morf-data/"): """ From extract and extract-holdout data, create k randomized folds for each session and archive results to s3. :param label_type: type of outcome label to use. :param k: number of folds. :param multithread: logical indicating whether multiple cores should be used (if available) :param raw_data_dir: name of subfolder in s3 buckets containing raw data. :return: """ user_id_col = "userID" label_col = "label_value" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("creating cross-validation folds") with Pool(num_cores) as pool: for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=raw_data_dir, course=course, fetch_all_sessions=True): with tempfile.TemporaryDirectory( dir=job_config.local_working_directory ) as working_dir: # todo: call make_folds() here via apply_async(); currently this is not parallelized! input_dir, output_dir = initialize_input_output_dirs( working_dir) # get the session feature and label data download_train_test_data(job_config, raw_data_bucket, raw_data_dir, course, session, input_dir, label_type=label_type) feature_file = os.path.join( input_dir, course, session, make_feature_csv_name(course, session)) label_file = os.path.join( input_dir, course, session, make_label_csv_name(course, session)) feat_df = pd.read_csv(feature_file, dtype=object) label_df = pd.read_csv(label_file, dtype=object) # merge features to ensure splits are correct feat_label_df = pd.merge(feat_df, label_df, on=user_id_col) assert feat_df.shape[0] == label_df.shape[ 0], "features and labels must contain same number of observations" # create the folds logger.info( "creating cv splits with k = {} course {} session {}" .format(k, course, session)) skf = StratifiedKFold(n_splits=k, shuffle=True) folds = skf.split(np.zeros(feat_df.shape[0]), feat_label_df.label_value) for fold_num, train_test_indices in enumerate( folds, 1 ): # write each fold train/test data to csv and push to s3 train_index, test_index = train_test_indices train_df, test_df = feat_label_df.loc[ train_index, ].drop( label_col, axis=1), feat_label_df.loc[ test_index, ].drop(label_col, axis=1) train_df_name = os.path.join( working_dir, make_feature_csv_name(course, session, fold_num, "train")) test_df_name = os.path.join( working_dir, make_feature_csv_name(course, session, fold_num, "test")) train_df.to_csv(train_df_name, index=False) test_df.to_csv(test_df_name, index=False) # upload to s3 try: train_key = make_s3_key_path( job_config, course, os.path.basename(train_df_name), session) upload_file_to_s3(train_df_name, job_config.proc_data_bucket, train_key, job_config, remove_on_success=True) test_key = make_s3_key_path( job_config, course, os.path.basename(test_df_name), session) upload_file_to_s3(test_df_name, job_config.proc_data_bucket, test_key, job_config, remove_on_success=True) except Exception as e: logger.warning( "exception occurred while uploading cv results: {}" .format(e)) pool.close() pool.join() return
GENDER_CSV_FP = os.path.join(os.getcwd(), "data/names_for_josh.csv") # docker doesn't like relative file paths GENDER_VALUES_TO_KEEP = ("male", "female") MORF_DATA_DIR = "morf-data/" MYSQL_DOCKER_DIR = os.path.join(os.getcwd(), "docker") MYSQL_DOCKER_IMG_NAME = "mysql-docker.tar" OUTPUT_DIR = os.path.join(os.getcwd(), "data/hash-mapping-exports") OUTPUT_FILENAME = "coursera_user_hash_gender_lookup.csv" GENDER_COL_NAME = "gender" module_logger = logging.getLogger(__name__) job_config = MorfJobConfig("config.properties") logger = set_logger_handlers(module_logger, job_config) for raw_data_bucket in job_config.raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course, fetch_all_sessions=True): with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: print("[INFO] processing course {} session {}".format(course, session)) # download the data exports fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir) # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir, # data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir # create docker run command and load image image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME) cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid, course=course, session=session, mode=None, client_args=None) # run the docker image, make sure to pass params for course and session execute_and_log_output(cmd, logger)