Esempio n. 1
0
def create_course_folds(label_type, k=5, multithread=True):
    """
    From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3.
    :param label_type: type of outcome label to use.
    :param k: number of folds.
    :param multithread: logical indicating whether multiple cores should be used (if available)
    :param raw_data_dir: name of subfolder in s3 buckets containing raw data.
    :return:
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("creating cross-validation folds")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                poolres = pool.apply_async(
                    make_folds,
                    [job_config, raw_data_bucket, course, k, label_type])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    return
Esempio n. 2
0
def cross_validate_course(label_type, k=5, multithread=True):
    """
    Compute k-fold cross-validation across courses.
    :return:
    """
    # todo: call to create_course_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    # clear previous test results
    clear_s3_subdirectory(job_config, mode="test")
    docker_image_dir = os.getcwd(
    )  # directory the function is called from; should contain docker image
    logger = set_logger_handlers(module_logger, job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for fold_num in range(1, k + 1):
                    poolres = pool.apply_async(execute_image_for_cv, [
                        job_config, raw_data_bucket, course, fold_num,
                        docker_image_dir, label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    test_csv_fp = collect_course_cv_results(job_config)
    pred_key = make_s3_key_path(job_config,
                                os.path.basename(test_csv_fp),
                                mode="test")
    upload_file_to_s3(test_csv_fp,
                      job_config.proc_data_bucket,
                      pred_key,
                      job_config,
                      remove_on_success=True)
    return
Esempio n. 3
0
def cross_validate_session(label_type,
                           k=5,
                           multithread=True,
                           raw_data_dir="morf-data/"):
    """
    Compute k-fold cross-validation across sessions.
    :return:
    """
    raise NotImplementedError  # this is not implemented!
    # todo: call to create_session_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    # clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    with Pool(num_cores) as pool:
        for raw_data_bucket in job_config.raw_data_buckets:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config,
                                              raw_data_bucket,
                                              data_dir=raw_data_dir,
                                              course=course,
                                              fetch_all_sessions=True):
                    for fold_num in range(1, k + 1):
                        with tempfile.TemporaryDirectory(
                                dir=job_config.local_working_directory
                        ) as working_dir:
                            # get fold train data
                            input_dir, output_dir = initialize_input_output_dirs(
                                working_dir)
                            session_input_dir = os.path.join(
                                input_dir, course, session)
                            session_output_dir = os.path.join(
                                output_dir, course, session)
                            trainkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "train"),
                                session)
                            train_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                trainkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            testkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "test"),
                                session)
                            test_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                testkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            # get labels
                            initialize_labels(job_config, raw_data_bucket,
                                              course, session, label_type,
                                              session_input_dir, raw_data_dir)
                            # run docker image with mode == cv
                            #todo
                            # upload results
                            #todo
        pool.close()
        pool.join()
    return
Esempio n. 4
0
def create_session_folds(label_type,
                         k=5,
                         multithread=True,
                         raw_data_dir="morf-data/"):
    """
    From extract and extract-holdout data, create k randomized folds for each session and archive results to s3.
    :param label_type: type of outcome label to use.
    :param k: number of folds.
    :param multithread: logical indicating whether multiple cores should be used (if available)
    :param raw_data_dir: name of subfolder in s3 buckets containing raw data.
    :return:
    """
    user_id_col = "userID"
    label_col = "label_value"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("creating cross-validation folds")
    with Pool(num_cores) as pool:
        for raw_data_bucket in job_config.raw_data_buckets:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config,
                                              raw_data_bucket,
                                              data_dir=raw_data_dir,
                                              course=course,
                                              fetch_all_sessions=True):
                    with tempfile.TemporaryDirectory(
                            dir=job_config.local_working_directory
                    ) as working_dir:
                        # todo: call make_folds() here via apply_async(); currently this is not parallelized!
                        input_dir, output_dir = initialize_input_output_dirs(
                            working_dir)
                        # get the session feature and label data
                        download_train_test_data(job_config,
                                                 raw_data_bucket,
                                                 raw_data_dir,
                                                 course,
                                                 session,
                                                 input_dir,
                                                 label_type=label_type)
                        feature_file = os.path.join(
                            input_dir, course, session,
                            make_feature_csv_name(course, session))
                        label_file = os.path.join(
                            input_dir, course, session,
                            make_label_csv_name(course, session))
                        feat_df = pd.read_csv(feature_file, dtype=object)
                        label_df = pd.read_csv(label_file, dtype=object)
                        # merge features to ensure splits are correct
                        feat_label_df = pd.merge(feat_df,
                                                 label_df,
                                                 on=user_id_col)
                        assert feat_df.shape[0] == label_df.shape[
                            0], "features and labels must contain same number of observations"
                        # create the folds
                        logger.info(
                            "creating cv splits with k = {} course {} session {}"
                            .format(k, course, session))
                        skf = StratifiedKFold(n_splits=k, shuffle=True)
                        folds = skf.split(np.zeros(feat_df.shape[0]),
                                          feat_label_df.label_value)
                        for fold_num, train_test_indices in enumerate(
                                folds, 1
                        ):  # write each fold train/test data to csv and push to s3
                            train_index, test_index = train_test_indices
                            train_df, test_df = feat_label_df.loc[
                                train_index, ].drop(
                                    label_col, axis=1), feat_label_df.loc[
                                        test_index, ].drop(label_col, axis=1)
                            train_df_name = os.path.join(
                                working_dir,
                                make_feature_csv_name(course, session,
                                                      fold_num, "train"))
                            test_df_name = os.path.join(
                                working_dir,
                                make_feature_csv_name(course, session,
                                                      fold_num, "test"))
                            train_df.to_csv(train_df_name, index=False)
                            test_df.to_csv(test_df_name, index=False)
                            # upload to s3
                            try:
                                train_key = make_s3_key_path(
                                    job_config, course,
                                    os.path.basename(train_df_name), session)
                                upload_file_to_s3(train_df_name,
                                                  job_config.proc_data_bucket,
                                                  train_key,
                                                  job_config,
                                                  remove_on_success=True)
                                test_key = make_s3_key_path(
                                    job_config, course,
                                    os.path.basename(test_df_name), session)
                                upload_file_to_s3(test_df_name,
                                                  job_config.proc_data_bucket,
                                                  test_key,
                                                  job_config,
                                                  remove_on_success=True)
                            except Exception as e:
                                logger.warning(
                                    "exception occurred while uploading cv results: {}"
                                    .format(e))
        pool.close()
        pool.join()
    return
GENDER_CSV_FP = os.path.join(os.getcwd(), "data/names_for_josh.csv")  # docker doesn't like relative file paths
GENDER_VALUES_TO_KEEP = ("male", "female")
MORF_DATA_DIR = "morf-data/"
MYSQL_DOCKER_DIR = os.path.join(os.getcwd(), "docker")
MYSQL_DOCKER_IMG_NAME = "mysql-docker.tar"
OUTPUT_DIR = os.path.join(os.getcwd(), "data/hash-mapping-exports")
OUTPUT_FILENAME = "coursera_user_hash_gender_lookup.csv"
GENDER_COL_NAME = "gender"

module_logger = logging.getLogger(__name__)
job_config = MorfJobConfig("config.properties")
logger = set_logger_handlers(module_logger, job_config)

for raw_data_bucket in job_config.raw_data_buckets:
    for course in fetch_complete_courses(job_config, raw_data_bucket):
        for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course,
                                      fetch_all_sessions=True):
            with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
                print("[INFO] processing course {} session {}".format(course, session))
                # download the data exports
                fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir)
                # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir,
                #                          data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir
                # create docker run command and load image
                image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME)
                cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid,
                                              course=course, session=session, mode=None,
                                              client_args=None)
                # run the docker image, make sure to pass params for course and session
                execute_and_log_output(cmd, logger)