Esempio n. 1
0
def create_course_folds(label_type, k=5, multithread=True):
    """
    From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3.
    :param label_type: type of outcome label to use.
    :param k: number of folds.
    :param multithread: logical indicating whether multiple cores should be used (if available)
    :param raw_data_dir: name of subfolder in s3 buckets containing raw data.
    :return:
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("creating cross-validation folds")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                poolres = pool.apply_async(
                    make_folds,
                    [job_config, raw_data_bucket, course, k, label_type])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    return
Esempio n. 2
0
def test_all(label_type):
    """
    test a single overall model using the entire dataset using the Docker image.
    :return:
    """
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    run_image(job_config,
              job_config.raw_data_buckets,
              level=level,
              label_type=label_type)
    # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow
    result_file = collect_all_results(job_config)
    upload_key = make_s3_key_path(job_config,
                                  filename=generate_archive_filename(
                                      job_config, extension="csv"))
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Esempio n. 3
0
def evaluate_cv_course(label_type, k=5, label_col = "label_type", raw_data_dir = "morf-data/",
                    course_col = "course", fold_col = "fold_num", pred_cols = ("prob", "pred"),
                    user_col = "userID"):
    """
    Fetch metrics by first averaging over folds within course, then returning results by course.
    :param label_type: label type defined by user.
    :param label_col: column containing labels.
    :param raw_data_bucket: bucket containing raw data; used to fetch course names.
    :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories.
    :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features).
    :param course_col: column containing course identifier.
    :param pred_cols: user-supplied prediction columns; these columns will be checked for missing values and to ensure they contain values for every user in the course.
    :param user_col: column containing user ID for predictions.
    :param labels_file: name of csv file containing labels.
    :return: None.
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    check_label_type(label_type)
    raw_data_buckets = job_config.raw_data_buckets
    proc_data_bucket = job_config.proc_data_bucket
    s3 = job_config.initialize_s3()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    course_data = []
    for raw_data_bucket in raw_data_buckets:
        pred_file = generate_archive_filename(job_config, mode="test", extension="csv")
        pred_key = make_s3_key_path(job_config, pred_file, mode="test")
        # download course prediction and label files, fetch classification metrics at course level
        with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
            pred_csv = download_from_s3(proc_data_bucket, pred_key, s3, working_dir, job_config=job_config)
            job_config.update_mode("cv") # set mode to cv to fetch correct labels for sessions even if they are train/test sessions
            label_csv = initialize_labels(job_config, raw_data_bucket, None, None, label_type, working_dir, raw_data_dir, level="all")
            pred_df = pd.read_csv(pred_csv)
            lab_df = pd.read_csv(label_csv, dtype=object)
            pred_lab_df = pd.merge(lab_df, pred_df, how = "left", on = [user_col, course_col])
            check_dataframe_complete(pred_lab_df, job_config, columns = list(pred_cols))
            for course in fetch_complete_courses(job_config, data_bucket = raw_data_bucket, data_dir = raw_data_dir, n_train=1):
                fold_metrics_list = list()
                for fold_num in range(1, k+1):
                    fold_metrics_df = fetch_binary_classification_metrics(job_config, pred_lab_df[pred_lab_df[fold_col] == fold_num], course)
                    fold_metrics_list.append(fold_metrics_df)
                assert len(fold_metrics_list) == k, "something is wrong; number of folds doesn't match. Try running job again from scratch."
                course_metrics_df = pd.concat(fold_metrics_list).mean()
                course_metrics_df[course_col] = course
                course_data.append(course_metrics_df)
    job_config.update_mode(mode)
    master_metrics_df = pd.concat(course_data, axis = 1).T
    # reorder dataframe so course name is first
    cols = list(master_metrics_df)
    # move the column to head of list using index, pop and insert
    cols.insert(0, cols.pop(cols.index(course_col)))
    master_metrics_df = master_metrics_df.ix[:, cols]
    csv_fp = generate_archive_filename(job_config, extension="csv")
    master_metrics_df[course_col] = hash_df_column(master_metrics_df[course_col], job_config.user_id, job_config.hash_secret)
    master_metrics_df.to_csv(csv_fp, index = False, header = True)
    upload_key = make_s3_key_path(job_config, mode = "test", filename=csv_fp)
    upload_file_to_s3(csv_fp, bucket=proc_data_bucket, key=upload_key)
    os.remove(csv_fp)
    return
Esempio n. 4
0
def evaluate_course(label_type, label_col = "label_type", raw_data_dir = "morf-data/",
                    course_col = "course", pred_cols = ("prob", "pred"),
                    user_col = "userID", labels_file = "labels-test.csv"):
    """
    Fetch metrics by course.
    :param label_type: label type defined by user.
    :param label_col: column containing labels.
    :param raw_data_bucket: bucket containing raw data; used to fetch course names.
    :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories.
    :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features).
    :param course_col: column containing course identifier.
    :param pred_cols: user-supplied prediction columns; these columns will be checked for missing values and to ensure they contain values for every user in the course.
    :param user_col: column containing user ID for predictions.
    :param labels_file: name of csv file containing labels.
    :return: None.
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    check_label_type(label_type)
    raw_data_buckets = job_config.raw_data_buckets
    proc_data_bucket = job_config.proc_data_bucket
    s3 = job_config.initialize_s3()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    course_data = []
    for raw_data_bucket in raw_data_buckets:
        pred_file = generate_archive_filename(job_config, mode="test", extension="csv")
        pred_key = "{}/{}/{}/{}".format(job_config.user_id, job_config.job_id, "test", pred_file)
        label_key = raw_data_dir + labels_file
        # download course prediction and label files, fetch classification metrics at course level
        with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
            download_from_s3(proc_data_bucket, pred_key, s3, working_dir, job_config=job_config)
            download_from_s3(raw_data_bucket, label_key, s3, working_dir, job_config=job_config)
            pred_df = pd.read_csv("/".join([working_dir, pred_file]))
            lab_df = pd.read_csv("/".join([working_dir, labels_file]), dtype=object)
            lab_df = lab_df[lab_df[label_col] == label_type].copy()
            pred_lab_df = pd.merge(lab_df, pred_df, how = "left", on = [user_col, course_col])
            check_dataframe_complete(pred_lab_df, job_config, columns = pred_cols)
            for course in fetch_complete_courses(job_config, data_bucket = raw_data_bucket, data_dir = raw_data_dir, n_train=1):
                course_metrics_df = fetch_binary_classification_metrics(job_config, pred_lab_df, course)
                course_data.append(course_metrics_df)
    master_metrics_df = pd.concat(course_data).reset_index().rename(columns={"index": course_col})
    csv_fp = generate_archive_filename(job_config, extension="csv")
    master_metrics_df[course_col] = hash_df_column(master_metrics_df[course_col], job_config.user_id, job_config.hash_secret)
    master_metrics_df.to_csv(csv_fp, index = False, header = True)
    upload_key = make_s3_key_path(job_config, mode = "test", filename=csv_fp)
    upload_file_to_s3(csv_fp, bucket=proc_data_bucket, key=upload_key)
    os.remove(csv_fp)
    return
Esempio n. 5
0
def evaluate_prule_session():
    """
    Perform statistical testing for prule analysis.
    :return: None
    """
    raw_data_dir = "morf-data/"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    raw_data_buckets = job_config.raw_data_buckets
    proc_data_bucket = job_config.proc_data_bucket
    prule_file = job_config.prule_url
    s3 = job_config.initialize_s3()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
        input_dir, output_dir = initialize_input_output_dirs(working_dir)
        # pull extraction results from every course into working_dir
        for raw_data_bucket in raw_data_buckets:
            for course in fetch_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_all_sessions=True):
                    if session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course):
                        ## session is a non-holdout session
                        fetch_mode = "extract"
                    else:
                        fetch_mode = "extract-holdout"
                    feat_file = generate_archive_filename(job_config, course=course, session=session, mode=fetch_mode)
                    feat_key = make_s3_key_path(job_config, filename=feat_file, course=course, session=session, mode=fetch_mode)
                    feat_local_fp = download_from_s3(proc_data_bucket, feat_key, s3, input_dir, job_config=job_config)
                    unarchive_file(feat_local_fp, input_dir)
        docker_image_fp = urlparse(job_config.prule_evaluate_image).path
        docker_image_dir = os.path.dirname(docker_image_fp)
        docker_image_name = os.path.basename(docker_image_fp)
        image_uuid = load_docker_image(docker_image_dir, job_config, logger, image_name=docker_image_name)
        # create a directory for prule file and copy into it; this will be mounted to docker image
        prule_dir = os.path.join(working_dir, "prule")
        os.makedirs(prule_dir)
        shutil.copy(urlparse(prule_file).path, prule_dir)
        cmd = "{} run --network=\"none\" --rm=true --volume={}:/input --volume={}:/output --volume={}:/prule {} ".format(job_config.docker_exec, input_dir, output_dir, prule_dir, image_uuid)
        subprocess.call(cmd, shell=True)
        # rename result file and upload results to s3
        final_output_file = os.path.join(output_dir, "output.csv")
        final_output_archive_name = generate_archive_filename(job_config, extension="csv")
        final_output_archive_fp = os.path.join(output_dir, final_output_archive_name)
        os.rename(final_output_file, final_output_archive_fp)
        output_key = make_s3_key_path(job_config, filename = final_output_archive_name, mode = "test")
        upload_file_to_s3(final_output_archive_fp, proc_data_bucket, output_key, job_config, remove_on_success=True)
        return
Esempio n. 6
0
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True):
    """
    Extract features using the Docker image across each course of holdout data.
    :return:
    """
    mode = "extract-holdout"
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # call job_runner once percourse with --mode=extract and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                holdout_session = fetch_sessions(
                    job_config,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, holdout_session,
                    level, None
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config, filename=result_file)
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Esempio n. 7
0
def test_course(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    tests one model per course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return:
    """
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    ## for each bucket, call job_runner once per course with --mode=test and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("[INFO] processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, None, level,
                    label_type
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config,
                                  filename=generate_archive_filename(
                                      job_config, extension="csv"))
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Esempio n. 8
0
def train_all(label_type):
    """
    Train a single overall model using the entire dataset using the Docker image.
    :param label_type:  label type provided by user.
    :return: None
    """
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode("train")
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    run_image(job_config,
              raw_data_bucket=job_config.raw_data_buckets,
              level=level,
              label_type=label_type)
    send_email_alert(job_config)
    return
Esempio n. 9
0
def cross_validate_course(label_type, k=5, multithread=True):
    """
    Compute k-fold cross-validation across courses.
    :return:
    """
    # todo: call to create_course_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    # clear previous test results
    clear_s3_subdirectory(job_config, mode="test")
    docker_image_dir = os.getcwd(
    )  # directory the function is called from; should contain docker image
    logger = set_logger_handlers(module_logger, job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for fold_num in range(1, k + 1):
                    poolres = pool.apply_async(execute_image_for_cv, [
                        job_config, raw_data_bucket, course, fold_num,
                        docker_image_dir, label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    test_csv_fp = collect_course_cv_results(job_config)
    pred_key = make_s3_key_path(job_config,
                                os.path.basename(test_csv_fp),
                                mode="test")
    upload_file_to_s3(test_csv_fp,
                      job_config.proc_data_bucket,
                      pred_key,
                      job_config,
                      remove_on_success=True)
    return
Esempio n. 10
0
def extract_all():
    """
    Extract features using the docker image across all courses and all sessions except holdout.
    :return:
    """
    mode = "extract"
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image
    run_image(job_config, job_config.raw_data_buckets, level=level)
    result_file = collect_all_results(job_config)
    upload_key = make_s3_key_path(job_config, filename=result_file)
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Esempio n. 11
0
def train_session(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    Train one model per session of the course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return: None
    """
    level = "session"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # for each bucket, call job_runner once per session with --mode=train and --level=session
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                for session in fetch_sessions(job_config, raw_data_bucket,
                                              raw_data_dir, course):
                    poolres = pool.apply_async(run_image, [
                        job_config, raw_data_bucket, course, session, level,
                        label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    send_email_alert(job_config)
    return
Esempio n. 12
0
def cross_validate_session(label_type,
                           k=5,
                           multithread=True,
                           raw_data_dir="morf-data/"):
    """
    Compute k-fold cross-validation across sessions.
    :return:
    """
    raise NotImplementedError  # this is not implemented!
    # todo: call to create_session_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    # clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    with Pool(num_cores) as pool:
        for raw_data_bucket in job_config.raw_data_buckets:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config,
                                              raw_data_bucket,
                                              data_dir=raw_data_dir,
                                              course=course,
                                              fetch_all_sessions=True):
                    for fold_num in range(1, k + 1):
                        with tempfile.TemporaryDirectory(
                                dir=job_config.local_working_directory
                        ) as working_dir:
                            # get fold train data
                            input_dir, output_dir = initialize_input_output_dirs(
                                working_dir)
                            session_input_dir = os.path.join(
                                input_dir, course, session)
                            session_output_dir = os.path.join(
                                output_dir, course, session)
                            trainkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "train"),
                                session)
                            train_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                trainkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            testkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "test"),
                                session)
                            test_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                testkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            # get labels
                            initialize_labels(job_config, raw_data_bucket,
                                              course, session, label_type,
                                              session_input_dir, raw_data_dir)
                            # run docker image with mode == cv
                            #todo
                            # upload results
                            #todo
        pool.close()
        pool.join()
    return
Esempio n. 13
0
def create_session_folds(label_type,
                         k=5,
                         multithread=True,
                         raw_data_dir="morf-data/"):
    """
    From extract and extract-holdout data, create k randomized folds for each session and archive results to s3.
    :param label_type: type of outcome label to use.
    :param k: number of folds.
    :param multithread: logical indicating whether multiple cores should be used (if available)
    :param raw_data_dir: name of subfolder in s3 buckets containing raw data.
    :return:
    """
    user_id_col = "userID"
    label_col = "label_value"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("creating cross-validation folds")
    with Pool(num_cores) as pool:
        for raw_data_bucket in job_config.raw_data_buckets:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config,
                                              raw_data_bucket,
                                              data_dir=raw_data_dir,
                                              course=course,
                                              fetch_all_sessions=True):
                    with tempfile.TemporaryDirectory(
                            dir=job_config.local_working_directory
                    ) as working_dir:
                        # todo: call make_folds() here via apply_async(); currently this is not parallelized!
                        input_dir, output_dir = initialize_input_output_dirs(
                            working_dir)
                        # get the session feature and label data
                        download_train_test_data(job_config,
                                                 raw_data_bucket,
                                                 raw_data_dir,
                                                 course,
                                                 session,
                                                 input_dir,
                                                 label_type=label_type)
                        feature_file = os.path.join(
                            input_dir, course, session,
                            make_feature_csv_name(course, session))
                        label_file = os.path.join(
                            input_dir, course, session,
                            make_label_csv_name(course, session))
                        feat_df = pd.read_csv(feature_file, dtype=object)
                        label_df = pd.read_csv(label_file, dtype=object)
                        # merge features to ensure splits are correct
                        feat_label_df = pd.merge(feat_df,
                                                 label_df,
                                                 on=user_id_col)
                        assert feat_df.shape[0] == label_df.shape[
                            0], "features and labels must contain same number of observations"
                        # create the folds
                        logger.info(
                            "creating cv splits with k = {} course {} session {}"
                            .format(k, course, session))
                        skf = StratifiedKFold(n_splits=k, shuffle=True)
                        folds = skf.split(np.zeros(feat_df.shape[0]),
                                          feat_label_df.label_value)
                        for fold_num, train_test_indices in enumerate(
                                folds, 1
                        ):  # write each fold train/test data to csv and push to s3
                            train_index, test_index = train_test_indices
                            train_df, test_df = feat_label_df.loc[
                                train_index, ].drop(
                                    label_col, axis=1), feat_label_df.loc[
                                        test_index, ].drop(label_col, axis=1)
                            train_df_name = os.path.join(
                                working_dir,
                                make_feature_csv_name(course, session,
                                                      fold_num, "train"))
                            test_df_name = os.path.join(
                                working_dir,
                                make_feature_csv_name(course, session,
                                                      fold_num, "test"))
                            train_df.to_csv(train_df_name, index=False)
                            test_df.to_csv(test_df_name, index=False)
                            # upload to s3
                            try:
                                train_key = make_s3_key_path(
                                    job_config, course,
                                    os.path.basename(train_df_name), session)
                                upload_file_to_s3(train_df_name,
                                                  job_config.proc_data_bucket,
                                                  train_key,
                                                  job_config,
                                                  remove_on_success=True)
                                test_key = make_s3_key_path(
                                    job_config, course,
                                    os.path.basename(test_df_name), session)
                                upload_file_to_s3(test_df_name,
                                                  job_config.proc_data_bucket,
                                                  test_key,
                                                  job_config,
                                                  remove_on_success=True)
                            except Exception as e:
                                logger.warning(
                                    "exception occurred while uploading cv results: {}"
                                    .format(e))
        pool.close()
        pool.join()
    return
Esempio n. 14
0
def run_morf_job(client_config_url,
                 server_config_url,
                 email_to=None,
                 no_cache=False):
    """
    Wrapper function to run complete MORF job.
    :param client_config_url: url to client.config file; should be located on local machine.
    :param server_config_url: url (local or s3) to server.config file.
    :return:
    """
    controller_script_name = "controller.py"
    docker_image_name = "docker_image"
    config_filename = "config.properties"
    server_config_path = urlparse(server_config_url).path
    # read server.config and get those properties
    server_config = get_config_properties(server_config_path)
    # create temporary directory in local_working_directory from server.config
    with tempfile.TemporaryDirectory(
            dir=server_config["local_working_directory"]) as working_dir:
        # save calling working directory; change directory into working_dir
        calling_dir = os.getcwd()
        os.chdir(working_dir)
        # download client.config into local_working_directory using AWS creds from server.config
        s3 = boto3.client(
            "s3",
            aws_access_key_id=server_config["aws_access_key_id"],
            aws_secret_access_key=server_config["aws_secret_access_key"])
        fetch_file(s3, working_dir, client_config_url)
        local_client_config_path = os.path.join(os.getcwd(), "client.config")
        combine_config_files(server_config_path,
                             local_client_config_path,
                             outfile=config_filename)
        job_config = MorfJobConfig(config_filename)
        if email_to:  # if email_to was provided by user, this overrides in config file -- allows users to easily run mwe
            print(
                "[INFO] email address from submission {} overriding email address in config file {}"
                .format(email_to, job_config.email_to))
            job_config.email_to = email_to
            update_config_fields_in_section("client", email_to=email_to)
        cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id,
                             job_config.proc_data_bucket)
        # from client.config, fetch and download the following: docker image, controller script
        try:
            fetch_file(s3,
                       working_dir,
                       job_config.docker_url,
                       dest_filename=docker_image_name)
            fetch_file(s3,
                       working_dir,
                       job_config.controller_url,
                       dest_filename=controller_script_name)
            if not no_cache:  # cache job files in s3 unless no_cache parameter set to true
                cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id,
                                     job_config.proc_data_bucket,
                                     docker_image_name)
                cache_job_file_in_s3(s3, job_config.user_id, job_config.job_id,
                                     job_config.proc_data_bucket,
                                     controller_script_name)
        except KeyError as e:
            cause = e.args[0]
            print("[Error]: field {} missing from client.config file.".format(
                cause))
            sys.exit(-1)
        # change working directory and run controller script with notifications for initialization and completion
        job_config.update_status("INITIALIZED")
        send_email_alert(job_config)
        subprocess.call("python3 {}".format(controller_script_name),
                        shell=True)
        job_config.update_status("SUCCESS")
        send_success_email(job_config)
        return
Esempio n. 15
0
def extract_session(labels=False,
                    raw_data_dir="morf-data/",
                    label_type="labels-train",
                    multithread=True):
    """
    Extract features using the Docker image, building individual feature sets for each "session" or iteration of the course.
    :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-train.csv).
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :label_type: type of outcome label to use (string).
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return:
    """
    level = "session"
    mode = "extract"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # # clear any preexisting data for this user/job/mode and set number of cores
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    ## for each bucket, call job_runner once per session with --mode=extract and --level=session
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                for session in fetch_sessions(
                        job_config,
                        raw_data_bucket,
                        raw_data_dir,
                        course,
                        fetch_holdout_session_only=False):
                    poolres = pool.apply_async(
                        run_image,
                        [job_config, raw_data_bucket, course, session, level])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    if not labels:  # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket
        result_file = collect_session_results(job_config)
        upload_key = "{}/{}/extract/{}".format(job_config.user_id,
                                               job_config.job_id, result_file)
        upload_file_to_s3(result_file,
                          bucket=job_config.proc_data_bucket,
                          key=upload_key)
    else:  # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket
        for raw_data_bucket in job_config.raw_data_buckets:
            result_file = collect_session_results(
                job_config, raw_data_buckets=[raw_data_bucket])
            upload_key = raw_data_dir + "{}.csv".format(label_type)
            upload_file_to_s3(result_file,
                              bucket=raw_data_bucket,
                              key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
import os
import tempfile
import re
import pandas as pd

GENDER_CSV_FP = os.path.join(os.getcwd(), "data/names_for_josh.csv")  # docker doesn't like relative file paths
GENDER_VALUES_TO_KEEP = ("male", "female")
MORF_DATA_DIR = "morf-data/"
MYSQL_DOCKER_DIR = os.path.join(os.getcwd(), "docker")
MYSQL_DOCKER_IMG_NAME = "mysql-docker.tar"
OUTPUT_DIR = os.path.join(os.getcwd(), "data/hash-mapping-exports")
OUTPUT_FILENAME = "coursera_user_hash_gender_lookup.csv"
GENDER_COL_NAME = "gender"

module_logger = logging.getLogger(__name__)
job_config = MorfJobConfig("config.properties")
logger = set_logger_handlers(module_logger, job_config)

for raw_data_bucket in job_config.raw_data_buckets:
    for course in fetch_complete_courses(job_config, raw_data_bucket):
        for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course,
                                      fetch_all_sessions=True):
            with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
                print("[INFO] processing course {} session {}".format(course, session))
                # download the data exports
                fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir)
                # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir,
                #                          data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir
                # create docker run command and load image
                image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME)
                cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid,
Esempio n. 17
0
def fork_features(job_id_to_fork, raw_data_dir="morf-data/"):
    """
    Copies features from job_id_to_fork into current job_id.
    :param job_id_to_fork: string, name of job_id (must be from same user).
    :return: None.
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    #todo: multithread this
    for mode in ["extract", "extract-holdout"]:
        job_config.update_mode(mode)
        clear_s3_subdirectory(job_config)
        for raw_data_bucket in job_config.raw_data_buckets:
            print("[INFO] forking features from bucket {} mode {}".format(
                raw_data_bucket, mode))
            courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
            for course in courses:
                for session in fetch_sessions(
                        job_config,
                        raw_data_bucket,
                        raw_data_dir,
                        course,
                        fetch_holdout_session_only=mode == "extract-holdout"):
                    # get current location of file, with old jobid name
                    prev_job_archive_filename = generate_archive_filename(
                        job_config,
                        course=course,
                        session=session,
                        mode=mode,
                        job_id=job_id_to_fork)
                    # get location of prev archive file in s3
                    prev_job_key = make_s3_key_path(
                        job_config,
                        filename=prev_job_archive_filename,
                        course=course,
                        session=session,
                        mode=mode,
                        job_id=job_id_to_fork)
                    prev_job_s3_url = "s3://{}/{}".format(
                        job_config.proc_data_bucket, prev_job_key)
                    # make new location of file, with new jobid name
                    current_job_archive_filename = generate_archive_filename(
                        job_config, course=course, session=session, mode=mode)
                    # copy frmo current location to new location
                    current_job_key = make_s3_key_path(
                        job_config,
                        filename=current_job_archive_filename,
                        course=course,
                        session=session,
                        mode=mode)
                    current_job_s3_url = "s3://{}/{}".format(
                        job_config.proc_data_bucket, current_job_key)
                    copy_s3_file(job_config,
                                 sourceloc=prev_job_s3_url,
                                 destloc=current_job_s3_url)
        # after copying individual extraction results, copy collected feature file
        result_file = collect_session_results(
            job_config, holdout=mode == "extract-holdout")
        upload_key = "{}/{}/{}/{}".format(job_config.user_id,
                                          job_config.job_id, job_config.mode,
                                          result_file)
        upload_file_to_s3(result_file,
                          bucket=job_config.proc_data_bucket,
                          key=upload_key)
    return
Esempio n. 18
0
def extract_holdout_session(labels=False,
                            raw_data_dir="morf-data/",
                            label_type="labels-train",
                            multithread=True):
    """
    Extract features using the Docker image across each session of holdout data.
    :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv).
    :return: None
    """
    mode = "extract-holdout"
    level = "session"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # call job_runner once per session with --mode=extract-holdout and --level=session
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("[INFO] processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                holdout_session = fetch_sessions(
                    job_config,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, holdout_session, level
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    if not labels:  # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket
        result_file = collect_session_results(job_config, holdout=True)
        upload_key = "{}/{}/{}/{}".format(job_config.user_id,
                                          job_config.job_id, job_config.mode,
                                          result_file)
        upload_file_to_s3(result_file,
                          bucket=job_config.proc_data_bucket,
                          key=upload_key)
    else:  # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket
        for raw_data_bucket in job_config.raw_data_buckets:
            result_file = collect_session_results(
                job_config, raw_data_buckets=[raw_data_bucket])
            upload_key = raw_data_dir + "{}.csv".format(label_type)
            upload_file_to_s3(result_file,
                              bucket=raw_data_bucket,
                              key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return