Beispiel #1
0
def fetch_result_file(job_config, dir, course=None, session=None):
    """
    Download and untar result file for user_id, job_id, mode, and (optional) course and session from job_config.proc_data_bucket.
    :param job_config: MorfJobConfig object.
    :param course: course shorname.
    :param session: session number.
    :return:  None.
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3 = job_config.initialize_s3()
    bucket = job_config.proc_data_bucket
    archive_file = generate_archive_filename(job_config, course, session)
    key = make_s3_key_path(job_config,
                           course=course,
                           session=session,
                           filename=archive_file)
    dest = os.path.join(dir, archive_file)
    logger.info("fetching s3://{}/{}".format(bucket, key))
    with open(dest, 'wb') as resource:
        try:
            s3.download_fileobj(bucket, key, resource)
        except Exception as e:
            logger.warning(
                "exception while fetching results for mode {} course {} session {}:{}"
                .format(job_config.mode, course, session, e))
    unarchive_file(dest, dir)
    os.remove(dest)
    return
Beispiel #2
0
def make_output_archive_file(output_dir,
                             job_config,
                             course=None,
                             session=None):
    """
    Archive output_dir into archive file, and return name of archive file.
    :param output_dir: directory to compress into archive_file.
    :param mode: mode for job (string); one of: {extract, test, train}.
    :param user_id: user_id for job (string).
    :param job_id: job_id for job (string).
    :param course: course: name of course for job (string).
    :param session: session number of course (string) (optional, only needed when mode == extract).
    :return: name of archive file (string).
    """
    logger = set_logger_handlers(module_logger, job_config)
    archive_file = generate_archive_filename(job_config, course, session)
    # archive results; only save directory structure relative to output_dir (NOT absolute directory structure)
    logger.info(" archiving results to {} as {}".format(
        output_dir, archive_file))
    # todo: use python tarfile here
    cmd = "tar -cvf {} -C {} .".format(archive_file, output_dir)
    subprocess.call(cmd,
                    shell=True,
                    stdout=open(os.devnull, "wb"),
                    stderr=open(os.devnull, "wb"))
    return archive_file
Beispiel #3
0
def sync_s3_bucket_cache(job_config, bucket):
    """
    Cache all data in an s3 bucket to job_config.cache_dir, creating a complete copy of files and directory structure.
    :param job_config: MorfJobConfig object.
    :param bucket: path to s3 bucket.
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3bucket = "s3://{}".format(bucket)
    bucket_cache_dir = os.path.join(job_config.cache_dir, bucket)
    # create job_config.cache_dir directory if not exists
    if not os.path.exists(job_config.cache_dir):
        try:
            os.makedirs(job_config.cache_dir)
        except exception as e:
            logger.error("error creating cache: {}".format(e))
            raise
    # execute s3 sync command
    cmd = "{} s3 sync {} {}".format(job_config.aws_exec, s3bucket,
                                    bucket_cache_dir)
    logger.info("running {}".format(cmd))
    try:
        subprocess.call(cmd, shell=True)
    except Exception as e:
        logger.warning("exception when executing sync: {}".format(e))
    return
Beispiel #4
0
def create_course_folds(label_type, k=5, multithread=True):
    """
    From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3.
    :param label_type: type of outcome label to use.
    :param k: number of folds.
    :param multithread: logical indicating whether multiple cores should be used (if available)
    :param raw_data_dir: name of subfolder in s3 buckets containing raw data.
    :return:
    """
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("creating cross-validation folds")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                poolres = pool.apply_async(
                    make_folds,
                    [job_config, raw_data_bucket, course, k, label_type])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    return
Beispiel #5
0
def check_dataframe_complete(df, job_config, columns):
    """
    Check columns for presence of NaN values; if any NaN values exist, throw message and raise exception.
    :param df: pd.DataFrame, containing columns.
    :param columns: columns to check for NaN values.
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("[INFO] checking predictions")
    # filter to only include complete courses
    courses = [
        x[0] for x in fetch_all_complete_courses_and_sessions(job_config)
    ]
    df_to_check = df[df.course.isin(courses)]
    null_counts = df_to_check[columns].apply(lambda x: sum(x.isnull()), axis=0)
    if null_counts.sum() > 0:
        logger.error(
            "Null values detected in the following columns: {} \n Did you include predicted probabilities and labels for all users?"
            .format(null_counts.loc[null_counts > 0].index.tolist()))
        missing_courses = df_to_check[
            df_to_check.prob.isnull()]['course'].unique()
        logger.error("missing values detected in these courses: {}".format(
            missing_courses))
        raise
    else:
        return
Beispiel #6
0
def sync_s3_job_cache(job_config,
                      modes=("extract", "extract-holdout", "train", "test")):
    """
    Sync data in s3 just for this specific job (better for large buckets or when the entire bucket is not actually needed).
    :param job_config:
    :param bucket:
    :param modes: modes to update cache for; by default to all modes
    :return:
    """
    bucket = job_config.proc_data_bucket
    logger = set_logger_handlers(module_logger, job_config)
    s3bucket = "s3://{}".format(bucket)
    bucket_cache_dir = os.path.join(job_config.cache_dir, bucket)
    # create job_config.cache_dir directory if not exists
    if not os.path.exists(job_config.cache_dir):
        try:
            os.makedirs(job_config.cache_dir)
        except exception as e:
            logger.error("error creating cache: {}".format(e))
            raise
    for m in modes:
        s3_prefix = make_s3_key_path(job_config, mode=m)
        mode_cache_dir = os.path.join(bucket_cache_dir, job_config.user_id,
                                      job_config.job_id, m)
        # execute s3 sync command
        cmd = "{} s3 sync {}/{} {}".format(job_config.aws_exec, s3bucket,
                                           s3_prefix, mode_cache_dir)
        logger.info("running {}".format(cmd))
        try:
            subprocess.call(cmd, shell=True)
        except Exception as e:
            logger.warning("exception when executing sync: {}".format(e))
    return
Beispiel #7
0
def upload_files_to_zenodo(job_config, upload_files, deposition_id = None, publish = True):
    """
    Upload each file in files to Zenodo, and publish the repo.
    :param deposition_id:
    :param files: a tuple of filenames to upload. These should be locally available.
    :param access_token:
    :return: deposition_id of Zenodo files
    """
    working_dir = os.getcwd()
    s3 = job_config.initialize_s3()
    logger = set_logger_handlers(module_logger, job_config)
    access_token = getattr(job_config, "zenodo_access_token")
    # check inputs
    assert isinstance(upload_files, collections.Iterable), "param 'files' must be an iterable"
    if not deposition_id: # create an empty upload and get its deposition id
        deposition_id = create_empty_zenodo_upload(access_token).json()['id']
    # upload each file
    for f in upload_files:
        fp = fetch_file(s3, working_dir, f, job_config=job_config)
        data = {'filename': fp}
        files = {'file': open(fp, 'rb')}
        r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, params = {'access_token': access_token}, data = data, files = files)
        logger.info(r.json())
    # generate metadata for the zenodo repo and publish it
    generate_zenodo_metadata(job_config, deposition_id)
    if publish:
        publish_zenodo_deposition(job_config, deposition_id)
    return deposition_id
Beispiel #8
0
def fetch_binary_classification_metrics(job_config,
                                        df,
                                        course,
                                        pred_prob_col="prob",
                                        pred_col="pred",
                                        label_col="label_value",
                                        course_col="course"):
    """
    Fetch set of binary classification metrics for df.
    :param job_config: MorfJobConfig object.
    :param df: pd.DataFrame of predictions; must include columns with names matching pred_prob_col, pred_col, and label_col.
    :param pred_prob_col: column of predicted probability of a positive class label. Should be in interval [0,1].
    :param pred_col: column of predicted class label. Should be in {0, 1}.
    :param label_col: column of true class label. Should be in {0, 1}
    :return: pd.DataFrame with dimension [1 x n_metrics].
    """
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("fetching metrics for course {}".format(course))
    df = df[df[course_col] == course]
    metrics = {}
    y_pred = df[pred_col].values.astype(float)
    y_true = df[label_col].values.astype(float)
    y_score = df[pred_prob_col].values
    metrics["accuracy"] = sklearn.metrics.accuracy_score(y_true, y_pred)
    try:
        metrics["auc"] = sklearn.metrics.roc_auc_score(y_true, y_score)
        metrics["log_loss"] = sklearn.metrics.log_loss(y_true, y_score)
        metrics["precision"] = sklearn.metrics.precision_score(y_true,
                                                               y_pred)  #
        metrics["recall"] = sklearn.metrics.recall_score(
            y_true, y_pred)  # true positive rate, sensitivity
        metrics["f1_score"] = sklearn.metrics.f1_score(y_true, y_pred)
    except ValueError:
        logger.warning(
            "Only one class present in y_true for course {}. ROC AUC score, log_loss, precision, recall, F1 are undefined."
            .format(course))
        metrics["auc"] = np.nan
        metrics["log_loss"] = np.nan
        metrics["precision"] = np.nan
        metrics["recall"] = np.nan
        metrics["f1_score"] = np.nan
    metrics["cohen_kappa_score"] = sklearn.metrics.cohen_kappa_score(
        y_true, y_pred)
    metrics["N"] = df.shape[0]
    metrics["N_n"] = df[label_col].value_counts().get(0, 0)
    metrics["N_p"] = df[label_col].value_counts().get(1, 0)
    cm = sklearn.metrics.confusion_matrix(y_true, y_pred)
    try:
        spec = cm[0, 0] / float(cm[0, 0] + cm[1, 0])
    except Exception as e:
        print(
            "[ERROR] error when computing specificity from confusion matrix: {}"
            .format(e))
        print("confusion matrix is: {}".format(cm))
        spec = np.nan
    metrics["specificity"] = spec
    metrics_df = pd.DataFrame(metrics, index=[course])
    return metrics_df
Beispiel #9
0
def download_train_test_data(job_config, raw_data_bucket, raw_data_dir, course,
                             session, input_dir, label_type):
    """
    Download pre-extracted train or test data (specified by mode) for course/session into input_dir.
    :param job_config: MorfJobConfig object.
    :param raw_data_bucket: bucket containing raw data.
    :param raw_data_dir: directory in raw_data_bucket containing course-level data.
    :param course: course to fetch data for.
    :param session: session to fetch data for.
    :param input_dir: /input directory to load data into. This should be same directory mounted to Docker image.
    :param label_type: valid label type to reatin for 'label' column of MORF-provided labels.
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3 = job_config.initialize_s3()
    aws_access_key_id = job_config.aws_access_key_id
    aws_secret_access_key = job_config.aws_secret_access_key
    proc_data_bucket = job_config.proc_data_bucket
    mode = job_config.mode
    user_id = job_config.user_id
    job_id = job_config.job_id
    if mode == "train":
        fetch_mode = "extract"
    if mode == "test":
        fetch_mode = "extract-holdout"
    logger.info(" fetching {} data for course {} session {}".format(
        fetch_mode, course, session))
    session_input_dir = os.path.join(input_dir, course, session)
    os.makedirs(session_input_dir)
    # download features file
    feature_csv = generate_archive_filename(job_config,
                                            mode=fetch_mode,
                                            extension="csv")
    key = "{}/{}/{}/{}".format(user_id, job_id, fetch_mode, feature_csv)
    download_from_s3(proc_data_bucket, key, s3, session_input_dir)
    # read features file and filter to only include specific course/session
    local_feature_csv = os.path.join(session_input_dir, feature_csv)
    temp_df = pd.read_csv(local_feature_csv, dtype=object)
    outfile = os.path.join(session_input_dir,
                           "{}_{}_features.csv".format(course, session))
    temp_df[(temp_df["course"] == course) & (temp_df["session"] == session)].drop(["course", "session"], axis = 1)\
        .to_csv(outfile, index = False)
    os.remove(local_feature_csv)
    if mode == "train":  #download labels only if training job; otherwise no labels needed
        initialize_labels(s3,
                          aws_access_key_id,
                          aws_secret_access_key,
                          raw_data_bucket,
                          course,
                          session,
                          mode,
                          label_type,
                          dest_dir=session_input_dir,
                          data_dir=raw_data_dir)
    return
Beispiel #10
0
def docker_cloud_login(job_config):
    """
    Log into docker cloud using creds in job_config.
    :param job_config: MorfJobConfig object.
    :return: None
    """
    cmd = "docker login --username={} --password={}".format(
        job_config.docker_cloud_username, job_config.docker_cloud_password)
    logger = set_logger_handlers(module_logger, job_config)
    execute_and_log_output(cmd, logger)
    return
Beispiel #11
0
def cache_to_docker_hub(job_config, dir, image_name):
    """
    Push image to MORF repo in Docker Hub.
    :param job_config: MorfJobConfig object.
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    image_uuid = load_docker_image(dir, job_config, logger, image_name)
    docker_cloud_login(job_config)
    docker_cloud_repo_and_tag_path = docker_cloud_push(job_config, image_uuid)
    return docker_cloud_repo_and_tag_path
Beispiel #12
0
def run_morf_job(job_config, no_cache=False):
    """
    Wrapper function to run complete MORF job.
    :param client_config_url: url to client.config file.
    :param server_config_url: url to server.config file.
    :return:
    """
    combined_config_filename = "config.properties"
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("running job id: {}".format(job_config.morf_id))
    controller_script_name = "controller.py"
    docker_image_name = "docker_image"
    s3 = job_config.initialize_s3()
    # create temporary directory in local_working_directory from server.config
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        # copy config file into new directory
        shutil.copy(combined_config_filename, working_dir)
        os.chdir(working_dir)
        # from job_config, fetch and download the following: docker image, controller script, cached config file
        update_morf_job_cache(job_config)
        # from client.config, fetch and download the following: docker image, controller script
        try:
            fetch_file(s3,
                       working_dir,
                       job_config.docker_url,
                       dest_filename=docker_image_name,
                       job_config=job_config)
            fetch_file(s3,
                       working_dir,
                       job_config.controller_url,
                       dest_filename=controller_script_name,
                       job_config=job_config)
            if not no_cache:  # cache job files in s3 unless no_cache parameter set to true
                cache_job_file_in_s3(job_config, filename=docker_image_name)
                cache_job_file_in_s3(job_config,
                                     filename=controller_script_name)
        except KeyError as e:
            cause = e.args[0]
            logger.error(
                "[Error]: field {} missing from client.config file.".format(
                    cause))
            sys.exit(-1)
        # change working directory and run controller script with notifications for initialization and completion
        job_config.update_status("INITIALIZED")
        send_email_alert(job_config)
        subprocess.call("python3 {}".format(controller_script_name),
                        shell=True)
        job_config.update_status("SUCCESS")
        send_success_email(job_config)
        return
Beispiel #13
0
def collect_session_results(job_config,
                            holdout=False,
                            raw_data_dir="morf-data/",
                            raw_data_buckets=None):
    """
    Iterate through course- and session-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv.
    :param s3: boto3.client object with appropriate access credentials.
    :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket.
    :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories.
    :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features).
    :param mode: mode to collect results for, {extract, test}.
    :param holdout: flag; fetch holdout run only (boolean; default False).
    :return: path to csv.
    """
    logger = set_logger_handlers(module_logger, job_config)
    mode = job_config.mode
    if not raw_data_buckets:  # can utilize this parameter to override job_config buckets; used for label extraction
        raw_data_buckets = job_config.raw_data_buckets
    feat_df_list = list()
    for raw_data_bucket in raw_data_buckets:
        for course in fetch_courses(job_config, raw_data_bucket, raw_data_dir):
            for run in fetch_sessions(job_config,
                                      raw_data_bucket,
                                      raw_data_dir,
                                      course,
                                      fetch_holdout_session_only=holdout):
                with tempfile.TemporaryDirectory(
                        dir=os.getcwd()) as working_dir:
                    logger.info(
                        "[INFO] fetching extraction results for course {} run {}"
                        .format(course, run))
                    try:
                        fetch_result_file(job_config,
                                          course=course,
                                          session=run,
                                          dir=working_dir)
                        csv = fetch_result_csv_fp(working_dir)
                        feat_df = pd.read_csv(csv, dtype=object)
                        feat_df['course'] = course
                        feat_df['session'] = run
                        feat_df_list.append(feat_df)
                    except Exception as e:
                        logger.warning(
                            "exception while collecting session results for course {} session {} mode {}: {}"
                            .format(course, run, mode, e))
                        continue
    master_feat_df = pd.concat(feat_df_list)
    csv_fp = generate_archive_filename(job_config, extension='csv')
    master_feat_df.to_csv(csv_fp, index=False, header=True)
    return csv_fp
Beispiel #14
0
def collect_course_cv_results(job_config, k=5, raw_data_dir="morf-data/"):
    """
    Iterate through course-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv.
    :param s3: boto3.client object with appropriate access credentials.
    :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket.
    :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories.
    :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features).
    :param mode: mode to collect results for, {extract, test}.
    :param holdout: flag; fetch holdout run only (boolean; default False).
    :return: path to csv.
    """
    logger = set_logger_handlers(module_logger, job_config)
    raw_data_buckets = job_config.raw_data_buckets
    mode = job_config.mode
    pred_df_list = list()
    session = None
    for raw_data_bucket in raw_data_buckets:
        for course in fetch_complete_courses(job_config, raw_data_bucket):
            with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
                for fold_num in range(1, k + 1):
                    logger.info(
                        "fetching {} results for course {} session {}".format(
                            mode, course, session))
                    try:
                        fold_csv_name = "{}_{}_test.csv".format(
                            course, fold_num)
                        key = make_s3_key_path(job_config,
                                               course,
                                               fold_csv_name,
                                               mode="test")
                        pred_fp = download_from_s3(job_config.proc_data_bucket,
                                                   key,
                                                   job_config.initialize_s3(),
                                                   working_dir,
                                                   dest_filename=fold_csv_name)
                        pred_df = pd.read_csv(pred_fp, dtype=object)
                        pred_df['course'] = course
                        pred_df['fold_num'] = str(fold_num)
                        pred_df_list.append(pred_df)
                    except Exception as e:
                        logger.warning("exception occurred: {} ".format(e))
                        continue
    master_feat_df = pd.concat(pred_df_list)
    csv_fp = generate_archive_filename(job_config,
                                       mode="test",
                                       extension='csv')
    master_feat_df.to_csv(csv_fp, index=False, header=True)
    return csv_fp
Beispiel #15
0
def evaluate_prule_session():
    """
    Perform statistical testing for prule analysis.
    :return: None
    """
    raw_data_dir = "morf-data/"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    raw_data_buckets = job_config.raw_data_buckets
    proc_data_bucket = job_config.proc_data_bucket
    prule_file = job_config.prule_url
    s3 = job_config.initialize_s3()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
        input_dir, output_dir = initialize_input_output_dirs(working_dir)
        # pull extraction results from every course into working_dir
        for raw_data_bucket in raw_data_buckets:
            for course in fetch_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_all_sessions=True):
                    if session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course):
                        ## session is a non-holdout session
                        fetch_mode = "extract"
                    else:
                        fetch_mode = "extract-holdout"
                    feat_file = generate_archive_filename(job_config, course=course, session=session, mode=fetch_mode)
                    feat_key = make_s3_key_path(job_config, filename=feat_file, course=course, session=session, mode=fetch_mode)
                    feat_local_fp = download_from_s3(proc_data_bucket, feat_key, s3, input_dir, job_config=job_config)
                    unarchive_file(feat_local_fp, input_dir)
        docker_image_fp = urlparse(job_config.prule_evaluate_image).path
        docker_image_dir = os.path.dirname(docker_image_fp)
        docker_image_name = os.path.basename(docker_image_fp)
        image_uuid = load_docker_image(docker_image_dir, job_config, logger, image_name=docker_image_name)
        # create a directory for prule file and copy into it; this will be mounted to docker image
        prule_dir = os.path.join(working_dir, "prule")
        os.makedirs(prule_dir)
        shutil.copy(urlparse(prule_file).path, prule_dir)
        cmd = "{} run --network=\"none\" --rm=true --volume={}:/input --volume={}:/output --volume={}:/prule {} ".format(job_config.docker_exec, input_dir, output_dir, prule_dir, image_uuid)
        subprocess.call(cmd, shell=True)
        # rename result file and upload results to s3
        final_output_file = os.path.join(output_dir, "output.csv")
        final_output_archive_name = generate_archive_filename(job_config, extension="csv")
        final_output_archive_fp = os.path.join(output_dir, final_output_archive_name)
        os.rename(final_output_file, final_output_archive_fp)
        output_key = make_s3_key_path(job_config, filename = final_output_archive_name, mode = "test")
        upload_file_to_s3(final_output_archive_fp, proc_data_bucket, output_key, job_config, remove_on_success=True)
        return
Beispiel #16
0
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True):
    """
    Extract features using the Docker image across each course of holdout data.
    :return:
    """
    mode = "extract-holdout"
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # call job_runner once percourse with --mode=extract and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                holdout_session = fetch_sessions(
                    job_config,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, holdout_session,
                    level, None
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config, filename=result_file)
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Beispiel #17
0
def test_course(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    tests one model per course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return:
    """
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    ## for each bucket, call job_runner once per course with --mode=test and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("[INFO] processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, None, level,
                    label_type
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config,
                                  filename=generate_archive_filename(
                                      job_config, extension="csv"))
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Beispiel #18
0
def fetch_raw_course_data(job_config,
                          bucket,
                          course,
                          session,
                          input_dir,
                          data_dir="morf-data/"):
    """
    Fetch raw course data from job_config.cache_dir, if exists; otherwise fetch from s3.
    :param job_config: MorfJobConfig object
    :param bucket: bucket containing raw data.
    :param course: id of course to download data for.
    :param session: id of session to download data for.
    :param input_dir: input directory.
    :param data_dir: directory in bucket that contains course-level data.
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    course_date_file = "coursera_course_dates.csv"
    course_session_cache_dir = os.path.join(job_config.cache_dir, bucket,
                                            data_dir, course, session)
    session_input_dir = os.path.join(input_dir, course, session)
    if job_config.cache_dir:
        try:
            logger.info("copying data from cached location {} to {}".format(
                course_session_cache_dir, session_input_dir))
            shutil.copytree(course_session_cache_dir, session_input_dir)
            course_date_file = os.path.join(job_config.cache_dir, bucket,
                                            data_dir, course_date_file)
            shutil.copy(course_date_file, session_input_dir)
        except Exception as e:
            logger.error(
                "exception while attempting to copy from cache: {}".format(e))
    else:
        download_raw_course_data(job_config,
                                 bucket=raw_data_bucket,
                                 course=course,
                                 session=session,
                                 input_dir=input_dir,
                                 data_dir=data_dir)
    # unzip all of the sql files and remove any parens from filename
    for item in os.listdir(session_input_dir):
        if item.endswith(".sql.gz"):
            item_path = os.path.join(session_input_dir, item)
            unarchive_res = unarchive_file(item_path, session_input_dir)
            clean_filename(unarchive_res)
    return
Beispiel #19
0
def collect_course_results(job_config, raw_data_dir="morf-data/"):
    """
    Iterate through course-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv.
    :param s3: boto3.client object with appropriate access credentials.
    :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket.
    :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories.
    :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features).
    :param mode: mode to collect results for, {extract, test}.
    :param holdout: flag; fetch holdout run only (boolean; default False).
    :return: path to csv.
    """
    logger = set_logger_handlers(module_logger, job_config)
    raw_data_buckets = job_config.raw_data_buckets
    mode = job_config.mode
    feat_df_list = list()
    for raw_data_bucket in raw_data_buckets:
        for course in fetch_complete_courses(job_config, raw_data_bucket):
            if mode == "extract-holdout":  # results are stored in session-level directories in extract-holdout mode; get this session
                session = fetch_sessions(job_config,
                                         raw_data_bucket,
                                         raw_data_dir,
                                         course,
                                         fetch_holdout_session_only=True)[0]
            else:
                session = None
            with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
                logger.info(
                    "fetching {} results for course {} session {}".format(
                        mode, course, session))
                try:
                    fetch_result_file(job_config,
                                      dir=working_dir,
                                      course=course,
                                      session=session)
                    csv = fetch_result_csv_fp(working_dir)
                    feat_df = pd.read_csv(csv, dtype=object)
                    feat_df['course'] = course
                    feat_df_list.append(feat_df)
                except Exception as e:
                    logger.warning("exception occurred: {} ".format(e))
                    continue
    master_feat_df = pd.concat(feat_df_list)
    csv_fp = generate_archive_filename(job_config, extension='csv')
    master_feat_df.to_csv(csv_fp, index=False, header=True)
    return csv_fp
Beispiel #20
0
def docker_cloud_push(job_config, image_uuid):
    """
    Push image to Docker Cloud repo in job_config; tagging the image with its morf_id.
    :param job_config: MorfJobConfig object
    :param image_uuid: Docker image uuid
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    docker_cloud_repo_and_tag_path = "{}:{}".format(
        job_config.docker_cloud_repo, job_config.morf_id)
    # tag the docker image using the morf_id
    tag_cmd = "docker tag {} {}".format(image_uuid,
                                        docker_cloud_repo_and_tag_path)
    execute_and_log_output(tag_cmd, logger)
    # push the image to docker cloud
    push_cmd = "docker push {}".format(docker_cloud_repo_and_tag_path)
    execute_and_log_output(push_cmd, logger)
    return docker_cloud_repo_and_tag_path
Beispiel #21
0
def clear_s3_subdirectory(job_config, course=None, session=None):
    """
    Clear all files for user_id, job_id, and mode; used to wipe s3 subdirectory before uploading new files.
    :job_config: MorfJobConfig object.
    :param course:
    :param session:
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3_prefix = "/".join([
        x for x in [
            job_config.proc_data_bucket, job_config.user_id, job_config.job_id,
            job_config.mode, course, session
        ] if x is not None
    ]) + "/"
    logger.info(" clearing previous job data at s3://{}".format(s3_prefix))
    delete_s3_keys(job_config, prefix=s3_prefix)
    return
Beispiel #22
0
def download_models(job_config, course, dest_dir, level, session=None):
    """
    Download and untar archived file of pre-trained models for specified user_id/job_id/course.
    :param job_config: MorfJobConfig object.
    :param course: course: course slug for job (string).
    :param dest_dir: location to download models to; this should be /input directory mounted to Docker image.
    :param level: Level for job.
    :param session: Session id for session-level jobs.
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    bucket = job_config.proc_data_bucket
    user_id = job_config.user_id
    aws_access_key_id = job_config.aws_access_key_id
    aws_secret_access_key = job_config.aws_secret_access_key
    job_id = job_config.job_id
    if level == "all":
        # just one model file
        mod_archive_file = generate_archive_filename(job_config, mode="train")
        key = make_s3_key_path(job_config,
                               mode="train",
                               filename=mod_archive_file)
        download_model_from_s3(job_config, bucket, key, dest_dir)
    elif level in [
            "course", "session"
    ]:  # model files might be in either course- or session-level directories
        train_files = [
            obj.key for obj in boto3.resource(
                "s3",
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key).Bucket(bucket).
            objects.filter(Prefix="/".join([user_id, job_id, "train"])) if
            ".tgz" in obj.key.split("/")[-1]  # fetch trained model files only
            and "train" in obj.key.split("/")[-1]
            and course in obj.key.split("/")[-1]
        ]
        for key in train_files:
            download_model_from_s3(job_config, bucket, key, dest_dir)
    else:
        logger.error(
            "the procedure for executing this job is unsupported in this version of MORF."
        )
        raise
    return
Beispiel #23
0
def download_model_from_s3(job_config, bucket, key, dest_dir):
    """
    Download and untar a model file from S3; or print a warning message if it doesn't exist.
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3 = job_config.initialize_s3()
    mod_url = 's3://{}/{}'.format(bucket, key)
    logger.info(
        " downloading compressed model file from bucket {} key {}".format(
            bucket, key))
    try:
        tar_path = initialize_tar(mod_url, s3=s3, dest_dir=dest_dir)
        unarchive_file(tar_path, dest_dir)
    except:
        logger.error(
            "error downloading model file from s3; trained model(s) for this course may not exist. Skipping."
        )
    return
Beispiel #24
0
def download_raw_course_data(
        job_config,
        bucket,
        course,
        session,
        input_dir,
        data_dir,
        course_date_file_name="coursera_course_dates.csv"):
    """
    Download all raw course files for course and session into input_dir.
    :param job_config: MorfJobConfig object.
    :param bucket: bucket containing raw data.
    :param course: id of course to download data for.
    :param session: id of session to download data for.
    :param input_dir: input directory.
    :param data_dir: directory in bucket that contains course-level data.
    :param course_date_file_name: name of csv file in bucket which contains course start/end dates.
    :return: None
    """
    s3 = job_config.initialize_s3()
    logger = set_logger_handlers(module_logger, job_config)
    course_date_file_url = "s3://{}/{}/{}".format(bucket, data_dir,
                                                  course_date_file_name)
    session_input_dir = os.path.join(input_dir, course, session)
    os.makedirs(session_input_dir)
    for obj in boto3.resource("s3", aws_access_key_id=job_config.aws_access_key_id, aws_secret_access_key=job_config.aws_secret_access_key)\
            .Bucket(bucket).objects.filter(Prefix="{}/{}/{}/".format(data_dir, course, session)):
        filename = obj.key.split("/")[-1]
        filename = re.sub('[\s\(\)":!&]', "", filename)
        filepath = os.path.join(session_input_dir, filename)
        try:
            with open(filepath, "wb") as resource:
                s3.download_fileobj(bucket, obj.key, resource)
        except:
            logger.warning("skipping empty object in bucket {} key {}".format(
                bucket, obj.key))
            continue
    dates_bucket = get_bucket_from_url(course_date_file_url)
    dates_key = get_key_from_url(course_date_file_url)
    dates_file = dates_key.split("/")[-1]
    s3.download_file(dates_bucket, dates_key,
                     os.path.join(session_input_dir, dates_file))
    return
Beispiel #25
0
def cross_validate_course(label_type, k=5, multithread=True):
    """
    Compute k-fold cross-validation across courses.
    :return:
    """
    # todo: call to create_course_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    # clear previous test results
    clear_s3_subdirectory(job_config, mode="test")
    docker_image_dir = os.getcwd(
    )  # directory the function is called from; should contain docker image
    logger = set_logger_handlers(module_logger, job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    for raw_data_bucket in job_config.raw_data_buckets:
        reslist = []
        with Pool(num_cores) as pool:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for fold_num in range(1, k + 1):
                    poolres = pool.apply_async(execute_image_for_cv, [
                        job_config, raw_data_bucket, course, fold_num,
                        docker_image_dir, label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    test_csv_fp = collect_course_cv_results(job_config)
    pred_key = make_s3_key_path(job_config,
                                os.path.basename(test_csv_fp),
                                mode="test")
    upload_file_to_s3(test_csv_fp,
                      job_config.proc_data_bucket,
                      pred_key,
                      job_config,
                      remove_on_success=True)
    return
Beispiel #26
0
def fetch_file(s3,
               dest_dir,
               remote_file_url,
               dest_filename=None,
               job_config=None):
    """

    :param s3: boto3.client object for s3 connection.
    :param dest_dir: directory to download file to (string).
    :param remote_file_url: url of remote file; must be either file://, s3, or http format (string).
    :param dest_filename: base name of file to use (otherwise defaults to current file name) (string).
    :param job_config: MorfJobConfig object; used for logging.
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("retrieving file {} to {}".format(remote_file_url, dest_dir))
    try:
        if not dest_filename:
            dest_filename = os.path.basename(remote_file_url)
        url = urlparse(remote_file_url)
        if url.scheme == "file":
            shutil.copyfile(url.path, os.path.join(dest_dir, dest_filename))
        elif url.scheme == "s3":
            bucket = url.netloc
            key = url.path[1:]  # ignore initial /
            download_from_s3(bucket,
                             key,
                             s3,
                             dest_dir,
                             dest_filename=dest_filename)
        elif url.scheme == "https":
            urllib.request.urlretrieve(remote_file_url,
                                       os.path.join(dest_dir, dest_filename))
        else:
            logger.error(
                "A URL which was not s3:// or file:// or https:// was passed in for a file location, this is not supported. {}"
                .format(remote_file_url))
            sys.exit(-1)
    except Exception as e:
        logger.error("{} when attempting to fetch and copy file at {}".format(
            e, remote_file_url))
    return
Beispiel #27
0
def upload_file_to_s3(file, bucket, key, job_config=None):
    """
    Upload file to bucket + key in S3.
    :param file: name or path to file.
    :param bucket: bucket to upload to.
    :param key: key to upload to in bucket.
    :param job_config: MorfJobConfig object; used for logging.
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    session = boto3.Session()
    s3_client = session.client("s3")
    tc = boto3.s3.transfer.TransferConfig()
    t = boto3.s3.transfer.S3Transfer(client=s3_client, config=tc)
    logger.info("uploading {} to s3://{}/{}".format(file, bucket, key))
    try:
        t.upload_file(file, bucket, key)
    except Exception as e:
        logger.warn("error caching configurations: {}".format(e))
    return
Beispiel #28
0
def train_session(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    Train one model per session of the course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return: None
    """
    level = "session"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # for each bucket, call job_runner once per session with --mode=train and --level=session
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                for session in fetch_sessions(job_config, raw_data_bucket,
                                              raw_data_dir, course):
                    poolres = pool.apply_async(run_image, [
                        job_config, raw_data_bucket, course, session, level,
                        label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    send_email_alert(job_config)
    return
Beispiel #29
0
def generate_zenodo_metadata(job_config, deposition_id):
    """
    Create metadata for a MORF job.
    :param job_config:
    :param deposition_id:
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    data = {
        'metadata': {
            'title': 'MORF job id {}'.format(job_config.morf_id),
            'upload_type': 'software',
            'description': 'Job files for job id {} from the MOOC Replication Framework'.format(job_config.morf_id),
            'creators': [{'name': '{}'.format(job_config.user_id), 'affiliation': 'None'}]
        }
    }
    headers = {"Content-Type": "application/json"}
    r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id,
                     params = {'access_token': getattr(job_config, "zenodo_access_token")},
                     data = json.dumps(data), headers = headers)
    logger.info(r.json())
    return 
Beispiel #30
0
def fetch_from_cache(job_config, cache_file_path, dest_dir):
    """
    Fetch a file from the cache for job_config into dest_dir, if it exists.
    :param job_config:
    :param cache_file_path: string, relative path to file in cache (this is identical to the directory path in s3; e.g. "/bucket/path/to/somefile.csv"
    :param dest_dir: absolute path of directory to fetch file into (will be created if not exists)
    :return: path to fetched file (string); return None if cache is not used.
    """
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("fetching file {} from cache".format(cache_file_path))
    abs_cache_file_path = os.path.join(getattr(job_config, "cache_dir", None),
                                       cache_file_path)
    if hasattr(job_config,
               "cache_dir") and os.path.exists(abs_cache_file_path):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        dest_fp = shutil.copy(abs_cache_file_path, dest_dir)
    else:
        logger.warning(
            "file {} does not exist in cache".format(abs_cache_file_path))
        dest_fp = None
    return dest_fp