コード例 #1
0
ファイル: caching.py プロジェクト: scarescrow/morf
def docker_cloud_login(job_config):
    """
    Log into docker cloud using creds in job_config.
    :param job_config: MorfJobConfig object.
    :return: None
    """
    cmd = "docker login --username={} --password={}".format(
        job_config.docker_cloud_username, job_config.docker_cloud_password)
    logger = set_logger_handlers(module_logger, job_config)
    execute_and_log_output(cmd, logger)
    return
コード例 #2
0
ファイル: caching.py プロジェクト: scarescrow/morf
def docker_cloud_push(job_config, image_uuid):
    """
    Push image to Docker Cloud repo in job_config; tagging the image with its morf_id.
    :param job_config: MorfJobConfig object
    :param image_uuid: Docker image uuid
    :return: None
    """
    logger = set_logger_handlers(module_logger, job_config)
    docker_cloud_repo_and_tag_path = "{}:{}".format(
        job_config.docker_cloud_repo, job_config.morf_id)
    # tag the docker image using the morf_id
    tag_cmd = "docker tag {} {}".format(image_uuid,
                                        docker_cloud_repo_and_tag_path)
    execute_and_log_output(tag_cmd, logger)
    # push the image to docker cloud
    push_cmd = "docker push {}".format(docker_cloud_repo_and_tag_path)
    execute_and_log_output(push_cmd, logger)
    return docker_cloud_repo_and_tag_path
コード例 #3
0
ファイル: job_runner_utils.py プロジェクト: scarescrow/morf
def run_image(job_config,
              raw_data_bucket,
              course=None,
              session=None,
              level=None,
              label_type=None):
    """
    Run a docker image with the specified parameters, initializing any data as necessary and archiving results to s3.
    :param docker_url: URL for a built and compressed (.tar) docker image
    :param user_id: unique user id (string).
    :param job_id: unique job id (string).
    :param mode: mode to run image in; {extract, extract-holdout, train, test} (string).
    :param raw_data_bucket: raw data bucket; specify multiple buckets only if level == all.
    :param course: Coursera course slug or course shortname (string).
    :param session: 3-digit course session number (for trained model or extraction).
    :param level: level of aggregation of MORF API function; {session, course, all} (string).
    :param label_type: type of outcome label to use (required for model training and testing) (string).
    :return:
    """
    logger = set_logger_handlers(module_logger, job_config)
    s3 = job_config.initialize_s3()
    # create local directory for processing on this instance
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        try:
            fetch_file(s3,
                       working_dir,
                       job_config.docker_url,
                       dest_filename="docker_image")
        except Exception as e:
            logger.error("[ERROR] Error downloading file {} to {}".format(
                job_config.docker_url, working_dir))
        input_dir, output_dir = initialize_input_output_dirs(working_dir)
        # fetch any data or models needed
        if "extract" in job_config.mode:  # download raw data
            initialize_raw_course_data(job_config,
                                       raw_data_bucket=raw_data_bucket,
                                       mode=job_config.mode,
                                       course=course,
                                       session=session,
                                       level=level,
                                       input_dir=input_dir)
            job_config.mode = "extract"  # sets mode to "extract" in case of "extract-holdout"
        # fetch training/testing data
        if job_config.mode in ["train", "test"]:
            sync_s3_job_cache(job_config)
            initialize_train_test_data(job_config,
                                       raw_data_bucket=raw_data_bucket,
                                       level=level,
                                       label_type=label_type,
                                       course=course,
                                       session=session,
                                       input_dir=input_dir)
        if job_config.mode == "test":  # fetch models and untar
            download_models(job_config,
                            course=course,
                            session=session,
                            dest_dir=input_dir,
                            level=level)
        image_uuid = load_docker_image(dir=working_dir,
                                       job_config=job_config,
                                       logger=logger)
        # build docker run command and execute the image
        cmd = make_docker_run_command(job_config,
                                      job_config.docker_exec,
                                      input_dir,
                                      output_dir,
                                      image_uuid,
                                      course,
                                      session,
                                      job_config.mode,
                                      client_args=job_config.client_args)
        execute_and_log_output(cmd, logger)
        # cleanup
        execute_and_log_output(
            "{} rmi --force {}".format(job_config.docker_exec, image_uuid),
            logger)
        # archive and write output
        archive_file = make_output_archive_file(output_dir,
                                                job_config,
                                                course=course,
                                                session=session)
        move_results_to_destination(archive_file,
                                    job_config,
                                    course=course,
                                    session=session)
    return
コード例 #4
0
def execute_image_for_cv(job_config,
                         raw_data_bucket,
                         course,
                         fold_num,
                         docker_image_dir,
                         label_type,
                         raw_data_dir="morf-data/"):
    """

    :param job_config:
    :param raw_data_bucket:
    :param course:
    :param fold_num:
    :param docker_image_dir:
    :param label_type:
    :param raw_data_dir:
    :return:
    """
    user_id_col = "userID"
    logger = set_logger_handlers(module_logger, job_config)
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        input_dir, output_dir = initialize_input_output_dirs(working_dir)
        # get fold train data
        course_input_dir = os.path.join(input_dir, course)
        trainkey = make_s3_key_path(
            job_config, course, make_feature_csv_name(course, fold_num,
                                                      "train"))
        train_data_path = download_from_s3(job_config.proc_data_bucket,
                                           trainkey,
                                           job_config.initialize_s3(),
                                           dir=course_input_dir,
                                           job_config=job_config)
        testkey = make_s3_key_path(
            job_config, course, make_feature_csv_name(course, fold_num,
                                                      "test"))
        test_data_path = download_from_s3(job_config.proc_data_bucket,
                                          testkey,
                                          job_config.initialize_s3(),
                                          dir=course_input_dir,
                                          job_config=job_config)
        # get labels
        train_users = pd.read_csv(train_data_path)[user_id_col]
        train_labels_path = initialize_cv_labels(job_config,
                                                 train_users,
                                                 raw_data_bucket,
                                                 course,
                                                 label_type,
                                                 input_dir,
                                                 raw_data_dir,
                                                 fold_num,
                                                 "train",
                                                 level="course")
        # run docker image with mode == cv
        image_uuid = load_docker_image(docker_image_dir, job_config, logger)
        cmd = make_docker_run_command(
            job_config, job_config.docker_exec, input_dir, output_dir,
            image_uuid, course, None, mode,
            job_config.client_args) + " --fold_num {}".format(fold_num)
        execute_and_log_output(cmd, logger)
        # upload results
        pred_csv = os.path.join(output_dir,
                                "{}_{}_test.csv".format(course, fold_num))
        pred_key = make_s3_key_path(job_config,
                                    course,
                                    os.path.basename(pred_csv),
                                    mode="test")
        upload_file_to_s3(pred_csv,
                          job_config.proc_data_bucket,
                          pred_key,
                          job_config,
                          remove_on_success=True)
    return
コード例 #5
0
    for course in fetch_complete_courses(job_config, raw_data_bucket):
        for session in fetch_sessions(job_config, raw_data_bucket, data_dir=MORF_DATA_DIR, course=course,
                                      fetch_all_sessions=True):
            with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir:
                print("[INFO] processing course {} session {}".format(course, session))
                # download the data exports
                fetch_raw_course_data(job_config, raw_data_bucket, course, session, input_dir=working_dir)
                # download_raw_course_data(job_config, raw_data_bucket, course=course, session=session, input_dir=working_dir,
                #                          data_dir=MORF_DATA_DIR[:-1]) # drop trailing slash on data dir
                # create docker run command and load image
                image_uuid = load_docker_image(MYSQL_DOCKER_DIR, job_config, logger, image_name=MYSQL_DOCKER_IMG_NAME)
                cmd = make_docker_run_command(job_config.docker_exec, working_dir, OUTPUT_DIR, image_uuid,
                                              course=course, session=session, mode=None,
                                              client_args=None)
                # run the docker image, make sure to pass params for course and session
                execute_and_log_output(cmd, logger)

# concatenate into single file in OUTPUT_DIR
df_list = []
for f in os.listdir(OUTPUT_DIR):
    if f.endswith(".csv") and not f.startswith("."):  # only use csv files, ignore system files
        print("[INFO] reading file {}".format(f))
        # pull course name and session from filename
        regex = re.compile("^hash_mapping_(\S+)_([0-9\-]+).*")
        res = re.search(regex, f)
        course = res.group(1)
        session = res.group(2)
        # read dataframe and add columns for course and session
        df = pd.read_csv(os.path.join(OUTPUT_DIR, f))
        df["course"] = course
        df["session"] = session