import os
import re
import pandas as pd
from multiprocessing import Pool


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='execute feature extraction, training, or testing.')
    parser.add_argument('-c', '--course_id', required=True, help='an s3 pointer to a course')
    parser.add_argument('-r', '--run_number', required=False, help='3-digit course run number')
    parser.add_argument('-m', '--mode', required=True, help='mode to run image in; {extract, train, test}')
    args = parser.parse_args()
    if args.mode == 'extract':
        # this block expects individual session-level data mounted by extract_session() and outputs one CSV file per session in /output
        # set up the mysql database
        extract_coursera_sql_data(args.course_id, args.run_number)
        extract_features(course_name = args.course_id, run_number = args.run_number)
    if args.mode == 'train':
        # this block expects session-level data mounted by train_session() and outputs one model file per session in /output
        cmd = "Rscript /modeling/train_model_morf_test_session.R --course {} --session {}".format(args.course_id, args.run_number)
        subprocess.call(cmd, shell=True)
    if args.mode == 'test':
        # this block expects session-level data and models mounted by test_course() and outputs one csv of predictions per course in /output, using only data from most recent iteration of course.
        cmd = "Rscript /modeling/test_model_morf_test_session.R --course {}".format(args.course_id)
        subprocess.call(cmd, shell=True)





Exemple #2
0
from feature_extraction import fetch_courses_and_sessions, aggregate_output_csvs

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='execute feature extraction, training, or testing.')
    parser.add_argument('-m',
                        '--mode',
                        required=True,
                        help='mode to run image in; {extract, train, test}')
    parser.add_argument('--course_id', required=True)
    parser.add_argument('--run_number', required=False)
    args = parser.parse_args()
    if args.mode == 'extract':
        # this block expects individual session data mounted by extract_session() and outputs one CSV file in /output
        # get list of courses and sessions from course-level directories in /input
        for c, s in fetch_courses_and_sessions():
            # set up the mysql database
            extract_coursera_sql_data(c, s)
            extract_features(course_name=c, run_number=s)
        aggregate_output_csvs()
    if args.mode == 'train':
        # this block expects all data mounted by train_course() and outputs one model in /output
        cmd = "Rscript /modeling/train_model_course.R --course {}".format(
            args.course_id)
        subprocess.call(cmd, shell=True)
    if args.mode == 'test':
        # this block expects course-level data and models mounted by test_course() and outputs one csv of predictions for all courses in /output
        cmd = "Rscript /modeling/test_model_course.R --course {}".format(
            args.course_id)
        subprocess.call(cmd, shell=True)
Exemple #3
0
Note that this script uses the --mode parameter from docker run to control the flow of extraction, training, and testing.

This script is structured to utilize the input and output contract of the extract_session() and train_course() functions from the MORF API.
"""

import argparse
import subprocess

from feature_extraction.mwe_feature_extractor import main as extract_features
from feature_extraction.sql_utils import extract_coursera_sql_data

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="execute feature extraction, training, or testing.")
    parser.add_argument("-c", "--course", required=True, help="an s3 pointer to a course")
    parser.add_argument("-r", "--session", required=False, help="3-digit course run number")
    parser.add_argument("-m", "--mode", required=True, help="mode to run image in; {extract, train, test}")
    args = parser.parse_args()
    if args.mode == "extract":
        # this block expects individual session-level data mounted by extract_session() and outputs one CSV file per session in /output
        # set up the mysql database
        extract_coursera_sql_data(args.course, args.session)
        extract_features(course = args.course, session = args.session)
    elif args.mode == "train":
        # this block expects course-level data mounted by train_course() and outputs one model file per course in /output
        cmd = "Rscript /modeling/train_model_morf_mwe.R --course {}".format(args.course)
        subprocess.call(cmd, shell=True)
    elif args.mode == "test":
        # this block expects course-level data and models mounted by test_course() and outputs one csv of predictions per course in /output
        cmd = "Rscript /modeling/test_model_morf_mwe.R --course {}".format(args.course)
        subprocess.call(cmd, shell=True)