def cleanup_old_jobs():
    for file_path in Path(SUBMITTY_DATA_DIR, "autograding_DONE").glob("*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove autograding DONE file: " + file_path)
        try:
            os.remove(file_path)
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, JOB_ID,trace=traceback.format_exc())
def read_autograding_worker_json():
    try:
        with open(ALL_WORKERS_JSON, 'r') as infile:
            name_and_stats = json.load(infile)
            #grab the key and the value. NOTE: For now there should only ever be one pair.
            name = list(name_and_stats.keys())[0]
            stats = name_and_stats[name]
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, trace=traceback.format_exc())
        raise SystemExit("ERROR loading autograding_worker.json file: {0}".format(e))
    return name, stats
Esempio n. 3
0
def add_fields_to_autograding_worker_json(autograding_worker_json, entry):

    submitty_config  = os.path.join(SUBMITTY_INSTALL_DIR, 'config', 'version.json')

    try:
        with open(submitty_config) as infile:
            submitty_details = json.load(infile)
            installed_commit = submitty_details['installed_commit']
            most_recent_tag  = submitty_details['most_recent_git_tag']
    except FileNotFoundError as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, trace=traceback.format_exc())
        raise SystemExit("ERROR, could not locate the submitty.json:", e)

    autograding_worker_json[entry]['server_name']     = socket.getfqdn()
    autograding_worker_json[entry]['primary_commit']  = installed_commit
    autograding_worker_json[entry]['most_recent_tag'] = most_recent_tag
    return autograding_worker_json
Esempio n. 4
0
def shipper_process(my_name, my_data, full_address, which_untrusted,
                    overall_lock):
    """
    Each shipper process spins in a loop, looking for a job that
    matches the capabilities of this machine, and then oversees the
    autograding of that job.  Interactive jobs are prioritized over
    batch (regrade) jobs.  If no jobs are available, the shipper waits
    on an event editing one of the queues.
    """

    which_machine = full_address
    my_capabilities = my_data[my_name]['capabilities']

    # ignore keyboard interrupts in the shipper processes
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    counter = 0
    while True:
        try:
            my_job = get_job(my_name, which_machine, my_capabilities,
                             which_untrusted, overall_lock)
            if not my_job == "":
                counter = 0
                grade_queue_file(my_name, which_machine, which_untrusted,
                                 os.path.join(INTERACTIVE_QUEUE, my_job))
                continue
            else:
                if counter == 0 or counter >= 10:
                    print("{0} {1}: no available job".format(
                        my_name, which_untrusted))
                    counter = 0
                counter += 1
                time.sleep(1)

        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH,
                                              job_id=JOB_ID,
                                              trace=traceback.format_exc())
            my_message = "ERROR in get_job {0} {1} {2}. For more details, see traces entry".format(
                which_machine, which_untrusted, str(e))
            print(my_message)
            autograding_utils.log_message(AUTOGRADING_LOG_PATH,
                                          JOB_ID,
                                          message=my_message)
            time.sleep(1)
Esempio n. 5
0
def update_all_foreign_autograding_workers():
    success_map = dict()
    all_workers_json = os.path.join(SUBMITTY_INSTALL_DIR, 'config', "autograding_workers.json")

    try:
        with open(all_workers_json, 'r') as infile:
            autograding_workers = json.load(infile)
    except FileNotFoundError as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, trace=traceback.format_exc())
        raise SystemExit("ERROR, could not locate autograding_workers_json :", e)

    for key, value in autograding_workers.items():
        if value['enabled'] == False:
            continue
        formatted_entry = {key: value}
        formatted_entry = add_fields_to_autograding_worker_json(formatted_entry, key)
        success = update_worker_json(key, formatted_entry)
        success_map[key] = success
    return success_map
Esempio n. 6
0
def grade_queue_file(my_name, which_machine,which_untrusted,queue_file):
    """
    Oversees the autograding of single item from the queue

    :param queue_file: details of what to grade
    :param which_machine: name of machine to send this job to (might be "localhost")
    :param which_untrusted: specific untrusted user for this autograding job
    """

    my_dir,my_file=os.path.split(queue_file)
    pid = os.getpid()
    directory = os.path.dirname(os.path.realpath(queue_file))
    name = os.path.basename(os.path.realpath(queue_file))
    grading_file = os.path.join(directory, "GRADING_" + name)

    #TODO: break which_machine into id, address, and passphrase.
    
    try:
        # prepare the job
        shipper_counter=0

        #prep_job_success = prepare_job(my_name,which_machine, which_untrusted, my_dir, queue_file)
        while not prepare_job(my_name,which_machine, which_untrusted, my_dir, queue_file):
            time.sleep(5)

        prep_job_success = True
        
        if not prep_job_success:
            print (my_name, " ERROR unable to prepare job: ", queue_file)
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message=str(my_name)+" ERROR unable to prepare job: " + queue_file)

        else:
            # then wait for grading to be completed
            shipper_counter=0
            while not unpack_job(which_machine, which_untrusted, my_dir, queue_file):
                shipper_counter+=1
                time.sleep(1)
                if shipper_counter >= 10:
                    print (my_name,which_untrusted,"shipper wait for grade: ",queue_file)
                    shipper_counter=0

    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        print (my_name, " ERROR attempting to grade item: ", queue_file, " exception=",str(e))
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message=str(my_name)+" ERROR attempting to grade item: " + queue_file + " exception " + repr(e))

    # note: not necessary to acquire lock for these statements, but
    # make sure you remove the queue file, then the grading file
    try:
        os.remove(queue_file)
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        print (my_name, " ERROR attempting to remove queue file: ", queue_file, " exception=",str(e))
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message=str(my_name)+" ERROR attempting to remove queue file: " + queue_file + " exception=" + str(e))
    try:
        os.remove(grading_file)
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        print (my_name, " ERROR attempting to remove grading file: ", grading_file, " exception=",str(e))
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message=str(my_name)+" ERROR attempting to remove grading file: " + grading_file + " exception=" + str(e))
Esempio n. 7
0
def establish_ssh_connection(my_name, user, host, only_try_once = False):
    """
    Returns a connected paramiko ssh session.
    Tries to connect until a connection is established, unless only_try_once
    is set to true. If only_try_once is true, raise whatever connection error is thrown.
    """
    connected = False
    ssh = None
    retry_delay = .1
    while not connected:
        ssh = paramiko.SSHClient()
        ssh.get_host_keys()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        try:
            ssh.connect(hostname = host, username = user, timeout=10)
            connected = True
        except:
            if only_try_once:
                raise
            time.sleep(retry_delay)
            retry_relay = min(10, retry_delay * 2)
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message=f"{my_name} Could not establish connection with {user}@{host} going to re-try.")
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
    return ssh
Esempio n. 8
0
def update_worker_json(name, entry):

    fd, tmp_json_path = tempfile.mkstemp()
    foreign_json = os.path.join(SUBMITTY_DATA_DIR, "autograding_TODO", "autograding_worker.json")
    autograding_worker_to_ship = entry

    try:
        user = autograding_worker_to_ship[name]['username']
        host = autograding_worker_to_ship[name]['address']
    except Exception as e:
        print("ERROR: autograding_workers.json entry for {0} is malformatted. {1}".format(e, name))
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: autograding_workers.json entry for {0} is malformed. {1}".format(e, name))
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        return False

    #create a new temporary json with only the entry for the current machine.
    with open(tmp_json_path, 'w') as outfile:
        json.dump(autograding_worker_to_ship, outfile, sort_keys=True, indent=4)
    #if we are updating the current machine, we can just move the new json to the appropriate spot (no ssh needed)
    if host == "localhost":
        try:
            shutil.move(tmp_json_path,foreign_json)
            print("Successfully updated local autograding_TODO/autograding_worker.json")
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Successfully updated local autograding_TODO/autograding_worker.json")
            return True
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: could not mv to local autograding_TODO/autograding_worker.json due to the following error: "+str(e))
            print("ERROR: could not mv to local autograding_worker.json due to the following error: {0}".format(e))
            return False
        finally:
            os.close(fd)
    #if we are updating a foreign machine, we must connect via ssh and use sftp to update it.
    else:
        #try to establish an ssh connection to the host
        try:
            ssh = establish_ssh_connection(None, user, host, only_try_once = True)
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: could not ssh to {0}@{1} due to following error: {2}".format(user, host,str(e)))
            print("ERROR: could not ssh to {0}@{1} due to following error: {2}".format(user, host,str(e)))
            return False
        #try to copy the files over to the host
        try:
            sftp = ssh.open_sftp()

            sftp.put(tmp_json_path,foreign_json)

            sftp.close()
            print("Successfully forwarded autograding_worker.json to {0}".format(name))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Successfully forwarded autograding_worker.json to {0}".format(name))
            success = True
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: could not sftp to foreign autograding_TODO/autograding_worker.json due to the following error: "+str(e))
            print("ERROR: could sftp to foreign autograding_TODO/autograding_worker.json due to the following error: {0}".format(e))
            success = False
        finally:
            os.close(fd)
            os.remove(tmp_json_path)
            sftp.close()
            ssh.close()
            return success
Esempio n. 9
0
def launch_shippers(worker_status_map):
    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        raise SystemExit("ERROR: the submitty_autograding_shipper.py script must be run by the DAEMON_USER")
    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py launched")

    # Clean up old files from previous shipping/autograding (any
    # partially completed work will be re-done)
    for file_path in Path(INTERACTIVE_QUEUE).glob("GRADING_*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove old queue file: " + file_path)
        os.remove(file_path)

    for file_path in Path(SUBMITTY_DATA_DIR, "autograding_TODO").glob("untrusted*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove autograding TODO file: " + file_path)
        os.remove(file_path)
    for file_path in Path(SUBMITTY_DATA_DIR, "autograding_DONE").glob("*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove autograding DONE file: " + file_path)
        os.remove(file_path)

    # this lock will be used to edit the queue or new job event
    overall_lock = multiprocessing.Lock()

    # The names of the worker machines, the capabilities of each
    # worker machine, and the number of workers per machine are stored
    # in the autograding_workers json.
    try:
        autograding_workers_path = os.path.join(SUBMITTY_INSTALL_DIR, 'config', "autograding_workers.json")
        with open(autograding_workers_path, 'r') as infile:
            autograding_workers = json.load(infile)
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        raise SystemExit("ERROR: could not locate the autograding workers json: {0}".format(e))

    # There must always be a primary machine, it may or may not have
    # autograding workers.
    if not "primary" in autograding_workers:
        raise SystemExit("ERROR: autograding_workers.json contained no primary machine.")

    # One (or more) of the machines must accept "default" jobs.
    default_present = False
    for name, machine in autograding_workers.items():
        if "default" in machine["capabilities"]:
            default_present = True
            break
    if not default_present:
        raise SystemExit("ERROR: autograding_workers.json contained no machine with default capabilities")

    # Launch a shipper process for every worker on the primary machine and each worker machine
    total_num_workers = 0
    processes = list()
    for name, machine in autograding_workers.items():
        if worker_status_map[name] == False:
            print("{0} could not be reached, so we are not spinning up shipper threads.".format(name))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="{0} could not be reached, so we are not spinning up shipper threads.".format(name))
            continue
        if 'enabled' in machine and machine['enabled'] == False:
            print("{0} is disabled, so we are not spinning up shipper threads.".format(name))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="{0} is disabled, so we are not spinning up shipper threads.")
            continue
        try:
            full_address = ""
            if machine["address"] != "localhost":
                if machine["username"] == "":
                    raise SystemExit("ERROR: empty username for worker machine {0} ".format(machine["address"]))
                full_address = "{0}@{1}".format(machine["username"], machine["address"])
            else:
                if not machine["username"] == "":
                    raise SystemExit('ERROR: username for primary (localhost) must be ""')
                full_address = machine['address']

            num_workers_on_machine = machine["num_autograding_workers"]
            if num_workers_on_machine < 0:
                raise SystemExit("ERROR: num_workers_on_machine for '{0}' must be non-negative.".format(machine))

            single_machine_data = {name : machine}
            single_machine_data = add_fields_to_autograding_worker_json(single_machine_data, name)
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            print("ERROR: autograding_workers.json entry for {0} contains an error: {1}. For more details, see trace entry.".format(name, e))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: autograding_workers.json entry for {0} contains an error: {1} For more details, see trace entry.".format(name,e))
            continue
        # launch the shipper threads
        for i in range(0,num_workers_on_machine):
            u = "untrusted" + str(i).zfill(2)
            p = multiprocessing.Process(target=shipper_process,args=(name,single_machine_data,full_address, u,overall_lock))
            p.start()
            processes.append(p)
        total_num_workers += num_workers_on_machine

    # main monitoring loop
    try:
        while True:
            alive = 0
            for i in range(0,total_num_workers):
                if processes[i].is_alive:
                    alive = alive+1
                else:
                    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: process "+str(i)+" is not alive")
            if alive != total_num_workers:
                autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: #shippers="+str(total_num_workers)+" != #alive="+str(alive))
            #print ("shippers= ",total_num_workers,"  alive=",alive)
            time.sleep(1)

    except KeyboardInterrupt:
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py keyboard interrupt")
        # just kill everything in this group id right now
        # NOTE:  this may be a bug if the grandchildren have a different group id and not be killed
        os.kill(-os.getpid(), signal.SIGKILL)

        # run this to check if everything is dead
        #    ps  xao pid,ppid,pgid,sid,comm,user  | grep untrust

        # everything's dead, including the main process so the rest of this will be ignored
        # but this was mostly working...

        # terminate the jobs
        for i in range(0,total_num_workers):
            processes[i].terminate()
        # wait for them to join
        for i in range(0,total_num_workers):
            processes[i].join()

    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py terminated")
Esempio n. 10
0
def unpack_job(which_machine,which_untrusted,next_directory,next_to_grade):

    # variables needed for logging
    obj = packer_unpacker.load_queue_file_obj(JOB_ID,next_directory,next_to_grade)
    if "generate_output" not in obj:
        partial_path = os.path.join(obj["gradeable"],obj["who"],str(obj["version"]))
        item_name = os.path.join(obj["semester"],obj["course"],"submissions",partial_path)
    elif obj["generate_output"]:
        item_name = os.path.join(obj["semester"],obj["course"],"generated_output")
    is_batch = "regrade" in obj and obj["regrade"]

    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: must be run by DAEMON_USER")
        raise SystemExit("ERROR: the submitty_autograding_shipper.py script must be run by the DAEMON_USER")

    if which_machine == 'localhost':
        address = which_machine
    else:
        address = which_machine.split('@')[1]

    fully_qualified_domain_name = socket.getfqdn()
    servername_workername = "{0}_{1}".format(fully_qualified_domain_name, address)
    target_results_zip = os.path.join(SUBMITTY_DATA_DIR,"autograding_DONE",servername_workername+"_"+which_untrusted+"_results.zip")
    target_done_queue_file = os.path.join(SUBMITTY_DATA_DIR,"autograding_DONE",servername_workername+"_"+which_untrusted+"_queue.json")

    if which_machine == "localhost":
        if not os.path.exists(target_done_queue_file):
            return False
        else:
          local_done_queue_file = target_done_queue_file
          local_results_zip = target_results_zip
    else:
        ssh = sftp = fd1 = fd2 = local_done_queue_file = local_results_zip = None
        try:
            user, host = which_machine.split("@")
            ssh = establish_ssh_connection(which_machine, user, host)
            sftp = ssh.open_sftp()
            fd1, local_done_queue_file = tempfile.mkstemp()
            fd2, local_results_zip     = tempfile.mkstemp()
            #remote path first, then local.
            sftp.get(target_done_queue_file, local_done_queue_file)
            sftp.get(target_results_zip, local_results_zip)
            #Because get works like cp rather tnan mv, we have to clean up.
            sftp.remove(target_done_queue_file)
            sftp.remove(target_results_zip)
            success = True
        #This is the normal case (still grading on the other end) so we don't need to print anything.
        except (socket.timeout, TimeoutError) as e:
            success = False
        except FileNotFoundError:
            # Remove results files
            for var in [local_results_zip, local_done_queue_file]:
                if var:
                    with contextlib.suppress(FileNotFoundError):
                        os.remove(var)
            success = False
        #In this more general case, we do want to print what the error was.
        #TODO catch other types of exception as we identify them.
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: Could not retrieve the file from the foreign machine "+str(e))
            print("ERROR: Could not retrieve the file from the foreign machine.\nERROR: {0}".format(e))

            # Remove results files
            for var in [local_results_zip, local_done_queue_file]:
                if var:
                    with contextlib.suppress(FileNotFoundError):
                        os.remove(var)

            success = False
        finally:
            # Close SSH connections
            for var in [sftp, ssh]:
                if var:
                    var.close()

            # Close file descriptors
            for var in [fd1, fd2]:
                if var:
                    try:
                        os.close(var)
                    except Exception:
                        pass

            if not success:
                return False
    # archive the results of grading
    try:
        success = packer_unpacker.unpack_grading_results_zip(which_machine,which_untrusted,local_results_zip)
    except:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID,jobname=item_name,message="ERROR: Exception when unpacking zip. For more details, see traces entry.")
        with contextlib.suppress(FileNotFoundError):
            os.remove(local_results_zip)
        success = False

    with contextlib.suppress(FileNotFoundError):
        os.remove(local_done_queue_file)

    msg = "Unpacked job from " + which_machine if success else "ERROR: failure returned from worker machine"
    print(msg)
    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, jobname=item_name, which_untrusted=which_untrusted, is_batch=is_batch, message=msg)
    return True
Esempio n. 11
0
def prepare_job(my_name,which_machine,which_untrusted,next_directory,next_to_grade):
    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: must be run by DAEMON_USER")
        raise SystemExit("ERROR: the submitty_autograding_shipper.py script must be run by the DAEMON_USER")

    if which_machine == 'localhost':
        address = which_machine
    else:
        address = which_machine.split('@')[1]

    # prepare the zip files
    try:
        autograding_zip_tmp,submission_zip_tmp = packer_unpacker.prepare_autograding_and_submission_zip(which_machine,which_untrusted,next_directory,next_to_grade)
        fully_qualified_domain_name = socket.getfqdn()
        servername_workername = "{0}_{1}".format(fully_qualified_domain_name, address)
        autograding_zip = os.path.join(SUBMITTY_DATA_DIR,"autograding_TODO",servername_workername+"_"+which_untrusted+"_autograding.zip")
        submission_zip = os.path.join(SUBMITTY_DATA_DIR,"autograding_TODO",servername_workername+"_"+which_untrusted+"_submission.zip")
        todo_queue_file = os.path.join(SUBMITTY_DATA_DIR,"autograding_TODO",servername_workername+"_"+which_untrusted+"_queue.json")

        with open(next_to_grade, 'r') as infile:
            queue_obj = json.load(infile)
            queue_obj["which_untrusted"] = which_untrusted
            queue_obj["which_machine"] = which_machine
            queue_obj["ship_time"] = dateutils.write_submitty_date(microseconds=True)
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: failed preparing submission zip or accessing next to grade "+str(e))
        print("ERROR: failed preparing submission zip or accessing next to grade ", e)
        return False

    if address == "localhost":
        try:
            shutil.move(autograding_zip_tmp,autograding_zip)
            shutil.move(submission_zip_tmp,submission_zip)
            with open(todo_queue_file, 'w') as outfile:
                json.dump(queue_obj, outfile, sort_keys=True, indent=4)
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: could not move files due to the following error: "+str(e))
            print("ERROR: could not move files due to the following error: {0}".format(e))
            return False
    else:
        sftp = ssh = None
        try:
            user, host = which_machine.split("@")

            ssh = establish_ssh_connection(my_name, user, host)
            sftp = ssh.open_sftp()
            sftp.put(autograding_zip_tmp,autograding_zip)
            sftp.put(submission_zip_tmp,submission_zip)
            with open(todo_queue_file, 'w') as outfile:
                json.dump(queue_obj, outfile, sort_keys=True, indent=4)
            sftp.put(todo_queue_file, todo_queue_file)
            os.remove(todo_queue_file)
            print("Successfully forwarded files to {0}".format(my_name))
            success = True
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: could not move files due to the following error: "+str(e))
            print("Could not move files due to the following error: {0}".format(e))
            success = False
        finally:
            if sftp:
                sftp.close()
            if ssh:
                ssh.close()
            os.remove(autograding_zip_tmp)
            os.remove(submission_zip_tmp)
            return success

    # log completion of job preparation
    obj = packer_unpacker.load_queue_file_obj(JOB_ID,next_directory,next_to_grade)
    if "generate_output" not in obj:
        partial_path = os.path.join(obj["gradeable"],obj["who"],str(obj["version"]))
        item_name = os.path.join(obj["semester"],obj["course"],"submissions",partial_path)
    elif obj["generate_output"]:
        item_name = os.path.join(obj["semester"],obj["course"],"generated_output",obj["gradeable"])
    is_batch = "regrade" in obj and obj["regrade"]
    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, jobname=item_name, which_untrusted=which_untrusted,
                                    is_batch=is_batch, message="Prepared job for " + which_machine)
    return True
def worker_process(which_machine, address, which_untrusted, my_server):

    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        autograding_utils.log_message(
            AUTOGRADING_LOG_PATH,
            JOB_ID,
            message="ERROR: must be run by DAEMON_USER")
        raise SystemExit(
            "ERROR: the submitty_autograding_worker.py script must be run by the DAEMON_USER"
        )

    # ignore keyboard interrupts in the worker processes
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    counter = 0

    # The full name of this worker
    worker_name = f"{my_server}_{address}_{which_untrusted}"

    # Set up key autograding_DONE directories
    done_dir = os.path.join(SUBMITTY_DATA_DIR, "autograding_DONE")
    done_queue_file = os.path.join(done_dir, f"{worker_name}_queue.json")
    results_zip = os.path.join(done_dir, f"{worker_name}_results.zip")

    # Set up key autograding_TODO directories
    todo_dir = os.path.join(SUBMITTY_DATA_DIR, "autograding_TODO")
    autograding_zip = os.path.join(todo_dir, f"{worker_name}_autograding.zip")
    submission_zip = os.path.join(todo_dir, f"{worker_name}_submission.zip")
    todo_queue_file = os.path.join(todo_dir, f"{worker_name}_queue.json")

    # Establish the the directory in which we will do our work
    working_directory = os.path.join(SUBMITTY_DATA_DIR, 'autograding_tmp',
                                     which_untrusted, "tmp")

    while True:
        if os.path.exists(todo_queue_file):
            try:
                # Attempt to grade the submission. Get back the location of the results.
                results_zip_tmp = grade_item.grade_from_zip(
                    working_directory, which_untrusted, autograding_zip,
                    submission_zip)
                shutil.copyfile(results_zip_tmp, results_zip)
                os.remove(results_zip_tmp)
                # At this point, we will assume that grading has progressed successfully enough to
                # return a coherent answer, and will say as much in the done queue file
                response = {
                    'status': 'success',
                    'message': 'Grading completed successfully'
                }
            except Exception:
                # If we threw an error while grading, log it.
                autograding_utils.log_message(
                    AUTOGRADING_LOG_PATH,
                    JOB_ID,
                    message=
                    f"ERROR attempting to unzip graded item: {which_machine} "
                    f"{which_untrusted}. for more details, see traces entry.")
                autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH,
                                                  JOB_ID,
                                                  trace=traceback.format_exc())
                # TODO: It is possible that autograding failed after multiple steps.
                # In this case, we may be able to salvage a portion of the autograding_results
                # directory.

                # Because we failed grading, we will respond with an empty results zip.
                results_zip_tmp = zipfile.ZipFile(results_zip, 'w')
                results_zip_tmp.close()

                # We will also respond with a done_queue_file which contains a failure message.
                response = {
                    'status': 'fail',
                    'message': traceback.format_exc()
                }
            finally:
                # Regardless of if we succeeded or failed, create a done queue file to
                # send to the shipper.
                with open(todo_queue_file, 'r') as infile:
                    queue_obj = json.load(infile)
                    queue_obj["done_time"] = dateutils.write_submitty_date(
                        milliseconds=True)
                    queue_obj['autograding_status'] = response
                with open(done_queue_file, 'w') as outfile:
                    json.dump(queue_obj, outfile, sort_keys=True, indent=4)
                # Clean up temporary files.
                with contextlib.suppress(FileNotFoundError):
                    os.remove(autograding_zip)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(submission_zip)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(todo_queue_file)
            counter = 0
        else:
            if counter >= 10:
                print(which_machine, which_untrusted, "wait")
                counter = 0
            counter += 1
            time.sleep(1)
def launch_shippers(worker_status_map):
    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        raise SystemExit("ERROR: the submitty_autograding_shipper.py script must be run by the DAEMON_USER")
    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py launched")

    for file_path in Path(SUBMITTY_DATA_DIR, "autograding_TODO").glob("untrusted*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove autograding TODO file: " + file_path)
        os.remove(file_path)
    for file_path in Path(SUBMITTY_DATA_DIR, "autograding_DONE").glob("*"):
        file_path = str(file_path)
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="Remove autograding DONE file: " + file_path)
        os.remove(file_path)

    # The names of the worker machines, the capabilities of each
    # worker machine, and the number of workers per machine are stored
    # in the autograding_workers json.
    try:
        autograding_workers_path = os.path.join(SUBMITTY_INSTALL_DIR, 'config', "autograding_workers.json")
        with open(autograding_workers_path, 'r') as infile:
            autograding_workers = json.load(infile)
    except Exception as e:
        autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
        raise SystemExit("ERROR: could not locate the autograding workers json: {0}".format(e))

    # There must always be a primary machine, it may or may not have
    # autograding workers.
    if not "primary" in autograding_workers:
        raise SystemExit("ERROR: autograding_workers.json contained no primary machine.")

    # One (or more) of the machines must accept "default" jobs.
    default_present = False
    for name, machine in autograding_workers.items():
        if "default" in machine["capabilities"]:
            default_present = True
            break
    if not default_present:
        raise SystemExit("ERROR: autograding_workers.json contained no machine with default capabilities")

    # Launch a shipper process for every worker on the primary machine and each worker machine
    total_num_workers = 0
    processes = list()
    for name, machine in autograding_workers.items():
        thread_count = machine["num_autograding_workers"]
        
        # Cleanup previous in-progress submissions
        worker_folders = [worker_folder(f'{name}_{i}') for i in range(thread_count)]
        for folder in worker_folders:
            os.makedirs(folder, exist_ok=True)
            # Clear out in-progress files, as these will be re-done.
            for grading in Path(folder).glob('GRADING_*'):
                os.remove(grading)

        if worker_status_map[name] == False:
            print("{0} could not be reached, so we are not spinning up shipper threads.".format(name))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="{0} could not be reached, so we are not spinning up shipper threads.".format(name))
            continue
        if 'enabled' in machine and machine['enabled'] == False:
            print("{0} is disabled, so we are not spinning up shipper threads.".format(name))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="{0} is disabled, so we are not spinning up shipper threads.")
            continue
        try:
            full_address = ""
            if machine["address"] != "localhost":
                if machine["username"] == "":
                    raise SystemExit("ERROR: empty username for worker machine {0} ".format(machine["address"]))
                full_address = "{0}@{1}".format(machine["username"], machine["address"])
            else:
                if not machine["username"] == "":
                    raise SystemExit('ERROR: username for primary (localhost) must be ""')
                full_address = machine['address']

            num_workers_on_machine = machine["num_autograding_workers"]
            if num_workers_on_machine < 0:
                raise SystemExit("ERROR: num_workers_on_machine for '{0}' must be non-negative.".format(machine))

            single_machine_data = {name : machine}
            single_machine_data = add_fields_to_autograding_worker_json(single_machine_data, name)
        except Exception as e:
            autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH, job_id=JOB_ID, trace=traceback.format_exc())
            print("ERROR: autograding_workers.json entry for {0} contains an error: {1}. For more details, see trace entry.".format(name, e))
            autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: autograding_workers.json entry for {0} contains an error: {1} For more details, see trace entry.".format(name,e))
            continue
        # launch the shipper threads
        for i in range(thread_count):
            thread_name = f'{name}_{i}'
            u = "untrusted" + str(i).zfill(2)
            p = multiprocessing.Process(target=shipper_process,args=(thread_name,single_machine_data[name],full_address, u))
            p.start()
            processes.append((thread_name, p))
        total_num_workers += num_workers_on_machine

    # main monitoring loop
    try:
        while True:
            alive = 0
            for name, p in processes:
                if p.is_alive:
                    alive = alive+1
                else:
                    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: process "+name+" is not alive")
            if alive != total_num_workers:
                autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="ERROR: #shippers="+str(total_num_workers)+" != #alive="+str(alive))

            # Find which workers are currently idle, as well as any autograding
            # jobs which need to be scheduled.
            workers = [name for (name, p) in processes if p.is_alive]
            idle_workers = list(filter(
                lambda n: len(os.listdir(worker_folder(n))) == 0,
                workers))
            jobs = filter(os.path.isfile, 
                map(lambda f: os.path.join(INTERACTIVE_QUEUE, f), 
                    os.listdir(INTERACTIVE_QUEUE)))
            
            # Distribute available jobs randomly among workers currently idle.
            for job in jobs:
                if len(idle_workers) == 0:
                    break
                dest = random.choice(idle_workers)
                autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, 
                    message=f"Pushing job {os.path.basename(job)} to {dest}.")
                shutil.move(job, worker_folder(dest))
                idle_workers.remove(dest)

            time.sleep(1)

    except KeyboardInterrupt:
        autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py keyboard interrupt")
        # just kill everything in this group id right now
        # NOTE:  this may be a bug if the grandchildren have a different group id and not be killed
        os.kill(-os.getpid(), signal.SIGKILL)

        # run this to check if everything is dead
        #    ps  xao pid,ppid,pgid,sid,comm,user  | grep untrust

        # everything's dead, including the main process so the rest of this will be ignored
        # but this was mostly working...

        # terminate the jobs
        for i in range(0,total_num_workers):
            processes[i].terminate()
        # wait for them to join
        for i in range(0,total_num_workers):
            processes[i].join()

    autograding_utils.log_message(AUTOGRADING_LOG_PATH, JOB_ID, message="grade_scheduler.py terminated")
def worker_process(which_machine, address, which_untrusted, my_server):

    # verify the DAEMON_USER is running this script
    if not int(os.getuid()) == int(DAEMON_UID):
        autograding_utils.log_message(
            AUTOGRADING_LOG_PATH,
            JOB_ID,
            message="ERROR: must be run by DAEMON_USER")
        raise SystemExit(
            "ERROR: the submitty_autograding_worker.py script must be run by the DAEMON_USER"
        )

    # ignore keyboard interrupts in the worker processes
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    counter = 0

    servername_workername = "{0}_{1}".format(my_server, address)
    autograding_zip = os.path.join(
        SUBMITTY_DATA_DIR, "autograding_TODO",
        servername_workername + "_" + which_untrusted + "_autograding.zip")
    submission_zip = os.path.join(
        SUBMITTY_DATA_DIR, "autograding_TODO",
        servername_workername + "_" + which_untrusted + "_submission.zip")
    todo_queue_file = os.path.join(
        SUBMITTY_DATA_DIR, "autograding_TODO",
        servername_workername + "_" + which_untrusted + "_queue.json")

    while True:
        if os.path.exists(todo_queue_file):
            try:
                working_directory = os.path.join(
                    "/var/local/submitty/autograding_tmp/", which_untrusted,
                    "tmp")
                results_zip_tmp = grade_item.grade_from_zip(
                    working_directory, which_untrusted, autograding_zip,
                    submission_zip)
                results_zip = os.path.join(
                    SUBMITTY_DATA_DIR, "autograding_DONE",
                    servername_workername + "_" + which_untrusted +
                    "_results.zip")
                done_queue_file = os.path.join(
                    SUBMITTY_DATA_DIR, "autograding_DONE",
                    servername_workername + "_" + which_untrusted +
                    "_queue.json")
                #move doesn't inherit the permissions of the destination directory. Copyfile does.
                shutil.copyfile(results_zip_tmp, results_zip)

                os.remove(results_zip_tmp)
                with open(todo_queue_file, 'r') as infile:
                    queue_obj = json.load(infile)
                    queue_obj["done_time"] = dateutils.write_submitty_date(
                        microseconds=True)
                with open(done_queue_file, 'w') as outfile:
                    json.dump(queue_obj, outfile, sort_keys=True, indent=4)
            except Exception as e:
                autograding_utils.log_message(
                    AUTOGRADING_LOG_PATH,
                    JOB_ID,
                    message="ERROR attempting to unzip graded item: " +
                    which_machine + " " + which_untrusted +
                    ". for more details, see traces entry.")
                autograding_utils.log_stack_trace(AUTOGRADING_STACKTRACE_PATH,
                                                  JOB_ID,
                                                  trace=traceback.format_exc())
                with contextlib.suppress(FileNotFoundError):
                    os.remove(autograding_zip)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(submission_zip)

                #Respond with a failure zip file.
                results_zip = os.path.join(
                    SUBMITTY_DATA_DIR, "autograding_DONE",
                    servername_workername + "_" + which_untrusted +
                    "_results.zip")
                tmp_dir = tempfile.mkdtemp()
                with open(os.path.join(tmp_dir, 'failure.txt'),
                          'w') as outfile:
                    outfile.write("grading failed.\n")

                results_zip_tmp = zipfile.ZipFile(results_zip, 'w')
                results_zip_tmp.write(os.path.join(tmp_dir, 'failure.txt'))
                results_zip_tmp.close()

                shutil.rmtree(tmp_dir)
                done_queue_file = os.path.join(
                    SUBMITTY_DATA_DIR, "autograding_DONE",
                    servername_workername + "_" + which_untrusted +
                    "_queue.json")
                with open(todo_queue_file, 'r') as infile:
                    queue_obj = json.load(infile)
                    queue_obj["done_time"] = dateutils.write_submitty_date(
                        microseconds=True)
                with open(done_queue_file, 'w') as outfile:
                    json.dump(queue_obj, outfile, sort_keys=True, indent=4)
            finally:
                if os.path.exists(autograding_zip):
                    os.remove(autograding_zip)
                if os.path.exists(submission_zip):
                    os.remove(submission_zip)

            with contextlib.suppress(FileNotFoundError):
                os.remove(todo_queue_file)
            counter = 0
        else:
            if counter >= 10:
                print(which_machine, which_untrusted, "wait")
                counter = 0
            counter += 1
            time.sleep(1)