def run_prepare(job_id, job, no_new_learn=0): """ Parse step's parameter and predict the resources needed by the step :param job_id: int, jod id :param job: dict, job dict :param no_new_learn: int, 1 means refusing creating new training item :return: """ learning = 0 if job['status'] == -1 and job['resume'] != -1: # skip and resume OUTPUT_DICT[job_id] = baseDriver.load_output_dict(job_id) if (job['resume'] + 1) == len(job['steps']): return None elif job['status'] > 0: return 'running' else: step = job['steps'][job['resume'] + 1]['parameter'] step = step.replace('{Job}', str(job_id)) if job_id in LAST_OUTPUT_STRING.keys(): step = step.replace('{LastOutput}', LAST_OUTPUT_STRING[job_id]) if job_id in OUTPUTS.keys(): step = step.replace('{AllOutputBefore}', ' '.join(OUTPUTS[job_id])) if job_id in NEW_FILES.keys(): step = parameterParser.last_output_map(step, NEW_FILES[job_id]) if job_id in JOB_PARAMETERS.keys(): step = parameterParser.special_parameter_map(step, JOB_PARAMETERS[job_id]) if job_id in OUTPUT_DICT.keys(): step = parameterParser.output_file_map(step, OUTPUT_DICT[job_id]) if job_id in JOB_INPUT_FILES.keys(): step, outside_size = parameterParser.input_file_map( step, JOB_INPUT_FILES[job_id], job['user_folder']) if job_id in LAST_OUTPUT_SUFFIX.keys( ) and job_id in OUTPUT_DICT_SUFFIX.keys(): step = parameterParser.suffix_map(step, OUTPUT_DICT_SUFFIX[job_id], LAST_OUTPUT_SUFFIX[job_id]) step, outside_size_upload = parameterParser.upload_file_map( step, job['user_folder']) outside_size += outside_size_upload step = step.replace('{Workspace}', job['job_folder']) step = step.replace('{ThreadN}', str(settings['env']['cpu'])) JOB_COMMAND[job_id] = parameterParser.parameter_string_to_list(step) LAST_OUTPUT[job_id] = baseDriver.get_folder_content(job['job_folder']) training_num = get_training_items(job['steps'][job['resume'] + 1]['hash']) if training_num < 10: learning = 1 if INPUT_SIZE[job_id] == 0: INPUT_SIZE[job_id] = baseDriver.get_folder_size(job['job_folder']) else: if job_id in OUTPUT_SIZE.keys(): INPUT_SIZE[job_id] = OUTPUT_SIZE[job_id] else: INPUT_SIZE[job_id] = 0 FOLDER_SIZE_BEFORE[job_id] = baseDriver.get_folder_size(job['job_folder']) INPUT_SIZE[job_id] += outside_size resource_needed = checkPoint.predict_resource_needed( job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id], training_num) if learning == 1 and no_new_learn == 0: trace_id = create_machine_learning_item( job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id]) resource_needed['trace'] = trace_id return resource_needed
JOB_TABLE[job_id]['protocol'], JOB_TABLE[job_id]['input_file'], JOB_TABLE[job_id]['parameter']) mail.send_mail( mail.get_user_mail_address( JOB_TABLE[job_id]['user_id'])) except Exception, e: print e if job_id in JOB_TABLE.keys(): resume = JOB_TABLE[job_id]['resume'] res_key = str(job_id) + '_' + str(resume + 1) if res_key in RESOURCES.keys(): RESOURCES.pop(res_key) DISK_POOL += CUMULATIVE_OUTPUT_SIZE[ job_id] - baseDriver.get_folder_size( JOB_TABLE[job_id]['job_folder']) JOB_TABLE.pop(job_id) if job_id in OUTPUTS.keys(): OUTPUTS.pop(job_id) if job_id in OUTPUT_DICT.keys(): OUTPUT_DICT.pop(job_id) if job_id in LAST_OUTPUT.keys(): LAST_OUTPUT.pop(job_id) if job_id in LAST_OUTPUT_STRING.keys(): LAST_OUTPUT_STRING.pop(job_id) if job_id in CUMULATIVE_OUTPUT_SIZE.keys(): CUMULATIVE_OUTPUT_SIZE.pop(job_id) if job_id in LAST_OUTPUT_SUFFIX.keys(): LAST_OUTPUT_SUFFIX.pop(job_id) if job_id in OUTPUT_DICT_SUFFIX.keys(): OUTPUT_DICT_SUFFIX.pop(job_id)
def finish_job(job_id, error=0): """ Mark a job as finished and release resources it occupied If mail notify is switched on, it will send e-mail :param job_id: int, job id :param error: int, if error occurs, it should be 1 :return: None """ global DISK_POOL, JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT,\ OUTPUT_SIZE, FOLDER_SIZE_BEFORE, CUMULATIVE_OUTPUT_SIZE,\ LAST_OUTPUT_STRING, LAST_OUTPUT_SUFFIX, OUTPUT_DICT_SUFFIX if job_id in JOB_TABLE.keys(): if error == 1: if settings['mail']['notify'] == 'on': try: from notify import MailNotify mail = MailNotify(JOB_TABLE[job_id]['user_id'], 2, job_id, JOB_TABLE[job_id]['protocol'], JOB_TABLE[job_id]['input_file'], JOB_TABLE[job_id]['parameter']) mail.send_mail( mail.get_user_mail_address( JOB_TABLE[job_id]['user_id'])) except Exception as e: print(e) else: try: job = Queue.objects.get(id=job_id) job.status = -1 job.save() except: pass baseDriver.del_output_dict(job_id) if settings['mail']['notify'] == 'on': try: from notify import MailNotify mail = MailNotify(JOB_TABLE[job_id]['user_id'], 1, job_id, JOB_TABLE[job_id]['protocol'], JOB_TABLE[job_id]['input_file'], JOB_TABLE[job_id]['parameter']) mail.send_mail( mail.get_user_mail_address( JOB_TABLE[job_id]['user_id'])) except Exception as e: print(e) if job_id in JOB_TABLE.keys(): resume = JOB_TABLE[job_id]['resume'] res_key = str(job_id) + '_' + str(resume + 1) if res_key in RESOURCES.keys(): RESOURCES.pop(res_key) DISK_POOL += CUMULATIVE_OUTPUT_SIZE[ job_id] - baseDriver.get_folder_size( JOB_TABLE[job_id]['job_folder']) JOB_TABLE.pop(job_id) if job_id in OUTPUTS.keys(): OUTPUTS.pop(job_id) if job_id in OUTPUT_DICT.keys(): OUTPUT_DICT.pop(job_id) if job_id in LAST_OUTPUT.keys(): LAST_OUTPUT.pop(job_id) if job_id in LAST_OUTPUT_STRING.keys(): LAST_OUTPUT_STRING.pop(job_id) if job_id in CUMULATIVE_OUTPUT_SIZE.keys(): CUMULATIVE_OUTPUT_SIZE.pop(job_id) if job_id in LAST_OUTPUT_SUFFIX.keys(): LAST_OUTPUT_SUFFIX.pop(job_id) if job_id in OUTPUT_DICT_SUFFIX.keys(): OUTPUT_DICT_SUFFIX.pop(job_id)
def finish_step(job_id, step_order, resources): """ Mark a step as finished :param job_id: int, job id :param step_order: int, step order :param resources: dictionary, resources required by the step :return: None """ global JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT, OUTPUT_SIZE, FOLDER_SIZE_BEFORE,\ CUMULATIVE_OUTPUT_SIZE, LAST_OUTPUT_STRING try: job = Queue.objects.get(id=job_id) job.resume = step_order job.status = -2 job.save() JOB_TABLE[job_id]['status'] = -2 JOB_TABLE[job_id]['resume'] = step_order this_output = baseDriver.get_folder_content( JOB_TABLE[job_id]['job_folder']) NEW_FILES[job_id] = sorted( list(set(this_output).difference(set(LAST_OUTPUT[job_id])))) NEW_FILES[job_id] = [ os.path.join(JOB_TABLE[job_id]['job_folder'], file_name) for file_name in NEW_FILES[job_id] ] except Exception as e: print(e) if job_id in OUTPUTS.keys(): OUTPUTS[job_id].extend(NEW_FILES[job_id]) else: OUTPUTS[job_id] = NEW_FILES[job_id] suffix_dict = build_suffix_dict(NEW_FILES[job_id]) if job_id in OUTPUT_DICT.keys(): OUTPUT_DICT[job_id][step_order + 1] = NEW_FILES[job_id] else: OUTPUT_DICT[job_id] = {step_order + 1: NEW_FILES[job_id]} if job_id in OUTPUT_DICT_SUFFIX.keys(): OUTPUT_DICT_SUFFIX[job_id][step_order + 1] = suffix_dict else: OUTPUT_DICT_SUFFIX[job_id] = {step_order + 1: suffix_dict} LAST_OUTPUT_SUFFIX[job_id] = suffix_dict LAST_OUTPUT_STRING[job_id] = ' '.join(NEW_FILES[job_id]) OUTPUT_SIZE[job_id] = baseDriver.get_folder_size( JOB_TABLE[job_id]['job_folder']) - FOLDER_SIZE_BEFORE[job_id] CUMULATIVE_OUTPUT_SIZE[job_id] += OUTPUT_SIZE[job_id] if 'trace' in resources.keys(): training_item = Training.objects.get(id=resources['trace']) if training_item.cpu != '-' and training_item.mem != '-' \ and training_item.cpu != '' and training_item.mem != '': training_item.output = OUTPUT_SIZE[job_id] training_item.lock = 0 training_item.save() if settings['cluster']['type'] == '': update_resource_pool(resources)
def run_prepare(job_id, job, no_new_learn=0): """ Parse step's parameter and predict the resources needed by the step :param job_id: int, jod id :param job: dict, job dict :param no_new_learn: int, 1 means refusing creating new training item :return: """ global LAST_OUTPUT_STRING, OUTPUTS, OUTPUT_DICT, OUTPUT_DICT_SUFFIX, NEW_FILES, LAST_OUTPUT, LAST_OUTPUT_STRING learning = 0 outside_size = 0 if job['status'] == -1 and job['resume'] != -1: # skip and resume tmp_dict = baseDriver.load_output_dict(job_id) if 'LAST_OUTPUT_STRING' in tmp_dict.keys(): LAST_OUTPUT_STRING[job_id] = tmp_dict['LAST_OUTPUT_STRING'] if 'OUTPUTS' in tmp_dict.keys(): OUTPUTS[job_id] = tmp_dict['OUTPUTS'] if 'OUTPUT_DICT' in tmp_dict.keys(): OUTPUT_DICT[job_id] = tmp_dict['OUTPUT_DICT'] if 'OUTPUT_DICT_SUFFIX' in tmp_dict.keys(): OUTPUT_DICT_SUFFIX[job_id] = tmp_dict['OUTPUT_DICT_SUFFIX'] if 'NEW_FILES' in tmp_dict.keys(): NEW_FILES[job_id] = tmp_dict['NEW_FILES'] if 'LAST_OUTPUT' in tmp_dict.keys(): LAST_OUTPUT[job_id] = tmp_dict['LAST_OUTPUT'] if 'LAST_OUTPUT_SUFFIX' in tmp_dict.keys(): LAST_OUTPUT_SUFFIX[job_id] = tmp_dict['LAST_OUTPUT_SUFFIX'] if (job['resume'] + 1) == len(job['steps']): return None elif job['status'] > 0: return 'running' else: step = job['steps'][job['resume'] + 1]['parameter'] step = step.replace('{Job}', str(job_id)) step = step.replace('{JobName}', str(JOB_TABLE[job_id]['name'])) if job_id in LAST_OUTPUT_STRING.keys(): step = step.replace('{LastOutput}', LAST_OUTPUT_STRING[job_id]) if job_id in OUTPUTS.keys(): step = step.replace('{AllOutputBefore}', ' '.join(OUTPUTS[job_id])) if job_id in NEW_FILES.keys(): step = parameterParser.last_output_map(step, NEW_FILES[job_id]) if job_id in JOB_PARAMETERS.keys(): step = parameterParser.special_parameter_map(step, JOB_PARAMETERS[job_id]) if job_id in OUTPUT_DICT.keys(): step = parameterParser.output_file_map(step, OUTPUT_DICT[job_id]) if job_id in JOB_INPUT_FILES.keys(): step, outside_size = parameterParser.input_file_map( step, JOB_INPUT_FILES[job_id], job['user_folder']) if job_id in LAST_OUTPUT_SUFFIX.keys( ) and job_id in OUTPUT_DICT_SUFFIX.keys(): step = parameterParser.suffix_map(step, OUTPUT_DICT_SUFFIX[job_id], LAST_OUTPUT_SUFFIX[job_id]) step = parameterParser.history_map(step, job['user_id'], job['user_folder'], Queue) step, outside_size_upload = parameterParser.upload_file_map( step, job['user_folder']) outside_size += outside_size_upload step = step.replace('{Workspace}', job['job_folder']) user_bin_dir = os.path.join( os.path.join(settings['env']['workspace'], job['user_id'], 'bin')) if not os.path.exists(user_bin_dir): try: os.makedirs(user_bin_dir) except: pass step = step.replace('{UserBin}', user_bin_dir) if settings['cluster']['type']: if 'cpu' in settings['cluster'].keys() and settings['cluster']['cpu']: step = step.replace('{ThreadN}', str(settings['cluster']['cpu'])) else: step = step.replace('{ThreadN}', str(settings['env']['cpu'])) else: step = step.replace('{ThreadN}', str(settings['env']['cpu'])) JOB_COMMAND[job_id] = parameterParser.parameter_string_to_list(step) LAST_OUTPUT[job_id] = baseDriver.get_folder_content(job['job_folder']) training_num = get_training_items(job['steps'][job['resume'] + 1]['hash']) if training_num < 10: learning = 1 if INPUT_SIZE[job_id] == 0: INPUT_SIZE[job_id] = baseDriver.get_folder_size(job['job_folder']) else: if job_id in OUTPUT_SIZE.keys(): INPUT_SIZE[job_id] = OUTPUT_SIZE[job_id] else: INPUT_SIZE[job_id] = 0 FOLDER_SIZE_BEFORE[job_id] = baseDriver.get_folder_size(job['job_folder']) INPUT_SIZE[job_id] += outside_size resource_needed = checkPoint.predict_resource_needed( job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id], training_num) if resource_needed['cpu'] > int(settings['env']['cpu']) * 100: resource_needed['cpu'] = int(settings['env']['cpu']) * 95 # if resource_needed['mem'] > if learning == 1 and no_new_learn == 0: trace_id = create_machine_learning_item( job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id]) resource_needed['trace'] = trace_id return resource_needed