Beispiel #1
0
def run_prepare(job_id, job, no_new_learn=0):
    """
    Parse step's parameter and predict the resources needed by the step
    :param job_id: int, jod id
    :param job: dict, job dict
    :param no_new_learn: int, 1 means refusing creating new training item
    :return:
    """
    learning = 0

    if job['status'] == -1 and job['resume'] != -1:
        # skip and resume
        OUTPUT_DICT[job_id] = baseDriver.load_output_dict(job_id)

    if (job['resume'] + 1) == len(job['steps']):
        return None
    elif job['status'] > 0:
        return 'running'
    else:
        step = job['steps'][job['resume'] + 1]['parameter']

    step = step.replace('{Job}', str(job_id))

    if job_id in LAST_OUTPUT_STRING.keys():
        step = step.replace('{LastOutput}', LAST_OUTPUT_STRING[job_id])
    if job_id in OUTPUTS.keys():
        step = step.replace('{AllOutputBefore}', ' '.join(OUTPUTS[job_id]))
    if job_id in NEW_FILES.keys():
        step = parameterParser.last_output_map(step, NEW_FILES[job_id])
    if job_id in JOB_PARAMETERS.keys():
        step = parameterParser.special_parameter_map(step,
                                                     JOB_PARAMETERS[job_id])
    if job_id in OUTPUT_DICT.keys():
        step = parameterParser.output_file_map(step, OUTPUT_DICT[job_id])
    if job_id in JOB_INPUT_FILES.keys():
        step, outside_size = parameterParser.input_file_map(
            step, JOB_INPUT_FILES[job_id], job['user_folder'])
    if job_id in LAST_OUTPUT_SUFFIX.keys(
    ) and job_id in OUTPUT_DICT_SUFFIX.keys():
        step = parameterParser.suffix_map(step, OUTPUT_DICT_SUFFIX[job_id],
                                          LAST_OUTPUT_SUFFIX[job_id])
    step, outside_size_upload = parameterParser.upload_file_map(
        step, job['user_folder'])
    outside_size += outside_size_upload
    step = step.replace('{Workspace}', job['job_folder'])
    step = step.replace('{ThreadN}', str(settings['env']['cpu']))
    JOB_COMMAND[job_id] = parameterParser.parameter_string_to_list(step)
    LAST_OUTPUT[job_id] = baseDriver.get_folder_content(job['job_folder'])
    training_num = get_training_items(job['steps'][job['resume'] + 1]['hash'])
    if training_num < 10:
        learning = 1

    if INPUT_SIZE[job_id] == 0:
        INPUT_SIZE[job_id] = baseDriver.get_folder_size(job['job_folder'])
    else:
        if job_id in OUTPUT_SIZE.keys():
            INPUT_SIZE[job_id] = OUTPUT_SIZE[job_id]
        else:
            INPUT_SIZE[job_id] = 0
    FOLDER_SIZE_BEFORE[job_id] = baseDriver.get_folder_size(job['job_folder'])
    INPUT_SIZE[job_id] += outside_size

    resource_needed = checkPoint.predict_resource_needed(
        job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id],
        training_num)

    if learning == 1 and no_new_learn == 0:
        trace_id = create_machine_learning_item(
            job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id])
        resource_needed['trace'] = trace_id

    return resource_needed
Beispiel #2
0
                                      JOB_TABLE[job_id]['protocol'],
                                      JOB_TABLE[job_id]['input_file'],
                                      JOB_TABLE[job_id]['parameter'])
                    mail.send_mail(
                        mail.get_user_mail_address(
                            JOB_TABLE[job_id]['user_id']))
                except Exception, e:
                    print e

        if job_id in JOB_TABLE.keys():
            resume = JOB_TABLE[job_id]['resume']
            res_key = str(job_id) + '_' + str(resume + 1)
            if res_key in RESOURCES.keys():
                RESOURCES.pop(res_key)
        DISK_POOL += CUMULATIVE_OUTPUT_SIZE[
            job_id] - baseDriver.get_folder_size(
                JOB_TABLE[job_id]['job_folder'])
        JOB_TABLE.pop(job_id)
        if job_id in OUTPUTS.keys():
            OUTPUTS.pop(job_id)
        if job_id in OUTPUT_DICT.keys():
            OUTPUT_DICT.pop(job_id)
        if job_id in LAST_OUTPUT.keys():
            LAST_OUTPUT.pop(job_id)
        if job_id in LAST_OUTPUT_STRING.keys():
            LAST_OUTPUT_STRING.pop(job_id)
        if job_id in CUMULATIVE_OUTPUT_SIZE.keys():
            CUMULATIVE_OUTPUT_SIZE.pop(job_id)
        if job_id in LAST_OUTPUT_SUFFIX.keys():
            LAST_OUTPUT_SUFFIX.pop(job_id)
        if job_id in OUTPUT_DICT_SUFFIX.keys():
            OUTPUT_DICT_SUFFIX.pop(job_id)
Beispiel #3
0
def finish_job(job_id, error=0):
    """
    Mark a job as finished and release resources it occupied
    If mail notify is switched on, it will send e-mail
    :param job_id: int, job id
    :param error: int, if error occurs, it should be 1
    :return: None
    """
    global DISK_POOL, JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT,\
        OUTPUT_SIZE, FOLDER_SIZE_BEFORE, CUMULATIVE_OUTPUT_SIZE,\
        LAST_OUTPUT_STRING, LAST_OUTPUT_SUFFIX, OUTPUT_DICT_SUFFIX
    if job_id in JOB_TABLE.keys():
        if error == 1:
            if settings['mail']['notify'] == 'on':
                try:
                    from notify import MailNotify
                    mail = MailNotify(JOB_TABLE[job_id]['user_id'], 2, job_id,
                                      JOB_TABLE[job_id]['protocol'],
                                      JOB_TABLE[job_id]['input_file'],
                                      JOB_TABLE[job_id]['parameter'])
                    mail.send_mail(
                        mail.get_user_mail_address(
                            JOB_TABLE[job_id]['user_id']))
                except Exception as e:
                    print(e)
        else:
            try:
                job = Queue.objects.get(id=job_id)
                job.status = -1
                job.save()
            except:
                pass
            baseDriver.del_output_dict(job_id)
            if settings['mail']['notify'] == 'on':
                try:
                    from notify import MailNotify
                    mail = MailNotify(JOB_TABLE[job_id]['user_id'], 1, job_id,
                                      JOB_TABLE[job_id]['protocol'],
                                      JOB_TABLE[job_id]['input_file'],
                                      JOB_TABLE[job_id]['parameter'])
                    mail.send_mail(
                        mail.get_user_mail_address(
                            JOB_TABLE[job_id]['user_id']))
                except Exception as e:
                    print(e)

        if job_id in JOB_TABLE.keys():
            resume = JOB_TABLE[job_id]['resume']
            res_key = str(job_id) + '_' + str(resume + 1)
            if res_key in RESOURCES.keys():
                RESOURCES.pop(res_key)
        DISK_POOL += CUMULATIVE_OUTPUT_SIZE[
            job_id] - baseDriver.get_folder_size(
                JOB_TABLE[job_id]['job_folder'])
        JOB_TABLE.pop(job_id)
        if job_id in OUTPUTS.keys():
            OUTPUTS.pop(job_id)
        if job_id in OUTPUT_DICT.keys():
            OUTPUT_DICT.pop(job_id)
        if job_id in LAST_OUTPUT.keys():
            LAST_OUTPUT.pop(job_id)
        if job_id in LAST_OUTPUT_STRING.keys():
            LAST_OUTPUT_STRING.pop(job_id)
        if job_id in CUMULATIVE_OUTPUT_SIZE.keys():
            CUMULATIVE_OUTPUT_SIZE.pop(job_id)
        if job_id in LAST_OUTPUT_SUFFIX.keys():
            LAST_OUTPUT_SUFFIX.pop(job_id)
        if job_id in OUTPUT_DICT_SUFFIX.keys():
            OUTPUT_DICT_SUFFIX.pop(job_id)
Beispiel #4
0
def finish_step(job_id, step_order, resources):
    """
    Mark a step as finished
    :param job_id: int, job id
    :param step_order: int, step order
    :param resources: dictionary, resources required by the step
    :return: None
    """
    global JOB_TABLE, NEW_FILES, OUTPUTS, OUTPUT_DICT, OUTPUT_SIZE, FOLDER_SIZE_BEFORE,\
        CUMULATIVE_OUTPUT_SIZE, LAST_OUTPUT_STRING
    try:
        job = Queue.objects.get(id=job_id)
        job.resume = step_order
        job.status = -2
        job.save()
        JOB_TABLE[job_id]['status'] = -2
        JOB_TABLE[job_id]['resume'] = step_order
        this_output = baseDriver.get_folder_content(
            JOB_TABLE[job_id]['job_folder'])
        NEW_FILES[job_id] = sorted(
            list(set(this_output).difference(set(LAST_OUTPUT[job_id]))))
        NEW_FILES[job_id] = [
            os.path.join(JOB_TABLE[job_id]['job_folder'], file_name)
            for file_name in NEW_FILES[job_id]
        ]
    except Exception as e:
        print(e)

    if job_id in OUTPUTS.keys():
        OUTPUTS[job_id].extend(NEW_FILES[job_id])
    else:
        OUTPUTS[job_id] = NEW_FILES[job_id]

    suffix_dict = build_suffix_dict(NEW_FILES[job_id])

    if job_id in OUTPUT_DICT.keys():
        OUTPUT_DICT[job_id][step_order + 1] = NEW_FILES[job_id]
    else:
        OUTPUT_DICT[job_id] = {step_order + 1: NEW_FILES[job_id]}

    if job_id in OUTPUT_DICT_SUFFIX.keys():
        OUTPUT_DICT_SUFFIX[job_id][step_order + 1] = suffix_dict
    else:
        OUTPUT_DICT_SUFFIX[job_id] = {step_order + 1: suffix_dict}

    LAST_OUTPUT_SUFFIX[job_id] = suffix_dict
    LAST_OUTPUT_STRING[job_id] = ' '.join(NEW_FILES[job_id])
    OUTPUT_SIZE[job_id] = baseDriver.get_folder_size(
        JOB_TABLE[job_id]['job_folder']) - FOLDER_SIZE_BEFORE[job_id]
    CUMULATIVE_OUTPUT_SIZE[job_id] += OUTPUT_SIZE[job_id]

    if 'trace' in resources.keys():
        training_item = Training.objects.get(id=resources['trace'])
        if training_item.cpu != '-' and training_item.mem != '-' \
                and training_item.cpu != '' and training_item.mem != '':
            training_item.output = OUTPUT_SIZE[job_id]
            training_item.lock = 0
            training_item.save()

    if settings['cluster']['type'] == '':
        update_resource_pool(resources)
Beispiel #5
0
def run_prepare(job_id, job, no_new_learn=0):
    """
    Parse step's parameter and predict the resources needed by the step
    :param job_id: int, jod id
    :param job: dict, job dict
    :param no_new_learn: int, 1 means refusing creating new training item
    :return:
    """
    global LAST_OUTPUT_STRING, OUTPUTS, OUTPUT_DICT, OUTPUT_DICT_SUFFIX, NEW_FILES, LAST_OUTPUT, LAST_OUTPUT_STRING
    learning = 0
    outside_size = 0

    if job['status'] == -1 and job['resume'] != -1:
        # skip and resume
        tmp_dict = baseDriver.load_output_dict(job_id)
        if 'LAST_OUTPUT_STRING' in tmp_dict.keys():
            LAST_OUTPUT_STRING[job_id] = tmp_dict['LAST_OUTPUT_STRING']
        if 'OUTPUTS' in tmp_dict.keys():
            OUTPUTS[job_id] = tmp_dict['OUTPUTS']
        if 'OUTPUT_DICT' in tmp_dict.keys():
            OUTPUT_DICT[job_id] = tmp_dict['OUTPUT_DICT']
        if 'OUTPUT_DICT_SUFFIX' in tmp_dict.keys():
            OUTPUT_DICT_SUFFIX[job_id] = tmp_dict['OUTPUT_DICT_SUFFIX']
        if 'NEW_FILES' in tmp_dict.keys():
            NEW_FILES[job_id] = tmp_dict['NEW_FILES']
        if 'LAST_OUTPUT' in tmp_dict.keys():
            LAST_OUTPUT[job_id] = tmp_dict['LAST_OUTPUT']
        if 'LAST_OUTPUT_SUFFIX' in tmp_dict.keys():
            LAST_OUTPUT_SUFFIX[job_id] = tmp_dict['LAST_OUTPUT_SUFFIX']

    if (job['resume'] + 1) == len(job['steps']):
        return None
    elif job['status'] > 0:
        return 'running'
    else:
        step = job['steps'][job['resume'] + 1]['parameter']

    step = step.replace('{Job}', str(job_id))
    step = step.replace('{JobName}', str(JOB_TABLE[job_id]['name']))

    if job_id in LAST_OUTPUT_STRING.keys():
        step = step.replace('{LastOutput}', LAST_OUTPUT_STRING[job_id])
    if job_id in OUTPUTS.keys():
        step = step.replace('{AllOutputBefore}', ' '.join(OUTPUTS[job_id]))
    if job_id in NEW_FILES.keys():
        step = parameterParser.last_output_map(step, NEW_FILES[job_id])
    if job_id in JOB_PARAMETERS.keys():
        step = parameterParser.special_parameter_map(step,
                                                     JOB_PARAMETERS[job_id])
    if job_id in OUTPUT_DICT.keys():
        step = parameterParser.output_file_map(step, OUTPUT_DICT[job_id])
    if job_id in JOB_INPUT_FILES.keys():
        step, outside_size = parameterParser.input_file_map(
            step, JOB_INPUT_FILES[job_id], job['user_folder'])
    if job_id in LAST_OUTPUT_SUFFIX.keys(
    ) and job_id in OUTPUT_DICT_SUFFIX.keys():
        step = parameterParser.suffix_map(step, OUTPUT_DICT_SUFFIX[job_id],
                                          LAST_OUTPUT_SUFFIX[job_id])
    step = parameterParser.history_map(step, job['user_id'],
                                       job['user_folder'], Queue)

    step, outside_size_upload = parameterParser.upload_file_map(
        step, job['user_folder'])
    outside_size += outside_size_upload
    step = step.replace('{Workspace}', job['job_folder'])
    user_bin_dir = os.path.join(
        os.path.join(settings['env']['workspace'], job['user_id'], 'bin'))
    if not os.path.exists(user_bin_dir):
        try:
            os.makedirs(user_bin_dir)
        except:
            pass
    step = step.replace('{UserBin}', user_bin_dir)
    if settings['cluster']['type']:
        if 'cpu' in settings['cluster'].keys() and settings['cluster']['cpu']:
            step = step.replace('{ThreadN}', str(settings['cluster']['cpu']))
        else:
            step = step.replace('{ThreadN}', str(settings['env']['cpu']))
    else:
        step = step.replace('{ThreadN}', str(settings['env']['cpu']))
    JOB_COMMAND[job_id] = parameterParser.parameter_string_to_list(step)
    LAST_OUTPUT[job_id] = baseDriver.get_folder_content(job['job_folder'])
    training_num = get_training_items(job['steps'][job['resume'] + 1]['hash'])
    if training_num < 10:
        learning = 1

    if INPUT_SIZE[job_id] == 0:
        INPUT_SIZE[job_id] = baseDriver.get_folder_size(job['job_folder'])
    else:
        if job_id in OUTPUT_SIZE.keys():
            INPUT_SIZE[job_id] = OUTPUT_SIZE[job_id]
        else:
            INPUT_SIZE[job_id] = 0
    FOLDER_SIZE_BEFORE[job_id] = baseDriver.get_folder_size(job['job_folder'])
    INPUT_SIZE[job_id] += outside_size

    resource_needed = checkPoint.predict_resource_needed(
        job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id],
        training_num)
    if resource_needed['cpu'] > int(settings['env']['cpu']) * 100:
        resource_needed['cpu'] = int(settings['env']['cpu']) * 95

    # if resource_needed['mem'] >
    if learning == 1 and no_new_learn == 0:
        trace_id = create_machine_learning_item(
            job['steps'][job['resume'] + 1]['hash'], INPUT_SIZE[job_id])
        resource_needed['trace'] = trace_id

    return resource_needed