Beispiel #1
0
def _upload_variations_inputs(settings, source_url_initial, values_map):
    bdp_username = settings['bdp_username']
    logger.debug("source_url_initial=%s" % source_url_initial)
    encoded_s_url = storage.get_url_with_credentials(settings, source_url_initial)
    logger.debug("encoded_s_url=%s" % encoded_s_url)

    dest_url = _get_dest_bdp_url(settings)

    computation_platform_url = settings['comp_platform_url']
    bdp_username = settings['bdp_username']
    comp_pltf_settings = manage.get_platform_settings(
        computation_platform_url,
        bdp_username)
    settings.update(comp_pltf_settings)

    encoded_d_url = storage.get_url_with_credentials(settings,
        dest_url, is_relative_path=True, ip_address=settings['host'])

    storage.copy_directories(encoded_s_url, encoded_d_url)

    for content_fname, content in _instantiate_context(
            source_url_initial,
            settings,
            values_map).items():

        content_url = storage.get_url_with_credentials(
            settings,
            os.path.join(dest_url, content_fname),
            is_relative_path=True, ip_address=settings['host'])
        logger.debug("content_url=%s" % content_url)
        storage.put_file(content_url, content.encode('utf-8'))

    _save_values(settings, dest_url, values_map)

    logger.debug("done input upload")
Beispiel #2
0
def _instantiate_context(source_url, settings, context):

    templ_pat = re.compile("(.*)_template")
    encoded_s_url = storage.get_url_with_credentials(settings,
        source_url, is_relative_path=False)

    logger.debug("encoded_s_url=%s" % encoded_s_url)
    fnames = storage.list_dirs(encoded_s_url, list_files=True)

    logger.debug("fnames=%s" % fnames)
    new_content = {}
    for fname in fnames:
        logger.debug("fname=%s" % fname)
        templ_mat = templ_pat.match(fname)
        if templ_mat:
            base_fname = templ_mat.group(1)
            basename_url_with_pkey = storage.get_url_with_credentials(
                settings,
                os.path.join(
                    source_url,
                    fname),
                is_relative_path=False)
            logger.debug("basename_url_with_pkey=%s" % basename_url_with_pkey)
            cont = storage.get_file(basename_url_with_pkey)
            try:
                t = Template(cont)
            except TemplateSyntaxError, e:
                logger.error(e)
                #FIXME: should detect this during submission of job,
                #as no sensible way to recover here.
                #TODO: signal error conditions in job status
                continue
            con = Context(context)
            logger.debug("context=%s" % context)
            new_content[base_fname] = t.render(con)
        def copy_files_with_pattern(iter_out_fsys, source_path, dest_path,
                                    pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                          all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' %
                         (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(
                        all_settings,
                        output_prefix +
                        os.path.join(ip_address, source_path, f),
                        is_relative_path=False)
                    dest_url = get_url_with_credentials(
                        all_settings,
                        output_prefix + os.path.join(ip_address, dest_path, f),
                        is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' %
                                 (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)
Beispiel #4
0
    def copy_to_scratch_space(self, run_settings, local_settings):
        bdp_username = run_settings[
            'http://rmit.edu.au/schemas/bdp_userprofile']['username']
        output_storage_url = run_settings[
            'http://rmit.edu.au/schemas/platform/storage/output'][
                'platform_url']
        output_storage_settings = manage.get_platform_settings(
            output_storage_url, bdp_username)

        run_settings['http://rmit.edu.au/schemas/platform/storage/output'][
            'offset'] = self.output_loc_offset
        offset = run_settings[
            'http://rmit.edu.au/schemas/platform/storage/output']['offset']
        self.job_dir = manage.get_job_dir(output_storage_settings, offset)
        iter_inputdir = os.path.join(self.job_dir, "input_0")
        logger.debug("iter_inputdir=%s" % iter_inputdir)

        input_location = run_settings[RMIT_SCHEMA +
                                      '/input/system']['input_location']
        logger.debug("input_location=%s" % input_location)
        #todo: input location will evenatually be replaced by the scratch space that was used by the sweep
        #todo: the sweep will indicate the location of the scratch space in the run_settings
        #todo: add scheme (ssh) to inputlocation
        source_url = get_url_with_credentials(local_settings, input_location)
        logger.debug("source_url=%s" % source_url)

        destination_url = get_url_with_credentials(
            output_storage_settings,
            '%s://%s@%s' % (output_storage_settings['scheme'],
                            output_storage_settings['type'], iter_inputdir),
            is_relative_path=False)
        logger.debug("destination_url=%s" % destination_url)
        storage.copy_directories(source_url, destination_url)
Beispiel #5
0
    def _copy_previous_inputs(self, local_settings, output_storage_settings,
                              computation_platform_settings):
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        for proc in self.ready_processes:
            source_location = os.path.join(self.job_dir, "input_backup",
                                           proc['id'])
            source_files_url = get_url_with_credentials(
                output_storage_settings,
                output_prefix + source_location,
                is_relative_path=False)
            relative_path_suffix = self.get_relative_output_path(
                local_settings)
            #dest_files_location = computation_platform_settings['type'] + "@"\
            #                      + os.path.join(
            #    local_settings['payload_destination'],
            #    proc['id'], local_settings['payload_cloud_dirname'])
            dest_files_location = computation_platform_settings['type'] + "@"\
                                  + os.path.join(relative_path_suffix,
                proc['id'], local_settings['payload_cloud_dirname'])
            logger.debug('dest_files_location=%s' % dest_files_location)

            dest_files_url = get_url_with_credentials(
                computation_platform_settings,
                dest_files_location,
                is_relative_path=True,
                ip_address=proc['ip_address'])
            logger.debug('dest_files_url=%s' % dest_files_url)
            storage.copy_directories(source_files_url, dest_files_url)
    def _get_output(self, local_settings, source_url):
        """
            Retrieve the output from the task on the node
        """
        logger.debug("get_output from %s" % source_url)

        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url, bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_s_url = storage.get_url_with_credentials(
            local_settings,
            source_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_s_url)
        make_path = os.path.join(query_settings['root_path'], mypath)
        logger.debug("make_path=%s" % make_path)

        output_storage_url = local_settings['storeout_platform_url']
        logger.debug("output_storage_url=%s" % output_storage_url)
        output_storage_settings = manage.get_platform_settings(
            output_storage_url, bdp_username)
        local_settings.update(output_storage_settings)
        logger.debug("output_storage_settings=%s" % output_storage_settings)

        dest_url = '%s://%s@%s/%s/make%s' % (
            output_storage_settings['scheme'], output_storage_settings['type'],
            output_storage_settings['host'],
            local_settings['storeout_platform_offset'],
            str(local_settings['contextid']))

        logger.debug("Transferring output from %s to %s" %
                     (source_url, dest_url))
        local_settings.update(output_storage_settings)
        encoded_d_url = storage.get_url_with_credentials(
            local_settings, dest_url)
        logger.debug("encoded_d_url=%s" % encoded_d_url)
        # FIXME: might want to turn on paramiko compress function
        #storage_files(encoded_d_url, exceptions=[])
        # to speed up this transfer
        try:
            storage.copy_directories(encoded_s_url, encoded_d_url)
        except SSHException, e:
            logger.error(e)
            # FIXME: Could just exit, but need to flag that this data has not
            # been transferred.
            raise
def start_multi_bootstrap_task(settings, relative_path_suffix):
    """
    Run the package on each of the nodes in the group and grab
    any output as needed
    """
    nodes = get_registered_vms(settings)
    logger.debug("nodes=%s" % nodes)
    requested_nodes = 0
    maketarget_nodegroup_pair = {}

    # TODO: need testcases for following code
    if not maketarget_nodegroup_pair:
        EMPTY_MAKE_TARGET = ''
        requested_nodes = len(nodes)
        maketarget_nodegroup_pair[EMPTY_MAKE_TARGET] = requested_nodes
    else:
        for i in maketarget_nodegroup_pair.keys():
            requested_nodes += maketarget_nodegroup_pair[i]
        if requested_nodes > len(nodes):
            message = "Requested nodes %d; but available nodes %s " \
                % (requested_nodes, len(nodes))
            logger.exception(message)
            raise InsufficientResourceError(message)
    logger.info("Requested nodes %d: \nAvailable nodes %s " %
                (requested_nodes, len(nodes)))

    logger.debug('starting setup')
    for make_target in maketarget_nodegroup_pair:
        for i in range(0, maketarget_nodegroup_pair[make_target]):
            instance = nodes[0]
            node_ip = instance.ip_address
            if not node_ip:
                node_ip = instance.private_ip_address
            logger.debug("node_ip=%s" % node_ip)
            logger.debug('constructing source')
            source = get_url_with_credentials(settings,
                                              settings['payload_source'])
            logger.debug('source=%s' % source)
            #relative_path = '%s@%s' % (settings['type'], settings['payload_destination'])
            relative_path = '%s@%s' % (settings['type'], relative_path_suffix)
            destination = get_url_with_credentials(settings,
                                                   relative_path,
                                                   is_relative_path=True,
                                                   ip_address=node_ip)
            logger.debug("Source %s" % source)
            logger.debug("Destination %s" % destination)
            logger.debug("Relative path %s" % relative_path)
            _start_bootstrap(instance, node_ip, settings, source, destination)
            nodes.pop(0)
Beispiel #8
0
    def run_task(self, ip_address, process_id, settings, run_settings):
        """
            Start the task on the instance, then hang and
            periodically check its state.
        """
        logger.debug("run_task %s" % ip_address)
        #ip = botocloudconnector.get_instance_ip(instance_id, settings)
        #ip = ip_address
        logger.debug("ip=%s" % ip_address)
        # curr_username = settings['username']
        #settings['username'] = '******'
        # ssh = sshconnector.open_connection(ip_address=ip,
        #                                    settings=settings)
        # settings['username'] = curr_username

        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path_suffix = self.get_relative_output_path(settings)
        relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        makefile_path = get_make_path(destination)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command, errs = run_make(ssh, makefile_path, 'start_running_process')
            logger.debug('execute_command=%s' % command
            )
        finally:
            ssh.close()
Beispiel #9
0
 def prepare_inputs(self, local_settings, output_storage_settings,
                     computation_platform_settings, mytardis_settings, run_settings):
     """
     Upload all input files for this run
     """
     logger.debug("preparing inputs")
     # TODO: to ensure reproducability, may want to precalculate all random numbers and
     # store rather than rely on canonical execution of rest of this funciton.
     #processes = self.schedule_procs
     processes = [x for x in self.schedule_procs
                 if x['status'] == 'ready']
     self.node_ind = 0
     logger.debug("Iteration Input dir %s" % self.iter_inputdir)
     output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                             output_storage_settings['type'])
     url_with_pkey = get_url_with_credentials(
         output_storage_settings, output_prefix + self.iter_inputdir, is_relative_path=False)
     logger.debug("url_with_pkey=%s" % url_with_pkey)
     input_dirs = list_dirs(url_with_pkey)
     if not input_dirs:
         raise BadInputException("require an initial subdirectory of input directory")
     for input_dir in sorted(input_dirs):
         logger.debug("Input dir %s" % input_dir)
         self.upload_variation_inputs(
             run_settings, local_settings, self.generate_variations(
                 input_dir, local_settings, output_storage_settings, run_settings),
             processes, input_dir, output_storage_settings,
             computation_platform_settings, mytardis_settings)
Beispiel #10
0
def start_round_robin_reschedule(nodes, procs_2b_rescheduled, current_procs,
                                 settings, output_storage_settings,
                                 relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    processes = len(procs_2b_rescheduled)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = 0
    new_processes = current_procs
    rescheduled_procs = list(procs_2b_rescheduled)
    for cur_node in all_nodes:
        logger.debug('Schedule here %s' % cur_node)
        ip_address = cur_node.ip_address
        if not ip_address:
            ip_address = cur_node.private_ip_address
        logger.debug('ip_address=%s' % ip_address)
        #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination']
        relative_path = output_storage_settings[
            'type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node,
                            rescheduled_procs=rescheduled_procs)
        #index += len(ids)
        #logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(ids,
                                               ip_address,
                                               new_processes,
                                               status='reschedule_ready',
                                               maximum_retry=int(
                                                   settings['maximum_retry']))
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)
        command = "cd %s; make %s" % (
            makefile_path, 'start_schedule PAYLOAD_NAME=%s IDS=%s' %
            (settings['payload_name'], settings['filename_for_PIDs']))
        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Beispiel #11
0
 def get_total_templates(self, maps, **kwargs):
     run_settings = kwargs['run_settings']
     output_storage_settings = kwargs['output_storage_settings']
     job_dir = kwargs['job_dir']
     try:
         id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
     except (SettingNotFoundException, ValueError) as e:
         logger.debug(e)
         id = 0
     iter_inputdir = os.path.join(job_dir, "input_%s" % id)
     url_with_pkey = get_url_with_credentials(
         output_storage_settings,
         '%s://%s@%s' % (output_storage_settings['scheme'],
                         output_storage_settings['type'], iter_inputdir),
         is_relative_path=False)
     logger.debug(url_with_pkey)
     input_dirs = list_dirs(url_with_pkey)
     for iter, template_map in enumerate(maps):
         logger.debug("template_map=%s" % template_map)
         map_keys = template_map.keys()
         logger.debug("map_keys %s" % map_keys)
         map_ranges = [list(template_map[x]) for x in map_keys]
         product = 1
         for i in map_ranges:
             product = product * len(i)
         total_templates = product * len(input_dirs)
         logger.debug("total_templates=%d" % (total_templates))
     return total_templates
Beispiel #12
0
    def run_task(self, ip_address, process_id, settings, run_settings):
        """
            Start the task on the instance, then hang and
            periodically check its state.
        """
        logger.debug("run_task %s" % ip_address)
        #ip = botocloudconnector.get_instance_ip(instance_id, settings)
        #ip = ip_address
        logger.debug("ip=%s" % ip_address)
        # curr_username = settings['username']
        #settings['username'] = '******'
        # ssh = sshconnector.open_connection(ip_address=ip,
        #                                    settings=settings)
        # settings['username'] = curr_username

        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path_suffix = self.get_relative_output_path(settings)
        relative_path = settings['type'] + '@' + \
            os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip_address)
        makefile_path = get_make_path(destination)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            logger.debug(settings['process_output_dirname'])
            try:
                self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid
                self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid
                hadoop = run_settings['%s/input/system/compplatform/hadoop' %
                                      django_settings.SCHEMA_PREFIX]
                sudo = False
                options = '%s %s  %s %s %s ' % (
                    settings['smart_connector_input'],
                    settings['process_output_dirname'],
                    settings['hadoop_home_path'], self.hadoop_input,
                    self.hadoop_output)
                logger.debug('options = %s ' % options)
                optional_args = self.get_optional_args(run_settings)
                if optional_args:
                    options += " %s" % optional_args
                logger.debug('options = %s ' % options)
                command, errs = run_make(ssh,
                                         makefile_path,
                                         'start_running_process  %s' % options,
                                         sudo=sudo)
            except KeyError:
                sudo = True
                command, errs = run_make(ssh,
                                         makefile_path,
                                         'start_running_process %s %s' %
                                         (settings['smart_connector_input'],
                                          settings['process_output_dirname']),
                                         sudo=sudo)
            logger.debug('execute_command=%s' % command)
        finally:
            ssh.close()
Beispiel #13
0
    def copy_to_scratch_space(self, run_settings, local_settings, result_offset):
        bdp_username = run_settings['%s/bdp_userprofile' % django_settings.SCHEMA_PREFIX]['username']
        output_storage_url = run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['platform_url']
        output_storage_settings = manage.get_platform_settings(output_storage_url, bdp_username)

        run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['offset'] = self.output_loc_offset
        offset = run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['offset']
        self.job_dir = manage.get_job_dir(output_storage_settings, offset)
        iter_inputdir = os.path.join(self.job_dir, result_offset)
        logger.debug("iter_inputdir=%s" % iter_inputdir)

        input_storage_settings = self.get_platform_settings(run_settings, '%s/platform/storage/input' % django_settings.SCHEMA_PREFIX)
        #input_location = run_settings[django_settings.SCHEMA_PREFIX + '/input/system']['input_location']

        try:
            input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/system/input_location')
        except SettingNotFoundException:
            try:
		input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/location/input_location')
	    except:
		input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/location/input/input_location')
        logger.debug("input_location=%s" % input_location)
        #todo: input location will evenatually be replaced by the scratch space that was used by the sweep
        #todo: the sweep will indicate the location of the scratch space in the run_settings
        #todo: add scheme (ssh) to inputlocation

        #source_url = get_url_with_credentials(local_settings, input_location)

        input_offset = run_settings['%s/platform/storage/input' % django_settings.SCHEMA_PREFIX]['offset']
        input_url = "%s://%s@%s/%s" % (input_storage_settings['scheme'],
                                       input_storage_settings['type'],
                                       input_storage_settings['host'], input_offset)
        source_url = get_url_with_credentials(
            input_storage_settings, input_url, is_relative_path=False)

        logger.debug("source_url=%s" % source_url)

        destination_url = get_url_with_credentials(
            output_storage_settings,
            '%s://%s@%s' % (output_storage_settings['scheme'],
                             output_storage_settings['type'],
                             iter_inputdir),
            is_relative_path=False)
        logger.debug("destination_url=%s" % destination_url)
        storage.copy_directories(source_url, destination_url)
Beispiel #14
0
    def get_output(self, ip_address, process_id, output_dir, local_settings,
                   computation_platform_settings, output_storage_settings,
                   run_settings):
        """
            Retrieve the output from the task on the node
        """

        logger.debug("get_output of process %s on %s" %
                     (process_id, ip_address))
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        #fixme: add call get_process_output_path
        #cloud_path = os.path.join(local_settings['payload_destination'],
        #                          #str(contextid), #fixme: uncomment
        #                          str(process_id),
        #                          local_settings['payload_cloud_dirname']
        #                          )
        relative_path_suffix = self.get_relative_output_path(local_settings)
        cloud_path = os.path.join(relative_path_suffix, str(process_id),
                                  local_settings['payload_cloud_dirname'])
        #cloud_path = self.get_process_output_path(run_settings, process_id)
        logger.debug("cloud_path=%s" % cloud_path)
        logger.debug("Transferring output from %s to %s" %
                     (cloud_path, output_dir))
        ip = ip_address  # botocloudconnector.get_instance_ip(instance_id, settings)
        #ssh = open_connection(ip_address=ip, settings=settings)
        source_files_location = "%s://%s@%s" % (
            computation_platform_settings['scheme'],
            computation_platform_settings['type'], os.path.join(
                ip, cloud_path))
        source_files_url = get_url_with_credentials(
            computation_platform_settings,
            source_files_location,
            is_relative_path=False)
        logger.debug('source_files_url=%s' % source_files_url)

        dest_files_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix +
            os.path.join(self.job_dir, self.output_dir, process_id),
            is_relative_path=False)
        logger.debug('dest_files_url=%s' % dest_files_url)
        # FIXME: might want to turn on paramiko compress function
        # to speed up this transfer
        storage.copy_directories(source_files_url, dest_files_url)
    def process(self, run_settings):
        self.experiment_id = 0
        local_settings = setup_settings(run_settings)
        self.experiment_id = local_settings['experiment_id']
        messages.info(run_settings, "1: waiting for completion")
        logger.debug("settings=%s" % local_settings)

        try:
            self.runs_left = ast.literal_eval(
                getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            self.runs_left = []

        # if self._exists(run_settings,
        #     'http://rmit.edu.au/schemas/stages/make',
        #     u'runs_left'):
        #     self.runs_left = ast.literal_eval(
        #         run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left'])
        # else:
        #     self.runs_left = []

        def _get_dest_bdp_url(local_settings):
            return "%s@%s" % ("nci",
                              os.path.join(
                                  local_settings['payload_destination'],
                                  str(local_settings['contextid'])))

        dest_url = _get_dest_bdp_url(local_settings)
        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url, bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_d_url = storage.get_url_with_credentials(
            local_settings,
            dest_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)

        if self.runs_left:
            job_finished = self._job_finished(settings=local_settings,
                                              remote_path=dest_url)

            if not job_finished:
                return

            self._get_output(local_settings, dest_url)
            self.runs_left -= 1

        if self.runs_left <= 0:
            messages.success(run_settings, "%s: finished" % (1))

        logger.debug("processing finished")
def complete_bootstrap(bootstrap_class, local_settings):
    try:

        nodes = ast.literal_eval(local_settings['created_nodes'])
        logger.debug("nodes=%s" % nodes)

        running_created_nodes = [
            x for x in bootstrap_class.created_nodes if str(x[3]) == 'running'
        ]
        if len(nodes) < len(running_created_nodes):
            raise VMTerminatedError
    except NoRegisteredVMError as e:
        logger.debug('NoRegisteredVMError detected')
        ftmanager = FTManager()
        ftmanager.manage_failure(e,
                                 stage_class=bootstrap_class,
                                 settings=local_settings)
    except VMTerminatedError as e:
        logger.debug('VMTerminatedError detected')
        ftmanager = FTManager()
        ftmanager.manage_failure(e,
                                 stage_class=bootstrap_class,
                                 settings=local_settings)
    for node in nodes:
        node_ip = node[1]
        if (node_ip in [
                x[1] for x in bootstrap_class.bootstrapped_nodes
                if x[1] == node_ip
        ]):
            continue
        relative_path_suffix = bootstrap_class.get_relative_output_path(
            local_settings)
        relative_path = "%s@%s" % (local_settings['type'],
                                   relative_path_suffix)
        destination = get_url_with_credentials(local_settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=node_ip)
        logger.debug("Relative path %s" % relative_path)
        logger.debug("Destination %s" % destination)
        try:
            fin = _is_bootstrap_complete(node_ip, local_settings, destination)
        except IOError, e:
            logger.error(e)
            fin = False
        except Exception as e:
            logger.error(e)
            fin = False
            ftmanager = FTManager()
            ftmanager.manage_failure(e,
                                     stage_class=bootstrap_class,
                                     vm_ip=node_ip,
                                     vm_id=node[0],
                                     settings=local_settings)
Beispiel #17
0
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = schedule_index
    new_processes = []

    for cur_node in all_nodes:
        ip_address = cur_node[1]
        #relative_path = settings['type'] + '@' + settings['payload_destination']
        relative_path = settings['type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node, index=index)
        index += len(ids)
        logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(
            ids, ip_address, new_processes,
            maximum_retry=int(settings['maximum_retry']))

        destination = get_url_with_credentials(
            settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)

        command = "cd %s; make %s" % (makefile_path,
            'start_schedule %s %s %s %s' % (
            settings['payload_name'], settings['filename_for_PIDs'], settings['process_output_dirname'],
            settings['smart_connector_input']))

        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Beispiel #18
0
def _load_values_map(settings, url):
    values = {}
    try:
        enc_url = storage.get_url_with_credentials(
            settings,
            "%s/%s" % (url, VALUES_FNAME))
        logger.debug("values_file=%s" % enc_url)
        values_content = storage.get_file(enc_url)
    except IOError:
        logger.warn("no values file found")
    else:
        logger.debug("values_content = %s" % values_content)
        values = dict(json.loads(values_content))
    return values
Beispiel #19
0
def put_proc_ids(relative_path, ids, ip, settings):
    relative_path = os.path.join(relative_path, settings['filename_for_PIDs'])
    logger.debug('put_proc_ids=%s' % relative_path)
    destination = get_url_with_credentials(settings,
                                           relative_path,
                                           is_relative_path=True,
                                           ip_address=ip)
    logger.debug('destination=%s' % destination)
    ids_str = []
    [ids_str.append(str(i)) for i in ids]
    proc_ids = ("\n".join(ids_str)) + "\n"
    logger.debug('ids_str=%s' % ids_str)
    logger.debug('proc_ids=%s' % proc_ids)
    logger.debug('encoded=%s' % proc_ids.encode('utf-8'))
    put_file(destination, proc_ids.encode('utf-8'))
                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
Beispiel #21
0
                def put_dest_file(proc, fname, dest_file_location,
                                  resched_file_location, content):
                    dest_url = get_url_with_credentials(
                        computation_platform_settings,
                        os.path.join(dest_file_location, fname),
                        is_relative_path=True,
                        ip_address=proc['ip_address'])
                    logger.debug("writing to =%s" % dest_url)
                    #logger.debug("content=%s" % content)
                    storage.put_file(dest_url, content)
                    if self.reschedule_failed_procs:
                        logger.debug("resched=%s" % resched_file_location)
                        logger.debug("fname=%s" % fname)
                        logger.debug("output_storage_settings=%s" %
                                     output_storage_settings)

                        logger.debug("here")
                        test = "%s/%s" % (resched_file_location, fname)
                        logger.debug("test=%s" % test)
                        resched_url = get_url_with_credentials(
                            output_storage_settings, test)
                        logger.debug("writing backup to %s" % resched_url)
                        storage.put_file(resched_url, content)
                    logger.debug("done")
Beispiel #22
0
        def _get_dataset_name_for_input(settings, url, path):
            logger.debug("path=%s" % path)
            source_url = get_url_with_credentials(
                output_storage_settings,
                output_prefix + os.path.join(output_host, path, self.VALUES_FNAME),
                is_relative_path=False)
            logger.debug("source_url=%s" % source_url)
            try:
                content = get_file(source_url)
            except IOError:
                return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

            logger.debug("content=%s" % content)
            try:
                values_map = dict(json.loads(str(content)))
            except Exception, e:
                logger.warn("cannot load %s: %s" % (content, e))
                return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
Beispiel #23
0
def generate_rands(settings, start_range, end_range, num_required,
                   start_index):
    # FIXME: there must be an third party library that does this more
    # effectively.
    rand_nums = []
    num_url = get_url_with_credentials(settings,
                                       settings['random_numbers'],
                                       is_relative_path=False)
    random_content = get_file(num_url)
    # FIXME: this loads the entire file, which could be very large.
    numbers = random_content.split('\n')

    random_counter = start_index
    # FIXME: better handled with separate function
    if end_range < start_range:
        # special case, where we want rands in range of number of rands in file
        start_range = 0
        end_range = len(numbers)

    for i in range(0, num_required):

        raw_num = float(numbers[random_counter])
        num = int((raw_num * float(end_range - start_range)) + start_range)

        rand_nums.append(num)
        logger.debug("[0,1) %s -> [%s,%s) %s" %
                     (raw_num, start_range, end_range, num))

        random_counter += 1
        if random_counter >= len(numbers):
            random_counter = 0

    # for i, line in enumerate(random_content.split('\n')):
    #     if start_index <= i < (start_index + num_required):
    #         raw_num = float(line)
    #         num = int((raw_num * float(end_range - start_range)) + start_range)
    #         logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num))
    #         rand_nums.append(num)

    logger.debug(
        "Generated %s random numbers from %s in range [%s, %s): %s " %
        (num_required, num_url, start_range, end_range, pformat(rand_nums)))
    return rand_nums
Beispiel #24
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
        all_settings):

        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)

        logger.debug("output_url=%s" % output_url)
        output_settings = self.get_platform_settings(run_settings, RMIT_SCHEMA + '/platform/storage/output')
        current_output_url= "%s://%s@%s/%s" %(scheme, output_settings['type'], host,
            os.path.join(mypath, '1/'))
        logger.debug('current-dest=%s' % current_output_url)
        outcar_url = storage.get_url_with_credentials(
            output_settings, current_output_url +self.OUTCAR_FILE, is_relative_path=False)
        logger.debug("outcar_url=%s" % outcar_url)

        try:
            outcar_content = storage.get_file(outcar_url)
        except IOError, e:
            logger.error(e)
            toten = None
    def _job_finished(self, settings, remote_path):

        encoded_d_url = storage.get_url_with_credentials(
            settings=settings,
            url_or_relative_path=remote_path,
            is_relative_path=True,
            ip_address=settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)
        stdout = ''
        stderr = ''

        try:
            ssh = open_connection(ip_address=host, settings=settings)
            (stdout, stderr) = compute.run_make(
                ssh, (os.path.join(query_settings['root_path'], mypath)),
                'running')
        except Exception, e:
            logger.error(e)
            raise
 def compute_hrmc_criterion(self, number, node_output_dir, fs,
                            output_storage_settings):
     output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                   output_storage_settings['type'])
     grerr_file = 'grerr%s.dat' % str(number).zfill(2)
     logger.debug("grerr_file=%s " % grerr_file)
     grerr_url = get_url_with_credentials(
         output_storage_settings,
         output_prefix + os.path.join(self.output_dir, node_output_dir,
                                      'grerr%s.dat' % str(number).zfill(2)),
         is_relative_path=False)
     grerr_content = storage.get_file(
         grerr_url)  # FIXME: check that get_file can raise IOError
     if not grerr_content:
         logger.warn("no gerr content found")
     logger.debug("grerr_content=%s" % grerr_content)
     try:
         criterion = float(grerr_content.strip().split('\n')[-1].split()[1])
     except ValueError as e:
         logger.warn("invalid criteron found in grerr " +
                     "file for  %s/%s: %s" %
                     (self.output_dir, node_output_dir, e))
     logger.debug("criterion=%s" % criterion)
     return criterion
Beispiel #27
0
    def process(self, run_settings):
        settings = setup_settings(run_settings)
        messages.info(run_settings, "1: execute starting")

        def _get_dest_bdp_url(settings):
            return "%s@%s" % (
                    "nci",
                    os.path.join(settings['payload_destination'],
                                 str(settings['contextid'])))

        dest_url = _get_dest_bdp_url(settings)
        computation_platform_url = settings['comp_platform_url']
        bdp_username = settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url,
            bdp_username)
        logger.debug("comp_pltf_settings=%s" % pformat(comp_pltf_settings))
        settings.update(comp_pltf_settings)
        encoded_d_url = storage.get_url_with_credentials(
            settings,
            dest_url,
            is_relative_path=True,
            ip_address=settings['host'])
        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)
        stderr = ''
        try:
            ssh = open_connection(
                ip_address=settings['host'],
                settings=settings)
            (command_out, stderr) = compute.run_make(ssh, (os.path.join(
                    query_settings['root_path'],
                    mypath)), 'startrun')
        except Exception, e:
            logger.error(e)
            raise
        mytardis_settings = _get_mytardis_settings(local_settings,
                                                   bdp_username)
        logger.debug(mytardis_settings)

        if local_settings['curate_data']:
            if mytardis_settings['mytardis_host']:

                if directive == "vasp":

                    # TODO: this is very domain specific

                    OUTCAR_FILE = "OUTCAR"
                    VALUES_FILE = "values"

                    outcar_url = storage.get_url_with_credentials(
                        local_settings,
                        os.path.join(dest_url, OUTCAR_FILE),
                        is_relative_path=False)
                    logger.debug("outcar_url=%s" % outcar_url)

                    try:
                        outcar_content = storage.get_file(outcar_url)
                    except IOError, e:
                        logger.error(e)
                        toten = None
                    else:
                        toten = None
                        for line in outcar_content.split('\n'):
                            #logger.debug("line=%s" % line)
                            if 'e  en' in line:
                                logger.debug("found")
                                try:
Beispiel #29
0
    def is_job_finished(self, wait_class, ip_address, process_id, retry_left,
                        settings, relative_path_suffix):
        """
            Return True if package job on instance_id has is_job_finished
        """
        # TODO: maybe this should be a reusable library method?
        ip = ip_address
        logger.debug("ip=%s" % ip)
        curr_username = settings['username']
        # settings['username'] = '******'
        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path = settings['type'] + '@' + os.path.join(
            relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip)
        makefile_path = get_make_path(destination)
        ssh = None
        try:
            logger.debug('trying ssh')
            ssh = open_connection(ip_address=ip, settings=settings)
            logger.debug('successful ssh')
            (command_out, errs) = run_make(ssh, makefile_path,
                                           "process_running_done")
            ssh.close()
            logger.debug("command_out2=(%s, %s)" % (command_out, errs))
            if command_out:
                logger.debug("command_out = %s" % command_out)
                for line in command_out:
                    if "stopped" in line:
                        return True
        except Exception, e:

            # Failure detection and then management
            logger.debug('error is = %s' % e)
            process_failed = False
            node_failed = False
            logger.debug('Is there error? %s' %
                         wait_class.failure_detector.failed_ssh_connection(e))
            if wait_class.failure_detector.failed_ssh_connection(e):
                node = [
                    x for x in wait_class.created_nodes if x[1] == ip_address
                ]
                wait_class.failed_processes = wait_class.ftmanager.manage_failed_process(
                    settings, process_id, node[0], node[0][0], ip_address,
                    wait_class.failed_nodes, wait_class.executed_procs,
                    wait_class.current_processes, wait_class.all_processes,
                    wait_class.procs_2b_rescheduled)
                #wait_class.procs_2b_rescheduled.extend(rescheduled_prcs)
                '''
                if wait_class.failure_detector.node_terminated(settings, node[0][0]):
                    if not wait_class.failure_detector.recorded_failed_node(
                            wait_class.failed_nodes, ip_address):
                        wait_class.failed_nodes.append(node[0])
                    node_failed = True
                else:
                    if not retry_left:
                        process_failed = True
                    else:
                        process_lists = [wait_class.executed_procs, wait_class.current_processes,
                                         wait_class.all_processes]
                        wait_class.ftmanager.decrease_max_retry(
                            process_lists, ip_address, process_id)
                # Failure management
                if node_failed or process_failed:
                    process_lists = [wait_class.executed_procs,
                                     wait_class.current_processes, wait_class.all_processes]
                    if node_failed:
                        wait_class.ftmanager.flag_all_processes(process_lists, ip_address)
                    elif process_failed:
                        wait_class.ftmanager.flag_this_process(
                            process_lists, ip_address, process_id)
                    wait_class.failed_processes = wait_class.ftmanager.\
                        get_total_failed_processes(wait_class.executed_procs)
                    if wait_class.reschedule_failed_procs:
                        wait_class.ftmanager.collect_failed_processes(
                            wait_class.executed_procs, wait_class.procs_2b_rescheduled)

                '''
            else:
                raise
Beispiel #30
0
    def curate_dataset(self, run_settings, experiment_id, base_url, output_url,
                       all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        current_output_url = "%s%s" % (
            output_prefix,
            os.path.join(os.path.join(base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join(
                [current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                all_settings, process_output_url, is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings,
                '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)

            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings,  # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=
                _get_dataset_name,  # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset(
                        "dsetgraph",  # name of schema
                        name="randdset",  # a unique dataset name
                        graph_info={},
                        value_dict={
                            "randdset/x": x,
                            "randdset/y": y
                        },  # values to be used in experiment graphs
                        value_keys=[]),
                ])
        return experiment_id