Exemple #1
0
def _instantiate_context(source_url, settings, context):

    templ_pat = re.compile("(.*)_template")
    encoded_s_url = storage.get_url_with_credentials(settings,
        source_url, is_relative_path=False)

    logger.debug("encoded_s_url=%s" % encoded_s_url)
    fnames = storage.list_dirs(encoded_s_url, list_files=True)

    logger.debug("fnames=%s" % fnames)
    new_content = {}
    for fname in fnames:
        logger.debug("fname=%s" % fname)
        templ_mat = templ_pat.match(fname)
        if templ_mat:
            base_fname = templ_mat.group(1)
            basename_url_with_pkey = storage.get_url_with_credentials(
                settings,
                os.path.join(
                    source_url,
                    fname),
                is_relative_path=False)
            logger.debug("basename_url_with_pkey=%s" % basename_url_with_pkey)
            cont = storage.get_file(basename_url_with_pkey)
            try:
                t = Template(cont)
            except TemplateSyntaxError, e:
                logger.error(e)
                #FIXME: should detect this during submission of job,
                #as no sensible way to recover here.
                #TODO: signal error conditions in job status
                continue
            con = Context(context)
            logger.debug("context=%s" % context)
            new_content[base_fname] = t.render(con)
        def copy_files_with_pattern(iter_out_fsys, source_path, dest_path,
                                    pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                          all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' %
                         (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(
                        all_settings,
                        output_prefix +
                        os.path.join(ip_address, source_path, f),
                        is_relative_path=False)
                    dest_url = get_url_with_credentials(
                        all_settings,
                        output_prefix + os.path.join(ip_address, dest_path, f),
                        is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' %
                                 (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)
Exemple #3
0
def _load_values_map(settings, url):
    values = {}
    try:
        enc_url = storage.get_url_with_credentials(
            settings,
            "%s/%s" % (url, VALUES_FNAME))
        logger.debug("values_file=%s" % enc_url)
        values_content = storage.get_file(enc_url)
    except IOError:
        logger.warn("no values file found")
    else:
        logger.debug("values_content = %s" % values_content)
        values = dict(json.loads(values_content))
    return values
Exemple #4
0
def get_dataset_name_for_output(settings, url, path):
    logger.debug("path=%s" % path)

    host = settings['host']
    prefix = 'ssh://%s@%s' % (settings['type'], host)
    source_url = storage.get_url_with_credentials(
        settings, os.path.join(prefix, path, VALUES_FNAME),
        is_relative_path=False)
    logger.debug("source_url=%s" % source_url)
    try:
        content = storage.get_file(source_url)
    except IOError, e:
        logger.warn("cannot read file %s" % e)
        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
        all_settings):

        OUTCAR_FILE = "OUTCAR"
        VALUES_FILE = "values"

        logger.debug("output_url=%s" % output_url)

        outcar_url = storage.get_url_with_credentials(all_settings,
            os.path.join(output_url, OUTCAR_FILE), is_relative_path=False)
        logger.debug("outcar_url=%s" % outcar_url)

        try:
            outcar_content = storage.get_file(outcar_url)
        except IOError, e:
            logger.error(e)
            toten = None
        def copy_files_with_pattern(iter_out_fsys, source_path,
                                 dest_path, pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False)
                    dest_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)
Exemple #8
0
        def _get_dataset_name_for_input(settings, url, path):
            logger.debug("path=%s" % path)
            source_url = get_url_with_credentials(
                output_storage_settings,
                output_prefix + os.path.join(output_host, path, self.VALUES_FNAME),
                is_relative_path=False)
            logger.debug("source_url=%s" % source_url)
            try:
                content = get_file(source_url)
            except IOError:
                return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

            logger.debug("content=%s" % content)
            try:
                values_map = dict(json.loads(str(content)))
            except Exception, e:
                logger.warn("cannot load %s: %s" % (content, e))
                return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
Exemple #9
0
def generate_rands(settings, start_range, end_range, num_required,
                   start_index):
    # FIXME: there must be an third party library that does this more
    # effectively.
    rand_nums = []
    num_url = get_url_with_credentials(settings,
                                       settings['random_numbers'],
                                       is_relative_path=False)
    random_content = get_file(num_url)
    # FIXME: this loads the entire file, which could be very large.
    numbers = random_content.split('\n')

    random_counter = start_index
    # FIXME: better handled with separate function
    if end_range < start_range:
        # special case, where we want rands in range of number of rands in file
        start_range = 0
        end_range = len(numbers)

    for i in range(0, num_required):

        raw_num = float(numbers[random_counter])
        num = int((raw_num * float(end_range - start_range)) + start_range)

        rand_nums.append(num)
        logger.debug("[0,1) %s -> [%s,%s) %s" %
                     (raw_num, start_range, end_range, num))

        random_counter += 1
        if random_counter >= len(numbers):
            random_counter = 0

    # for i, line in enumerate(random_content.split('\n')):
    #     if start_index <= i < (start_index + num_required):
    #         raw_num = float(line)
    #         num = int((raw_num * float(end_range - start_range)) + start_range)
    #         logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num))
    #         rand_nums.append(num)

    logger.debug(
        "Generated %s random numbers from %s in range [%s, %s): %s " %
        (num_required, num_url, start_range, end_range, pformat(rand_nums)))
    return rand_nums
Exemple #10
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
        all_settings):

        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)

        logger.debug("output_url=%s" % output_url)
        output_settings = self.get_platform_settings(run_settings, RMIT_SCHEMA + '/platform/storage/output')
        current_output_url= "%s://%s@%s/%s" %(scheme, output_settings['type'], host,
            os.path.join(mypath, '1/'))
        logger.debug('current-dest=%s' % current_output_url)
        outcar_url = storage.get_url_with_credentials(
            output_settings, current_output_url +self.OUTCAR_FILE, is_relative_path=False)
        logger.debug("outcar_url=%s" % outcar_url)

        try:
            outcar_content = storage.get_file(outcar_url)
        except IOError, e:
            logger.error(e)
            toten = None
Exemple #11
0
def generate_rands(settings, start_range,  end_range, num_required, start_index):
    # FIXME: there must be an third party library that does this more
    # effectively.
    rand_nums = []
    num_url = get_url_with_credentials(settings, settings['random_numbers'],
        is_relative_path=False)
    random_content = get_file(num_url)
    # FIXME: this loads the entire file, which could be very large.
    numbers = random_content.split('\n')

    random_counter = start_index
    # FIXME: better handled with separate function
    if end_range < start_range:
        # special case, where we want rands in range of number of rands in file
        start_range = 0
        end_range = len(numbers)

    for i in range(0, num_required):

        raw_num = float(numbers[random_counter])
        num = int((raw_num * float(end_range - start_range)) + start_range)

        rand_nums.append(num)
        logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num))

        random_counter += 1
        if random_counter >= len(numbers):
            random_counter = 0

    # for i, line in enumerate(random_content.split('\n')):
    #     if start_index <= i < (start_index + num_required):
    #         raw_num = float(line)
    #         num = int((raw_num * float(end_range - start_range)) + start_range)
    #         logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num))
    #         rand_nums.append(num)

    logger.debug("Generated %s random numbers from %s in range [%s, %s): %s "
        % (num_required, num_url, start_range, end_range, pformat(rand_nums)))
    return rand_nums
 def compute_hrmc_criterion(self, number, node_output_dir, fs, output_storage_settings):
     output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                 output_storage_settings['type'])
     grerr_file = 'grerr%s.dat' % str(number).zfill(2)
     logger.debug("grerr_file=%s " % grerr_file)
     grerr_url = get_url_with_credentials(
         output_storage_settings,
                     output_prefix + os.path.join(self.output_dir,
                         node_output_dir, 'grerr%s.dat' % str(number).zfill(2)), is_relative_path=False)
     grerr_content = storage.get_file(grerr_url)  # FIXME: check that get_file can raise IOError
     if not grerr_content:
         logger.warn("no gerr content found")
     logger.debug("grerr_content=%s" % grerr_content)
     try:
         criterion = float(grerr_content.strip().split('\n')[-1]
         .split()[1])
     except ValueError as e:
         logger.warn("invalid criteron found in grerr "
                     + "file for  %s/%s: %s"
                     % (self.output_dir, node_output_dir, e))
     logger.debug("criterion=%s" % criterion)
     return criterion
 def compute_hrmc_criterion(self, number, node_output_dir, fs,
                            output_storage_settings):
     output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                   output_storage_settings['type'])
     grerr_file = 'grerr%s.dat' % str(number).zfill(2)
     logger.debug("grerr_file=%s " % grerr_file)
     grerr_url = get_url_with_credentials(
         output_storage_settings,
         output_prefix + os.path.join(self.output_dir, node_output_dir,
                                      'grerr%s.dat' % str(number).zfill(2)),
         is_relative_path=False)
     grerr_content = storage.get_file(
         grerr_url)  # FIXME: check that get_file can raise IOError
     if not grerr_content:
         logger.warn("no gerr content found")
     logger.debug("grerr_content=%s" % grerr_content)
     try:
         criterion = float(grerr_content.strip().split('\n')[-1].split()[1])
     except ValueError as e:
         logger.warn("invalid criteron found in grerr " +
                     "file for  %s/%s: %s" %
                     (self.output_dir, node_output_dir, e))
     logger.debug("criterion=%s" % criterion)
     return criterion
Exemple #14
0
                    logger.debug("found")
                    try:
                        toten = float(line.rsplit(' ', 2)[-2])
                    except ValueError, e:
                        logger.error(e)
                        pass
                    break

        logger.debug("toten=%s" % toten)

        values_url = storage.get_url_with_credentials(all_settings,
            '%s%s' % (current_output_url, self.VALUES_FNAME), is_relative_path=False)
        logger.debug("values_url=%s" % values_url)

        try:
            values_content = storage.get_file(values_url)
        except IOError, e:
            logger.error(e)
            values = None
        else:
            values = None
            try:
                values = dict(json.loads(values_content))
            except Exception, e:
                logger.error(e)
                pass
        logger.debug("values=%s" % values)

        # FIXME: all values from map are strings initially, so need to know
        # type to coerce.
        num_kp = None
Exemple #15
0
    def _upload_input_dir_variations(self, processes, local_settings,
                                     computation_platform_settings,
                                     output_storage_settings,
                                     mytardis_settings,
                                     input_dir, run_settings):
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        input_url_with_credentials = get_url_with_credentials(
            output_storage_settings, output_prefix + os.path.join(
                self.iter_inputdir, input_dir),
            is_relative_path=False)
        logger.debug('input_url_with_credentials=%s' %
                     input_url_with_credentials)
        if local_settings['curate_data']:

            try:
                mytardis_platform = jobs.safe_import('chiminey.platform.mytardis.MyTardisPlatform', [], {})
                self.experiment_id = mytardis_platform.create_dataset_for_input(self.experiment_id,
                                                      run_settings, local_settings,
                                                      output_storage_settings,
                                                      mytardis_settings,
                                                      input_url_with_credentials)
            except ImproperlyConfigured as  e:
                logger.error("Cannot load mytardis platform hook %s" % e)

        else:
            logger.warn('Data curation is off')

        # get run Map
        parent_stage = self.import_parent_stage(run_settings)
        run_map, self.rand_index = parent_stage.get_internal_sweep_map(local_settings,
                                                                       run_settings=run_settings)

        # load value_map
        values_url_with_pkey = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(self.iter_inputdir,
                                         input_dir,
                                         self.VALUES_FNAME),
            is_relative_path=False)
        logger.debug("initial values_file=%s" % values_url_with_pkey)
        values = {}
        try:
            values_content = storage.get_file(values_url_with_pkey)
        except IOError:
            logger.warn("no values file found")
        else:
            logger.debug("values_content = %s" % values_content)
            values = dict(json.loads(values_content))
        logger.debug("values=%s" % values)

        # generates a set of variations for the template fname
        logger.debug('self.initial_numbfile = %s ' % self.initial_numbfile)
        contexts = self._get_variation_contexts(
            [run_map], values,  self.initial_numbfile)
        self.initial_numbfile += len(contexts)
        logger.debug('contexts = %s ' % contexts)
        logger.debug('self.initial_numbfile = %s ' % self.initial_numbfile)

        # for each context, copy each file to dest and any
        # templates to be instantiated, then store in values.

        template_pat = re.compile("(.*)_template")
        relative_path_suffix = self.get_relative_output_path(local_settings)

        for context in contexts:
            logger.debug("context=%s" % context)
            # get list of all files in input_dir
            fname_url_with_pkey = get_url_with_credentials(
                output_storage_settings,
                output_prefix + os.path.join(self.iter_inputdir, input_dir),
                is_relative_path=False)
            input_files = storage.list_dirs(fname_url_with_pkey,
                                            list_files=True)

            # get process information
            run_counter = context['run_counter']
            logger.debug("run_counter=%s" % run_counter)
            proc = None
            for p in processes:
                # TODO: how to handle invalid run_counter
                pid = int(p['id'])
                logger.debug("pid=%s" % pid)
                if pid == run_counter:
                    proc = p
                    break
            else:
                logger.error("no process found matching run_counter")
                raise BadInputException()
            logger.debug("proc=%s" % pformat(proc))

            for fname in input_files:
                logger.debug("fname=%s" % fname)
                templ_mat = template_pat.match(fname)
                fname_url_with_credentials = storage.get_url_with_credentials(
                    output_storage_settings,
                    output_prefix +
                    os.path.join(self.iter_inputdir, input_dir, fname),
                    is_relative_path=False)
                logger.debug("fname_url_with_credentials=%s" %
                             fname_url_with_credentials)

                def put_dest_file(proc, fname,
                                  dest_file_location, resched_file_location,
                                  content):
                    dest_url = get_url_with_credentials(
                        computation_platform_settings, os.path.join(
                            dest_file_location, fname),
                        is_relative_path=True, ip_address=proc['ip_address'])
                    logger.debug("writing to =%s" % dest_url)
                    #logger.debug("content=%s" % content)
                    storage.put_file(dest_url, content)
                    if self.reschedule_failed_procs:
                        logger.debug("resched=%s" % resched_file_location)
                        logger.debug("fname=%s" % fname)
                        logger.debug("output_storage_settings=%s" %
                                     output_storage_settings)

                        logger.debug("here")
                        test = "%s/%s" % (resched_file_location, fname)
                        logger.debug("test=%s" % test)
                        resched_url = get_url_with_credentials(
                            output_storage_settings, test)
                        logger.debug("writing backup to %s" % resched_url)
                        storage.put_file(resched_url, content)
                    logger.debug("done")

                outputs = []
                if templ_mat:
                    base_fname = templ_mat.group(1)
                    template_content = storage.get_file(
                        fname_url_with_credentials)
                    try:
                        templ = Template(template_content)
                    except TemplateSyntaxError, e:
                        logger.error(e)
                        # FIXME: should detect this during submission of job,
                        # as no sensible way to recover here.
                        # TODO: signal error conditions in job status
                        continue
                    new_context = Context(context)
                    logger.debug("new_content=%s" % new_context)
                    render_output = templ.render(new_context)
                    render_output = render_output.encode('utf-8')
                    outputs.append((base_fname, render_output))
                    outputs.append((fname, template_content))

                else:
                    content = storage.get_file(fname_url_with_credentials)
                    outputs.append((fname, content))

                for (new_fname, content) in outputs:
                    dest_file_location = computation_platform_settings['type']\
                        + "@" + os.path.join(relative_path_suffix,
                                             proc['id'],
                                             local_settings['smart_connector_input'])
                    logger.debug("dest_file_location =%s" % dest_file_location)
                    resched_file_location = "%s%s" % (output_prefix, os.path.join(
                        self.job_dir, "input_backup", proc['id']))

                    logger.debug("resched_file_location=%s" %
                                 resched_file_location)
                    put_dest_file(proc, new_fname, dest_file_location,
                                  resched_file_location, content)

            # then copy context new values file
            logger.debug("writing values file")
            values_dest_location = computation_platform_settings['type']\
                + "@" + os.path.join(relative_path_suffix,
                                     proc['id'],
                                     local_settings['smart_connector_input'],
                                     self.VALUES_FNAME)
            logger.debug("values_dest_location =%s" % values_dest_location)

            values_dest_url = get_url_with_credentials(
                computation_platform_settings, values_dest_location,
                is_relative_path=True, ip_address=proc['ip_address'])

            storage.put_file(values_dest_url, json.dumps(context, indent=4))
Exemple #16
0
    def build_metadata_for_final_output(self, m, output_dir, **kwargs):
        #FIXME: this calculation should be done as in extract_psd_func
        # pulling directly from data_errors rather than passing in
        # through nested function.
        experiment_paramset = []
        dataset_paramset = []
        datafile_paramset = []
        dfile_extract_func = {}

        exp_value_keys = []
        legends = []
        for m, current_dir in enumerate(kwargs['output_dirs']):
            #node_path = os.path.join(iter_output_dir, node_dir)

            exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

            source_url = storage.get_url_with_credentials(\
            kwargs['storage_settings'], current_dir, is_relative_path=False)

            (source_scheme, source_location, source_path, source_location,
                query_settings) = storage.parse_bdpurl(source_url)
            logger.debug("source_url=%s" % source_url)
            legends.append(
                mytardis.get_dataset_name_for_output(
                    kwargs['storage_settings'], "", source_path))

        logger.debug("exp_value_keys=%s" % exp_value_keys)
        logger.debug("legends=%s" % legends)




        # for m, output_dir in enumerate(kwargs['output_dirs']):
        #node_path = os.path.join(iter_output_dir, output_dir)
        node_path = output_dir
        logger.debug("node_path=%s" % node_path)

        dataerrors_url = storage.get_url_with_credentials(kwargs['storage_settings'],
            os.path.join(node_path, self.DATA_ERRORS_FILE),
            is_relative_path=False)
        logger.debug("dataerrors_url=%s" % dataerrors_url)
        dataerrors_content = storage.get_file(dataerrors_url)
        xs = []
        ys = []
        re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')
        for i, line in enumerate(dataerrors_content.splitlines()):
            if i == 0:
                continue
            columns = line.split()
            try:
                hrmc_step = int(columns[self.STEP_COLUMN_NUM])
            except ValueError:
                logger.warn("could not parse hrmc_step value on line %s" % i)
                continue
            # handle  format double precision float format
            val = columns[self.ERRGR_COLUMN_NUM]
            val = re_dbl_fort.sub(r'\1E\2', val)
            logger.debug("val=%s" % val)
            try:
                hrmc_errgr = float(val)
            except ValueError:
                logger.warn("could not parse hrmc_errgr value on line %s" % i)
                continue
            xs.append(hrmc_step)
            ys.append(hrmc_errgr)

        logger.debug("xs=%s" % xs)
        logger.debug("ys=%s" % ys)

        crit_url = storage.get_url_with_credentials(kwargs['storage_settings'],
            os.path.join(node_path, "criterion.txt"), is_relative_path=False)
        try:
            crit = storage.get_file(crit_url)
        except ValueError:
            crit = None
        except IOError:
            crit = None
        # FIXME: can crit be zero?
        logger.debug("crit=%s" % crit)
        if crit:
            system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \
            django_settings.SCHEMA_PREFIX))
            hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit}
        else:
            hrmcdset_val = {}

        # TODO: move into utiltiy function for reuse
        def extract_psd_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(dataerrors_content.splitlines()):
                if i == 0:
                    continue
                columns = line.split()

                val = columns[self.STEP_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    x = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                val = columns[self.ERRGR_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    y = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                xs.append(x)
                ys.append(y)
            res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
            return res

        def extract_psdexp_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
            return res

        def extract_grfinal_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [xs[i] for i, x in enumerate(xs)
                if (i % (len(xs) / 50) == 0)]
            cut_ys = [ys[i] for i, x in enumerate(ys)
                if (i % (len(ys) / 50) == 0)]

            res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
            return res

        def extract_inputgr_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [xs[i] for i, x in enumerate(xs)
                if (i % (len(xs) / 50) == 0)]
            cut_ys = [ys[i] for i, x in enumerate(ys)
                if (i % (len(ys) / 50) == 0)]

            res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
            return res
        #todo: replace self.boto_setttings with mytardis_settings


        # Only save graph paramset for experiment once per experiment.
        if not self.final_graph_paramset:
            self.final_graph_paramset = [mytardis.create_graph_paramset("expgraph",
                name="hrmcexp2",
                graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
                value_dict={},
                value_keys=exp_value_keys)]

            experiment_paramset = self.final_graph_paramset
        else:
            experiment_paramset = []

        dataset_paramset = [
            mytardis.create_paramset('hrmcdataset/output', []),
            mytardis.create_graph_paramset('dsetgraph',
                name="hrmcdset",
                graph_info={"axes":["r (Angstroms)", "PSD"],
                    "legends":["psd", "PSD_exp"],  "type":"line"},
                value_dict=hrmcdset_val,
                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                    ["hrmcdfile/r2", "hrmcdfile/g2"]]),
            mytardis.create_graph_paramset('dsetgraph',
                name='hrmcdset2',
                graph_info={"axes":["r (Angstroms)", "g(r)"],
                    "legends":["data_grfinal", "input_gr"],
                    "type":"line"},
                value_dict={},
                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                    ["hrmcdfile/r4", "hrmcdfile/g4"]]),
            mytardis.create_graph_paramset('dsetgraph',
                name='hrmcdset%s' % m,
                graph_info={},
                value_dict={"hrmcdset%s/step" % m: xs,
                    "hrmcdset%s/err" % m: ys},
                value_keys=[]),
            ]
        datafile_paramset = [
            mytardis.create_graph_paramset('dfilegraph',
                name="hrmcdfile",
                graph_info={},
                value_dict={},
                value_keys=[])
            ]
        dfile_extract_func = {
            'psd.dat': extract_psd_func,
            'PSD_exp.dat': extract_psdexp_func,
            'data_grfinal.dat': extract_grfinal_func,
            'input_gr.dat': extract_inputgr_func}
        logger.debug("experiment_paramset=%s" % experiment_paramset)
        logger.debug("dataset_paramset=%s" % dataset_paramset)
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        logger.debug("dfile_extract_func=%s" % dfile_extract_func)

        return (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func)
Exemple #17
0
    def generate_variations(self, input_dir, local_settings, output_storage_settings, run_settings):
        """
        For each templated file in input_dir, generate all variations
        """
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                    output_storage_settings['type'])
        template_pat = re.compile("(.*)_template")
        fname_url_with_pkey = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(self.iter_inputdir, input_dir),
            is_relative_path=False)
        input_files = storage.list_dirs(fname_url_with_pkey,
            list_files=True)

        variations = {}
        # TODO: only tested with single template file per input
        parent_stage = self.import_parent_stage(run_settings)

        for fname in input_files:
            logger.debug("trying %s/%s/%s" % (self.iter_inputdir, input_dir,
                                              fname))
            template_mat = template_pat.match(fname)
            if template_mat:
                # get the template
                basename_url_with_pkey = get_url_with_credentials(
                    output_storage_settings,
                    output_prefix + os.path.join(self.iter_inputdir, input_dir, fname),
                    is_relative_path=False)
                template = storage.get_file(basename_url_with_pkey)
                base_fname = template_mat.group(1)
                logger.debug("base_fname=%s" % base_fname)

                # find associated values file and generator_counter
                values_map = {}
                try:
                    values_url_with_pkey = get_url_with_credentials(
                        output_storage_settings,
                        output_prefix + os.path.join(self.iter_inputdir,
                            input_dir,
                            '%s_values' % base_fname),
                        is_relative_path=False)

                    logger.debug("values_file=%s" % values_url_with_pkey)
                    values_content = storage.get_file(values_url_with_pkey)
                except IOError:
                    logger.warn("no values file found")
                else:
                    logger.debug("values_content = %s" % values_content)
                    values_map = dict(json.loads(values_content))

                    # TODO: rather than loading up specific vars for info
                    # to send to next set of variations, pass whole values_map
                    # and then override with map.  This means we need no
                    # special variables here, could easily propogate values
                    # between iterations and we might also pass an list
                    # of values...

                map, self.rand_index = parent_stage.get_run_map(local_settings,
                                       run_settings=run_settings)

                if not template_mat.groups():
                    logger.info("found odd template matching file %s" % fname)
                else:

                    logger.debug("self.initial_numbfile=%s" % self.initial_numbfile)
                    # generates a set of variations for the template fname
                    variation_set = self._expand_variations(template,
                                                            [map], values_map,  self.initial_numbfile)
                    self.initial_numbfile += len(variation_set)
                    logger.debug('variation_set=%d' % len(variation_set))
                    logger.debug("self.initial_numbfile=%s" % self.initial_numbfile)
                    variations[base_fname] = variation_set
                logger.debug("map=%s" % map)
        else:
            # normal file
            pass
        logger.debug('Variations %s' % variations)
        logger.debug("Variations items %d" % len(variations.items()))
        return variations
Exemple #18
0
    def upload_variation_inputs(self, run_settings, local_settings, variations, processes,
                                 input_dir, output_storage_settings,
                                 computation_platform_settings, mytardis_settings):
        '''
        Create input packages for each variation and upload the vms
        '''
        logger.debug("upload_variation_inputs")
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                    output_storage_settings['type'])
        source_files_url = get_url_with_credentials(
            output_storage_settings, output_prefix + os.path.join(
                self.iter_inputdir, input_dir),
            is_relative_path=False)

        logger.debug('source_files_url=%s' % source_files_url)
        # Copy input directory to mytardis only after saving locally, so if
        # something goes wrong we still have the results
        if local_settings['curate_data']:
            self.experiment_id = self.curate_data(self.experiment_id, local_settings, output_storage_settings,
                             mytardis_settings, source_files_url)
        else:
            logger.warn('Data curation is off')
        #proc_ind = 0
        for var_fname in variations.keys():
            logger.debug("var_fname=%s" % var_fname)
            logger.debug('variations[var_fname]=%s' % variations[var_fname])
            for var_content, values in variations[var_fname]:
                #logger.debug("var_content = %s" % var_content)
                #logger.debug('proc_ind=%s' % proc_ind)
                logger.debug('processes=%s' % processes)
                run_counter = values['run_counter']
                logger.debug("run_counter=%s" % run_counter)
                proc = None
                for p in processes:
                    # TODO: how to handle invalid run_counter
                    pid = int(p['id'])
                    logger.debug("pid=%s" % pid)
                    if pid == run_counter:
                        proc = p
                        break
                else:
                    logger.error("no process found matching run_counter")
                    #smartconnectorscheduler.error(run_settings, "%s: wait" % (self.id + 1))
                    # TODO: catch this error and recover
                    raise BadInputException()

                logger.debug("proc=%s" % pformat(proc))

                #proc = processes[proc_ind]
                #proc_ind += 1
                #ip = botocloudconnector.get_instance_ip(var_node.id, local_settings)
                ip = proc['ip_address']

                #dest_files_location = computation_platform_settings['type'] + "@"\
                #                      + os.path.join(local_settings['payload_destination'],
                #                                     proc['id'],
                #                                     local_settings['payload_cloud_dirname']
                #                                     )
                relative_path_suffix = self.get_relative_output_path(local_settings)
                dest_files_location = computation_platform_settings['type'] + "@"\
                                      + os.path.join(relative_path_suffix,
                                                     proc['id'],
                                                     local_settings['payload_cloud_dirname']
                                                     )

                logger.debug('dest_files_location=%s' % dest_files_location)

                dest_files_url = get_url_with_credentials(
                    computation_platform_settings, dest_files_location,
                    is_relative_path=True, ip_address=ip)
                logger.debug('dest_files_url=%s' % dest_files_url)

                # FIXME: Cleanup any existing runs already there
                # FIXME: keep the compile exec from setup
                #FIXME: exceptions should be given as parameter
                #FIXme we should not delete anyfile. SInce each process runs in its own directory
                exceptions = [local_settings['compile_file'], "..", ".",
                              'PSD', 'PSD.f', 'PSD_exp.dat', 'PSD.inp',
                              'Makefile', 'running.sh',
                              'process_scheduledone.sh', 'process_schedulestart.sh']
                storage.copy_directories(source_files_url, dest_files_url)

                if self.reschedule_failed_procs:
                    input_backup = os.path.join(self.job_dir, "input_backup", proc['id'])
                    backup_url = get_url_with_credentials(
                        output_storage_settings,
                        output_prefix + input_backup, is_relative_path=False)
                    storage.copy_directories(source_files_url, backup_url)

                # Why do we need to create a tempory file to make this copy?
                import uuid
                randsuffix = unicode(uuid.uuid4())  # should use some job id here

                var_url = get_url_with_credentials(local_settings, os.path.join("tmp%s" % randsuffix, "var"),
                    is_relative_path=True)
                logger.debug("var_url=%s" % var_url)
                storage.put_file(var_url, var_content.encode('utf-8'))

                value_url = get_url_with_credentials(local_settings, os.path.join("tmp%s" % randsuffix, "value"),
                    is_relative_path=True)
                logger.debug("value_url=%s" % value_url)
                storage.put_file(value_url, json.dumps(values))

                #local_settings['platform'] should be replaced
                # and overwrite on the remote
                #var_fname_remote = computation_platform_settings['type']\
                #    + "@" + os.path.join(local_settings['payload_destination'],
                #                         proc['id'],
                #                         local_settings['payload_cloud_dirname'],
                #                         var_fname)
                var_fname_remote = computation_platform_settings['type']\
                    + "@" + os.path.join(relative_path_suffix,
                                         proc['id'],
                                         local_settings['payload_cloud_dirname'],
                                         var_fname)

                var_fname_pkey = get_url_with_credentials(
                    computation_platform_settings, var_fname_remote,
                    is_relative_path=True, ip_address=ip)
                var_content = storage.get_file(var_url)
                storage.put_file(var_fname_pkey, var_content)

                logger.debug("var_fname_pkey=%s" % var_fname_pkey)
                values_fname_pkey = get_url_with_credentials(
                    computation_platform_settings,
                    os.path.join(dest_files_location,
                                 "%s_values" % var_fname),
                    is_relative_path=True, ip_address=ip)
                values_content = storage.get_file(value_url)
                storage.put_file(values_fname_pkey, values_content)
                logger.debug("values_fname_pkey=%s" % values_fname_pkey)

                #copying values and var_content to backup folder
                if self.reschedule_failed_procs:
                    value_url = get_url_with_credentials(
                        output_storage_settings,
                        output_prefix + os.path.join(input_backup, "%s_values" % var_fname),
                        is_relative_path=False)
                    logger.debug("value_url=%s" % value_url)
                    storage.put_file(value_url, json.dumps(values))

                    var_fname_pkey = get_url_with_credentials(
                        output_storage_settings,
                        output_prefix + os.path.join(input_backup, var_fname),
                        is_relative_path=False)
                    var_content = storage.get_file(var_url)
                    storage.put_file(var_fname_pkey, var_content)

                # cleanup
                tmp_url = get_url_with_credentials(local_settings, os.path.join("tmp%s" % randsuffix),
                    is_relative_path=True)
                logger.debug("deleting %s" % tmp_url)
Exemple #19
0
        if local_settings['curate_data']:
            if mytardis_settings['mytardis_host']:

                if directive == "vasp":

                 # TODO: this is very domain specific

                    OUTCAR_FILE = "OUTCAR"
                    VALUES_FILE = "values"

                    outcar_url = storage.get_url_with_credentials(local_settings,
                        os.path.join(dest_url, OUTCAR_FILE), is_relative_path=False)
                    logger.debug("outcar_url=%s" % outcar_url)

                    try:
                        outcar_content = storage.get_file(outcar_url)
                    except IOError, e:
                        logger.error(e)
                        toten = None
                    else:
                        toten = None
                        for line in outcar_content.split('\n'):
                            #logger.debug("line=%s" % line)
                            if 'e  en' in line:
                                logger.debug("found")
                                try:
                                    toten = float(line.rsplit(' ', 2)[-2])
                                except ValueError, e:
                                    logger.error(e)
                                    pass
                                break
Exemple #20
0
    def build_metadata_for_final_output(self, m, output_dir, **kwargs):
        #FIXME: this calculation should be done as in extract_psd_func
        # pulling directly from data_errors rather than passing in
        # through nested function.
        experiment_paramset = []
        dataset_paramset = []
        datafile_paramset = []
        dfile_extract_func = {}

        exp_value_keys = []
        legends = []
        for m, current_dir in enumerate(kwargs['output_dirs']):
            #node_path = os.path.join(iter_output_dir, node_dir)

            exp_value_keys.append(
                ["hrmcdset%s/step" % m,
                 "hrmcdset%s/err" % m])

            source_url = storage.get_url_with_credentials(\
            kwargs['storage_settings'], current_dir, is_relative_path=False)

            (source_scheme, source_location, source_path, source_location,
             query_settings) = storage.parse_bdpurl(source_url)
            logger.debug("source_url=%s" % source_url)
            legends.append(
                mytardis.get_dataset_name_for_output(
                    kwargs['storage_settings'], "", source_path))

        logger.debug("exp_value_keys=%s" % exp_value_keys)
        logger.debug("legends=%s" % legends)

        # for m, output_dir in enumerate(kwargs['output_dirs']):
        #node_path = os.path.join(iter_output_dir, output_dir)
        node_path = output_dir
        logger.debug("node_path=%s" % node_path)

        dataerrors_url = storage.get_url_with_credentials(
            kwargs['storage_settings'],
            os.path.join(node_path, self.DATA_ERRORS_FILE),
            is_relative_path=False)
        logger.debug("dataerrors_url=%s" % dataerrors_url)
        dataerrors_content = storage.get_file(dataerrors_url)
        xs = []
        ys = []
        re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')
        for i, line in enumerate(dataerrors_content.splitlines()):
            if i == 0:
                continue
            columns = line.split()
            try:
                hrmc_step = int(columns[self.STEP_COLUMN_NUM])
            except ValueError:
                logger.warn("could not parse hrmc_step value on line %s" % i)
                continue
            # handle  format double precision float format
            val = columns[self.ERRGR_COLUMN_NUM]
            val = re_dbl_fort.sub(r'\1E\2', val)
            logger.debug("val=%s" % val)
            try:
                hrmc_errgr = float(val)
            except ValueError:
                logger.warn("could not parse hrmc_errgr value on line %s" % i)
                continue
            xs.append(hrmc_step)
            ys.append(hrmc_errgr)

        logger.debug("xs=%s" % xs)
        logger.debug("ys=%s" % ys)

        crit_url = storage.get_url_with_credentials(kwargs['storage_settings'],
                                                    os.path.join(
                                                        node_path,
                                                        "criterion.txt"),
                                                    is_relative_path=False)
        try:
            crit = storage.get_file(crit_url)
        except ValueError:
            crit = None
        except IOError:
            crit = None
        # FIXME: can crit be zero?
        logger.debug("crit=%s" % crit)
        if crit:
            system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \
            django_settings.SCHEMA_PREFIX))
            hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit}
        else:
            hrmcdset_val = {}

        # TODO: move into utiltiy function for reuse
        def extract_psd_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(dataerrors_content.splitlines()):
                if i == 0:
                    continue
                columns = line.split()

                val = columns[self.STEP_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    x = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                val = columns[self.ERRGR_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    y = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                xs.append(x)
                ys.append(y)
            res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
            return res

        def extract_psdexp_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
            return res

        def extract_grfinal_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [
                xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)
            ]
            cut_ys = [
                ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)
            ]

            res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
            return res

        def extract_inputgr_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [
                xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)
            ]
            cut_ys = [
                ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)
            ]

            res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
            return res

        #todo: replace self.boto_setttings with mytardis_settings

        # Only save graph paramset for experiment once per experiment.
        if not self.final_graph_paramset:
            self.final_graph_paramset = [
                mytardis.create_graph_paramset("expgraph",
                                               name="hrmcexp2",
                                               graph_info={
                                                   "axes":
                                                   ["step", "ERRGr*wf"],
                                                   "precision": [0, 2],
                                                   "legends": legends
                                               },
                                               value_dict={},
                                               value_keys=exp_value_keys)
            ]

            experiment_paramset = self.final_graph_paramset
        else:
            experiment_paramset = []

        dataset_paramset = [
            mytardis.create_paramset('hrmcdataset/output', []),
            mytardis.create_graph_paramset(
                'dsetgraph',
                name="hrmcdset",
                graph_info={
                    "axes": ["r (Angstroms)", "PSD"],
                    "legends": ["psd", "PSD_exp"],
                    "type": "line"
                },
                value_dict=hrmcdset_val,
                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                            ["hrmcdfile/r2", "hrmcdfile/g2"]]),
            mytardis.create_graph_paramset(
                'dsetgraph',
                name='hrmcdset2',
                graph_info={
                    "axes": ["r (Angstroms)", "g(r)"],
                    "legends": ["data_grfinal", "input_gr"],
                    "type": "line"
                },
                value_dict={},
                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                            ["hrmcdfile/r4", "hrmcdfile/g4"]]),
            mytardis.create_graph_paramset('dsetgraph',
                                           name='hrmcdset%s' % m,
                                           graph_info={},
                                           value_dict={
                                               "hrmcdset%s/step" % m: xs,
                                               "hrmcdset%s/err" % m: ys
                                           },
                                           value_keys=[]),
        ]
        datafile_paramset = [
            mytardis.create_graph_paramset('dfilegraph',
                                           name="hrmcdfile",
                                           graph_info={},
                                           value_dict={},
                                           value_keys=[])
        ]
        dfile_extract_func = {
            'psd.dat': extract_psd_func,
            'PSD_exp.dat': extract_psdexp_func,
            'data_grfinal.dat': extract_grfinal_func,
            'input_gr.dat': extract_inputgr_func
        }
        logger.debug("experiment_paramset=%s" % experiment_paramset)
        logger.debug("dataset_paramset=%s" % dataset_paramset)
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        logger.debug("dfile_extract_func=%s" % dfile_extract_func)

        return (experiment_paramset, dataset_paramset, datafile_paramset,
                dfile_extract_func)
class HRMCConverge(Converge):

    def input_valid(self, settings_to_test):
        """ Return a tuple, where the first element is True settings_to_test
        are syntactically and semantically valid for this stage.  Otherwise,
        return False with the second element in the tuple describing the
        problem
        """
        error = []
        try:
            int(getval(settings_to_test, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            error.append("Cannot load max_iteration")

        try:
            float(getval(settings_to_test, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            error.append("Cannot load error threshold")

        if error:
            return (False, '. '.join(error))
        return (True, "ok")

    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA))
        if curate_data:
            if all_settings['mytardis_host']:

#         if mytardis_settings['mytardis_host']:

#             EXP_DATASET_NAME_SPLIT = 2

#             def get_exp_name_for_output(settings, url, path):
#                 return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

#             def get_dataset_name_for_output(settings, url, path):
#                 logger.debug("path=%s" % path)

#                 host = settings['host']
#                 prefix = 'ssh://%s@%s' % (settings['type'], host)

#                 source_url = smartconnectorscheduler.get_url_with_credentials(
#                     settings, os.path.join(prefix, path, "HRMC.inp_values"),
#                     is_relative_path=False)
#                 logger.debug("source_url=%s" % source_url)
#                 try:
#                     content = storage.get_file(source_url)
#                 except IOError, e:
#                     logger.warn("cannot read file %s" % e)
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 logger.debug("content=%s" % content)
#                 try:
#                     values_map = dict(json.loads(str(content)))
#                 except Exception, e:
#                     logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 try:
#                     iteration = str(path.split(os.sep)[-2:-1][0])
#                 except Exception, e:
#                     logger.error(e)
#                     iteration = ""

#                 if "_" in iteration:
#                     iteration = iteration.split("_")[1]
#                 else:
#                     iteration = "final"

#                 dataset_name = "%s_%s_%s" % (iteration,
#                     values_map['generator_counter'],
#                     values_map['run_counter'])
#                 logger.debug("dataset_name=%s" % dataset_name)
#                 return dataset_name

#             re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

#             logger.debug("new_output_dir=%s" % new_output_dir)
#             exp_value_keys = []
#             legends = []
#             for m, node_dir in enumerate(node_dirs):
#                 exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

#                 source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False)

#                 (source_scheme, source_location, source_path, source_location,
#                     query_settings) = storage.parse_bdpurl(source_url)
#                 logger.debug("source_url=%s" % source_url)
#                 legends.append(
#                     get_dataset_name_for_output(
#                         output_storage_settings, "", source_path))

#             logger.debug("exp_value_keys=%s" % exp_value_keys)
#             logger.debug("legends=%s" % legends)

#             graph_paramset = [mytardis.create_graph_paramset("expgraph",
#                 name="hrmcexp2",
#                 graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
#                 value_dict={},
#                 value_keys=exp_value_keys)]

#             for m, node_dir in enumerate(node_dirs):

#                 dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False)
#                 dataerrors_content = storage.get_file(dataerrors_url)
#                 xs = []
#                 ys = []
#                 for i, line in enumerate(dataerrors_content.splitlines()):
#                     if i == 0:
#                         continue
#                     columns = line.split()
#                     try:
#                         hrmc_step = int(columns[STEP_COLUMN_NUM])
#                     except ValueError:
#                         logger.warn("could not parse hrmc_step value on line %s" % i)
#                         continue
#                     # handle  format double precision float format
#                     val = columns[ERRGR_COLUMN_NUM]
#                     val = re_dbl_fort.sub(r'\1E\2', val)
#                     logger.debug("val=%s" % val)





                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (iteration,
                        values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name

                re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

                exp_value_keys = []
                legends = []
                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)

                    exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

                    source_url = get_url_with_credentials(all_settings,
                                                   node_path, is_relative_path=False)

                    (source_scheme, source_location, source_path, source_location,
                        query_settings) = storage.parse_bdpurl(source_url)
                    logger.debug("source_url=%s" % source_url)
                    legends.append(
                        get_dataset_name_for_output(
                            all_settings, "", source_path))

                logger.debug("exp_value_keys=%s" % exp_value_keys)
                logger.debug("legends=%s" % legends)

                graph_paramset = [mytardis.create_graph_paramset("expgraph",
                    name="hrmcexp2",
                    graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
                    value_dict={},
                    value_keys=exp_value_keys)]

                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)
                    logger.debug("node_path=%s" % node_path)

                    #FIXME: this calculation should be done as in extract_psd_func
                    # pulling directly from data_errors rather than passing in
                    # through nested function.
                    dataerrors_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, DATA_ERRORS_FILE),
                        is_relative_path=False)
                    logger.debug("dataerrors_url=%s" % dataerrors_url)
                    dataerrors_content = storage.get_file(dataerrors_url)
                    xs = []
                    ys = []
                    for i, line in enumerate(dataerrors_content.splitlines()):
                        if i == 0:
                            continue
                        columns = line.split()
                        try:
                            hrmc_step = int(columns[STEP_COLUMN_NUM])
                        except ValueError:
                            logger.warn("could not parse hrmc_step value on line %s" % i)
                            continue
                        # handle  format double precision float format
                        val = columns[ERRGR_COLUMN_NUM]
                        val = re_dbl_fort.sub(r'\1E\2', val)
                        logger.debug("val=%s" % val)
                        try:
                            hrmc_errgr = float(val)
                        except ValueError:
                            logger.warn("could not parse hrmc_errgr value on line %s" % i)
                            continue
                        xs.append(hrmc_step)
                        ys.append(hrmc_errgr)

                    logger.debug("xs=%s" % xs)
                    logger.debug("ys=%s" % ys)

                    crit_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, "criterion.txt"), is_relative_path=False)
                    try:
                        crit = storage.get_file(crit_url)
                    except ValueError:
                        crit = None
                    except IOError:
                        crit = None
                    # FIXME: can crit be zero?
                    if crit:
                        hrmcdset_val = {"hrmcdset/it": self.id, "hrmcdset/crit": crit}
                    else:
                        hrmcdset_val = {}

                    source_url = get_url_with_credentials(
                        all_settings, node_path, is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)

                    # TODO: move into utiltiy function for reuse
                    def extract_psd_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(dataerrors_content.splitlines()):
                            if i == 0:
                                continue
                            columns = line.split()

                            val = columns[STEP_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                x = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            val = columns[ERRGR_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                y = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            xs.append(x)
                            ys.append(y)
                        res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                        return res

                    def extract_psdexp_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                        return res

                    def extract_grfinal_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                        return res

                    def extract_inputgr_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                        return res
                    #todo: replace self.boto_setttings with mytardis_settings

                    experiment_id = mytardis.create_dataset(
                        settings=all_settings,
                        source_url=source_url,
                        exp_name=get_exp_name_for_output,
                        dataset_name=get_dataset_name_for_output,
                        exp_id=experiment_id,
                        experiment_paramset=graph_paramset,
                        dataset_paramset=[
                            mytardis.create_paramset('hrmcdataset/output', []),
                            mytardis.create_graph_paramset('dsetgraph',
                                name="hrmcdset",
                                graph_info={"axes":["r (Angstroms)", "PSD"],
                                    "legends":["psd", "PSD_exp"],  "type":"line"},
                                value_dict=hrmcdset_val,
                                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                                    ["hrmcdfile/r2", "hrmcdfile/g2"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset2',
                                graph_info={"axes":["r (Angstroms)", "g(r)"],
                                    "legends":["data_grfinal", "input_gr"],
                                    "type":"line"},
                                value_dict={},
                                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                                    ["hrmcdfile/r4", "hrmcdfile/g4"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset%s' % m,
                                graph_info={},
                                value_dict={"hrmcdset%s/step" % m: xs,
                                    "hrmcdset%s/err" % m: ys},
                                value_keys=[]),
                            ],
                        datafile_paramset=[
                            mytardis.create_graph_paramset('dfilegraph',
                                name="hrmcdfile",
                                graph_info={},
                                value_dict={},
                                value_keys=[])
                            ],
                        dfile_extract_func={
                            'psd.dat': extract_psd_func,
                             'PSD_exp.dat': extract_psdexp_func,
                             'data_grfinal.dat': extract_grfinal_func,
                             'input_gr.dat': extract_inputgr_func}

                        )
                    graph_paramset = []
                if directive == "vasp":

                    # TODO: this is very domain specific

                    OUTCAR_FILE = "OUTCAR"
                    VALUES_FILE = "values"

                    outcar_url = storage.get_url_with_credentials(
                        local_settings,
                        os.path.join(dest_url, OUTCAR_FILE),
                        is_relative_path=False)
                    logger.debug("outcar_url=%s" % outcar_url)

                    try:
                        outcar_content = storage.get_file(outcar_url)
                    except IOError, e:
                        logger.error(e)
                        toten = None
                    else:
                        toten = None
                        for line in outcar_content.split('\n'):
                            #logger.debug("line=%s" % line)
                            if 'e  en' in line:
                                logger.debug("found")
                                try:
                                    toten = float(line.rsplit(' ', 2)[-2])
                                except ValueError, e:
                                    logger.error(e)
                                    pass
                                break
Exemple #23
0
    def get_output(self, ip_address, process_id, output_dir, local_settings,
                   computation_platform_settings, output_storage_settings,
                   run_settings):
        """
            Retrieve the output from the task on the node
        """

        logger.debug("get_output of process %s on %s" % (process_id, ip_address))
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                    output_storage_settings['type'])
        #fixme: add call get_process_output_path
        #cloud_path = os.path.join(local_settings['payload_destination'],
        #                          #str(contextid), #fixme: uncomment
        #                          str(process_id),
        #                          local_settings['process_output_dirname']
        #                          )
        relative_path_suffix = self.get_relative_output_path(local_settings)
        cloud_path = os.path.join(relative_path_suffix,
                                  str(process_id),
                                  local_settings['process_output_dirname']
                                  )
        #cloud_path = self.get_process_output_path(run_settings, process_id)
        logger.debug("cloud_path=%s" % cloud_path)
        logger.debug("Transferring output from %s to %s" % (cloud_path, output_dir))
        ip = ip_address  # botocloudconnector.get_instance_ip(instance_id, settings)
        #ssh = open_connection(ip_address=ip, settings=settings)
        source_files_location = "%s://%s@%s" % (computation_platform_settings['scheme'],
                                                computation_platform_settings['type'],
                                                 os.path.join(ip, cloud_path))
        source_files_url = get_url_with_credentials(
            computation_platform_settings, source_files_location,
            is_relative_path=False)
        logger.debug('source_files_url=%s' % source_files_url)

        dest_files_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(
                self.job_dir, self.output_dir, process_id),
            is_relative_path=False)
        logger.debug('dest_files_url=%s' % dest_files_url)
        # FIXME: might want to turn on paramiko compress function
        # to speed up this transfer
        storage.copy_directories(source_files_url, dest_files_url)


        #copying values file
        values_file_path = os.path.join(relative_path_suffix,
                                  str(process_id),
                                  local_settings['smart_connector_input'],
                                  django_settings.VALUES_FNAME
                                  )
        values_files_location = "%s://%s@%s" % (computation_platform_settings['scheme'],
                                                computation_platform_settings['type'],
                                                 os.path.join(ip, values_file_path))
        logger.debug("values_files_location=%s" % values_files_location)
        values_source_url = get_url_with_credentials(
        computation_platform_settings, values_files_location,
        is_relative_path=False)

        logger.debug("values_source_url=%s" % values_source_url)

        values_dest_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(
                self.job_dir, self.output_dir, process_id, django_settings.VALUES_FNAME),
            is_relative_path=False)
        logger.debug("values_dest_url=%s" % values_dest_url)
        try:
	    logger.debug('reading %s' % values_source_url)
            content = storage.get_file(values_source_url)
        except IOError, e:
            content = {}
    def process_outputs(self, run_settings, base_dir, output_url, all_settings,
                        offset):

        # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1
        # output_prefix = ssh://unix@
        # node_output_dir = 2

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir,
                                                    "output_%s" % id))
        logger.debug('iter_output_dir=%s' % iter_output_dir)
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug('output_url=%s' % output_url)
        (scheme, host, iter_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        logger.debug("iter_output_path=%s" % iter_output_path)
        iter_out_fsys = storage.get_filesystem(output_url)
        logger.debug("iter_out_fsys=%s" % iter_out_fsys)
        node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path)
        logger.debug('node_output_dirnames=%s' % node_output_dirnames)
        self.audit = ""

        Node_info = namedtuple('Node_info', ['dirname', 'number', 'criterion'])

        BASE_FNAME = "HRMC.inp"

        # generate criterias
        self.outputs = []
        for node_output_dirname in node_output_dirnames:
            node_path = output_prefix + os.path.join(iter_output_dir,
                                                     node_output_dirname)
            criterion = self.compute_psd_criterion(all_settings, node_path)
            #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,)
            logger.debug("criterion=%s" % criterion)

            try:
                values_url = get_url_with_credentials(
                    all_settings,
                    os.path.join(node_path, '%s_values' % BASE_FNAME),
                    is_relative_path=False)

                values_content = storage.get_file(values_url)

                logger.debug("values_file=%s" % values_url)
            except IOError:
                logger.warn("no values file found")
                values_map = {}
            else:
                values_map = dict(json.loads(values_content))

            self.outputs.append(
                Node_info(dirname=node_output_dirname,
                          number=values_map['run_counter'],
                          criterion=criterion))

        if not self.outputs:
            logger.error("no ouput found for this iteration")
            return

        self.outputs.sort(key=lambda x: int(x.criterion))
        logger.debug("self.outputs=%s" % self.outputs)

        try:
            # FIXME: need to validate this output to make sure list of int
            threshold = ast.literal_eval(
                getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            logger.warn("no threshold found when expected")
            return False
        logger.debug("threshold = %s" % threshold)
        total_picks = 1
        if len(threshold) > 1:
            for i in threshold:
                total_picks *= threshold[i]
        else:
            total_picks = threshold[0]

        def copy_files_with_pattern(iter_out_fsys, source_path, dest_path,
                                    pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                          all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' %
                         (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(
                        all_settings,
                        output_prefix +
                        os.path.join(ip_address, source_path, f),
                        is_relative_path=False)
                    dest_url = get_url_with_credentials(
                        all_settings,
                        output_prefix + os.path.join(ip_address, dest_path, f),
                        is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' %
                                 (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)

        # Make new input dirs
        new_input_dir = os.path.join(
            os.path.join(base_dir, "input_%d" % (id + 1)))
        for index in range(0, total_picks):
            Node_info = self.outputs[index]
            logger.debug("node_info.dirname=%s" % Node_info.dirname)
            logger.debug("Node_info=%s" % str(Node_info))

            new_input_path = os.path.join(new_input_dir, Node_info.dirname)
            logger.debug("New input node dir %s" % new_input_path)

            old_output_path = os.path.join(iter_output_dir, Node_info.dirname)

            # Move all existing domain input files unchanged to next input directory
            for f in DOMAIN_INPUT_FILES:
                source_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(old_output_path, f),
                    is_relative_path=False)
                dest_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(new_input_path, f),
                    is_relative_path=False)
                logger.debug('source_url=%s, dest_url=%s' %
                             (source_url, dest_url))

                content = storage.get_file(source_url)
                logger.debug('content collected')
                storage.put_file(dest_url, content)
                logger.debug('put successfully')

            logger.debug('put file successfully')
            pattern = "*_values"
            output_offset = os.path.join(
                os.path.join(offset, "output_%s" % id, Node_info.dirname))
            input_offset = os.path.join(
                os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname))
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            pattern = "*_template"
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            # NB: Converge stage triggers based on criterion value from audit.
            logger.debug('starting audit')
            info = "Run %s preserved (error %s)\n" % (Node_info.number,
                                                      Node_info.criterion)
            audit_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(new_input_path, 'audit.txt'),
                is_relative_path=False)
            storage.put_file(audit_url, info)
            logger.debug("audit=%s" % info)
            logger.debug('1:audit_url=%s' % audit_url)
            self.audit += info

            # move xyz_final.xyz to initial.xyz
            source_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(old_output_path, "xyz_final.xyz"),
                is_relative_path=False)
            logger.debug('source_url=%s' % source_url)
            dest_url = get_url_with_credentials(
                all_settings,
                output_prefix +
                os.path.join(new_input_path, 'input_initial.xyz'),
                is_relative_path=False)
            logger.debug('dest_url=%s' % dest_url)
            content = storage.get_file(source_url)
            logger.debug('content=%s' % content)
            storage.put_file(dest_url, content)
            self.audit += "spawning diamond runs\n"

        logger.debug(
            "input_dir=%s" %
            (output_prefix + os.path.join(new_input_dir, 'audit.txt')))
        audit_url = get_url_with_credentials(
            all_settings,
            output_prefix + os.path.join(new_input_dir, 'audit.txt'),
            is_relative_path=False)
        logger.debug('audit_url=%s' % audit_url)
        storage.put_file(audit_url, self.audit)
Exemple #25
0
    def process_outputs(self, run_settings, base_dir, input_url, all_settings):

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1)))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url)
        iter_out_fsys = storage.get_filesystem(input_url)

        input_dirs, _ = iter_out_fsys.listdir(iter_output_path)

        # TODO: store all audit info in single file in input_X directory in transform,
        # so we do not have to load individual files within node directories here.
        min_crit = sys.float_info.max - 1.0
        min_crit_index = sys.maxint

        # # TODO: store all audit info in single file in input_X directory in transform,
        # # so we do not have to load individual files within node directories here.
        # min_crit = sys.float_info.max - 1.0
        # min_crit_index = sys.maxint
        logger.debug("input_dirs=%s" % input_dirs)
        for input_dir in input_dirs:
            node_path = os.path.join(iter_output_dir, input_dir)
            logger.debug('node_path= %s' % node_path)

            # Retrieve audit file

            # audit_url = get_url_with_credentials(output_storage_settings,
            #     output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False)
            audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False)
            audit_content = storage.get_file(audit_url)
            logger.debug('audit_url=%s' % audit_url)

            # extract the best criterion error
            # FIXME: audit.txt is potentially debug file so format may not be fixed.
            p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE)
            m = p.search(audit_content)
            criterion = None
            if m:
                criterion = float(m.group(2))
                best_numb = int(m.group(1))
                # NB: assumes that subdirss in new input_x will have same names as output dir that created it.
                best_node = input_dir
            else:
                message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1)
                logger.warn(message)
                raise IOError(message)

            if criterion < min_crit:
                min_crit = criterion
                min_crit_index = best_numb
                min_crit_node = best_node

        logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index))

        if min_crit_index >= sys.maxint:
            raise BadInputException("Unable to find minimum criterion of input files")

        # get previous best criterion
        try:
            self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            self.prev_criterion = sys.float_info.max - 1.0
            logger.warn("no previous criterion found")

        # check whether we are under the error threshold
        logger.debug("best_num=%s" % best_numb)
        logger.debug("prev_criterion = %f" % self.prev_criterion)
        logger.debug("min_crit = %f" % min_crit)
        logger.debug('Current min criterion: %f, Prev '
                     'criterion: %f' % (min_crit, self.prev_criterion))
        difference = self.prev_criterion - min_crit
        logger.debug("Difference %f" % difference)

        try:
            max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            raise BadInputException("unknown max_iteration")
        logger.debug("max_iteration=%s" % max_iteration)

        try:
            self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            raise BadInputException("uknown error threshold")
        logger.debug("error_threshold=%s" % self.error_threshold)

        if self.id >= (max_iteration - 1):
            logger.debug("Max Iteration Reached %d " % self.id)
            return (True, min_crit)

        elif min_crit <= self.prev_criterion and difference <= self.error_threshold:
            logger.debug("Convergence reached %f" % difference)
            return (True, min_crit)

        else:
            if difference < 0:
                logger.debug("iteration diverged")
            logger.debug("iteration continues: %d iteration so far" % self.id)

        return (False, min_crit)
Exemple #26
0
                input_prefix = '%s://%s@' % (input_storage_settings['scheme'],
                                             input_storage_settings['type'])

                values_url = get_url_with_credentials(
                    input_storage_settings,
                    input_prefix + os.path.join(
                        input_storage_settings['ip_address'],
                        input_storage_offset, "initial", VALUES_MAP_FILE),
                    is_relative_path=False)
                logger.debug("values_url=%s" % values_url)

                values_e_url = get_url_with_credentials(local_settings,
                                                        values_url,
                                                        is_relative_path=False)
                logger.debug("values_url=%s" % values_e_url)
                values_content = get_file(values_e_url)
                logger.debug("values_content=%s" % values_content)
                starting_map = dict(json.loads(values_content))
            except IOError:
                logger.warn("no starting values file found")
            except ValueError:
                logger.error("problem parsing contents of %s" %
                             VALUES_MAP_FILE)
                pass
            logger.debug("starting_map after initial values=%s" %
                         pformat(starting_map))

        # Copy form input values info starting map
        # FIXME: could have name collisions between form inputs and
        # starting values.
        for ns in run_settings:
Exemple #27
0
class Sweep(Stage):

    def __init__(self, user_settings=None):
        self.numbfile = 0
        logger.debug("Sweep stage initialized")

    def is_triggered(self, run_settings):
        logger.debug('run_settings=%s' % run_settings)

        try:
            configure_done = int(getval(run_settings,
                '%s/stages/sweep/sweep_done' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            return True

        return not configure_done

    def _get_sweep_name(self, run_settings):
        try:
            sweep_name = getval(run_settings, '%s/directive_profile/sweep_name' % RMIT_SCHEMA)
        except SettingNotFoundException:
            sweep_name = 'unknown_sweep'
        return sweep_name

    def process(self, run_settings):
        logger.debug('run_settings=%s' % run_settings)

        # Need to make copy because we pass on run_settings to sub connector
        # so any changes we make here to run_settings WILL be inherited
        def make_local_settings(run_settings):
            from copy import deepcopy
            local_settings = deepcopy(getvals(run_settings, models.UserProfile.PROFILE_SCHEMA_NS))

            update(local_settings, run_settings,
                    RMIT_SCHEMA + '/system/platform',
                    # RMIT_SCHEMA + '/input/mytardis/experiment_id',
                    # RMIT_SCHEMA + '/system/random_numbers',
                   )
            local_settings['bdp_username'] = getval(
                run_settings, '%s/bdp_userprofile/username' % RMIT_SCHEMA)
            return local_settings

        local_settings = make_local_settings(run_settings)
        logger.debug('local_settings=%s' % local_settings)

        setval(run_settings,
               '%s/platform/computation/platform_url' % RMIT_SCHEMA,
               getval(run_settings,
                      '%s/input/system/compplatform/computation_platform'
                            % RMIT_SCHEMA))

        def _parse_output_location(run_settings, location):

            loc_list = location.split('/')
            name = loc_list[0]
            offset = ''
            if len(loc_list) > 1:
                offset = os.path.join(*loc_list[1:])
            logger.debug('offset=%s' % offset)
            return name, offset

        contextid = int(getval(run_settings, '%s/system/contextid' % RMIT_SCHEMA))
        logger.debug("contextid=%s" % contextid)
        sweep_name = self._get_sweep_name(run_settings)
        logger.debug("sweep_name=%s" % sweep_name)

        output_loc = self.output_exists(run_settings)
        location = ""
        if output_loc:
            location = getval(run_settings, output_loc)
            output_storage_name, output_storage_offset = \
                _parse_output_location(run_settings, location)
            setval(run_settings,
                   '%s/platform/storage/output/platform_url' % RMIT_SCHEMA,
                   output_storage_name)
            setval(run_settings, '%s/platform/storage/output/offset' % RMIT_SCHEMA,
                   os.path.join(output_storage_offset, '%s%s' % (sweep_name, contextid)))

        def _parse_input_location(run_settings, location):
            loc_list = location.split('/')
            name = loc_list[0]
            offset = ''
            if len(loc_list) > 1:
                offset = os.path.join(*loc_list[1:])
            logger.debug('offset=%s' % offset)
            return (name, offset)

        input_loc = self.input_exists(run_settings)
        if input_loc:
            location = getval(run_settings, input_loc)
            input_storage_name, input_storage_offset = \
                _parse_input_location(run_settings, location)
            setval(run_settings, '%s/platform/storage/input/platform_url' % RMIT_SCHEMA,
                   input_storage_name)
            # store offsets
            setval(run_settings,
                   '%s/platform/storage/input/offset' % RMIT_SCHEMA,
                   input_storage_offset)

        # TODO: replace with scratch space computation platform space
        self.scratch_platform = '%s%s%s' % (
            manage.get_scratch_platform(), sweep_name,
            contextid)

        # mytardis

        if output_loc:
            try:
                self.experiment_id = int(getval(run_settings, '%s/input/mytardis/experiment_id' % RMIT_SCHEMA))
            except KeyError:
                self.experiment_id = 0
            except ValueError:
                self.experiment_id = 0
            try:
                curate_data = getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA)
            except SettingNotFoundException:
                curate_data = False
            if curate_data:
                self.experiment_id = self.curate_data(run_settings, location, self.experiment_id)
            setval(run_settings,
                   '%s/input/mytardis/experiment_id' % RMIT_SCHEMA,
                   str(self.experiment_id))

        # generate all variations
        map_text = getval(run_settings, '%s/input/sweep/sweep_map' % RMIT_SCHEMA)
        # map_text = run_settings[RMIT_SCHEMA + '/input/sweep']['sweep_map']
        sweep_map = json.loads(map_text)
        logger.debug("sweep_map=%s" % pformat(sweep_map))
        runs = _expand_variations(maps=[sweep_map], values={})
        logger.debug("runs=%s" % runs)

        # Create random numbers if needed
        # TODO: move iseed out of hrmc into separate generic schema
        # to use on any sweepable connector and make this function
        # completely hrmc independent.

        rands = []

        try:
            self.rand_index = getval(run_settings, '%s/input/hrmc/iseed' % RMIT_SCHEMA)
            logger.debug("rand_index=%s" % self.rand_index)
        except SettingNotFoundException:
            pass
        else:
            # prep random seeds for each run based off original iseed
            # FIXME: inefficient for large random file
            # TODO, FIXME: this is potentially problematic if different
            # runs end up overlapping in the random numbers they utilise.
            # solution is to have separate random files per run or partition
            # big file up.

            try:
                num_url = getval(run_settings, "%s/system/random_numbers" % RMIT_SCHEMA)
                logger.debug('num_url=%s' % num_url)
            except SettingNotFoundException:
                pass
            else:
                try:
                    local_settings['random_numbers'] = num_url
                    rands = generate_rands(settings=local_settings,
                    start_range=0,
                    end_range=-1,
                    num_required=len(runs),
                    start_index=self.rand_index)
                    logger.debug("rands=%s" % rands)
                except Exception, e:
                    logger.debug('error')
                    logger.error(e)
                    raise
        # load initial values map in the input directory which
        # contains variable to use for all subdirectives
        starting_map = {}
        if input_loc:

            input_storage_settings = self.get_platform_settings(
                run_settings, 'http://rmit.edu.au/schemas/platform/storage/input')
            try:
                input_prefix = '%s://%s@' % (input_storage_settings['scheme'],
                                        input_storage_settings['type'])

                values_url = get_url_with_credentials(
                    input_storage_settings,
                    input_prefix + os.path.join(input_storage_settings['ip_address'],
                        input_storage_offset, "initial", VALUES_MAP_FILE),
                    is_relative_path=False)
                logger.debug("values_url=%s" % values_url)

                values_e_url = get_url_with_credentials(
                    local_settings,
                    values_url,
                    is_relative_path=False)
                logger.debug("values_url=%s" % values_e_url)
                values_content = get_file(values_e_url)
                logger.debug("values_content=%s" % values_content)
                starting_map = dict(json.loads(values_content))
            except IOError:
                logger.warn("no starting values file found")
            except ValueError:
                logger.error("problem parsing contents of %s" % VALUES_MAP_FILE)
                pass
            logger.debug("starting_map after initial values=%s"
                % pformat(starting_map))

        # Copy form input values info starting map
        # FIXME: could have name collisions between form inputs and
        # starting values.
        for ns in run_settings:
            if ns.startswith(RMIT_SCHEMA + "/input"):
                # for k, v in run_settings[ns].items():
                for k, v in getvals(run_settings, ns).items():
                    starting_map[k] = v
        logger.debug("starting_map after form=%s" % pformat(starting_map))

        # FIXME: we assume we will always have input directory

        # Get input_url directory
        input_url = ""
        if input_loc:
            input_prefix = '%s://%s@' % (input_storage_settings['scheme'],
                                    input_storage_settings['type'])
            input_url = get_url_with_credentials(input_storage_settings,
                input_prefix + os.path.join(input_storage_settings['ip_address'],
                    input_storage_offset),
            is_relative_path=False)
            logger.debug("input_url=%s" % input_url)

        current_context = models.Context.objects.get(id=contextid)
        user = current_context.owner.user.username

        # For each of the generated runs, copy across initial input
        # to individual input directories with variation values,
        # and then schedule subrun of sub directive
        logger.debug("run_settings=%s" % run_settings)
        for i, context in enumerate(runs):

            run_counter = int(context['run_counter'])
            logger.debug("run_counter=%s" % run_counter)
            run_inputdir = os.path.join(self.scratch_platform,
                SUBDIRECTIVE_DIR % {'run_counter': str(run_counter)},
                FIRST_ITERATION_DIR,)
            logger.debug("run_inputdir=%s" % run_inputdir)
            run_iter_url = get_url_with_credentials(local_settings,
                run_inputdir, is_relative_path=False)
            logger.debug("run_iter_url=%s" % run_iter_url)

            # Duplicate any input_directory into runX duplicates
            if input_loc:
                logger.debug("context=%s" % context)
                logger.debug("systemsettings=%s"
                         % pformat(getvals(run_settings, RMIT_SCHEMA + '/input/system')))
                copy_directories(input_url, run_iter_url)

            # Need to load up existing values, because original input_dir could
            # have contained values for the whole run
            # This code is deprecated in favour of single values file.
            self.error_detected = False


            try:
                template_name = getval(run_settings,
                                       '%s/stages/sweep/template_name'
                                            % RMIT_SCHEMA)
            except SettingNotFoundException:
                pass
            else:
                logger.debug("template_name=%s" % template_name)
                v_map = {}
                try:
                    values_url = get_url_with_credentials(
                        local_settings,
                        os.path.join(run_inputdir, "initial",
                             VALUES_MAP_TEMPLATE_FILE % {'template_name': template_name}),
                        is_relative_path=False)
                    logger.debug("values_url=%s" % values_url)
                    values_content = get_file(values_url)
                    logger.debug("values_content=%s" % values_content)
                    v_map = dict(json.loads(values_content), indent=4)
                except IOError:
                    logger.warn("no values file found")
                except ValueError:
                    logger.error("problem parsing contents of %s" % VALUES_MAP_FILE)
                    pass
                v_map.update(starting_map)
                v_map.update(context)
                logger.debug("new v_map=%s" % v_map)
                put_file(values_url, json.dumps(v_map, indent=4))

            v_map = {}
            try:
                values_url = get_url_with_credentials(
                    local_settings,
                    os.path.join(run_inputdir, "initial",
                        VALUES_MAP_FILE),
                    is_relative_path=False)
                logger.debug("values_url=%s" % values_url)
                values_content = get_file(values_url)
                logger.debug("values_content=%s" % values_content)
                v_map = dict(json.loads(values_content), )
            except IOError:
                logger.warn("no values file found")
            except ValueError:
                logger.error("problem parsing contents of %s" % VALUES_MAP_FILE)
                pass
            v_map.update(starting_map)
            v_map.update(context)
            logger.debug("new v_map=%s" % v_map)
            put_file(values_url, json.dumps(v_map, indent=4))

            # Set random numbers for subdirective
            logger.debug("run_settings=%s" % pformat(run_settings))
            if rands:
                setval(run_settings, '%s/input/hrmc/iseed' % RMIT_SCHEMA, rands[i])

            if input_loc:
                # Set revised input_location for subdirective
                setval(run_settings, input_loc,
                    "%s/%s/%s" % (self.scratch_platform,
                                    SUBDIRECTIVE_DIR
                                        % {'run_counter': str(run_counter)},
                                    FIRST_ITERATION_DIR))

            # Redirect input
            run_input_storage_name, run_input_storage_offset = \
                _parse_input_location(run_settings,
                    "local/sweep%s/run%s/input_0" % (contextid, run_counter))
            # setval(run_settings,
            #        '%s/platform/storage/input/platform_url' % RMIT_SCHEMA,
            #        run_input_storage_name)
            # setval(run_settings,
            #        '%s/platform/storage/input/offset' % RMIT_SCHEMA,
            #        run_input_storage_offset)

            logger.debug("run_settings=%s" % pformat(run_settings))
            try:
                _submit_subdirective("nectar", run_settings, user, current_context)
            except Exception, e:
                logger.error(e)
                raise e
Exemple #28
0
    def curate_dataset(self, run_settings, experiment_id, base_url, output_url,
                       all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        current_output_url = "%s%s" % (
            output_prefix,
            os.path.join(os.path.join(base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join(
                [current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                all_settings, process_output_url, is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings,
                '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)

            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings,  # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=
                _get_dataset_name,  # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset(
                        "dsetgraph",  # name of schema
                        name="randdset",  # a unique dataset name
                        graph_info={},
                        value_dict={
                            "randdset/x": x,
                            "randdset/y": y
                        },  # values to be used in experiment graphs
                        value_keys=[]),
                ])
        return experiment_id
    def process_outputs(self, run_settings, base_dir, output_url, all_settings, offset):

        # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1
        # output_prefix = ssh://unix@
        # node_output_dir = 2

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % id))
        logger.debug('iter_output_dir=%s' % iter_output_dir)
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug('output_url=%s' % output_url)
        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(output_url)
        logger.debug("iter_output_path=%s" % iter_output_path)
        iter_out_fsys = storage.get_filesystem(output_url)
        logger.debug("iter_out_fsys=%s" % iter_out_fsys)
        node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path)
        logger.debug('node_output_dirnames=%s' % node_output_dirnames)
        self.audit = ""

        Node_info = namedtuple('Node_info',
            ['dirname', 'number', 'criterion'])

        BASE_FNAME = "HRMC.inp"

        # generate criterias
        self.outputs = []
        for node_output_dirname in node_output_dirnames:
            node_path = output_prefix + os.path.join(iter_output_dir, node_output_dirname)
            criterion = self.compute_psd_criterion(all_settings, node_path)
            #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,)
            logger.debug("criterion=%s" % criterion)

            try:
                values_url = get_url_with_credentials(
                    all_settings, os.path.join(node_path,
                    '%s_values' % BASE_FNAME), is_relative_path=False)

                values_content = storage.get_file(values_url)

                logger.debug("values_file=%s" % values_url)
            except IOError:
                logger.warn("no values file found")
                values_map = {}
            else:
                values_map = dict(json.loads(values_content))

            self.outputs.append(Node_info(dirname=node_output_dirname,
                           number=values_map['run_counter'], criterion=criterion))

        if not self.outputs:
            logger.error("no ouput found for this iteration")
            return

        self.outputs.sort(key=lambda x: int(x.criterion))
        logger.debug("self.outputs=%s" % self.outputs)

        try:
            # FIXME: need to validate this output to make sure list of int
            threshold = ast.literal_eval(getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            logger.warn("no threshold found when expected")
            return False
        logger.debug("threshold = %s" % threshold)
        total_picks = 1
        if len(threshold) > 1:
            for i in threshold:
                total_picks *= threshold[i]
        else:
            total_picks = threshold[0]

        def copy_files_with_pattern(iter_out_fsys, source_path,
                                 dest_path, pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False)
                    dest_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)

        # Make new input dirs
        new_input_dir = os.path.join(os.path.join(base_dir, "input_%d" % (id + 1)))
        for index in range(0, total_picks):
            Node_info = self.outputs[index]
            logger.debug("node_info.dirname=%s" % Node_info.dirname)
            logger.debug("Node_info=%s" % str(Node_info))

            new_input_path = os.path.join(new_input_dir,
                Node_info.dirname)
            logger.debug("New input node dir %s" % new_input_path)

            old_output_path = os.path.join(iter_output_dir, Node_info.dirname)

            # Move all existing domain input files unchanged to next input directory
            for f in DOMAIN_INPUT_FILES:
                source_url = get_url_with_credentials(
                    all_settings, output_prefix + os.path.join(old_output_path, f), is_relative_path=False)
                dest_url = get_url_with_credentials(
                    all_settings, output_prefix + os.path.join(new_input_path, f),
                    is_relative_path=False)
                logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url))

                content = storage.get_file(source_url)
                logger.debug('content collected')
                storage.put_file(dest_url, content)
                logger.debug('put successfully')

            logger.debug('put file successfully')
            pattern = "*_values"
            output_offset = os.path.join(os.path.join(offset, "output_%s" % id, Node_info.dirname))
            input_offset = os.path.join(os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname))
            copy_files_with_pattern(iter_out_fsys,
                output_offset,
                input_offset, pattern,
                all_settings)

            pattern = "*_template"
            copy_files_with_pattern(iter_out_fsys,
                output_offset,
                input_offset, pattern,
                all_settings)

            # NB: Converge stage triggers based on criterion value from audit.
            logger.debug('starting audit')
            info = "Run %s preserved (error %s)\n" % (Node_info.number, Node_info.criterion)
            audit_url = get_url_with_credentials(
                all_settings, output_prefix +
                os.path.join(new_input_path, 'audit.txt'), is_relative_path=False)
            storage.put_file(audit_url, info)
            logger.debug("audit=%s" % info)
            logger.debug('1:audit_url=%s' % audit_url)
            self.audit += info

            # move xyz_final.xyz to initial.xyz
            source_url = get_url_with_credentials(
                all_settings, output_prefix + os.path.join(old_output_path, "xyz_final.xyz"), is_relative_path=False)
            logger.debug('source_url=%s' % source_url)
            dest_url = get_url_with_credentials(
                all_settings, output_prefix + os.path.join(new_input_path, 'input_initial.xyz'), is_relative_path=False)
            logger.debug('dest_url=%s' % dest_url)
            content = storage.get_file(source_url)
            logger.debug('content=%s' % content)
            storage.put_file(dest_url, content)
            self.audit += "spawning diamond runs\n"

        logger.debug("input_dir=%s" % (output_prefix + os.path.join(new_input_dir, 'audit.txt')))
        audit_url = get_url_with_credentials(
            all_settings, output_prefix + os.path.join(new_input_dir, 'audit.txt'), is_relative_path=False)
        logger.debug('audit_url=%s' % audit_url)
        storage.put_file(audit_url, self.audit)
Exemple #30
0
    def curate_dataset(self, run_settings, experiment_id,
                       base_url, output_url, all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        current_output_url = "%s%s" % (output_prefix, os.path.join(os.path.join(
            base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join([current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                    all_settings,
                    process_output_url,
                    is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings, '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)
            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings, # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=_get_dataset_name, # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset("dsetgraph", # name of schema
                        name="randdset", # a unique dataset name
                        graph_info={},
                        value_dict={"randdset/x": x, "randdset/y": y},  # values to be used in experiment graphs
                        value_keys=[]
                        ),
                    ]
                )
        return experiment_id
    def process_outputs(self, run_settings, base_dir, input_url, all_settings):

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1)))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url)
        iter_out_fsys = storage.get_filesystem(input_url)

        input_dirs, _ = iter_out_fsys.listdir(iter_output_path)

        # TODO: store all audit info in single file in input_X directory in transform,
        # so we do not have to load individual files within node directories here.
        min_crit = sys.float_info.max - 1.0
        min_crit_index = sys.maxint

        # # TODO: store all audit info in single file in input_X directory in transform,
        # # so we do not have to load individual files within node directories here.
        # min_crit = sys.float_info.max - 1.0
        # min_crit_index = sys.maxint
        logger.debug("input_dirs=%s" % input_dirs)
        for input_dir in input_dirs:
            node_path = os.path.join(iter_output_dir, input_dir)
            logger.debug('node_path= %s' % node_path)

            # Retrieve audit file

            # audit_url = get_url_with_credentials(output_storage_settings,
            #     output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False)
            audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False)
            audit_content = storage.get_file(audit_url)
            logger.debug('audit_url=%s' % audit_url)

            # extract the best criterion error
            # FIXME: audit.txt is potentially debug file so format may not be fixed.
            p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE)
            m = p.search(audit_content)
            criterion = None
            if m:
                criterion = float(m.group(2))
                best_numb = int(m.group(1))
                # NB: assumes that subdirss in new input_x will have same names as output dir that created it.
                best_node = input_dir
            else:
                message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1)
                logger.warn(message)
                raise IOError(message)

            if criterion < min_crit:
                min_crit = criterion
                min_crit_index = best_numb
                min_crit_node = best_node

        logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index))

        if min_crit_index >= sys.maxint:
            raise BadInputException("Unable to find minimum criterion of input files")

        # get previous best criterion
        try:
            self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            self.prev_criterion = sys.float_info.max - 1.0
            logger.warn("no previous criterion found")

        # check whether we are under the error threshold
        logger.debug("best_num=%s" % best_numb)
        logger.debug("prev_criterion = %f" % self.prev_criterion)
        logger.debug("min_crit = %f" % min_crit)
        logger.debug('Current min criterion: %f, Prev '
                     'criterion: %f' % (min_crit, self.prev_criterion))
        difference = self.prev_criterion - min_crit
        logger.debug("Difference %f" % difference)

        try:
            max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            raise BadInputException("unknown max_iteration")
        logger.debug("max_iteration=%s" % max_iteration)

        try:
            self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            raise BadInputException("uknown error threshold")
        logger.debug("error_threshold=%s" % self.error_threshold)

        if self.id >= (max_iteration - 1):
            logger.debug("Max Iteration Reached %d " % self.id)
            return (True, min_crit)

        elif min_crit <= self.prev_criterion and difference <= self.error_threshold:
            logger.debug("Convergence reached %f" % difference)
            return (True, min_crit)

        else:
            if difference < 0:
                logger.debug("iteration diverged")
            logger.debug("iteration continues: %d iteration so far" % self.id)

        return (False, min_crit)
Exemple #32
0
    def _upload_input_dir_variations(self, processes, local_settings,
                                     computation_platform_settings,
                                     output_storage_settings,
                                     mytardis_settings, input_dir,
                                     run_settings):
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        input_url_with_credentials = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(self.iter_inputdir, input_dir),
            is_relative_path=False)
        logger.debug('input_url_with_credentials=%s' %
                     input_url_with_credentials)
        if local_settings['curate_data']:
            self.experiment_id = self.curate_data(self.experiment_id,
                                                  local_settings,
                                                  output_storage_settings,
                                                  mytardis_settings,
                                                  input_url_with_credentials)
        else:
            logger.warn('Data curation is off')

        # get run Map
        parent_stage = self.import_parent_stage(run_settings)
        run_map, self.rand_index = parent_stage.get_run_map(
            local_settings, run_settings=run_settings)

        # load value_map
        values_url_with_pkey = get_url_with_credentials(
            output_storage_settings,
            output_prefix +
            os.path.join(self.iter_inputdir, input_dir, self.VALUES_FNAME),
            is_relative_path=False)
        logger.debug("initial values_file=%s" % values_url_with_pkey)
        values = {}
        try:
            values_content = storage.get_file(values_url_with_pkey)
        except IOError:
            logger.warn("no values file found")
        else:
            logger.debug("values_content = %s" % values_content)
            values = dict(json.loads(values_content))
        logger.debug("values=%s" % values)

        # generates a set of variations for the template fname
        contexts = self._get_variation_contexts([run_map], values,
                                                self.initial_numbfile)
        self.initial_numbfile += len(contexts)

        # for each context, copy each file to dest and any
        # templates to be instantiated, then store in values.

        template_pat = re.compile("(.*)_template")
        relative_path_suffix = self.get_relative_output_path(local_settings)

        for context in contexts:
            logger.debug("context=%s" % context)
            # get list of all files in input_dir
            fname_url_with_pkey = get_url_with_credentials(
                output_storage_settings,
                output_prefix + os.path.join(self.iter_inputdir, input_dir),
                is_relative_path=False)
            input_files = storage.list_dirs(fname_url_with_pkey,
                                            list_files=True)

            # get process information
            run_counter = context['run_counter']
            logger.debug("run_counter=%s" % run_counter)
            proc = None
            for p in processes:
                # TODO: how to handle invalid run_counter
                pid = int(p['id'])
                logger.debug("pid=%s" % pid)
                if pid == run_counter:
                    proc = p
                    break
            else:
                logger.error("no process found matching run_counter")
                raise BadInputException()
            logger.debug("proc=%s" % pformat(proc))

            for fname in input_files:
                logger.debug("fname=%s" % fname)
                templ_mat = template_pat.match(fname)
                fname_url_with_credentials = storage.get_url_with_credentials(
                    output_storage_settings,
                    output_prefix +
                    os.path.join(self.iter_inputdir, input_dir, fname),
                    is_relative_path=False)
                logger.debug("fname_url_with_credentials=%s" %
                             fname_url_with_credentials)

                def put_dest_file(proc, fname, dest_file_location,
                                  resched_file_location, content):
                    dest_url = get_url_with_credentials(
                        computation_platform_settings,
                        os.path.join(dest_file_location, fname),
                        is_relative_path=True,
                        ip_address=proc['ip_address'])
                    logger.debug("writing to =%s" % dest_url)
                    #logger.debug("content=%s" % content)
                    storage.put_file(dest_url, content)
                    if self.reschedule_failed_procs:
                        logger.debug("resched=%s" % resched_file_location)
                        logger.debug("fname=%s" % fname)
                        logger.debug("output_storage_settings=%s" %
                                     output_storage_settings)

                        logger.debug("here")
                        test = "%s/%s" % (resched_file_location, fname)
                        logger.debug("test=%s" % test)
                        resched_url = get_url_with_credentials(
                            output_storage_settings, test)
                        logger.debug("writing backup to %s" % resched_url)
                        storage.put_file(resched_url, content)
                    logger.debug("done")

                outputs = []
                if templ_mat:
                    base_fname = templ_mat.group(1)
                    template_content = storage.get_file(
                        fname_url_with_credentials)
                    try:
                        templ = Template(template_content)
                    except TemplateSyntaxError, e:
                        logger.error(e)
                        #FIXME: should detect this during submission of job,
                        #as no sensible way to recover here.
                        #TODO: signal error conditions in job status
                        continue
                    new_context = Context(context)
                    logger.debug("new_content=%s" % new_context)
                    render_output = templ.render(new_context)
                    render_output = render_output.encode('utf-8')
                    outputs.append((base_fname, render_output))
                    outputs.append((fname, template_content))

                else:
                    content = storage.get_file(fname_url_with_credentials)
                    outputs.append((fname, content))

                for (new_fname, content) in outputs:
                    dest_file_location = computation_platform_settings['type']\
                        + "@" + os.path.join(relative_path_suffix,
                                             proc['id'],
                                             local_settings['payload_cloud_dirname'])
                    logger.debug("dest_file_location =%s" % dest_file_location)
                    resched_file_location = "%s%s" % (
                        output_prefix,
                        os.path.join(self.job_dir, "input_backup", proc['id']))

                    logger.debug("resched_file_location=%s" %
                                 resched_file_location)
                    put_dest_file(proc, new_fname, dest_file_location,
                                  resched_file_location, content)

            # then copy context new values file
            logger.debug("writing values file")
            values_dest_location = computation_platform_settings['type']\
                + "@" + os.path.join(relative_path_suffix,
                                     proc['id'],
                                     local_settings['payload_cloud_dirname'],
                                     self.VALUES_FNAME)
            logger.debug("values_dest_location =%s" % values_dest_location)

            values_dest_url = get_url_with_credentials(
                computation_platform_settings,
                values_dest_location,
                is_relative_path=True,
                ip_address=proc['ip_address'])

            storage.put_file(values_dest_url, json.dumps(context, indent=4))
Exemple #33
0
    def get_output(self, ip_address, process_id, output_dir, local_settings,
                   computation_platform_settings, output_storage_settings,
                   run_settings):
        """
            Retrieve the output from the task on the node
        """

        logger.debug("get_output of process %s on %s" %
                     (process_id, ip_address))
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        #fixme: add call get_process_output_path
        #cloud_path = os.path.join(local_settings['payload_destination'],
        #                          #str(contextid), #fixme: uncomment
        #                          str(process_id),
        #                          local_settings['process_output_dirname']
        #                          )
        relative_path_suffix = self.get_relative_output_path(local_settings)
        cloud_path = os.path.join(relative_path_suffix, str(process_id),
                                  local_settings['process_output_dirname'])
        #cloud_path = self.get_process_output_path(run_settings, process_id)
        logger.debug("cloud_path=%s" % cloud_path)
        logger.debug("Transferring output from %s to %s" %
                     (cloud_path, output_dir))
        ip = ip_address  # botocloudconnector.get_instance_ip(instance_id, settings)
        #ssh = open_connection(ip_address=ip, settings=settings)
        source_files_location = "%s://%s@%s" % (
            computation_platform_settings['scheme'],
            computation_platform_settings['type'], os.path.join(
                ip, cloud_path))
        source_files_url = get_url_with_credentials(
            computation_platform_settings,
            source_files_location,
            is_relative_path=False)
        logger.debug('source_files_url=%s' % source_files_url)

        dest_files_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix +
            os.path.join(self.job_dir, self.output_dir, process_id),
            is_relative_path=False)
        logger.debug('dest_files_url=%s' % dest_files_url)
        # FIXME: might want to turn on paramiko compress function
        # to speed up this transfer
        storage.copy_directories(source_files_url, dest_files_url)

        #copying values file
        values_file_path = os.path.join(
            relative_path_suffix, str(process_id),
            local_settings['smart_connector_input'],
            django_settings.VALUES_FNAME)
        values_files_location = "%s://%s@%s" % (
            computation_platform_settings['scheme'],
            computation_platform_settings['type'],
            os.path.join(ip, values_file_path))
        logger.debug("values_files_location=%s" % values_files_location)
        values_source_url = get_url_with_credentials(
            computation_platform_settings,
            values_files_location,
            is_relative_path=False)

        logger.debug("values_source_url=%s" % values_source_url)

        values_dest_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix +
            os.path.join(self.job_dir, self.output_dir, process_id,
                         django_settings.VALUES_FNAME),
            is_relative_path=False)
        logger.debug("values_dest_url=%s" % values_dest_url)
        try:
            logger.debug('reading %s' % values_source_url)
            content = storage.get_file(values_source_url)
        except IOError, e:
            content = {}