def get_total_templates(self, maps, **kwargs): run_settings = kwargs['run_settings'] output_storage_settings = kwargs['output_storage_settings'] job_dir = kwargs['job_dir'] try: id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError) as e: logger.debug(e) id = 0 iter_inputdir = os.path.join(job_dir, "input_%s" % id) url_with_pkey = get_url_with_credentials( output_storage_settings, '%s://%s@%s' % (output_storage_settings['scheme'], output_storage_settings['type'], iter_inputdir), is_relative_path=False) logger.debug(url_with_pkey) input_dirs = list_dirs(url_with_pkey) for iter, template_map in enumerate(maps): logger.debug("template_map=%s" % template_map) map_keys = template_map.keys() logger.debug("map_keys %s" % map_keys) map_ranges = [list(template_map[x]) for x in map_keys] product = 1 for i in map_ranges: product = product * len(i) total_templates = product * len(input_dirs) logger.debug("total_templates=%d" % (total_templates)) return total_templates
def prepare_inputs(self, local_settings, output_storage_settings, computation_platform_settings, mytardis_settings, run_settings): """ Upload all input directories for this iteration """ logger.debug("preparing inputs") # TODO: to ensure reproducability, may want to precalculate all random numbers and # store rather than rely on canonical execution of rest of this funciton. #processes = self.schedule_procs processes = [x for x in self.schedule_procs if x['status'] == 'ready'] self.node_ind = 0 logger.debug("Iteration Input dir %s" % self.iter_inputdir) output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + self.iter_inputdir, is_relative_path=False) input_dirs = list_dirs(url_with_pkey) if not input_dirs: raise BadInputException( "require an initial subdirectory of input directory") for input_dir in sorted(input_dirs): self._upload_input_dir_variations(processes, local_settings, computation_platform_settings, output_storage_settings, mytardis_settings, input_dir, run_settings)
def prepare_inputs(self, local_settings, output_storage_settings, computation_platform_settings, mytardis_settings, run_settings): """ Upload all input files for this run """ logger.debug("preparing inputs") # TODO: to ensure reproducability, may want to precalculate all random numbers and # store rather than rely on canonical execution of rest of this funciton. #processes = self.schedule_procs processes = [x for x in self.schedule_procs if x['status'] == 'ready'] self.node_ind = 0 logger.debug("Iteration Input dir %s" % self.iter_inputdir) output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + self.iter_inputdir, is_relative_path=False) logger.debug("url_with_pkey=%s" % url_with_pkey) input_dirs = list_dirs(url_with_pkey) if not input_dirs: raise BadInputException("require an initial subdirectory of input directory") for input_dir in sorted(input_dirs): logger.debug("Input dir %s" % input_dir) self.upload_variation_inputs( run_settings, local_settings, self.generate_variations( input_dir, local_settings, output_storage_settings, run_settings), processes, input_dir, output_storage_settings, computation_platform_settings, mytardis_settings)
def _instantiate_context(source_url, settings, context): templ_pat = re.compile("(.*)_template") encoded_s_url = storage.get_url_with_credentials(settings, source_url, is_relative_path=False) logger.debug("encoded_s_url=%s" % encoded_s_url) fnames = storage.list_dirs(encoded_s_url, list_files=True) logger.debug("fnames=%s" % fnames) new_content = {} for fname in fnames: logger.debug("fname=%s" % fname) templ_mat = templ_pat.match(fname) if templ_mat: base_fname = templ_mat.group(1) basename_url_with_pkey = storage.get_url_with_credentials( settings, os.path.join( source_url, fname), is_relative_path=False) logger.debug("basename_url_with_pkey=%s" % basename_url_with_pkey) cont = storage.get_file(basename_url_with_pkey) try: t = Template(cont) except TemplateSyntaxError, e: logger.error(e) #FIXME: should detect this during submission of job, #as no sensible way to recover here. #TODO: signal error conditions in job status continue con = Context(context) logger.debug("context=%s" % context) new_content[base_fname] = t.render(con)
def _upload_input_dir_variations(self, processes, local_settings, computation_platform_settings, output_storage_settings, mytardis_settings, input_dir, run_settings): output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) input_url_with_credentials = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join( self.iter_inputdir, input_dir), is_relative_path=False) logger.debug('input_url_with_credentials=%s' % input_url_with_credentials) if local_settings['curate_data']: try: mytardis_platform = jobs.safe_import('chiminey.platform.mytardis.MyTardisPlatform', [], {}) self.experiment_id = mytardis_platform.create_dataset_for_input(self.experiment_id, run_settings, local_settings, output_storage_settings, mytardis_settings, input_url_with_credentials) except ImproperlyConfigured as e: logger.error("Cannot load mytardis platform hook %s" % e) else: logger.warn('Data curation is off') # get run Map parent_stage = self.import_parent_stage(run_settings) run_map, self.rand_index = parent_stage.get_internal_sweep_map(local_settings, run_settings=run_settings) # load value_map values_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, self.VALUES_FNAME), is_relative_path=False) logger.debug("initial values_file=%s" % values_url_with_pkey) values = {} try: values_content = storage.get_file(values_url_with_pkey) except IOError: logger.warn("no values file found") else: logger.debug("values_content = %s" % values_content) values = dict(json.loads(values_content)) logger.debug("values=%s" % values) # generates a set of variations for the template fname logger.debug('self.initial_numbfile = %s ' % self.initial_numbfile) contexts = self._get_variation_contexts( [run_map], values, self.initial_numbfile) self.initial_numbfile += len(contexts) logger.debug('contexts = %s ' % contexts) logger.debug('self.initial_numbfile = %s ' % self.initial_numbfile) # for each context, copy each file to dest and any # templates to be instantiated, then store in values. template_pat = re.compile("(.*)_template") relative_path_suffix = self.get_relative_output_path(local_settings) for context in contexts: logger.debug("context=%s" % context) # get list of all files in input_dir fname_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir), is_relative_path=False) input_files = storage.list_dirs(fname_url_with_pkey, list_files=True) # get process information run_counter = context['run_counter'] logger.debug("run_counter=%s" % run_counter) proc = None for p in processes: # TODO: how to handle invalid run_counter pid = int(p['id']) logger.debug("pid=%s" % pid) if pid == run_counter: proc = p break else: logger.error("no process found matching run_counter") raise BadInputException() logger.debug("proc=%s" % pformat(proc)) for fname in input_files: logger.debug("fname=%s" % fname) templ_mat = template_pat.match(fname) fname_url_with_credentials = storage.get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, fname), is_relative_path=False) logger.debug("fname_url_with_credentials=%s" % fname_url_with_credentials) def put_dest_file(proc, fname, dest_file_location, resched_file_location, content): dest_url = get_url_with_credentials( computation_platform_settings, os.path.join( dest_file_location, fname), is_relative_path=True, ip_address=proc['ip_address']) logger.debug("writing to =%s" % dest_url) #logger.debug("content=%s" % content) storage.put_file(dest_url, content) if self.reschedule_failed_procs: logger.debug("resched=%s" % resched_file_location) logger.debug("fname=%s" % fname) logger.debug("output_storage_settings=%s" % output_storage_settings) logger.debug("here") test = "%s/%s" % (resched_file_location, fname) logger.debug("test=%s" % test) resched_url = get_url_with_credentials( output_storage_settings, test) logger.debug("writing backup to %s" % resched_url) storage.put_file(resched_url, content) logger.debug("done") outputs = [] if templ_mat: base_fname = templ_mat.group(1) template_content = storage.get_file( fname_url_with_credentials) try: templ = Template(template_content) except TemplateSyntaxError, e: logger.error(e) # FIXME: should detect this during submission of job, # as no sensible way to recover here. # TODO: signal error conditions in job status continue new_context = Context(context) logger.debug("new_content=%s" % new_context) render_output = templ.render(new_context) render_output = render_output.encode('utf-8') outputs.append((base_fname, render_output)) outputs.append((fname, template_content)) else: content = storage.get_file(fname_url_with_credentials) outputs.append((fname, content)) for (new_fname, content) in outputs: dest_file_location = computation_platform_settings['type']\ + "@" + os.path.join(relative_path_suffix, proc['id'], local_settings['smart_connector_input']) logger.debug("dest_file_location =%s" % dest_file_location) resched_file_location = "%s%s" % (output_prefix, os.path.join( self.job_dir, "input_backup", proc['id'])) logger.debug("resched_file_location=%s" % resched_file_location) put_dest_file(proc, new_fname, dest_file_location, resched_file_location, content) # then copy context new values file logger.debug("writing values file") values_dest_location = computation_platform_settings['type']\ + "@" + os.path.join(relative_path_suffix, proc['id'], local_settings['smart_connector_input'], self.VALUES_FNAME) logger.debug("values_dest_location =%s" % values_dest_location) values_dest_url = get_url_with_credentials( computation_platform_settings, values_dest_location, is_relative_path=True, ip_address=proc['ip_address']) storage.put_file(values_dest_url, json.dumps(context, indent=4))
def generate_variations(self, input_dir, local_settings, output_storage_settings, run_settings): """ For each templated file in input_dir, generate all variations """ output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) template_pat = re.compile("(.*)_template") fname_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir), is_relative_path=False) input_files = storage.list_dirs(fname_url_with_pkey, list_files=True) variations = {} # TODO: only tested with single template file per input parent_stage = self.import_parent_stage(run_settings) for fname in input_files: logger.debug("trying %s/%s/%s" % (self.iter_inputdir, input_dir, fname)) template_mat = template_pat.match(fname) if template_mat: # get the template basename_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, fname), is_relative_path=False) template = storage.get_file(basename_url_with_pkey) base_fname = template_mat.group(1) logger.debug("base_fname=%s" % base_fname) # find associated values file and generator_counter values_map = {} try: values_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, '%s_values' % base_fname), is_relative_path=False) logger.debug("values_file=%s" % values_url_with_pkey) values_content = storage.get_file(values_url_with_pkey) except IOError: logger.warn("no values file found") else: logger.debug("values_content = %s" % values_content) values_map = dict(json.loads(values_content)) # TODO: rather than loading up specific vars for info # to send to next set of variations, pass whole values_map # and then override with map. This means we need no # special variables here, could easily propogate values # between iterations and we might also pass an list # of values... map, self.rand_index = parent_stage.get_run_map(local_settings, run_settings=run_settings) if not template_mat.groups(): logger.info("found odd template matching file %s" % fname) else: logger.debug("self.initial_numbfile=%s" % self.initial_numbfile) # generates a set of variations for the template fname variation_set = self._expand_variations(template, [map], values_map, self.initial_numbfile) self.initial_numbfile += len(variation_set) logger.debug('variation_set=%d' % len(variation_set)) logger.debug("self.initial_numbfile=%s" % self.initial_numbfile) variations[base_fname] = variation_set logger.debug("map=%s" % map) else: # normal file pass logger.debug('Variations %s' % variations) logger.debug("Variations items %d" % len(variations.items())) return variations
def _upload_input_dir_variations(self, processes, local_settings, computation_platform_settings, output_storage_settings, mytardis_settings, input_dir, run_settings): output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) input_url_with_credentials = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir), is_relative_path=False) logger.debug('input_url_with_credentials=%s' % input_url_with_credentials) if local_settings['curate_data']: self.experiment_id = self.curate_data(self.experiment_id, local_settings, output_storage_settings, mytardis_settings, input_url_with_credentials) else: logger.warn('Data curation is off') # get run Map parent_stage = self.import_parent_stage(run_settings) run_map, self.rand_index = parent_stage.get_run_map( local_settings, run_settings=run_settings) # load value_map values_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, self.VALUES_FNAME), is_relative_path=False) logger.debug("initial values_file=%s" % values_url_with_pkey) values = {} try: values_content = storage.get_file(values_url_with_pkey) except IOError: logger.warn("no values file found") else: logger.debug("values_content = %s" % values_content) values = dict(json.loads(values_content)) logger.debug("values=%s" % values) # generates a set of variations for the template fname contexts = self._get_variation_contexts([run_map], values, self.initial_numbfile) self.initial_numbfile += len(contexts) # for each context, copy each file to dest and any # templates to be instantiated, then store in values. template_pat = re.compile("(.*)_template") relative_path_suffix = self.get_relative_output_path(local_settings) for context in contexts: logger.debug("context=%s" % context) # get list of all files in input_dir fname_url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir), is_relative_path=False) input_files = storage.list_dirs(fname_url_with_pkey, list_files=True) # get process information run_counter = context['run_counter'] logger.debug("run_counter=%s" % run_counter) proc = None for p in processes: # TODO: how to handle invalid run_counter pid = int(p['id']) logger.debug("pid=%s" % pid) if pid == run_counter: proc = p break else: logger.error("no process found matching run_counter") raise BadInputException() logger.debug("proc=%s" % pformat(proc)) for fname in input_files: logger.debug("fname=%s" % fname) templ_mat = template_pat.match(fname) fname_url_with_credentials = storage.get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.iter_inputdir, input_dir, fname), is_relative_path=False) logger.debug("fname_url_with_credentials=%s" % fname_url_with_credentials) def put_dest_file(proc, fname, dest_file_location, resched_file_location, content): dest_url = get_url_with_credentials( computation_platform_settings, os.path.join(dest_file_location, fname), is_relative_path=True, ip_address=proc['ip_address']) logger.debug("writing to =%s" % dest_url) #logger.debug("content=%s" % content) storage.put_file(dest_url, content) if self.reschedule_failed_procs: logger.debug("resched=%s" % resched_file_location) logger.debug("fname=%s" % fname) logger.debug("output_storage_settings=%s" % output_storage_settings) logger.debug("here") test = "%s/%s" % (resched_file_location, fname) logger.debug("test=%s" % test) resched_url = get_url_with_credentials( output_storage_settings, test) logger.debug("writing backup to %s" % resched_url) storage.put_file(resched_url, content) logger.debug("done") outputs = [] if templ_mat: base_fname = templ_mat.group(1) template_content = storage.get_file( fname_url_with_credentials) try: templ = Template(template_content) except TemplateSyntaxError, e: logger.error(e) #FIXME: should detect this during submission of job, #as no sensible way to recover here. #TODO: signal error conditions in job status continue new_context = Context(context) logger.debug("new_content=%s" % new_context) render_output = templ.render(new_context) render_output = render_output.encode('utf-8') outputs.append((base_fname, render_output)) outputs.append((fname, template_content)) else: content = storage.get_file(fname_url_with_credentials) outputs.append((fname, content)) for (new_fname, content) in outputs: dest_file_location = computation_platform_settings['type']\ + "@" + os.path.join(relative_path_suffix, proc['id'], local_settings['payload_cloud_dirname']) logger.debug("dest_file_location =%s" % dest_file_location) resched_file_location = "%s%s" % ( output_prefix, os.path.join(self.job_dir, "input_backup", proc['id'])) logger.debug("resched_file_location=%s" % resched_file_location) put_dest_file(proc, new_fname, dest_file_location, resched_file_location, content) # then copy context new values file logger.debug("writing values file") values_dest_location = computation_platform_settings['type']\ + "@" + os.path.join(relative_path_suffix, proc['id'], local_settings['payload_cloud_dirname'], self.VALUES_FNAME) logger.debug("values_dest_location =%s" % values_dest_location) values_dest_url = get_url_with_credentials( computation_platform_settings, values_dest_location, is_relative_path=True, ip_address=proc['ip_address']) storage.put_file(values_dest_url, json.dumps(context, indent=4))