def fetch_dataset(self, var, d_key): """Copy files to temporary directory. (GCP can't copy to home dir, so always copy to a temp dir) """ tmpdir = core.TempDirManager().make_tempdir() self.log.debug("Created GCP fetch temp dir at %s.", tmpdir) (cp_command, smartsite) = self._get_fetch_method(self._fetch_method) paths = d_key.remote_data() if isinstance(paths, pd.Series): paths = paths.to_list() if not util.is_iterable(paths): paths = (paths, ) local_paths = [] for path in paths: # exceptions caught in parent loop in data_manager.DataSourceBase local_path = os.path.join(tmpdir, os.path.basename(path)) self.log.info(f"\tFetching {path[len(self.attrs.CASE_ROOT_DIR):]}") util.run_command( cp_command + [ smartsite + path, # gcp requires trailing slash, ln ignores it smartsite + tmpdir + os.sep ], timeout=self.timeout, dry_run=self.dry_run, log=self.log) local_paths.append(local_path) d_key.local_data = local_paths
def gcp_wrapper(source_path, dest_dir, timeout=None, dry_run=None): """Wrapper for file and recursive directory copying using the GFDL site-specific General Copy Program (`https://gitlab.gfdl.noaa.gov/gcp/gcp`__.) Assumes GCP environment module has been loaded beforehand, and calls GCP in a subprocess. """ modMgr = ModuleManager() modMgr.load('gcp') config = core.ConfigManager() if timeout is None: timeout = config.get('file_transfer_timeout', 0) if dry_run is None: dry_run = config.get('dry_run', False) source_path = os.path.normpath(source_path) dest_dir = os.path.normpath(dest_dir) # gcp requires trailing slash, ln ignores it if os.path.isdir(source_path): source = ['-r', 'gfdl:' + source_path + os.sep] # gcp /A/B/ /C/D/ will result in /C/D/B, so need to specify parent dir dest = ['gfdl:' + os.path.dirname(dest_dir) + os.sep] else: source = ['gfdl:' + source_path] dest = ['gfdl:' + dest_dir + os.sep] _log.info('\tGCP {} -> {}'.format(source[-1], dest[-1])) util.run_command(['gcp', '--sync', '-v', '-cd'] + source + dest, timeout=timeout, dry_run=dry_run)
def pre_fetch_hook(self, vars_to_fetch): """Issue dmget for all files we're about to fetch, if those files are on a tape filesystem. """ if self.tape_filesystem: paths = set([]) for var in vars_to_fetch: for d_key in var.iter_data_keys(status=core.ObjectStatus.ACTIVE): paths.update(d_key.remote_data()) self.log.info(f"Start dmget of {len(paths)} files...") util.run_command(['dmget','-t','-v'] + list(paths), timeout= len(paths) * self.timeout, dry_run=self.dry_run, log=self.log ) self.log.info("Successful exit of dmget.")
def pre_fetch_hook(self, vars_to_fetch): """Issue dmget for all files we're about to fetch, if those files are on a tape filesystem. """ if self.tape_filesystem: paths = set([]) for var in vars_to_fetch: for data_key in self.iter_data_keys(var): paths.update(self.remote_data(data_key)) _log.info(f"Start dmget of {len(paths)} files.") util.run_command(['dmget','-t','-v'] + list(paths), timeout= len(paths) * self.timeout, dry_run=self.dry_run ) _log.info("Successful exit of dmget.")
def gcp_wrapper(source_path, dest_dir, timeout=0, dry_run=False): modMgr = ModuleManager() modMgr.load('gcp') source_path = os.path.normpath(source_path) dest_dir = os.path.normpath(dest_dir) # gcp requires trailing slash, ln ignores it if os.path.isdir(source_path): source = ['-r', 'gfdl:' + source_path + os.sep] # gcp /A/B/ /C/D/ will result in /C/D/B, so need to specify parent dir dest = ['gfdl:' + os.path.dirname(dest_dir) + os.sep] else: source = ['gfdl:' + source_path] dest = ['gfdl:' + dest_dir + os.sep] print('\tDEBUG: GCP {} -> {}'.format(source[-1], dest[-1])) util.run_command(['gcp', '--sync', '-v', '-cd'] + source + dest, timeout=timeout, dry_run=dry_run)
def plan_data_fetch_hook(self): """Filter files on model component and chunk frequency. """ d_to_u_dict = self._decide_allowed_components() for data_key in self.data_keys: u_key = d_to_u_dict[data_key] print("Selected {} for {} @ {}".format(u_key, data_key.name_in_model, data_key.date_freq)) # check we didn't eliminate everything: assert self._component_map[u_key, data_key] self.data_files[data_key] = self._component_map[u_key, data_key] paths = set() for data_key in self.data_keys: for f in self.data_files[data_key]: paths.add(f._remote_data) if self.tape_filesystem: print("start dmget of {} files".format(len(paths))) util.run_command(['dmget', '-t', '-v'] + list(paths), timeout=len(paths) * self.file_transfer_timeout, dry_run=self.dry_run) print("end dmget")
def ncdump_h(cls, in_file=None, cwd=None, dry_run=False): """Return header information for all variables in a file. """ def _parse_xml_wrapper(xml_): # strips namespaces; https://stackoverflow.com/a/25920989 it = ET.iterparse(io.StringIO(xml_)) for _, el in it: _, _, el.tag = el.tag.rpartition('}') # strip namespaces for at in el.attrib: # strip namespaces of attributes too if '}' in at: newat = at.split('}', 1)[1] el.attrib[newat] = el.attrib[at] del el.attrib[at] return it.root d = {'dimensions': dict(), 'variables': dict()} if dry_run: return d # dummy answer # JSON output for -m is malformed in NCO <=4.5.4, verified OK for 4.7.6 xml_out = util.run_command(['ncks', '--xml', '-m', in_file], cwd=cwd, dry_run=dry_run) root = _parse_xml_wrapper('\n'.join(xml_out)) for dim in root.iter('dimension'): d['dimensions'][dim.attrib['name']] = int(dim.attrib['length']) dv = d['variables'] for var in root.iter('variable'): k = var.attrib['name'] dv[k] = var.attrib.copy() del dv[k]['name'] for att in var: if 'name' not in att.attrib or 'value' not in att.attrib: continue dv[k][att.attrib['name']] = att.attrib['value'] if dv[k].get('shape', None): dv[k]['shape'] = dv[k]['shape'].split(' ') return d
def fetch_dataset(self, d_key, method='auto'): """Copy files to temporary directory and combine chunks. """ # pylint: disable=maybe-no-member (cp_command, smartsite) = self._determine_fetch_method(method) dest_path = self.local_path(d_key) dest_dir = os.path.dirname(dest_path) # ncrcat will error instead of creating destination directories if not os.path.exists(dest_dir): os.makedirs(dest_dir) # GCP can't copy to home dir, so always copy to temp tmpdirs = util_mdtf.TempDirManager() work_dir = tmpdirs.make_tempdir(hash_obj=d_key) remote_files = list(self.data_files[d_key]) # copy remote files # TODO: Do something intelligent with logging, caught OSErrors for f in remote_files: print("\tcopying ...{} to {}".format( f._remote_data[len(self.root_dir):], work_dir)) util.run_command( cp_command + [ smartsite + f._remote_data, # gcp requires trailing slash, ln ignores it smartsite + work_dir + os.sep ], timeout=self.file_transfer_timeout, dry_run=self.dry_run) # ---------------------------------------- # Processing of copied files: TODO: refactor individual steps into # separate functions # set axis names from header info # only look at first file; if other chunks for same var differ, NCO will # raise error when we try to concat them file_name = os.path.basename(remote_files[0]._remote_data) var_name = remote_files[0].name_in_model file_axes = self.nc_get_axes_attributes(var_name, in_file=file_name, cwd=work_dir, dry_run=self.dry_run) for fax, fax_attrs in iter(file_axes.items()): # update DataSets with axis info - need to loop since multiple PODs # may reference this file (warning will be repeated; TODO fix that) error_flag = 0 for var in self.data_keys[d_key]: if fax in var.axes: # file's axis in list of case's axis names; check their # axis attributes match if they're both defined if 'axis' in fax_attrs and 'axis' in var.axes[fax] \ and fax_attrs['axis'].lower() != var.axes[fax]['axis'].lower() \ and error_flag != 1: print( ("\tWarning: unexpected axis attribute for {0} in " "{1} (found {2}, {3} convention is {4})").format( fax, file_name, fax_attrs['axis'], self.convention, var.axes[fax]['axis'])) error_flag = 1 var.axes[fax]['MDTF_set_from_axis'] = False else: # file has different axis name, try to match by attribute for vax, vax_attrs in iter(var.axes.items()): if 'axis' not in fax_attrs or 'axis' not in vax_attrs: continue elif vax_attrs['axis'].lower( ) == fax_attrs['axis'].lower(): # matched axis attributes: log warning & reassign if error_flag != 2: print(( "\tWarning: unexpected {0} axis name in {1} " "(found {2}, {3} convention is {4})" ).format(fax_attrs['axis'], file_name, fax, self.convention, vax)) error_flag = 2 # only update so we don't overwrite the envvar name var.axes[fax] = vax_attrs.copy() var.axes[fax].update(fax_attrs) var.axes[fax]['MDTF_set_from_axis'] = True del var.axes[vax] break else: # get here if we didn't hit 'break' above -- give up if error_flag != 3: print(("\tWarning: unable to assign {0} axis " "in {1}.").format(fax, file_name)) error_flag = 3 # crop time axis to requested range # do this *before* combining chunks to reduce disk activity for vax, vax_attrs in iter(var.axes.items()): if 'axis' not in vax_attrs or vax_attrs['axis'].lower() != 't': continue else: time_var_name = vax break else: print("\tCan't determine time axis for {}.".format(file_name)) time_var_name = 'time' # will probably give KeyError trim_count = 0 for f in remote_files: file_name = os.path.basename(f._remote_data) if f.date_range.is_static: # skip date trimming logic for time-independent files continue if not self.date_range.overlaps(f.date_range): print(("\tWarning: {} has dates {} outside of requested " "range {}.").format(file_name, f.date_range, self.date_range)) continue if not self.date_range.contains(f.date_range): # file overlaps analysis range but is not strictly contained # in it means we need to trim either start or end or both trimmed_range = f.date_range.intersection( self.date_range, precision=f.date_range.precision) print("\ttrimming '{}' of {} from {} to {}".format( time_var_name, file_name, f.date_range, trimmed_range)) trim_count = trim_count + 1 self.nc_crop_time_axis(time_var_name, trimmed_range, in_file=file_name, cwd=work_dir, dry_run=self.dry_run) if trim_count > 2: print("trimmed {} files!".format(trim_count)) raise AssertionError() # cat chunks to destination, if more than one if len(remote_files) > 1: # not running in shell, so can't use glob expansion. print("\tcatting {} chunks to {}".format(d_key.name_in_model, dest_path)) chunks = [os.path.basename(f._remote_data) for f in remote_files] self.nc_cat_chunks(chunks, dest_path, cwd=work_dir, dry_run=self.dry_run) else: f = util.coerce_from_iter(remote_files) file_name = os.path.basename(f._remote_data) print("\tsymlinking {} to {}".format(d_key.name_in_model, dest_path)) util.run_command(['ln', '-fs', \ os.path.join(work_dir, file_name), dest_path], dry_run=self.dry_run )
def test_run_command_exitcode(self): input = ['exit', '1'] with self.assertRaises(Exception): # I couldn't get this to catch CalledProcessError specifically, # maybe because it takes args? util.run_command(input)
def test_run_command_stdout1(self): out = util.run_command(['echo', '"foo"']) self.assertEqual(len(out), 1) self.assertEqual(out[0], '"foo"')