def process(ffp, ctx): """ process(ffp, ctx) File process that: * Builds dataset ID, * Retrieves file size, * Does checksums, * Deduces mapfile name, * Makes output directory if not already exists, * Writes the corresponding line into it. :param str ffp: The file full path :param esgprep.mapfile.main.ProcessingContext ctx: The processing context :return: The output file full path :rtype: *str* """ # Instantiate file handler fh = File(ffp) # Matching between directory_format and file full path fh.load_attributes(ctx) # Silently stop process if not in desired version scope if not in_version_scope(fh, ctx): return False # Deduce dataset_id dataset_id = fh.get_dataset_id(ctx) # Deduce dataset_version dataset_version = fh.get_dataset_version(ctx) # Build mapfile name depending on the --mapfile flag and appropriate tokens outfile = get_output_mapfile(fh.attributes, dataset_id, dataset_version, ctx) # Generate the corresponding mapfile entry/line line = generate_mapfile_entry(dataset_id, dataset_version, ffp, fh.size, fh.mtime, fh.checksum(ctx.checksum_type, ctx.checksum_client), ctx) insert_mapfile_entry(outfile, line, ffp) # Return mapfile name return outfile
def process(ffp): """ Process time axis checkup and rewriting if needed. :param str ffp: The file full path to process :returns: The file status :rtype: *list* """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(ffp=ffp, pattern=pctx.pattern, ref_units=pctx.ref_units, ref_calendar=pctx.ref_calendar, input_start_timestamp=pctx.ref_start, input_end_timestamp=pctx.ref_end) # Check time axis correctness wrong_timesteps = list() # Rebuild a theoretical time axis with appropriate precision fh.time_axis_rebuilt = trunc(fh.build_time_axis(), NDECIMALS) if not np.array_equal(fh.time_axis_rebuilt, fh.time_axis): fh.status.append(ERROR_TIME_AXIS_KO) time_axis_diff = (fh.time_axis_rebuilt == fh.time_axis) wrong_timesteps = list(np.where(time_axis_diff == False)[0]) # Check time boundaries correctness wrong_bounds = list() if fh.has_bounds: fh.time_bounds_rebuilt = trunc(fh.build_time_bounds(), NDECIMALS) if not np.array_equal(fh.time_bounds_rebuilt, fh.time_bounds): fh.status.append(ERROR_TIME_BOUNDS_KO) time_bounds_diff = (fh.time_bounds_rebuilt == fh.time_bounds) wrong_bounds = list(np.where(np.all(time_bounds_diff, axis=1) == False)[0]) # Get last theoretical date fh.last_num = fh.time_axis_rebuilt[-1] fh.last_date = fh.date_axis_rebuilt[-1] fh.last_timestamp = truncated_timestamp(str2dates(fh.last_date), fh.timestamp_length) # Check consistency between start date infile and start date from filename if fh.start_date_infile != fh.start_date_filename: if fh.start_date_infile < fh.start_date_filename: fh.status.append(ERROR_START_DATE_IN_VS_NAME) else: fh.status.append(ERROR_START_DATE_NAME_VS_IN) # Check consistency between end date infile and end date from filename if not pctx.on_fly and fh.end_date_infile != fh.end_date_filename: if fh.end_date_infile < fh.end_date_filename: fh.status.append(ERROR_END_DATE_IN_VS_NAME) else: fh.status.append(ERROR_END_DATE_NAME_VS_IN) # Check consistency between rebuilt end date and end date from filename if not pctx.on_fly and fh.last_date != fh.end_date_filename: if fh.last_date < fh.end_date_filename: fh.status.append(ERROR_END_DATE_REF_VS_NAME) else: fh.status.append(ERROR_END_DATE_NAME_VS_REF) # Check consistency between rebuilt end date and end date infile if not pctx.on_fly and fh.last_date != fh.end_date_infile: if fh.last_date < fh.end_date_infile: fh.status.append(ERROR_END_DATE_REF_VS_IN) else: fh.status.append(ERROR_END_DATE_IN_VS_REF) # Check file consistency between instant time and time boundaries if fh.is_instant and fh.has_bounds: fh.status.append(ERROR_TIME_BOUNDS_INS) # Check file consistency between averaged time and time boundaries if not fh.is_instant and not fh.has_bounds: fh.status.append(ERROR_TIME_BOUNDS_AVE) # Check time units consistency between file and ref if pctx.ref_units != fh.tunits: fh.status.append(ERROR_TIME_UNITS) # Check calendar consistency between file and ref if pctx.ref_calendar != fh.calendar: fh.status.append(ERROR_TIME_CALENDAR) # Exclude codes to ignore from status codes # Before correction to avoid undesired operations fh.status = [code for code in fh.status if code not in pctx.ignore_codes] correction = False # Rename file depending on checking if (pctx.write and ( ( ({ERROR_END_DATE_NAME_VS_IN, ERROR_END_DATE_IN_VS_NAME}.intersection(set(fh.status))) and {ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME}.intersection(set(fh.status)) ) or ( ({ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME}.intersection(set(fh.status)) and {ERROR_END_DATE_IN_VS_REF, ERROR_END_DATE_REF_VS_IN}.intersection(set(fh.status))) ) )) or pctx.force: # Change filename and file full path dynamically fh.nc_file_rename(new_filename=re.sub(fh.end_timestamp_filename, fh.last_timestamp, fh.filename)) correction = True # Remove time boundaries depending on checking if pctx.write and ERROR_TIME_BOUNDS_INS in fh.status: # Delete time bounds and bounds attribute from file if write or force mode fh.nc_var_delete(variable=fh.tbnds) fh.nc_att_delete(attribute='bounds', variable='time') correction = True # Rewrite time units depending on checking if pctx.write and ERROR_TIME_UNITS in fh.status: fh.nc_att_overwrite('units', variable='time', data=pctx.ref_units) # Rewrite time calendar depending on checking if pctx.write and ERROR_TIME_CALENDAR in fh.status: fh.nc_att_overwrite('calendar', variable='time', data=pctx.ref_calendar) # Rewrite time axis depending on checking if (pctx.write and {ERROR_TIME_AXIS_KO, ERROR_TIME_BOUNDS_KO}.intersection(set(fh.status))) or pctx.force: fh.nc_var_overwrite('time', fh.time_axis_rebuilt) # Rewrite time boundaries if needed if fh.has_bounds: fh.nc_var_overwrite(fh.tbnds, fh.time_bounds_rebuilt) correction = True # Diagnostic display msgval = {} msgval['file'] = COLORS.HEADER(fh.filename) if ERROR_TIME_UNITS in fh.status: msgval['infile_units'] = COLORS.FAIL(fh.tunits) msgval['ref_units'] = COLORS.SUCCESS(pctx.ref_units) else: msgval['infile_units'] = COLORS.SUCCESS(fh.tunits) msgval['ref_units'] = COLOR('cyan').bold(pctx.ref_units) if ERROR_TIME_CALENDAR in fh.status: msgval['infile_calendar'] = COLORS.FAIL(fh.calendar) msgval['ref_calendar'] = COLORS.SUCCESS(pctx.ref_calendar) else: msgval['infile_calendar'] = COLORS.SUCCESS(fh.calendar) msgval['ref_calendar'] = COLOR('cyan').bold(pctx.ref_calendar) if {ERROR_START_DATE_IN_VS_NAME, ERROR_START_DATE_NAME_VS_IN}.intersection(set(fh.status)): msgval['infile_start_timestamp'] = COLORS.FAIL(fh.start_timestamp_infile) msgval['infile_start_date'] = COLORS.FAIL(fh.start_date_infile) msgval['infile_start_num'] = COLORS.FAIL(str(fh.start_num_infile)) msgval['ref_start_timestamp'] = COLORS.SUCCESS(fh.start_timestamp_filename) msgval['ref_start_date'] = COLORS.SUCCESS(fh.start_date_filename) msgval['ref_start_num'] = COLORS.SUCCESS(str(fh.start_num_filename)) else: msgval['infile_start_timestamp'] = COLORS.SUCCESS(fh.start_timestamp_infile) msgval['infile_start_date'] = COLORS.SUCCESS(fh.start_date_infile) msgval['infile_start_num'] = COLORS.SUCCESS(str(fh.start_num_infile)) msgval['ref_start_timestamp'] = COLOR('cyan').bold(fh.start_timestamp_filename) msgval['ref_start_date'] = COLOR('cyan').bold(fh.start_date_filename) msgval['ref_start_num'] = COLOR('cyan').bold(str(fh.start_num_filename)) if {ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME}.intersection(set(fh.status)): msgval['filename_end_timestamp'] = COLORS.FAIL(fh.end_timestamp_filename) msgval['filename_end_date'] = COLORS.FAIL(fh.end_date_filename) msgval['filename_end_num'] = COLORS.FAIL(str(fh.end_num_filename)) else: msgval['filename_end_timestamp'] = COLORS.SUCCESS(fh.end_timestamp_filename) msgval['filename_end_date'] = COLORS.SUCCESS(fh.end_date_filename) msgval['filename_end_num'] = COLORS.SUCCESS(str(fh.end_num_filename)) if {ERROR_END_DATE_IN_VS_REF, ERROR_END_DATE_REF_VS_IN}.intersection(set(fh.status)): msgval['infile_end_timestamp'] = COLORS.FAIL(fh.end_timestamp_infile) msgval['infile_end_date'] = COLORS.FAIL(fh.end_date_infile) msgval['infile_end_num'] = COLORS.FAIL(str(fh.end_num_infile)) else: msgval['infile_end_timestamp'] = COLORS.SUCCESS(fh.end_timestamp_infile) msgval['infile_end_date'] = COLORS.SUCCESS(fh.end_date_infile) msgval['infile_end_num'] = COLORS.SUCCESS(str(fh.end_num_infile)) msgval['ref_end_timestamp'] = COLOR('cyan').bold(fh.last_timestamp) msgval['ref_end_date'] = COLOR('cyan').bold(fh.last_date) msgval['ref_end_num'] = COLOR('cyan').bold(str(fh.last_num)) msgval['len'] = fh.length msgval['table'] = fh.table msgval['freq'] = fh.frequency msgval['step'] = fh.step msgval['units'] = fh.step_units msgval['instant'] = fh.is_instant msgval['clim'] = fh.is_climatology msgval['bnds'] = fh.has_bounds msg = """{file} Units: IN FILE -- {infile_units} REF -- {ref_units} Calendar: IN FILE -- {infile_calendar} REF -- {ref_calendar} Start: IN FILE -- {infile_start_timestamp} = {infile_start_date} = {infile_start_num} REF -- {ref_start_timestamp} = {ref_start_date} = {ref_start_num} End: FILENAME -- {filename_end_timestamp} = {filename_end_date} = {filename_end_num} IN FILE -- {infile_end_timestamp} = {infile_end_date} = {infile_end_num} REBUILT -- {ref_end_timestamp} = {ref_end_date} = {ref_end_num} Length: {len} MIP table: {table} Frequency: {freq} = {step} {units} Is instant: {instant} Is climatology: {clim} Has bounds: {bnds}""".format(**msgval) # Add status message if fh.status: for s in fh.status: msg += """\n Status: {} """.format(COLORS.FAIL('Error {} -- {}'.format(s, STATUS[s]))) if correction and s in ERROR_CORRECTED_SET: msg += ' -- {}'.format(COLORS.SUCCESS('Corrected')) else: msg += """\n Status: {}""".format(COLORS.SUCCESS(STATUS[ERROR_TIME_AXIS_OK])) # Display wrong time steps and/or bounds timestep_limit = pctx.limit if pctx.limit else len(wrong_timesteps) for i, v in enumerate(wrong_timesteps): if (i + 1) <= timestep_limit: msg += """\n Wrong time step at index {}: IN FILE -- {} = {} vs. REBUILT -- {} = {}""".format( COLORS.HEADER(str(v + 1)), COLORS.FAIL(fh.date_axis[v]), COLORS.FAIL(str(fh.time_axis[v]).ljust(10)), COLORS.SUCCESS(fh.date_axis_rebuilt[v]), COLORS.SUCCESS(str(fh.time_axis_rebuilt[v]).ljust(10))) bounds_limit = pctx.limit if pctx.limit else len(wrong_bounds) for i, v in enumerate(wrong_bounds): if (i + 1) <= bounds_limit: msg += """\n Wrong time bounds at index {}: IN FILE -- {} = {} vs. REBUILT -- {} = {}""".format( COLORS.HEADER(str(v + 1)), COLORS.FAIL('[{} {}]'.format(fh.date_bounds[v][0], fh.date_bounds[v][1])), COLORS.FAIL(str(fh.time_bounds[v]).ljust(20)), COLORS.SUCCESS('[{} {}]'.format(fh.date_bounds_rebuilt[v][0], fh.date_bounds_rebuilt[v][1])), COLORS.SUCCESS(str(fh.time_bounds_rebuilt[v]).ljust(20))) # Acquire lock to print result with pctx.lock: if fh.status: Print.error(msg, buffer=True) else: Print.success(msg, buffer=True) # Return error if it is the case if fh.status: return 1 else: return 0 except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = COLORS.HEADER('{}'.format(os.path.basename(ffp))) msg += """\n Status: {}""".format(COLORS.FAIL('Skipped')) msg += """\n {}""".format(exc[0]) msg += """\n """ msg += """\n """.join(exc[1:]) with pctx.lock: Print.error(msg, buffer=True) return None finally: # Print progress with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbfiles) msg = COLORS.OKBLUE('\rProcess netCDF file(s): ') msg += '{}% | {}/{} files'.format(percentage, pctx.progress.value, pctx.nbfiles) Print.progress(msg)
def process(ffp): """ Process time axis checkup and rewriting if needed. :param str ffp: The file full path to process :returns: The file status :rtype: *list* """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(ffp=ffp, pattern=pctx.pattern, ref_units=pctx.ref_units, ref_calendar=pctx.ref_calendar, input_start_timestamp=pctx.ref_start, input_end_timestamp=pctx.ref_end) # Check time axis correctness wrong_timesteps = list() # Rebuild a theoretical time axis with appropriate precision fh.time_axis_rebuilt = trunc(fh.build_time_axis(), NDECIMALS) if not np.array_equal(fh.time_axis_rebuilt, fh.time_axis): fh.status.append(ERROR_TIME_AXIS_KO) time_axis_diff = (fh.time_axis_rebuilt == fh.time_axis) wrong_timesteps = list(np.where(time_axis_diff == False)[0]) # Check time boundaries correctness wrong_bounds = list() if fh.has_bounds: fh.time_bounds_rebuilt = trunc(fh.build_time_bounds(), NDECIMALS) if not np.array_equal(fh.time_bounds_rebuilt, fh.time_bounds): fh.status.append(ERROR_TIME_BOUNDS_KO) time_bounds_diff = (fh.time_bounds_rebuilt == fh.time_bounds) wrong_bounds = list( np.where(np.all(time_bounds_diff, axis=1) == False)[0]) # Get last theoretical date fh.last_num = fh.time_axis_rebuilt[-1] fh.last_date = fh.date_axis_rebuilt[-1] fh.last_timestamp = truncated_timestamp(str2dates(fh.last_date), fh.timestamp_length) # Check consistency between start date infile and start date from filename if fh.start_date_infile != fh.start_date_filename: if fh.start_date_infile < fh.start_date_filename: fh.status.append(ERROR_START_DATE_IN_VS_NAME) else: fh.status.append(ERROR_START_DATE_NAME_VS_IN) # Check consistency between end date infile and end date from filename if not pctx.on_fly and fh.end_date_infile != fh.end_date_filename: if fh.end_date_infile < fh.end_date_filename: fh.status.append(ERROR_END_DATE_IN_VS_NAME) else: fh.status.append(ERROR_END_DATE_NAME_VS_IN) # Check consistency between rebuilt end date and end date from filename if not pctx.on_fly and fh.last_date != fh.end_date_filename: if fh.last_date < fh.end_date_filename: fh.status.append(ERROR_END_DATE_REF_VS_NAME) else: fh.status.append(ERROR_END_DATE_NAME_VS_REF) # Check consistency between rebuilt end date and end date infile if not pctx.on_fly and fh.last_date != fh.end_date_infile: if fh.last_date < fh.end_date_infile: fh.status.append(ERROR_END_DATE_REF_VS_IN) else: fh.status.append(ERROR_END_DATE_IN_VS_REF) # Check file consistency between instant time and time boundaries if fh.is_instant and fh.has_bounds: fh.status.append(ERROR_TIME_BOUNDS_INS) # Check file consistency between averaged time and time boundaries if not fh.is_instant and not fh.has_bounds: fh.status.append(ERROR_TIME_BOUNDS_AVE) # Check time units consistency between file and ref if pctx.ref_units != fh.tunits: fh.status.append(ERROR_TIME_UNITS) # Check calendar consistency between file and ref if pctx.ref_calendar != fh.calendar: fh.status.append(ERROR_TIME_CALENDAR) # Exclude codes to ignore from status codes # Before correction to avoid undesired operations fh.status = [ code for code in fh.status if code not in pctx.ignore_codes ] correction = False # Rename file depending on checking if (pctx.write and ((({ERROR_END_DATE_NAME_VS_IN, ERROR_END_DATE_IN_VS_NAME }.intersection(set(fh.status))) and {ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME }.intersection(set(fh.status))) or (({ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME }.intersection(set(fh.status)) and {ERROR_END_DATE_IN_VS_REF, ERROR_END_DATE_REF_VS_IN }.intersection(set(fh.status)))))) or pctx.force: # Change filename and file full path dynamically fh.nc_file_rename( new_filename=re.sub(fh.orig_end_timestamp_filename, fh.last_timestamp, fh.filename)) correction = True # Remove time boundaries depending on checking if pctx.write and ERROR_TIME_BOUNDS_INS in fh.status: # Delete time bounds and bounds attribute from file if write or force mode fh.nc_var_delete(variable=fh.tbnds) fh.nc_att_delete(attribute='bounds', variable='time') correction = True # Rewrite time units depending on checking if pctx.write and ERROR_TIME_UNITS in fh.status: fh.nc_att_overwrite('units', variable='time', data=pctx.ref_units) # Rewrite time calendar depending on checking if pctx.write and ERROR_TIME_CALENDAR in fh.status: fh.nc_att_overwrite('calendar', variable='time', data=pctx.ref_calendar) # Rewrite time axis depending on checking if (pctx.write and {ERROR_TIME_AXIS_KO, ERROR_TIME_BOUNDS_KO }.intersection(set(fh.status))) or pctx.force: fh.nc_var_overwrite('time', fh.time_axis_rebuilt) # Rewrite time boundaries if needed if fh.has_bounds: fh.nc_var_overwrite(fh.tbnds, fh.time_bounds_rebuilt) correction = True # Diagnostic display msgval = {} msgval['file'] = COLORS.HEADER(fh.filename) if ERROR_TIME_UNITS in fh.status: msgval['infile_units'] = COLORS.FAIL(fh.tunits) msgval['ref_units'] = COLORS.SUCCESS(pctx.ref_units) else: msgval['infile_units'] = COLORS.SUCCESS(fh.tunits) msgval['ref_units'] = COLOR('cyan').bold(pctx.ref_units) if ERROR_TIME_CALENDAR in fh.status: msgval['infile_calendar'] = COLORS.FAIL(fh.calendar) msgval['ref_calendar'] = COLORS.SUCCESS(pctx.ref_calendar) else: msgval['infile_calendar'] = COLORS.SUCCESS(fh.calendar) msgval['ref_calendar'] = COLOR('cyan').bold(pctx.ref_calendar) if {ERROR_START_DATE_IN_VS_NAME, ERROR_START_DATE_NAME_VS_IN}.intersection(set(fh.status)): msgval['infile_start_timestamp'] = COLORS.FAIL( fh.start_timestamp_infile) msgval['infile_start_date'] = COLORS.FAIL(fh.start_date_infile) msgval['infile_start_num'] = COLORS.FAIL(str(fh.start_num_infile)) msgval['ref_start_timestamp'] = COLORS.SUCCESS( fh.start_timestamp_filename) msgval['ref_start_date'] = COLORS.SUCCESS(fh.start_date_filename) msgval['ref_start_num'] = COLORS.SUCCESS(str( fh.start_num_filename)) else: msgval['infile_start_timestamp'] = COLORS.SUCCESS( fh.start_timestamp_infile) msgval['infile_start_date'] = COLORS.SUCCESS(fh.start_date_infile) msgval['infile_start_num'] = COLORS.SUCCESS( str(fh.start_num_infile)) msgval['ref_start_timestamp'] = COLOR('cyan').bold( fh.start_timestamp_filename) msgval['ref_start_date'] = COLOR('cyan').bold( fh.start_date_filename) msgval['ref_start_num'] = COLOR('cyan').bold( str(fh.start_num_filename)) if {ERROR_END_DATE_NAME_VS_REF, ERROR_END_DATE_REF_VS_NAME}.intersection(set(fh.status)): msgval['filename_end_timestamp'] = COLORS.FAIL( fh.end_timestamp_filename) msgval['filename_end_date'] = COLORS.FAIL(fh.end_date_filename) msgval['filename_end_num'] = COLORS.FAIL(str(fh.end_num_filename)) else: msgval['filename_end_timestamp'] = COLORS.SUCCESS( fh.end_timestamp_filename) msgval['filename_end_date'] = COLORS.SUCCESS(fh.end_date_filename) msgval['filename_end_num'] = COLORS.SUCCESS( str(fh.end_num_filename)) if {ERROR_END_DATE_IN_VS_REF, ERROR_END_DATE_REF_VS_IN}.intersection(set(fh.status)): msgval['infile_end_timestamp'] = COLORS.FAIL( fh.end_timestamp_infile) msgval['infile_end_date'] = COLORS.FAIL(fh.end_date_infile) msgval['infile_end_num'] = COLORS.FAIL(str(fh.end_num_infile)) else: msgval['infile_end_timestamp'] = COLORS.SUCCESS( fh.end_timestamp_infile) msgval['infile_end_date'] = COLORS.SUCCESS(fh.end_date_infile) msgval['infile_end_num'] = COLORS.SUCCESS(str(fh.end_num_infile)) msgval['ref_end_timestamp'] = COLOR('cyan').bold(fh.last_timestamp) msgval['ref_end_date'] = COLOR('cyan').bold(fh.last_date) msgval['ref_end_num'] = COLOR('cyan').bold(str(fh.last_num)) msgval['len'] = fh.length msgval['table'] = fh.table msgval['freq'] = fh.frequency msgval['step'] = fh.step msgval['units'] = fh.step_units msgval['instant'] = fh.is_instant msgval['clim'] = fh.is_climatology msgval['bnds'] = fh.has_bounds msg = """{file} Units: IN FILE -- {infile_units} REF -- {ref_units} Calendar: IN FILE -- {infile_calendar} REF -- {ref_calendar} Start: IN FILE -- {infile_start_timestamp} = {infile_start_date} = {infile_start_num} REF -- {ref_start_timestamp} = {ref_start_date} = {ref_start_num} End: FILENAME -- {filename_end_timestamp} = {filename_end_date} = {filename_end_num} IN FILE -- {infile_end_timestamp} = {infile_end_date} = {infile_end_num} REBUILT -- {ref_end_timestamp} = {ref_end_date} = {ref_end_num} Length: {len} MIP table: {table} Frequency: {freq} = {step} {units} Is instant: {instant} Is climatology: {clim} Has bounds: {bnds}""".format(**msgval) # Add status message if fh.status: for s in fh.status: msg += """\n Status: {} """.format( COLORS.FAIL('Error {} -- {}'.format(s, STATUS[s]))) if correction and s in ERROR_CORRECTED_SET: msg += ' -- {}'.format(COLORS.SUCCESS('Corrected')) else: msg += """\n Status: {}""".format( COLORS.SUCCESS(STATUS[ERROR_TIME_AXIS_OK])) # Display wrong time steps and/or bounds timestep_limit = pctx.limit if pctx.limit else len(wrong_timesteps) for i, v in enumerate(wrong_timesteps): if (i + 1) <= timestep_limit: msg += """\n Wrong time step at index {}: IN FILE -- {} = {} vs. REBUILT -- {} = {}""".format( COLORS.HEADER(str(v + 1)), COLORS.FAIL(fh.date_axis[v]), COLORS.FAIL(str(fh.time_axis[v]).ljust(10)), COLORS.SUCCESS(fh.date_axis_rebuilt[v]), COLORS.SUCCESS(str(fh.time_axis_rebuilt[v]).ljust(10))) bounds_limit = pctx.limit if pctx.limit else len(wrong_bounds) for i, v in enumerate(wrong_bounds): if (i + 1) <= bounds_limit: msg += """\n Wrong time bounds at index {}: IN FILE -- {} = {} vs. REBUILT -- {} = {}""".format( COLORS.HEADER(str(v + 1)), COLORS.FAIL('[{} {}]'.format(fh.date_bounds[v][0], fh.date_bounds[v][1])), COLORS.FAIL(str(fh.time_bounds[v]).ljust(20)), COLORS.SUCCESS('[{} {}]'.format( fh.date_bounds_rebuilt[v][0], fh.date_bounds_rebuilt[v][1])), COLORS.SUCCESS(str(fh.time_bounds_rebuilt[v]).ljust(20))) # Acquire lock to print result with pctx.lock: if fh.status: Print.error(msg, buffer=True) else: Print.success(msg, buffer=True) # Return error if it is the case if fh.status: return 1 else: return 0 except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = COLORS.HEADER('{}'.format(os.path.basename(ffp))) msg += """\n Status: {}""".format(COLORS.FAIL('Skipped')) msg += """\n {}""".format(exc[0]) msg += """\n """ msg += """\n """.join(exc[1:]) with pctx.lock: Print.error(msg, buffer=True) return None finally: # Print progress with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbfiles) msg = COLORS.OKBLUE('\rProcess netCDF file(s): ') msg += '{}% | {}/{} files'.format(percentage, pctx.progress.value, pctx.nbfiles) Print.progress(msg)
def process(source): """ process(collector_input) File process that: * Handles files, * Deduces facet key, values pairs from file attributes * Checks facet values against CV, * Applies the versioning * Populates the DRS tree crating the appropriate leaves, * Stores dataset statistics. :param str source: The file full path to process """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(source) # Ignore files from incoming if fh.filename in pctx.ignore_from_incoming: msg = TAGS.SKIP + COLORS.HEADER(source) with pctx.lock: Print.exception(msg, buffer=True) return None # Loads attributes from filename, netCDF attributes, command-line fh.load_attributes(root=pctx.root, pattern=pctx.pattern, set_values=pctx.set_values) # Checks the facet values provided by the loaded attributes fh.check_facets(facets=pctx.facets, config=pctx.cfg, set_keys=pctx.set_keys) # Get parts of DRS path parts = fh.get_drs_parts(pctx.facets) # Instantiate file DRS path handler fh.drs = DRSPath(parts) # Ensure that the called project section is ALWAYS part of the DRS path elements (case insensitive) if not fh.drs.path().lower().startswith(pctx.project.lower()): raise InconsistentDRSPath(pctx.project, fh.drs.path()) # Compute file checksum if fh.drs.v_latest and not pctx.no_checksum: fh.checksum = checksum(fh.ffp, pctx.checksum_type) # Get file tracking id fh.tracking_id = get_tracking_id(fh.ffp, pctx.project) if fh.drs.v_latest: latest_file = os.path.join(fh.drs.path(latest=True, root=True), fh.filename) # Compute checksum of latest file version if exists if os.path.exists(latest_file) and not pctx.no_checksum: fh.latest_checksum = checksum(latest_file, pctx.checksum_type) # Get tracking_id of latest file version if exists if os.path.exists(latest_file): fh.latest_tracking_id = get_tracking_id( latest_file, pctx.project) msg = TAGS.SUCCESS + 'Processing {}'.format(COLORS.HEADER(fh.ffp)) Print.info(msg) return fh except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.SKIP + COLORS.HEADER(source) + '\n' msg += '\n'.join(exc) with pctx.lock: Print.exception(msg, buffer=True) return None finally: with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rScanning incoming file(s): ') msg += '{}% | {}/{} file(s)'.format(percentage, pctx.progress.value, pctx.nbsources) Print.progress(msg)
def process(source): """ File process that: * Handles file, * Harvests directory attributes, * Check DRS attributes against CV, * Builds dataset ID, * Retrieves file size, * Does checksums, * Deduces mapfile name, * Writes the corresponding mapfile entry. Any error leads to skip the file. It does not stop the process. :param str source: The source to process could be a path or a dataset ID :returns: The output mapfile full path :rtype: *str* """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: if pctx.source_type == 'file': # Instantiate source handle as file sh = File(source) else: # Instantiate source handler as dataset sh = Dataset(source) # Matching between directory_format and file full path sh.load_attributes(pattern=pctx.pattern) # Deduce dataset_id dataset_id = pctx.dataset_name if not pctx.dataset_name: sh.check_facets(facets=pctx.facets, config=pctx.cfg) dataset_id = sh.get_dataset_id(pctx.cfg.get('dataset_id', raw=True)) # Ensure that the first facet is ALWAYS the same as the called project section (case insensitive) if not dataset_id.lower().startswith(pctx.project.lower()): raise InconsistentDatasetID(pctx.project, dataset_id.lower()) # Deduce dataset_version dataset_version = sh.get_dataset_version(pctx.no_version) # Build mapfile name depending on the --mapfile flag and appropriate tokens outfile = get_output_mapfile(outdir=pctx.outdir, attributes=sh.attributes, mapfile_name=pctx.mapfile_name, dataset_id=dataset_id, dataset_version=dataset_version, mapfile_drs=pctx.mapfile_drs, basename=pctx.basename) # Dry-run: don't write mapfile to only show their paths if pctx.action == 'make': # Generate the corresponding mapfile entry/line optional_attrs = dict() optional_attrs['mod_time'] = sh.mtime if not pctx.no_checksum: optional_attrs['checksum'] = get_checksum( sh.source, pctx.checksum_type, pctx.checksums_from) optional_attrs['checksum_type'] = pctx.checksum_type.upper() optional_attrs['dataset_tech_notes'] = pctx.notes_url optional_attrs['dataset_tech_notes_title'] = pctx.notes_title line = mapfile_entry(dataset_id=dataset_id, dataset_version=dataset_version, ffp=source, size=sh.size, optional_attrs=optional_attrs) write(outfile, line) msg = TAGS.SUCCESS msg += '{}'.format(os.path.splitext(os.path.basename(outfile))[0]) msg += ' <-- ' + COLORS.HEADER(source) with pctx.lock: Print.info(msg) # Return mapfile name return outfile # Catch any exception into error log instead of stop the run except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.SKIP + COLORS.HEADER(source) + '\n' msg += '\n'.join(exc) with pctx.lock: Print.exception(msg, buffer=True) return None finally: with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rMapfile(s) generation: ') msg += '{}% | {}/{} {}'.format(percentage, pctx.progress.value, pctx.nbsources, SOURCE_TYPE[pctx.source_type]) Print.progress(msg)
def process(source): """ process(collector_input) File process that: * Handles files, * Deduces facet key, values pairs from file attributes * Checks facet values against CV, * Applies the versioning * Populates the DRS tree crating the appropriate leaves, * Stores dataset statistics. :param str source: The file full path to process """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(source) # Ignore files from incoming if fh.filename in pctx.ignore_from_incoming: msg = TAGS.SKIP + COLORS.HEADER(source) with pctx.lock: Print.exception(msg, buffer=True) return None # Loads attributes from filename, netCDF attributes, command-line fh.load_attributes(root=pctx.root, pattern=pctx.pattern, set_values=pctx.set_values) # Checks the facet values provided by the loaded attributes fh.check_facets(facets=pctx.facets, config=pctx.cfg, set_keys=pctx.set_keys) # Get parts of DRS path parts = fh.get_drs_parts(pctx.facets) # Instantiate file DRS path handler fh.drs = DRSPath(parts) # Ensure that the called project section is ALWAYS part of the DRS path elements (case insensitive) if not fh.drs.path().lower().startswith(pctx.project.lower()): raise InconsistentDRSPath(pctx.project, fh.drs.path()) # Evaluate if processing file already exists in the latest existing dataset version (i.e., "is duplicated") # Default: fh.is_duplicate = False # 1. If a latest dataset version exists if fh.drs.v_latest: # Build corresponding latest file path latest_file = os.path.join(fh.drs.path(latest=True, root=True), fh.filename) # 2. Test if a file with the same filename exists in latest version if os.path.exists(latest_file): # Get tracking ID (None if not recorded into the file) fh.tracking_id = get_tracking_id(fh.ffp, pctx.project) latest_tracking_id = get_tracking_id(latest_file, pctx.project) # 3. Test if tracking IDs are different (i.e., keep is_duplicate = False) if fh.tracking_id == latest_tracking_id: latest_size = os.stat(latest_file).st_size # 4. Test if file sizes are different (i.e., keep is_duplicate = False) if fh.size == latest_size and not pctx.no_checksum: # Read or compute the checksums fh.checksum = get_checksum(fh.ffp, pctx.checksum_type, pctx.checksums_from) latest_checksum = get_checksum(latest_file, pctx.checksum_type, pctx.checksums_from) # store checksum if fh.checksum == latest_checksum: fh.is_duplicate = True elif fh.tracking_id and latest_tracking_id: # If the checksums are different, the tracking ID must not be identical if exist. # If no tracking IDs keep to is_duplicate = False raise UnchangedTrackingID(latest_file, latest_tracking_id, fh.ffp, fh.tracking_id) elif fh.tracking_id and latest_tracking_id: # If the sizes are different, the tracking ID must not be identical if exist. # If no tracking IDs keep to is_duplicate = False raise UnchangedTrackingID(latest_file, latest_tracking_id, fh.ffp, fh.tracking_id) msg = TAGS.SUCCESS + 'Processing {}'.format(COLORS.HEADER(fh.ffp)) Print.info(msg) return fh except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.SKIP + COLORS.HEADER(source) + '\n' msg += '\n'.join(exc) with pctx.lock: Print.exception(msg, buffer=True) return None finally: with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rScanning incoming file(s): ') msg += '{}% | {}/{} file(s)'.format(percentage, pctx.progress.value, pctx.nbsources) Print.progress(msg)
def process(collector_input): """ File process that: * Handles file, * Harvests directory attributes, * Check DRS attributes against CV, * Builds dataset ID, * Retrieves file size, * Does checksums, * Deduces mapfile name, * Writes the corresponding mapfile entry. Any error leads to skip the file. It does not stop the process. :param tuple collector_input: A tuple with the file path and the processing context :returns: The output mapfile full path :rtype: *str* """ # Deserialize inputs from collector source, ctx = collector_input # Block to avoid program stop if a thread fails try: if ctx.source_type == 'file': # Instantiate source handle as file sh = File(source) else: # Instantiate source handler as dataset sh = Dataset(source) # Matching between directory_format and file full path sh.load_attributes(pattern=ctx.pattern) # Deduce dataset_id dataset_id = ctx.dataset_name if not ctx.dataset_name: sh.check_facets(facets=ctx.facets, config=ctx.cfg) dataset_id = sh.get_dataset_id(ctx.cfg.get('dataset_id', raw=True)) # Ensure that the first facet is ALWAYS the same as the called project section (case insensitive) assert dataset_id.lower().startswith(ctx.project.lower()), 'Inconsistent dataset identifier. ' \ 'Must start with "{}/" ' \ '(case-insensitive)'.format(ctx.project) # Deduce dataset_version dataset_version = sh.get_dataset_version(ctx.no_version) # Build mapfile name depending on the --mapfile flag and appropriate tokens outfile = get_output_mapfile(outdir=ctx.outdir, attributes=sh.attributes, mapfile_name=ctx.mapfile_name, dataset_id=dataset_id, dataset_version=dataset_version, mapfile_drs=ctx.mapfile_drs, basename=ctx.basename) # Dry-run: don't write mapfile to only show their paths if ctx.action == 'make': # Generate the corresponding mapfile entry/line optional_attrs = dict() optional_attrs['mod_time'] = sh.mtime if not ctx.no_checksum: optional_attrs['checksum'] = sh.checksum(ctx.checksum_type) optional_attrs['checksum_type'] = ctx.checksum_type.upper() optional_attrs['dataset_tech_notes'] = ctx.notes_url optional_attrs['dataset_tech_notes_title'] = ctx.notes_title line = mapfile_entry(dataset_id=dataset_id, dataset_version=dataset_version, ffp=source, size=sh.size, optional_attrs=optional_attrs) write(outfile, line) logging.info('{} <-- {}'.format( os.path.splitext(os.path.basename(outfile))[0], source)) # Return mapfile name return outfile # Catch any exception into error log instead of stop the run except Exception as e: logging.error('{} skipped\n{}: {}'.format(source, e.__class__.__name__, e.message)) return None finally: if ctx.pbar: ctx.pbar.update()
def process(collector_input): """ File process that: * Handles file, * Harvests directory attributes, * Check DRS attributes against CV, * Builds dataset ID, * Retrieves file size, * Does checksums, * Deduces mapfile name, * Writes the corresponding mapfile entry. Any error leads to skip the file. It does not stop the process. :param tuple collector_input: A tuple with the file path and the processing context :returns: The output mapfile full path :rtype: *str* """ # Deserialize inputs from collector ffp, ctx = collector_input # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(ffp) # Matching between directory_format and file full path fh.load_attributes(pattern=ctx.pattern) # Apply proper case to each attribute for key in fh.attributes: # Try to get the appropriate facet case for "category_default" try: fh.attributes[key] = ctx.cfg.get_options_from_pairs('category_defaults', key) except NoConfigKey: # If not specified keep facet case from local path, do nothing pass # Deduce dataset_id dataset_id = ctx.dataset if not ctx.dataset: fh.check_facets(facets=ctx.facets, config=ctx.cfg) dataset_id = fh.get_dataset_id(ctx.cfg.get('dataset_id', raw=True)) # Deduce dataset_version dataset_version = fh.get_dataset_version(ctx.no_version) # Build mapfile name depending on the --mapfile flag and appropriate tokens outfile = get_output_mapfile(outdir=ctx.outdir, attributes=fh.attributes, mapfile_name=ctx.mapfile_name, dataset_id=dataset_id, dataset_version=dataset_version, mapfile_drs=ctx.mapfile_drs) # Generate the corresponding mapfile entry/line optional_attrs = dict() optional_attrs['mod_time'] = fh.mtime if not ctx.no_checksum: optional_attrs['checksum'] = fh.checksum(ctx.checksum_type) optional_attrs['checksum_type'] = ctx.checksum_type.upper() optional_attrs['dataset_tech_notes'] = ctx.notes_url optional_attrs['dataset_tech_notes_title'] = ctx.notes_title line = mapfile_entry(dataset_id=dataset_id, dataset_version=dataset_version, ffp=ffp, size=fh.size, optional_attrs=optional_attrs) write(outfile, line) logging.info('{} <-- {}'.format(os.path.splitext(os.path.basename(outfile))[0], ffp)) # Return mapfile name return outfile # Catch any exception into error log instead of stop the run except Exception as e: logging.error('{} skipped\n{}: {}'.format(ffp, e.__class__.__name__, e.message)) return None finally: ctx.pbar.update()
def process(collector_input): """ process(collector_input) File process that: * Handles files, * Deduces facet key, values pairs from file attributes * Checks facet values against CV, * Applies the versioning * Populates the DRS tree crating the appropriate leaves, * Stores dataset statistics. :param tuple collector_input: A tuple with the file path and the processing context :return: True on success :rtype: *boolean* """ # Deserialize inputs from collector ffp, ctx = collector_input # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(ffp) # Loads attributes from filename, netCDF attributes, command-line fh.load_attributes(root=ctx.root, pattern=ctx.pattern, set_values=ctx.set_values) # Apply proper case to each attribute for key in fh.attributes: # Try to get the appropriate facet case for "category_default" try: fh.attributes[key] = ctx.cfg.get_options_from_pairs('category_defaults', key) except NoConfigKey: # If not specified keep facet case from local path, do nothing pass fh.check_facets(facets=ctx.facets, config=ctx.cfg, set_keys=ctx.set_keys) # Get parts of DRS path parts = fh.get_drs_parts(ctx.facets) # Instantiate file DRS path handler fph = DRSPath(parts) # If a latest version already exists make some checks FIRST to stop files to not process if fph.v_latest: # Latest version should be older than upgrade version if int(DRSPath.TREE_VERSION[1:]) <= int(fph.v_latest[1:]): raise OlderUpgrade(DRSPath.TREE_VERSION, fph.v_latest) # Walk through the latest dataset version to check its uniqueness with file checksums dset_nid = fph.path(f_part=False, latest=True, root=True) if dset_nid not in ctx.tree.hash.keys(): ctx.tree.hash[dset_nid] = dict() ctx.tree.hash[dset_nid]['latest'] = dict() for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)): for filename in filenames: ctx.tree.hash[dset_nid]['latest'][filename] = checksum(os.path.join(root, filename), ctx.checksum_type) # Pickup the latest file version latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename) # Check latest file if exists if os.path.exists(latest_file): latest_checksum = checksum(latest_file, ctx.checksum_type) current_checksum = checksum(fh.ffp, ctx.checksum_type) # Check if processed file is a duplicate in comparison with latest version if latest_checksum == current_checksum: fh.is_duplicate = True # Start the tree generation if not fh.is_duplicate: # Add the processed file to the "vYYYYMMDD" node src = ['..'] * len(fph.items(d_part=False)) src.extend(fph.items(d_part=False, file_folder=True)) src.append(fh.filename) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, os.path.join(*src)), src=os.path.join(*src), mode='symlink', origin=fh.ffp) # Add the "latest" node for symlink ctx.tree.create_leaf(nodes=fph.items(f_part=False, version=False, root=True), leaf='latest', label='{}{}{}'.format('latest', LINK_SEPARATOR, fph.v_upgrade), src=fph.v_upgrade, mode='symlink') # Add the processed file to the "files" node ctx.tree.create_leaf(nodes=fph.items(file_folder=True, root=True), leaf=fh.filename, label=fh.filename, src=fh.ffp, mode=ctx.mode) if ctx.upgrade_from_latest: # Walk through the latest dataset version and create a simlink for each file with a different # filename than the processed one for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)): for filename in filenames: # Add latest files as tree leaves with version to upgrade instead of latest version # i.e., copy latest dataset leaves to Tree if filename != fh.filename: src = os.path.join(root, filename) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=filename, label='{}{}{}'.format(filename, LINK_SEPARATOR, os.readlink(src)), src=os.readlink(src), mode='symlink', origin=os.path.realpath(src)) else: # Pickup the latest file version latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename) if ctx.upgrade_from_latest: # If upgrade from latest is activated, raise the error, no duplicated files allowed # Because incoming must only contain modifed/corrected files raise DuplicatedFile(latest_file, fh.ffp) else: # If default behavior, the incoming contains all data for a new version # In the case of a duplicated file, just pass to the expected symlink creation # and records duplicated file for further removal only if migration mode is the # default (i.e., moving files). In the case of --copy or --link, keep duplicates # in place into the incoming directory src = os.readlink(latest_file) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, src), src=src, mode='symlink', origin=fh.ffp) if ctx.mode == 'move': ctx.tree.duplicates.append(fh.ffp) # Record entry for list() incoming = {'src': fh.ffp, 'dst': fph.path(root=True), 'filename': fh.filename, 'latest': fph.v_latest or 'Initial', 'size': fh.size} if fph.path(f_part=False) in ctx.tree.paths.keys(): ctx.tree.paths[fph.path(f_part=False)].append(incoming) else: ctx.tree.paths[fph.path(f_part=False)] = [incoming] logging.info('{} <-- {}'.format(fph.path(f_part=False), fh.filename)) return True except Exception as e: logging.error('{} skipped\n{}: {}'.format(ffp, e.__class__.__name__, e.message)) return None finally: ctx.pbar.update()