def update(self, amount): '''Increment the number of files processed by one, and update the progress bar accordingly ''' self.done += amount elapsed = time.perf_counter() - self.started estimate = elapsed * self.total / self.done remains = (estimate - elapsed) * 1.1 # overestimate by 10% progress = self.done / self.total sys.stdout.write('\r[{0}{1}]{2}% ~{3} left '.format( '#' * int(self.width * progress), ' ' * int(self.width * (1 - progress)), int(progress * 100), utils.time_diff(int(remains))) ) sys.stdout.flush()
def main(argv): start = time.time() min_readers = 1 max_readers = 20 # NOTE: the defaults for begin time and end time # Begin : 00010101 # End : 99991231 # Parse Args ------------------------------------------------------------- parser = argparse.ArgumentParser(prog='ncreview', description='Compare netCDF files between two directories or summarize from single directory.', epilog='''Note that if --begin and --end are unspecified when comparing datastreams, the time span chosen will be the intersection of the time periods spanned by both datastreams.''') parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + ncr.__version__) parser.add_argument('old_dir', help='Old netCDF files directory') parser.add_argument('new_dir', nargs='?', default=None, help='New netCDF files directory, exclude to simply summarize a single directory.') parser.add_argument('--begin', '-b', default='00010101', metavar='YYYYMMDD', help='Ignore files before YYYYMMDD') parser.add_argument('--end', '-e', default='99991231', metavar='YYYYMMDD', help='Ignore files after YYYYMMDD') parser.add_argument('--sample_interval', '-t', default=None, help='Time interval to average data over in HH-MM-SS. If not provided, ' + 'defaults to 1 day if more than 10 days are being processed, otherwise defaults to hourly samples') parser.add_argument('--metadata_only', '-m', action='store_true', default=False, help='Review only metadata, ignoring variable data. Much faster than standard review.') parser.add_argument('--write_dir', '-w', default=None, metavar='DIR', help='write output data files to specified directory') parser.add_argument('--name', '-n', default=None, help='Specify custom name to be used for the run. Will be the directory name where the ' + 'summary files ncreview creates are stored as well as the URL suffix.') parser.add_argument('--readers', type=int, default=10, help='Specify number of concurrent file readers. Will accept a number between %d and %d (inclusive).' % (min_readers, max_readers)) args = parser.parse_args() # Get absolute directory paths... # This will be important for the webpage to know where the datastreams # came from. args.old_dir = os.path.abspath(args.old_dir) if args.new_dir: args.new_dir = os.path.abspath(args.new_dir) if args.write_dir: args.write_dir = os.path.abspath(args.write_dir) if not os.path.exists(os.path.dirname(args.write_dir)): raise ValueError( "Error: write directory %s does not exist\n" % os.path.dirname(args.write_dir)) args.begin = dt.datetime.strptime(args.begin, '%Y%m%d') args.end = dt.datetime.strptime(args.end, '%Y%m%d') if args.readers < min_readers or args.readers > max_readers: raise ValueError("Error: number of readers must be between %d and %d (inclusive)." % ( min_readers, max_readers)) try: if args.sample_interval is not None: h, m, s = args.sample_interval.split('-') args.sample_interval = int(h) * 60 * 60 + int(m) * 60 + int(s) # if interval is more than 10 days elif args.end - args.begin > dt.timedelta(days=10): args.sample_interval = 24 * 60 * 60 # set interval to 24 hr else: args.sample_interval = 60 * 60 # set interval to 1 hr except: raise ValueError("Error: chunk time %s is invalid.\n" % args.sample_interval) if args.sample_interval <= 0: # if user specified non-positive sample interval raise ValueError( 'Error: sample interval must be a positive number, not ' + str(args.sample_interval)) # Review Data ------------------------------------------------------------ def is_valid(fname): t = utils.file_time(fname) return t is not None and args.begin <= t <= args.end args.new_dir = os.path.abspath( args.new_dir) if args.new_dir else args.new_dir args.old_dir = os.path.abspath( args.old_dir) if args.old_dir else args.old_dir jdata = None if args.new_dir: new_files = sorted(filter(is_valid, os.listdir(args.new_dir))) old_files = sorted(filter(is_valid, os.listdir(args.old_dir))) if not new_files: raise RuntimeError( args.new_dir + ' contains no netCDF files in the specified time period.') if not old_files: raise RuntimeError( args.new_dir + ' contains no netCDF files in the specified time period.') # Get the latest begin and earliest end new_times = list(map(utils.file_time, new_files)) old_times = list(map(utils.file_time, old_files)) # These values are hardcoded to match the default dates # If user passed in start/end times, show the entire timeline at those dates # Otherwise the program defaults to only showing overlap if str(args.begin) != '0001-01-01 00:00:00': args.begin = min(min(new_times), min(old_times)).replace( hour=0, minute=0, second=0, microsecond=0) else: args.begin = max(min(new_times), min(old_times)).replace( hour=0, minute=0, second=0, microsecond=0) if str(args.end) != '9999-12-31 00:00:00': args.end = max(max(new_times), max(old_times)).replace( hour=23, minute=59, second=59, microsecond=999) pass else: args.end = min(max(new_times), max(old_times)).replace( hour=23, minute=59, second=59, microsecond=999) # Re-filter the files with the new time bounds new_files = sorted(filter(is_valid, new_files)) old_files = sorted(filter(is_valid, old_files)) if not new_files or not old_files: raise RuntimeError('Old and New directories do not appear to have overlapping measurement ' + 'times in the specified time period. Cannot determine a comparison interval.') print('Scanning directories...') total_size = 0 for (which, path, files) in (('old', args.old_dir, old_files), ('new', args.new_dir, new_files)): for f in files: total_size += os.stat('%s/%s' % (path, f)).st_size progress_bar = ProgressBar(total_size) print('Reading data...') old_ds = Datastream(is_plottable(args.old_dir), args.sample_interval) new_ds = Datastream(is_plottable(args.new_dir), args.sample_interval) progress_bar.start() with ProcessPoolExecutor(max_workers=args.readers) as executor: for s in executor.map(summarize, map(lambda f: ('%s/%s' % (args.old_dir, f), args.sample_interval, args.metadata_only), old_files)): old_ds.add(s) progress_bar.update(os.stat(s['path']).st_size) for s in executor.map(summarize, map(lambda f: ('%s/%s' % (args.new_dir, f), args.sample_interval, args.metadata_only), new_files)): new_ds.add(s) progress_bar.update(os.stat(s['path']).st_size) progress_bar.complete() print('Comparing...') dsdiff = DatastreamDiff(old_ds, new_ds) jdata = dsdiff.jsonify() else: path = args.old_dir files = sorted(filter(is_valid, os.listdir(path))) if not files: raise RuntimeError( path + ' contains no netCDF files in the specified time period.') print('Scanning directory...') total_size = 0 for f in files: total_size += os.stat('%s/%s' % (path, f)).st_size progress_bar = ProgressBar(total_size) print('Reading data...') ds = Datastream(is_plottable(path), args.sample_interval) progress_bar.start() with ProcessPoolExecutor(max_workers=args.readers) as executor: for s in executor.map(summarize, map(lambda f: ('%s/%s' % (path, f), args.sample_interval, args.metadata_only), files)): ds.add(s) progress_bar.update(os.stat(s['path']).st_size) progress_bar.complete() jdata = ds.jsonify() # Write out the data ----------------------------------------------------- def unique_name(format_str, path): '''Produce a unique directory name at the specified path''' ID = 1 while os.path.exists(path + '/' + format_str.format(ID)): ID += 1 return format_str.format(ID) wpath = '/data/tmp/ncreview/' if args.write_dir is not None: wpath = args.write_dir if not os.path.exists(wpath): os.mkdir(wpath) format_str = '' if args.name: format_str = args.name if os.path.exists(wpath + '/' + args.name): # if the directory already exists, add a unique id format_str += '.{0}' elif args.write_dir: format_str = '.ncr.' + dt.datetime.now().strftime('%y%m%d.%H%M%S') if os.path.exists(format_str): # if the directory already exists, add a unique id format_str += '.{0}' else: format_str = '%s.%s.{0}' % (os.environ['USER'], os.environ['HOST']) jdata_dir = unique_name(format_str, wpath) jdata_path = wpath + '/' + jdata_dir + '/' os.mkdir(jdata_path) def separate_data(obj, n=1): to_separate = [] if obj['type'] in ['plot', 'timeline', 'fileTimeline', 'timelineDiff']: to_separate = ['data'] elif obj['type'] in ['plotDiff', 'fileTimelineDiff']: to_separate = ['old_data', 'new_data'] for key in to_separate: # Generate a unique csv file name while os.path.isfile(jdata_path + 'ncreview.{0}.csv'.format(n)): n += 1 # Write out the data as csv with open(jdata_path + 'ncreview.{0}.csv'.format(n), 'w', newline='') as csvfile: writer = csv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC) for row in obj[key]: writer.writerow(row) # Make what was the data a reference to the file obj[key] = n if 'contents' in obj: for c in obj['contents']: separate_data(c, n) separate_data(jdata) with open(jdata_path + 'ncreview.json', 'w') as jfile: jfile.write(json.dumps(jdata, default=utils.JEncoder)) first_dir, user, *_ = os.path.realpath(__file__).split('/')[1:] location = '/~' + user + '/dsutil' if first_dir == 'home' else '' url_string = jdata_dir if args.write_dir: # if custom write location, put full path url_string = jdata_path # reads csv file # finds new/old values that are different """different_times = [] # the .5.csv is the file that has file_path = '/data/tmp/ncreview/' + url_string + '/ncreview.5.csv' with open(file_path, newline='') as csvfile: reading = csv.reader(csvfile, delimiter = ',') counter = 0 for row in reading: old, new, begin, end = row #dont need end if counter == 0: counter += 1 continue if old != new: #date = time.ctime(int(begin)) date = dt.datetime.fromtimestamp(int(begin)).strftime('%Y-%m-%d') print('date: {}\told: {}\tnew: {}\tdifference: {}'.format(date, old, new, int(new) - int(old)))""" print("") print("Complete! Took %s." % utils.time_diff(time.time() - start)) print("------------------------------------------------------------------------") print('https://engineering.arm.gov' + location + '/ncreview/?' + url_string) print("") return 0