def save_log(self, path): """Return a pandas DataFrame for all the valid log entry lines in log_file The index of the DataFrame are the uniqufied timestamps of the log entries """ if path in self.history: return print 'Processing %s' % path, start = time.time() header, df = load_log(path, extra=self.extra) if df is None: print 'Could not process %s' % path return self.progress_store.put(LogSaver.normalize(path), df) load_time = time.time() - start self.history[path] = { 'start': df.index[0], 'end': df.index[-1], 'load_time': int(load_time), 'num': len(df), 'header': header } ObjectDirectory.save_object(self.history_path, self.history) del df print { k:v for k,v in self.history[path].items() if k != 'header' }, print '%d of %d' % (len(self.history), len(self.log_list))
def __init__(self, store_path, log_list, extra): self.directory = ObjectDirectory(store_path) self.log_list = tuple(sorted(log_list)) self.extra = extra self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True) self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True) self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), is_df=True) self.history = ObjectDirectory.load_object(self.history_path, {}) self.saved = False
def check(self): history = ObjectDirectory.load_object(self.history_path, {}) sorted_keys = history.keys() sorted_keys.sort(key=lambda k: history[k]['start']) print '-' * 80 print 'Time range by log file' for i, path in enumerate(sorted_keys): hist = history[path] print '%2d: %s --- %s : %s' % (i, hist['start'], hist['end'], path) path0 = sorted_keys[0] for path1 in sorted_keys[1:]: hist0,hist1 = history[path0],history[path1] assert hist0['end'] < hist1['start'], ''' ----------- %s %s start: %s end : %s ----------- %s %s hist1['start'] start: %s end : %s ''' % ( path0, hist0, hist0['start'], hist0['end'], path1, hist1, hist1['start'], hist1['end'])
def process_dir(param): hdf_path, dir, n_files, n_entries = param print '=' * 80 print param print hdf_path, dir print '$' * 80 directory = ObjectDirectory(hdf_path) print directory.get_dir() if os.path.exists(directory.get_path('lfl_freq_corr.h5')): print '%s exists' %directory.get_path('lfl_freq_corr.h5') return True try: shutil.rmtree(directory.get_dir()) except: pass path_pattern = os.path.join(dir, 'server.log*') if load_logs.load_log_pattern(hdf_path, path_pattern, n_files=n_files): preprocess_logs.preprocess(directory, n_entries=n_entries) else: try: shutil.rmtree(directory.get_dir()) except: pass try: shutil.rmtree(directory.get_dir(temp=True)) except: pass try: os.remove(directory.get_path('logs.h5')) except: pass return True
class LogSaver: """ self.directory : Directory structure for temp and saved files self.log_list : List of server.log files to process self.extra : True if log messages and thread ids are to be saved too self.history_path : History of server.log conversions saved here self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file self.store_path : Final DataFrame of all server.log entries saved here self.history : History of server.log conversions """ FINAL = 'logs' PROGRESS = 'progress' HISTORY = 'history' @staticmethod def normalize(name): return re.sub(r'[^a-zA-Z0-9]', '_', name) @staticmethod def make_name(base_name, extra): if extra: return base_name + '.extra' else: return base_name #@staticmethod #def temp_name(log_list, extra): # hsh = hash(log_list) # sgn = 'n' if hsh < 0 else 'p' # temp = 'temp_%s%08X' % (sgn, abs(hsh)) # return LogSaver.make_name(temp, extra) def __init__(self, store_path, log_list, extra): self.directory = ObjectDirectory(store_path) self.log_list = tuple(sorted(log_list)) self.extra = extra self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True) self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True) self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), is_df=True) self.history = ObjectDirectory.load_object(self.history_path, {}) self.saved = False def __repr__(self): return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items()) def __str__(self): return '\n'.join([repr(self), '%d log files' % len(self.log_list)]) def save_all_logs(self, force=False): if os.path.exists(self.store_path): final_store = HDFStore(self.store_path) print 'Keys: %s' % final_store final_store.close() return if not force: assert not os.path.exists(self.history_path), ''' %s exists but %s does not. There appears to be a conversion in progress. -f forces conversion to complete. ''' % (self.history_path, self.store_path) self.directory.make_dir_if_necessary(self.progress_store_path) self.progress_store = HDFStore(self.progress_store_path) for path in self.log_list: self.save_log(path) self.check() print '--------' print 'All tables in %s' % self.progress_store_path print self.progress_store.keys() print '--------' def get_log(path): try: return self.progress_store.get(LogSaver.normalize(path)) except Exception as e: print print path raise e df_list = [get_log(path) for path in self.log_list] self.progress_store.close() print 'Closed %s' % self.progress_store_path df_all = pd.concat(df_list) print 'Final list has %d entries' % len(df_all) final_store = HDFStore(self.store_path) final_store.put('logs', df_all) print 'Keys: %s' % final_store final_store.close() print 'Closed %s' % self.store_path # Save the history in a corresponding file self.directory.save('history', self.history) print 'Saved history' self.saved = True def test_store(self): final_store = HDFStore(self.store_path) print '----' print final_store.keys() print '-' * 80 logs = final_store['/logs'] print type(logs) print len(logs) print logs.columns final_store.close() def cleanup(self): os.remove(self.progress_store_path) os.remove(self.history_path) def delete(self): os.remove(self.store_path) def save_log(self, path): """Return a pandas DataFrame for all the valid log entry lines in log_file The index of the DataFrame are the uniqufied timestamps of the log entries """ if path in self.history: return print 'Processing %s' % path, start = time.time() header, df = load_log(path, extra=self.extra) if df is None: print 'Could not process %s' % path return self.progress_store.put(LogSaver.normalize(path), df) load_time = time.time() - start self.history[path] = { 'start': df.index[0], 'end': df.index[-1], 'load_time': int(load_time), 'num': len(df), 'header': header } ObjectDirectory.save_object(self.history_path, self.history) del df print { k:v for k,v in self.history[path].items() if k != 'header' }, print '%d of %d' % (len(self.history), len(self.log_list)) def check(self): history = ObjectDirectory.load_object(self.history_path, {}) sorted_keys = history.keys() sorted_keys.sort(key=lambda k: history[k]['start']) print '-' * 80 print 'Time range by log file' for i, path in enumerate(sorted_keys): hist = history[path] print '%2d: %s --- %s : %s' % (i, hist['start'], hist['end'], path) path0 = sorted_keys[0] for path1 in sorted_keys[1:]: hist0,hist1 = history[path0],history[path1] assert hist0['end'] < hist1['start'], ''' ----------- %s %s start: %s end : %s ----------- %s %s hist1['start'] start: %s end : %s ''' % ( path0, hist0, hist0['start'], hist0['end'], path1, hist1, hist1['start'], hist1['end'])