Example #1
0
 def save_log(self, path):
     """Return a pandas DataFrame for all the valid log entry lines in log_file
         The index of the DataFrame are the uniqufied timestamps of the log entries
     """
     if path in self.history:
         return
     
     print 'Processing %s' % path,
     start = time.time()
     header, df = load_log(path, extra=self.extra)
     if df is None:
         print 'Could not process %s' % path
         return
     self.progress_store.put(LogSaver.normalize(path), df)
     load_time = time.time() - start
     
     self.history[path] = {
         'start': df.index[0],
         'end': df.index[-1],
         'load_time': int(load_time),
         'num': len(df),
         'header': header
     }
     ObjectDirectory.save_object(self.history_path, self.history)
     del df
     print { k:v for k,v in self.history[path].items() if k != 'header' },
     print '%d of %d' % (len(self.history), len(self.log_list))
Example #2
0
    def __init__(self, store_path, log_list, extra):
        self.directory = ObjectDirectory(store_path)
        self.log_list = tuple(sorted(log_list))
        self.extra = extra

        self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True)
        self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True)
        self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), 
                            is_df=True)
        self.history = ObjectDirectory.load_object(self.history_path, {})
        self.saved = False
Example #3
0
 def check(self):
     history = ObjectDirectory.load_object(self.history_path, {})
     sorted_keys = history.keys()
     sorted_keys.sort(key=lambda k: history[k]['start'])
     print '-' * 80
     print 'Time range by log file'
     for i, path in enumerate(sorted_keys):
         hist = history[path]
         print '%2d: %s  ---  %s : %s' % (i, hist['start'], hist['end'], path)
     
     path0 = sorted_keys[0]
     for path1 in sorted_keys[1:]:
         hist0,hist1 = history[path0],history[path1] 
         assert hist0['end'] < hist1['start'], '''
         -----------
         %s %s
         start: %s
         end  : %s
         -----------
         %s %s
         hist1['start']
         start: %s
         end  : %s
         ''' % (
             path0, hist0, hist0['start'],  hist0['end'],
             path1, hist1, hist1['start'],  hist1['end'])    
def process_dir(param):
    
    hdf_path, dir, n_files, n_entries = param
    print '=' * 80
    print param
    print hdf_path, dir
    print '$' * 80

    directory = ObjectDirectory(hdf_path)
    print directory.get_dir()

    if os.path.exists(directory.get_path('lfl_freq_corr.h5')):
        print '%s exists' %directory.get_path('lfl_freq_corr.h5')
        return True
        
    try:
        shutil.rmtree(directory.get_dir())
    except:
        pass

    path_pattern = os.path.join(dir, 'server.log*')

    if load_logs.load_log_pattern(hdf_path, path_pattern, n_files=n_files):
        preprocess_logs.preprocess(directory, n_entries=n_entries)  
    else:
        try:
            shutil.rmtree(directory.get_dir())
        except:
            pass

    try:
        shutil.rmtree(directory.get_dir(temp=True))
    except:
        pass

    try:    
        os.remove(directory.get_path('logs.h5'))
    except:
        pass

    return True
Example #5
0
class LogSaver:
    """
        self.directory : Directory structure for temp and saved files
        self.log_list : List of server.log files to process
        self.extra : True if log messages and thread ids are to be saved too
        self.history_path : History of server.log conversions saved here
        self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file 
        self.store_path : Final DataFrame of all server.log entries saved here
        self.history : History of server.log conversions
    """

    FINAL = 'logs'
    PROGRESS = 'progress'
    HISTORY = 'history'

    @staticmethod
    def normalize(name):
        return re.sub(r'[^a-zA-Z0-9]', '_', name)
     
    @staticmethod
    def make_name(base_name, extra):
        if extra:
            return base_name + '.extra'
        else:
            return base_name
     
    #@staticmethod
    #def temp_name(log_list, extra):
    #    hsh = hash(log_list)
    #    sgn = 'n' if hsh < 0 else 'p'
    #    temp = 'temp_%s%08X' % (sgn, abs(hsh))
    #    return LogSaver.make_name(temp, extra)    

    def __init__(self, store_path, log_list, extra):
        self.directory = ObjectDirectory(store_path)
        self.log_list = tuple(sorted(log_list))
        self.extra = extra

        self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True)
        self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True)
        self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), 
                            is_df=True)
        self.history = ObjectDirectory.load_object(self.history_path, {})
        self.saved = False
        
    def __repr__(self):
        return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items())
        
    def __str__(self):
        return '\n'.join([repr(self), '%d log files' % len(self.log_list)])    

    def save_all_logs(self, force=False):
         
        if os.path.exists(self.store_path):
            final_store = HDFStore(self.store_path)
            print 'Keys: %s' % final_store
            final_store.close()
            return
        if not force:
            assert not os.path.exists(self.history_path), '''
                %s exists but %s does not.
                There appears to be a conversion in progress.
                -f forces conversion to complete.
            ''' % (self.history_path, self.store_path)
        
        self.directory.make_dir_if_necessary(self.progress_store_path)
        self.progress_store = HDFStore(self.progress_store_path)
        for path in self.log_list:
            self.save_log(path)
        
        self.check()    
        print '--------'
        print 'All tables in %s' % self.progress_store_path
        print self.progress_store.keys()
        print '--------'
        
        def get_log(path):
            try:
                return self.progress_store.get(LogSaver.normalize(path))
            except Exception as e:
                print
                print path
                raise e
               
        
        df_list = [get_log(path) for path in self.log_list]     
        self.progress_store.close()
        print 'Closed %s' % self.progress_store_path
        
        df_all = pd.concat(df_list)
        print 'Final list has %d entries' % len(df_all)
        final_store = HDFStore(self.store_path)
        final_store.put('logs', df_all)
        print 'Keys: %s' % final_store
        final_store.close()
        print 'Closed %s' % self.store_path
        
        # Save the history in a corresponding file
        self.directory.save('history', self.history)
        print 'Saved history'
        
        self.saved = True
        

    def test_store(self):    
        final_store = HDFStore(self.store_path)
        print '----'
        print final_store.keys()
        print '-' * 80
        logs = final_store['/logs']
        print type(logs)
        print len(logs)
        print logs.columns
        final_store.close()

    def cleanup(self): 
        os.remove(self.progress_store_path)
        os.remove(self.history_path)
        
    def delete(self):
        os.remove(self.store_path)

    def save_log(self, path):
        """Return a pandas DataFrame for all the valid log entry lines in log_file
            The index of the DataFrame are the uniqufied timestamps of the log entries
        """
        if path in self.history:
            return
        
        print 'Processing %s' % path,
        start = time.time()
        header, df = load_log(path, extra=self.extra)
        if df is None:
            print 'Could not process %s' % path
            return
        self.progress_store.put(LogSaver.normalize(path), df)
        load_time = time.time() - start
        
        self.history[path] = {
            'start': df.index[0],
            'end': df.index[-1],
            'load_time': int(load_time),
            'num': len(df),
            'header': header
        }
        ObjectDirectory.save_object(self.history_path, self.history)
        del df
        print { k:v for k,v in self.history[path].items() if k != 'header' },
        print '%d of %d' % (len(self.history), len(self.log_list))

    def check(self):
        history = ObjectDirectory.load_object(self.history_path, {})
        sorted_keys = history.keys()
        sorted_keys.sort(key=lambda k: history[k]['start'])
        print '-' * 80
        print 'Time range by log file'
        for i, path in enumerate(sorted_keys):
            hist = history[path]
            print '%2d: %s  ---  %s : %s' % (i, hist['start'], hist['end'], path)
        
        path0 = sorted_keys[0]
        for path1 in sorted_keys[1:]:
            hist0,hist1 = history[path0],history[path1] 
            assert hist0['end'] < hist1['start'], '''
            -----------
            %s %s
            start: %s
            end  : %s
            -----------
            %s %s
            hist1['start']
            start: %s
            end  : %s
            ''' % (
                path0, hist0, hist0['start'],  hist0['end'],
                path1, hist1, hist1['start'],  hist1['end'])