def split_log_wrapper(log): dirpath, date, machine, _ = awazza.parse_log_name(log) logging.info('Splitting log file (pid=%s): %s\n\t\t\t\t\t\t\t\t\t(Date: %s, Machine: %s)',\ os.getpid(), log, date, machine) user_to_fd = {} try: with open(log, 'r') as logf: for line in logf: # make an AwazzaLogRequest to handle parsing the user id try: user = AwazzaLogRequest(line).user except Exception as e: logging.error('Error parsing line: %s\n%s', e, line) continue # skip this line # if we need to open a file for this user, do it if not user in user_to_fd: user_path = os.path.join(dirpath,\ '%s.%s.%s.user' % (date, machine, user)) f = open(user_path, 'w') user_to_fd[user] = f else: f = user_to_fd[user] # write the record to the corresponding user's file f.write(line) logf.closed except Exception as e: logging.error('Error splitting log files: %s\n%s',\ e, traceback.format_exc()) finally: for fd in user_to_fd.values(): fd.close()
def parse_logs(logs): try: for log in sorted(logs): dirpath, date, machine, _ = awazza.parse_log_name(log) logging.info('Splitting log file (pid=%s): %s\n\t\t\t\t\t\t\t\t\t(Date: %s, Machine: %s)',\ os.getpid(), log, date, machine) if log.endswith('.gz'): # Check if the files are gzip or not and uncompress if needed logf = gzip.open(log, 'rb') else: logf = open(log, 'r') for line in logf: # make an AwazzaLogRequest to handle parsing the user id try: alr = AwazzaLogRequest(line) except Exception as e: logging.error('Error parsing line: %s\n%s', e, line) continue # skip this line # Ignore bad requests if alr.response_code > 400: continue user = alr.user user_path = os.path.join(dirpath,\ '%s.user.fix' % user) # Repeatedly opening slows down the process, but prevents 'too many handles' type errors # write the record to the corresponding user's file with open(user_path, 'a') as userf: # Output time, user, and user agent string only userf.write(str(alr.ts) + ' ' + alr.user + ' ' + alr.user_agent + '\n') logf.close() except Exception as e: logging.error('Error splitting log files: %s\n%s',\ e, traceback.format_exc())