def main(argv): pywrapfile.File.Init() config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) # Collect logs only if active state = install_utilities.install_state(config.var('VERSION')) if not state in [ 'ACTIVE', 'SERVE' ]: sys.exit(0) # NO collection for sitesearches: if config.var('SITESEARCH_INTERFACE'): sys.exit(0) # If I'm not a config replica I don't collect the logs.. replicas = config.var('CONFIG_REPLICAS') crt_machine = E.getCrtHostName() if not crt_machine in replicas: logging.error('Not a replica') sys.exit(0) gws_log_dir = liblog.get_gws_log_dir(config) collect_dir = liblog.get_collect_dir(config) partition_dir = liblog.get_partition_dir(config) apache_dir = liblog.get_apache_dir(config) click_dir = liblog.get_click_dir(config) directory_map_file = liblog.get_directory_map_file(config) # in case cron job starts before adminrunner liblog.MakeDir(collect_dir) liblog.MakeDir(partition_dir) liblog.MakeDir(apache_dir) liblog.MakeDir(click_dir) # Collect Logs from machines all_machines = config.var('MACHINES') CollectLogs(all_machines, gws_log_dir, collect_dir) # Partition gwslogs by collections and convert to apache logs preprocess_logs.PartitionLogs(config) # Sanitize collection directory map coll_directory_map = liblog.CollectionDirectoryMap(directory_map_file) if coll_directory_map.sanitizeMap(partition_dir, apache_dir, click_dir): coll_directory_map.saveToDisk() # Send the OP logs to all machines SyncOpLogs(all_machines, config.var('LOGDIR')) logging.info('Done')
def __init__(self, cfg): self.cfg = cfg # configurator object self.entConfig = cfg.globalParams # locks for updating the report lists self.logreplock = threading.RLock() self.logdir = self.cfg.getGlobalParam('LOGDIR') liblog.MakeDir(liblog.get_click_dir(self.entConfig)) liblog.MakeDir(liblog.get_collect_dir(self.entConfig)) liblog.MakeDir(liblog.get_apache_dir(self.entConfig)) liblog.MakeDir(liblog.get_partition_dir(self.entConfig)) liblog.MakeGoogleDir(self.entConfig, liblog.get_report_dir(self.entConfig)) collection_dir_map_file = liblog.get_directory_map_file(self.entConfig) if not os.path.exists(collection_dir_map_file): open(collection_dir_map_file, 'w').close() # a trick to touch a file. self.reportCount = { liblog.RAW_REPORT: 0, liblog.SUMMARY_REPORT: 0, } self.sanitizeReportList(liblog.RAW_REPORT) self.sanitizeReportList(liblog.SUMMARY_REPORT) self.joblock = threading.Lock() self.runningJobs = {}
def babysit(self): """ This is the mode under which the service is executed when the cron process related to this service is executed""" log_max_size = LOG_MAX_SIZE # delete all converter tmpfiles older than 15 minutes since # we timeout a conversion after a few minutes anyways. # hardcoding the directory like we have done for data and log # long term we should consider running logcontrol_service with # additional parameters self.control_files([(".*", 0, 15)], "/mnt/rtcache/converter-tmp") self.control_files([(".*", 0, 15)], "%s/converter-tmp" % self.tmpdir) # Control files in TMPDIR, tmp and TMPDIR/oldlogs self.control_files(FILES_IN_ROOTTMPDIR, "/tmp") self.control_files(FILES_IN_TMPDIR, self.tmpdir) self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.datadir) self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.ent_home) self.control_files(GWS_LOGS_IN_TMPDIR, self.tmpdir) self.control_files([(".*", 0, 15)], "%s/oldlogs/" % self.tmpdir) # Control the core dump files which can fill up disks and can be dangerous self.control_files(CORE_DUMP_FILES, "%s/" % self.datadir) self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.datadir) self.control_files(CORE_DUMP_FILES, "%s/" % self.tmpdir) self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.ent_home) self.control_files(CORE_DUMP_FILES, "%s/" % self.logdir) self.control_files(CORE_DUMP_FILES, "/tmp/") # Control also AdminRunner/ gems # /babysitter/configmgrof files self.control_files( [ ("adminrunner\\.py\\..*", 30, 15), ("log_collector_main_", 5, 15), ("log_analyzer_alerter_", 5, 15), ("babysitter_out_*", 48, 15), # 12 hrs worth (4/hr) ("configmgr_out_*", 20, 15), ("fixer_out_*", 20, 15), ("py.migration_bot", 20, 15), ("batch_crawler_", 5, 15), ("ar_profile_*", 100, 60), # 100 files for 1 hour ("snmphelper_", 14, 15), # 14 files, creates 1/day ], self.logdir) # There are one of these files per version - we don't want anything more # than current version and previous version (serve and test) self.control_files([ ("periodic_scriptOut", 2, 60), ("cronLogCollectOut", 2, 60), ("cronSyslogLogOut", 2, 60), ], self.logdir) # Keep operator logs for 1 year (assuming 1 per day) self.control_files([ ("AdminRunner\\.OPERATOR\\..*", 365, 15), ("AdminServer\\.ERROR\\..*", 5, 15), ("AdminServer\\.INFO\\..*", 5, 15), ("AdminServer\\.FATAL\\..*", 5, 15), ], self.logdir) # Keep at most 50 successful config manager requests for 1 hour self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)], "%s/local/conf/cmr_working/success/" % self.ent_home) # Keep at most 50 successful config manager statusz for 1 hour self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)], "%s/local/conf/cmr_working/statusz/" % self.ent_home) # Keep at most 50 failed config manager requests for 8 hours self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/cmr_working/failed/" % self.ent_home) # Keep at most 50 .CONF config manager requests for 8 hours self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/cmr/" % self.ent_home) # Keep at most 50 .CONF config manager requests for 8 hours self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/fixer_cmr/" % self.ent_home) # Keep a maximum of 5 enterprise onebox log files created. # Delete them if they are older than 16 days. self.control_files( [("enterprise_onebox_log\\.from_.*", 5, 60 * 12 * 16)], self.tmpdir) # Do not allow more than a maximum of 5 web_log.*.browse file (that is # created when users browse our logs. Delete them if they are older than 1 # hour self.control_files([("web_log\\..*\\.browse", 5, 60)], self.logdir) # Do not allow more than a maximum of 5 weblog dump files created when # users browse our logs. Delete them if they are older than 1 day self.control_files([("weblog_dump_.*", 5, 1440)], self.logdir) # Do not allow more than a maximum of 5 feedlog browse files created when # users browse our logs. Delete them if they are older than 1 hour self.control_files([("feed_log\\..*\\.browse", 5, 60)], self.logdir) # Do not allow more than a maximum of 5 tomcat log files for the connectormgr # connector manager. Delete them if they are older than 5 days. self.control_files( FILES_FOR_CONNECTORMGR, "%s/local/google/bin/connectormgr-prod/logs" % self.ent_home) self.control_files( FILES_FOR_CONNECTORMGR, "%s/local/google/bin/connectormgr-test/logs" % self.ent_home) # Control some files of unknown origin that end up in /var/tmp self.control_files(VAR_TMP_FILES, "/var/tmp/") # control the collected, partitioned gws logs, and apache logs collect_dir = liblog.get_collect_dir(self.cp) partition_dir = liblog.get_partition_dir(self.cp) apache_dir = liblog.get_apache_dir(self.cp) click_dir = liblog.get_click_dir(self.cp) # Delete all bigfile remnants for bigfiles with locks older than one week. # (These result from interrupted attempts to delete bigfiles.) bigfile_data_dirs = ("/export/hda3/%s/data/enterprise" % self.version, "/export/hdb3/%s/data/enterprise" % self.version) self.control_bigfile_locks("%s/data/enterprise-data" % self.ent_home, bigfile_data_dirs, 60 * 24 * 7) machines = self.cp.var("MACHINES") for machine in machines: self.control_files(GWS_LOGS_IN_SAVEDDIR, "%s/%s" % (collect_dir, machine)) dirs = glob.glob("%s/*/%s" % (partition_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) dirs = glob.glob("%s/*/%s" % (apache_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) dirs = glob.glob("%s/*/%s" % (click_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) # control the size of apache logs etc self.control_log_files(LOGFILES_CONTROL) # remove distribution packages left behind by vmanager self.control_dirs(VMANAGER_DIRS) # control all log files over MAX_ALLOWED_LOG_SIZE # we first change the limit if disk space is less # Check if the disk is getting full, decrease the limit for file size try: df_status, df_output = commands.getstatusoutput('df /export/hda3/' '| tail -1 ') disk_free_percent = int(df_output.split()[4][:-1]) if disk_free_percent > CRITICAL_DISK_FULL_PERCENT: log_max_size = map(lambda (x, y): (x, y / 2), log_max_size) except ValueError, TypeError: pass # we failed to get disk availability
def babysit(self): """ This is the mode under which the service is executed when the cron process related to this service is executed""" log_max_size = LOG_MAX_SIZE # delete all converter tmpfiles older than 15 minutes since # we timeout a conversion after a few minutes anyways. # hardcoding the directory like we have done for data and log # long term we should consider running logcontrol_service with # additional parameters self.control_files([(".*", 0, 15)], "/mnt/rtcache/converter-tmp") self.control_files([(".*", 0, 15)], "%s/converter-tmp" % self.tmpdir) # Control files in TMPDIR, tmp and TMPDIR/oldlogs self.control_files(FILES_IN_ROOTTMPDIR, "/tmp") self.control_files(FILES_IN_TMPDIR, self.tmpdir) self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.datadir) self.control_files(FILES_IN_TMPDIR, "%s/logs/" % self.ent_home) self.control_files(GWS_LOGS_IN_TMPDIR, self.tmpdir) self.control_files([(".*", 0, 15)], "%s/oldlogs/" % self.tmpdir) # Control the core dump files which can fill up disks and can be dangerous self.control_files(CORE_DUMP_FILES, "%s/" % self.datadir) self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.datadir) self.control_files(CORE_DUMP_FILES, "%s/" % self.tmpdir) self.control_files(CORE_DUMP_FILES, "%s/logs/" % self.ent_home) self.control_files(CORE_DUMP_FILES, "%s/" % self.logdir) self.control_files(CORE_DUMP_FILES, "/tmp/") # Control also AdminRunner/ gems # /babysitter/configmgrof files self.control_files([("adminrunner\\.py\\..*", 30, 15), ("log_collector_main_", 5, 15), ("log_analyzer_alerter_", 5, 15), ("babysitter_out_*", 48, 15), # 12 hrs worth (4/hr) ("configmgr_out_*", 20, 15), ("fixer_out_*", 20, 15), ("py.migration_bot", 20, 15), ("batch_crawler_", 5, 15), ("ar_profile_*", 100, 60), # 100 files for 1 hour ("snmphelper_", 14, 15), # 14 files, creates 1/day ], self.logdir) # There are one of these files per version - we don't want anything more # than current version and previous version (serve and test) self.control_files([("periodic_scriptOut", 2, 60), ("cronLogCollectOut", 2, 60), ("cronSyslogLogOut", 2, 60), ], self.logdir) # Keep operator logs for 1 year (assuming 1 per day) self.control_files([("AdminRunner\\.OPERATOR\\..*", 365, 15), ("AdminServer\\.ERROR\\..*", 5, 15), ("AdminServer\\.INFO\\..*", 5, 15), ("AdminServer\\.FATAL\\..*", 5, 15), ], self.logdir) # Keep at most 50 successful config manager requests for 1 hour self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)], "%s/local/conf/cmr_working/success/" % self.ent_home) # Keep at most 50 successful config manager statusz for 1 hour self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60)], "%s/local/conf/cmr_working/statusz/" % self.ent_home) # Keep at most 50 failed config manager requests for 8 hours self.control_files([("CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/cmr_working/failed/" % self.ent_home) # Keep at most 50 .CONF config manager requests for 8 hours self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/cmr/" % self.ent_home) # Keep at most 50 .CONF config manager requests for 8 hours self.control_files([("\.CONFIG_MANAGER_REQUEST_*", 50, 60 * 8)], "%s/local/conf/fixer_cmr/" % self.ent_home) # Keep a maximum of 5 enterprise onebox log files created. # Delete them if they are older than 16 days. self.control_files([("enterprise_onebox_log\\.from_.*", 5, 60 * 12 * 16)], self.tmpdir) # Do not allow more than a maximum of 5 web_log.*.browse file (that is # created when users browse our logs. Delete them if they are older than 1 # hour self.control_files([("web_log\\..*\\.browse", 5, 60)], self.logdir) # Do not allow more than a maximum of 5 weblog dump files created when # users browse our logs. Delete them if they are older than 1 day self.control_files([("weblog_dump_.*", 5, 1440)], self.logdir) # Do not allow more than a maximum of 5 feedlog browse files created when # users browse our logs. Delete them if they are older than 1 hour self.control_files([("feed_log\\..*\\.browse", 5, 60)], self.logdir) # Do not allow more than a maximum of 5 tomcat log files for the connectormgr # connector manager. Delete them if they are older than 5 days. self.control_files(FILES_FOR_CONNECTORMGR, "%s/local/google/bin/connectormgr-prod/logs" % self.ent_home) self.control_files(FILES_FOR_CONNECTORMGR, "%s/local/google/bin/connectormgr-test/logs" % self.ent_home) # Control some files of unknown origin that end up in /var/tmp self.control_files(VAR_TMP_FILES, "/var/tmp/") # control the collected, partitioned gws logs, and apache logs collect_dir = liblog.get_collect_dir(self.cp) partition_dir = liblog.get_partition_dir(self.cp) apache_dir = liblog.get_apache_dir(self.cp) click_dir = liblog.get_click_dir(self.cp) # Delete all bigfile remnants for bigfiles with locks older than one week. # (These result from interrupted attempts to delete bigfiles.) bigfile_data_dirs = ("/export/hda3/%s/data/enterprise" % self.version, "/export/hdb3/%s/data/enterprise" % self.version) self.control_bigfile_locks("%s/data/enterprise-data" % self.ent_home, bigfile_data_dirs, 60 * 24 * 7) machines = self.cp.var("MACHINES") for machine in machines: self.control_files(GWS_LOGS_IN_SAVEDDIR, "%s/%s" % (collect_dir, machine)) dirs = glob.glob( "%s/*/%s" % (partition_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) dirs = glob.glob("%s/*/%s" % (apache_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) dirs = glob.glob("%s/*/%s" % (click_dir, machine)) for dir in dirs: self.control_files(GWS_LOGS_IN_SAVEDDIR, dir) # control the size of apache logs etc self.control_log_files(LOGFILES_CONTROL) # remove distribution packages left behind by vmanager self.control_dirs(VMANAGER_DIRS) # control all log files over MAX_ALLOWED_LOG_SIZE # we first change the limit if disk space is less # Check if the disk is getting full, decrease the limit for file size try: df_status, df_output = commands.getstatusoutput('df /export/hda3/' '| tail -1 ') disk_free_percent = int(df_output.split()[4][:-1]) if disk_free_percent > CRITICAL_DISK_FULL_PERCENT: log_max_size = map(lambda (x, y):(x, y/2), log_max_size) except ValueError, TypeError: pass # we failed to get disk availability
def main(argv): argc = len(argv) if argc < 6: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) pywrapbase.InitGoogleScript('', ['foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr'], 0) gfile.Init() client = argv[1] date_arg = argv[2] html_file = argv[3] valid_file = argv[4] new_valid_file = argv[5] # extract tag and date_range from command line args date_fields = string.split(date_arg, '_') date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:]) if not date_range: sys.exit(__doc__) first_date, last_date, printable_date, file_date = date_range if last_date.as_int() < first_date.as_int(): sys.exit(__doc__) gws_log_dir = liblog.get_gws_log_dir(config) click_dir = liblog.get_click_dir(config) collect_dir = liblog.get_collect_dir(config) apache_dir = liblog.get_apache_dir(config) directory_map_file = liblog.get_directory_map_file(config) # we need to collect logs first from all gws nodes and preprocess # logs first to make sure logs are up to date. all_machines = config.var('MACHINES') collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir) preprocess_logs.PartitionLogs(config) # make a vector of Log objects for all apache_logs and click_logs matching # the given date range and client. apache_logs = liblog.FindClientLogFiles(apache_dir, directory_map_file, client, first_date, last_date) click_logs = liblog.FindClientLogFiles(click_dir, directory_map_file, client, first_date, last_date) # If we have valid file and report file, we check to see if the data in # apache_dir has been changed and if the report is still valid. if (gfile.Exists(html_file) and gfile.Exists(valid_file) and liblog.checkValid(html_file, valid_file, apache_logs)): logging.info('%s still valid.' % html_file) sys.exit(liblog.STILL_VALID) # if there is no valid report, we create a new one DumpApacheAndClickLogs(apache_logs, click_logs) if not liblog.makeValid(new_valid_file, apache_logs): logging.error('Error validating %s' % html_file) sys.exit(liblog.FAILURE) logging.info('done apache_log, new_valid_file: %s' % new_valid_file) sys.exit(liblog.SUCCESS)
def main(argv): argc = len(argv) if argc < 6: sys.exit(__doc__) config = entconfig.EntConfig(argv[0]) if not config.Load(): sys.exit(__doc__) pywrapbase.InitGoogleScript('', [ 'foo', '--gfs_aliases=%s' % config.var("GFS_ALIASES"), '--bnsresolver_use_svelte=false', '--logtostderr' ], 0) gfile.Init() client = argv[1] date_arg = argv[2] html_file = argv[3] valid_file = argv[4] new_valid_file = argv[5] # extract tag and date_range from command line args date_fields = string.split(date_arg, '_') date_range = liblog.ParseDateRange(date_fields[0], date_fields[1:]) if not date_range: sys.exit(__doc__) first_date, last_date, printable_date, file_date = date_range if last_date.as_int() < first_date.as_int(): sys.exit(__doc__) gws_log_dir = liblog.get_gws_log_dir(config) click_dir = liblog.get_click_dir(config) collect_dir = liblog.get_collect_dir(config) apache_dir = liblog.get_apache_dir(config) directory_map_file = liblog.get_directory_map_file(config) # we need to collect logs first from all gws nodes and preprocess # logs first to make sure logs are up to date. all_machines = config.var('MACHINES') collect_logs.CollectLogs(all_machines, gws_log_dir, collect_dir) preprocess_logs.PartitionLogs(config) # make a vector of Log objects for all apache_logs and click_logs matching # the given date range and client. apache_logs = liblog.FindClientLogFiles(apache_dir, directory_map_file, client, first_date, last_date) click_logs = liblog.FindClientLogFiles(click_dir, directory_map_file, client, first_date, last_date) # If we have valid file and report file, we check to see if the data in # apache_dir has been changed and if the report is still valid. if (gfile.Exists(html_file) and gfile.Exists(valid_file) and liblog.checkValid(html_file, valid_file, apache_logs)): logging.info('%s still valid.' % html_file) sys.exit(liblog.STILL_VALID) # if there is no valid report, we create a new one DumpApacheAndClickLogs(apache_logs, click_logs) if not liblog.makeValid(new_valid_file, apache_logs): logging.error('Error validating %s' % html_file) sys.exit(liblog.FAILURE) logging.info('done apache_log, new_valid_file: %s' % new_valid_file) sys.exit(liblog.SUCCESS)