def fsck_mirror(name, config, verbose=False, force=False): global logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Running grok-fsck for [%s]' % name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s' % config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError: logger.info('Could not obtain exclusive lock on %s' % config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s' % config['statusfile']) stfh = open(config['statusfile'], 'r') try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # }, # ... # } status = json.load(stfh) except: # Huai le! logger.critical('Failed to parse %s' % config['statusfile']) lockf(flockh, LOCK_UN) flockh.close() return 1 else: status = {} frequency = int(config['frequency']) today = datetime.datetime.today() workdone = False # Go through the manifest and compare with status for gitdir in manifest.keys(): fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/')) if fullpath not in status.keys(): # Newly added repository # Randomize next check between now and frequency delay = random.randint(0, frequency) nextdate = today + datetime.timedelta(days=delay) nextcheck = nextdate.strftime('%F') status[fullpath] = { 'lastcheck': 'never', 'nextcheck': nextcheck, } logger.info('Added new repository %s with next check on %s' % ( gitdir, nextcheck)) workdone = True # Go through status and queue checks for all the dirs that are due today # (unless --force, which is EVERYTHING) todayiso = today.strftime('%F') for fullpath in status.keys(): # Check to make sure it's still in the manifest gitdir = fullpath.replace(config['toplevel'], '', 1) gitdir = '/' + gitdir.lstrip('/') if gitdir not in manifest.keys(): del status[fullpath] logger.info('Removed %s which is no longer in manifest' % gitdir) continue # If nextcheck is before today, set it to today # XXX: If a system comes up after being in downtime for a while, this # may cause pain for them, so perhaps use randomization here? nextcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d') if force or nextcheck <= today: logger.debug('Queueing to check %s' % fullpath) # Calculate elapsed seconds startt = time.time() run_git_fsck(fullpath, config) endt = time.time() status[fullpath]['lastcheck'] = todayiso status[fullpath]['s_elapsed'] = round(endt - startt, 2) if force: # Use randomization for next check, again delay = random.randint(1, frequency) else: delay = frequency nextdate = today + datetime.timedelta(days=delay) status[fullpath]['nextcheck'] = nextdate.strftime('%F') workdone = True # Do quickie checks if 'quick_checks_max_min' in config.keys(): # Convert to seconds for ease of tracking max_time = int(config['quick_checks_max_min']) * 60 logger.debug('max_time=%s' % max_time) if max_time < 60: logger.warning('quick_checks_max_min must be at least 1 minute') max_time = 60 logger.info('Performing quick checks') # Find the smallest s_elapsed not yet checked today # and run a check on it until we either run out of time # or repos to check. total_elapsed_time = 0 quickies_checked = 0 while True: # use this var to track which repo is smallest on s_elapsed least_elapsed = None repo_to_check = None for fullpath in status.keys(): if status[fullpath]['lastcheck'] in ('never', todayiso): # never been checked or checked today, skip continue if 's_elapsed' not in status[fullpath].keys(): # something happened to the s_elapsed entry? continue prev_elapsed = status[fullpath]['s_elapsed'] if total_elapsed_time + prev_elapsed > max_time: # would take us too long to check it, skip continue if least_elapsed is None or prev_elapsed < least_elapsed: least_elapsed = prev_elapsed repo_to_check = fullpath if repo_to_check is None: if quickies_checked == 0: logger.info('No repos qualified for quick checks') else: logger.info('Quick-checked %s repos in %s seconds' % ( quickies_checked, total_elapsed_time)) break # check repo and record the necessary bits startt = time.time() run_git_fsck(repo_to_check, config) endt = time.time() # We don't adjust nextcheck, since it kinda becomes meaningless status[repo_to_check]['lastcheck'] = todayiso status[repo_to_check]['s_elapsed'] = round(endt - startt, 2) total_elapsed_time += status[repo_to_check]['s_elapsed'] quickies_checked += 1 workdone = True # Write out the new status if workdone: logger.info('Writing new status file in %s' % config['statusfile']) stfh = open(config['statusfile'], 'w') json.dump(status, stfh, indent=2) stfh.close() lockf(flockh, LOCK_UN) flockh.close()
def fsck_mirror(name, config, verbose=False, force=False): global logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Running grok-fsck for [%s]' % name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s' % config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError: logger.info('Could not obtain exclusive lock on %s' % config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s' % config['statusfile']) stfh = open(config['statusfile'], 'r') try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # }, # ... # } status = json.load(stfh) except: # Huai le! logger.critical('Failed to parse %s' % config['statusfile']) lockf(flockh, LOCK_UN) flockh.close() return 1 else: status = {} frequency = int(config['frequency']) today = datetime.datetime.today() workdone = False # Go through the manifest and compare with status for gitdir in manifest.keys(): fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/')) if fullpath not in status.keys(): # Newly added repository # Randomize next check between now and frequency delay = random.randint(0, frequency) nextdate = today + datetime.timedelta(days=delay) nextcheck = nextdate.strftime('%F') status[fullpath] = { 'lastcheck': 'never', 'nextcheck': nextcheck, } logger.info('Added new repository %s with next check on %s' % (gitdir, nextcheck)) workdone = True # Go through status and queue checks for all the dirs that are due today # (unless --force, which is EVERYTHING) todayiso = today.strftime('%F') for fullpath in status.keys(): # Check to make sure it's still in the manifest gitdir = fullpath.replace(config['toplevel'], '', 1) gitdir = '/' + gitdir.lstrip('/') if gitdir not in manifest.keys(): del status[fullpath] logger.info('Removed %s which is no longer in manifest' % gitdir) continue # If nextcheck is before today, set it to today # XXX: If a system comes up after being in downtime for a while, this # may cause pain for them, so perhaps use randomization here? nextcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d') if force or nextcheck <= today: logger.debug('Queueing to check %s' % fullpath) # Calculate elapsed seconds startt = time.time() run_git_fsck(fullpath, config) endt = time.time() status[fullpath]['lastcheck'] = todayiso status[fullpath]['s_elapsed'] = round(endt - startt, 2) if force: # Use randomization for next check, again delay = random.randint(1, frequency) else: delay = frequency nextdate = today + datetime.timedelta(days=delay) status[fullpath]['nextcheck'] = nextdate.strftime('%F') workdone = True # Do quickie checks if 'quick_checks_max_min' in config.keys(): # Convert to seconds for ease of tracking max_time = int(config['quick_checks_max_min']) * 60 logger.debug('max_time=%s' % max_time) if max_time < 60: logger.warning('quick_checks_max_min must be at least 1 minute') max_time = 60 logger.info('Performing quick checks') # Find the smallest s_elapsed not yet checked today # and run a check on it until we either run out of time # or repos to check. total_elapsed_time = 0 quickies_checked = 0 while True: # use this var to track which repo is smallest on s_elapsed least_elapsed = None repo_to_check = None for fullpath in status.keys(): if status[fullpath]['lastcheck'] in ('never', todayiso): # never been checked or checked today, skip continue if 's_elapsed' not in status[fullpath].keys(): # something happened to the s_elapsed entry? continue prev_elapsed = status[fullpath]['s_elapsed'] if total_elapsed_time + prev_elapsed > max_time: # would take us too long to check it, skip continue if least_elapsed is None or prev_elapsed < least_elapsed: least_elapsed = prev_elapsed repo_to_check = fullpath if repo_to_check is None: if quickies_checked == 0: logger.info('No repos qualified for quick checks') else: logger.info('Quick-checked %s repos in %s seconds' % (quickies_checked, total_elapsed_time)) break # check repo and record the necessary bits startt = time.time() run_git_fsck(repo_to_check, config) endt = time.time() # We don't adjust nextcheck, since it kinda becomes meaningless status[repo_to_check]['lastcheck'] = todayiso status[repo_to_check]['s_elapsed'] = round(endt - startt, 2) total_elapsed_time += status[repo_to_check]['s_elapsed'] quickies_checked += 1 workdone = True # Write out the new status if workdone: logger.info('Writing new status file in %s' % config['statusfile']) stfh = open(config['statusfile'], 'w') json.dump(status, stfh, indent=2) stfh.close() lockf(flockh, LOCK_UN) flockh.close()
if config['manifest'].find('.gz') > 0: fh = gzip.GzipFile(fileobj=StringIO(ufh.read())) else: fh = ufh jdata = fh.read() fh.close() manifest = anyjson.deserialize(jdata) except Exception, ex: logger.warning('Failed to parse %s' % config['manifest']) logger.warning('Error was: %s' % ex) return 1 mymanifest = grokmirror.read_manifest(mymanifest) culled = cull_manifest(manifest, config) to_clone = [] to_pull = [] existing = [] toplevel = config['toplevel'] if not os.access(toplevel, os.W_OK): logger.critical('Toplevel %s does not exist or is not writable' % toplevel) sys.exit(1) if 'pull_threads' in config.keys(): pull_threads = int(config['pull_threads'])
def pull_mirror(name, config, verbose=False, force=False, nomtime=False, verify=False, verify_subpath='*', noreuse=False, purge=False, pretty=False, forcepurge=False): global logger global lock_fails logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]' % name) mymanifest = config['mymanifest'] if verify: logger.info('Verifying mirror against %s' % config['manifest']) nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!' % config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s' % (manifile, fstat[8])) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s' % my_last_modified) if not (force or nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s' % manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.critical('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s' % config['manifest']) # Do we have username:password@ in the URL? chunks = urlparse.urlparse(config['manifest']) if chunks.netloc.find('@') > 0: logger.debug('Taking username/password from the URL for basic auth') (upass, netloc) = chunks.netloc.split('@') if upass.find(':') > 0: (username, password) = upass.split(':') else: username = upass password = '' manifesturl = config['manifest'].replace(chunks.netloc, netloc) logger.debug('manifesturl=%s' % manifesturl) request = urllib2.Request(manifesturl) password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, manifesturl, username, password) auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr) opener = urllib2.build_opener(auth_handler) else: request = urllib2.Request(config['manifest']) opener = urllib2.build_opener() # Find out if we need to run at all first if not (force or nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s' % (mymanifest, mtime)) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s' % my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except urllib2.HTTPError, ex: if ex.code == 304: logger.info('Server says we have the latest manifest. ' 'Quitting.') return 0 logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Server returned: %s' % ex) return 1 except urllib2.URLError, ex: logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Error was: %s' % ex) return 1
def pull_mirror(name, config, verbose=False, force=False, nomtime=False, verify=False, verify_subpath='*', noreuse=False, purge=False, pretty=False, forcepurge=False): global logger global lock_fails logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]' % name) mymanifest = config['mymanifest'] if verify: logger.info('Verifying mirror against %s' % config['manifest']) nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!' % config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s' % (manifile, fstat[8])) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s' % my_last_modified) if not (force or nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s' % manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.critical('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s' % config['manifest']) # Do we have username:password@ in the URL? chunks = urlparse.urlparse(config['manifest']) if chunks.netloc.find('@') > 0: logger.debug( 'Taking username/password from the URL for basic auth') (upass, netloc) = chunks.netloc.split('@') if upass.find(':') > 0: (username, password) = upass.split(':') else: username = upass password = '' manifesturl = config['manifest'].replace(chunks.netloc, netloc) logger.debug('manifesturl=%s' % manifesturl) request = urllib2.Request(manifesturl) password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, manifesturl, username, password) auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr) opener = urllib2.build_opener(auth_handler) else: request = urllib2.Request(config['manifest']) opener = urllib2.build_opener() # Find out if we need to run at all first if not (force or nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s' % (mymanifest, mtime)) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s' % my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except urllib2.HTTPError, ex: if ex.code == 304: logger.info('Server says we have the latest manifest. ' 'Quitting.') return 0 logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Server returned: %s' % ex) return 1 except urllib2.URLError, ex: logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Error was: %s' % ex) return 1
def fsck_mirror(name, config, verbose=False, force=False, repack_only=False, conn_only=False, repack_all_quick=False, repack_all_full=False): global logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # noinspection PyTypeChecker em = enlighten.get_manager(series=' -=#') if 'log' in config: ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config: if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) em.enabled = False logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger if conn_only or repack_all_quick or repack_all_full: force = True logger.info('Running grok-fsck for [%s]', name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s', config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError: logger.info('Could not obtain exclusive lock on %s', config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s', config['statusfile']) stfh = open(config['statusfile'], 'rb') # noinspection PyBroadException try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # 'lastrepack': 'YYYY-MM-DD', # 'fingerprint': 'sha-1', # 's_elapsed': seconds, # 'quick_repack_count': times, # }, # ... # } status = json.loads(stfh.read().decode('utf-8')) except: # Huai le! logger.critical('Failed to parse %s', config['statusfile']) lockf(flockh, LOCK_UN) flockh.close() return 1 else: status = {} if 'frequency' in config: frequency = int(config['frequency']) else: frequency = 30 today = datetime.datetime.today() todayiso = today.strftime('%F') if force: # Use randomization for next check, again checkdelay = random.randint(1, frequency) else: checkdelay = frequency if 'precious' not in config: config['precious'] = 'yes' if 'commitgraph' not in config: config['commitgraph'] = 'yes' # Is our git version new enough to support it? if config['commitgraph'] == 'yes' and not grokmirror.git_newer_than( '2.18.0'): logger.info('Git version too old to support commit graphs, disabling') config['commitgraph'] = 'no' # Go through the manifest and compare with status # noinspection PyTypeChecker e_find = em.counter(total=len(manifest), desc='Discovering:', unit='repos', leave=False) for gitdir in list(manifest): e_find.update() fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/')) if fullpath not in status.keys(): # Newly added repository if not force: # Randomize next check between now and frequency delay = random.randint(0, frequency) nextdate = today + datetime.timedelta(days=delay) nextcheck = nextdate.strftime('%F') else: nextcheck = todayiso status[fullpath] = { 'lastcheck': 'never', 'nextcheck': nextcheck, } logger.info('%s:', fullpath) logger.info(' added : next check on %s', nextcheck) e_find.close() # record newly found repos in the status file logger.debug('Updating status file in %s', config['statusfile']) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) # Go through status and find all repos that need work done on them. # This is a dictionary that contains: # full_path_to_repo: # repack: 0, 1, 2 (0-no, 1-needs quick repack, 2-needs full repack) # fsck: 0/1 to_process = {} total_checked = 0 total_elapsed = 0 # noinspection PyTypeChecker e_cmp = em.counter(total=len(status), desc='Analyzing:', unit='repos', leave=False) for fullpath in list(status): e_cmp.update() # Check to make sure it's still in the manifest gitdir = fullpath.replace(config['toplevel'], '', 1) gitdir = '/' + gitdir.lstrip('/') if gitdir not in manifest.keys(): del status[fullpath] logger.debug('%s is gone, no longer in manifest', gitdir) continue needs_repack = needs_prune = needs_fsck = 0 obj_info = get_repo_obj_info(fullpath) try: packs = int(obj_info['packs']) count_loose = int(obj_info['count']) except KeyError: logger.warning('Unable to count objects in %s, skipping' % fullpath) continue schedcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d') nextcheck = today + datetime.timedelta(days=checkdelay) if 'repack' not in config or config['repack'] != 'yes': # don't look at me if you turned off repack logger.debug('Not repacking because repack=no in config') needs_repack = 0 elif repack_all_full and (count_loose > 0 or packs > 1): logger.debug('needs_repack=2 due to repack_all_full') needs_repack = 2 elif repack_all_quick and count_loose > 0: logger.debug('needs_repack=1 due to repack_all_quick') needs_repack = 1 elif conn_only: # don't do any repacks if we're running forced connectivity checks, unless # you specifically passed --repack-all-foo logger.debug('needs_repack=0 due to --conn-only') needs_repack = 0 else: # for now, hardcode the maximum loose objects and packs # TODO: we can probably set this in git config values? # I don't think this makes sense as a global setting, because # optimal values will depend on the size of the repo as a whole max_loose_objects = 1200 max_packs = 20 pc_loose_objects = 10 pc_loose_size = 10 # first, compare against max values: if packs >= max_packs: logger.debug('Triggering full repack of %s because packs > %s', fullpath, max_packs) needs_repack = 2 elif count_loose >= max_loose_objects: logger.debug( 'Triggering quick repack of %s because loose objects > %s', fullpath, max_loose_objects) needs_repack = 1 else: # is the number of loose objects or their size more than 10% of # the overall total? in_pack = int(obj_info['in-pack']) size_loose = int(obj_info['size']) size_pack = int(obj_info['size-pack']) total_obj = count_loose + in_pack total_size = size_loose + size_pack # set some arbitrary "worth bothering" limits so we don't # continuously repack tiny repos. if total_obj > 500 and count_loose / total_obj * 100 >= pc_loose_objects: logger.debug( 'Triggering repack of %s because loose objects > %s%% of total', fullpath, pc_loose_objects) needs_repack = 1 elif total_size > 1024 and size_loose / total_size * 100 >= pc_loose_size: logger.debug( 'Triggering repack of %s because loose size > %s%% of total', fullpath, pc_loose_size) needs_repack = 1 if needs_repack > 0 and (config['precious'] == 'always' and check_precious_objects(fullpath)): # if we have preciousObjects, then we only repack based on the same # schedule as fsck. logger.debug('preciousObjects is set') # for repos with preciousObjects, we use the fsck schedule for repacking if schedcheck <= today: logger.debug( 'Time for a full periodic repack of a preciousObjects repo' ) status[fullpath]['nextcheck'] = nextcheck.strftime('%F') needs_repack = 2 else: logger.debug( 'Not repacking preciousObjects repo outside of schedule') needs_repack = 0 # Do we need to fsck it? if not (repack_all_quick or repack_all_full or repack_only): if schedcheck <= today or force: status[fullpath]['nextcheck'] = nextcheck.strftime('%F') needs_fsck = 1 if needs_repack or needs_fsck or needs_prune: # emit a warning if we find garbage in a repo # we do it here so we don't spam people nightly on every cron run, # but only do it when a repo needs actual work done on it if obj_info['garbage'] != '0': logger.warning( '%s:\n\tcontains %s garbage files (garbage-size: %s KiB)', fullpath, obj_info['garbage'], obj_info['size-garbage']) to_process[fullpath] = { 'repack': needs_repack, 'prune': needs_prune, 'fsck': needs_fsck, } e_cmp.close() if not len(to_process): logger.info('No repos need attention.') em.stop() return logger.info('Processing %s repositories', len(to_process)) # noinspection PyTypeChecker run = em.counter(total=len(to_process), desc='Processing:', unit='repos', leave=False) for fullpath, needs in to_process.items(): logger.info('%s:', fullpath) # Calculate elapsed seconds run.refresh() startt = time.time() # Wait till the repo is available and lock it for the duration of checks, # otherwise there may be false-positives if a mirrored repo is updated # in the middle of fsck or repack. grokmirror.lock_repo(fullpath, nonblocking=False) if needs['repack']: if run_git_repack(fullpath, config, needs['repack']): status[fullpath]['lastrepack'] = todayiso if needs['repack'] > 1: status[fullpath]['lastfullrepack'] = todayiso else: logger.warning( 'Repacking %s was unsuccessful, ' 'not running fsck.', fullpath) grokmirror.unlock_repo(fullpath) run.update() continue if needs['prune']: run_git_prune(fullpath, config) if needs['fsck']: run_git_fsck(fullpath, config, conn_only) endt = time.time() status[fullpath]['lastcheck'] = todayiso status[fullpath]['s_elapsed'] = int(endt - startt) logger.info(' done : %ss, next check on %s', status[fullpath]['s_elapsed'], status[fullpath]['nextcheck']) run.update() # We're done with the repo now grokmirror.unlock_repo(fullpath) total_checked += 1 total_elapsed += time.time() - startt # Write status file after each check, so if the process dies, we won't # have to recheck all the repos we've already checked logger.debug('Updating status file in %s', config['statusfile']) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) run.close() em.stop() logger.info('Processed %s repos in %0.2fs', total_checked, total_elapsed) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) lockf(flockh, LOCK_UN) flockh.close()
def fsck_mirror(name, config, verbose=False, force=False): global logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Running grok-fsck for [%s]' % name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s' % config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError: logger.info('Could not obtain exclusive lock on %s' % config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s' % config['statusfile']) stfh = open(config['statusfile'], 'r') try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # 'lastrepack': 'YYYY-MM-DD', # 'fingerprint': 'sha-1', # 's_elapsed': seconds, # 'quick_repack_count': times, # }, # ... # } status = json.load(stfh) except: # Huai le! logger.critical('Failed to parse %s' % config['statusfile']) lockf(flockh, LOCK_UN) flockh.close() return 1 else: status = {} frequency = int(config['frequency']) today = datetime.datetime.today() # Go through the manifest and compare with status for gitdir in manifest.keys(): fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/')) if fullpath not in status.keys(): # Newly added repository # Randomize next check between now and frequency delay = random.randint(0, frequency) nextdate = today + datetime.timedelta(days=delay) nextcheck = nextdate.strftime('%F') status[fullpath] = { 'lastcheck': 'never', 'nextcheck': nextcheck, } logger.info('Added new repository %s with next check on %s' % ( gitdir, nextcheck)) total_checked = 0 total_elapsed = 0 # Go through status and queue checks for all the dirs that are due today # (unless --force, which is EVERYTHING) todayiso = today.strftime('%F') for fullpath in status.keys(): # Check to make sure it's still in the manifest gitdir = fullpath.replace(config['toplevel'], '', 1) gitdir = '/' + gitdir.lstrip('/') if gitdir not in manifest.keys(): del status[fullpath] logger.info('Removed %s which is no longer in manifest' % gitdir) continue # If nextcheck is before today, set it to today # XXX: If a system comes up after being in downtime for a while, this # may cause pain for them, so perhaps use randomization here? nextcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d') if force or nextcheck <= today: logger.debug('Preparing to check %s' % fullpath) # Calculate elapsed seconds startt = time.time() run_git_fsck(fullpath, config) total_checked += 1 # Did the fingerprint change since last time we repacked? oldfpr = None if 'fingerprint' in status[fullpath].keys(): oldfpr = status[fullpath]['fingerprint'] fpr = grokmirror.get_repo_fingerprint(config['toplevel'], gitdir, force=True) if fpr != oldfpr or force: full_repack = False if not 'quick_repack_count' in status[fullpath].keys(): status[fullpath]['quick_repack_count'] = 0 quick_repack_count = status[fullpath]['quick_repack_count'] if 'full_repack_every' in config.keys(): # but did you set 'full_repack_flags' as well? if 'full_repack_flags' not in config.keys(): logger.critical('full_repack_every is set, but not full_repack_flags') else: full_repack_every = int(config['full_repack_every']) # is it anything insane? if full_repack_every < 2: full_repack_every = 2 logger.warning('full_repack_every is too low, forced to 2') # is it time to trigger full repack? # We -1 because if we want a repack every 10th time, then we need to trigger # when current repack count is 9. if quick_repack_count >= full_repack_every-1: logger.debug('Time to do full repack on %s' % fullpath) full_repack = True quick_repack_count = 0 status[fullpath]['lastfullrepack'] = todayiso else: logger.debug('Repack count for %s not yet reached full repack trigger' % fullpath) quick_repack_count += 1 run_git_repack(fullpath, config, full_repack) run_git_prune(fullpath, config, manifest) status[fullpath]['lastrepack'] = todayiso status[fullpath]['quick_repack_count'] = quick_repack_count else: logger.debug('No changes to %s since last run, not repacking' % gitdir) endt = time.time() total_elapsed += endt-startt status[fullpath]['fingerprint'] = fpr status[fullpath]['lastcheck'] = todayiso status[fullpath]['s_elapsed'] = int(endt - startt) if force: # Use randomization for next check, again delay = random.randint(1, frequency) else: delay = frequency nextdate = today + datetime.timedelta(days=delay) status[fullpath]['nextcheck'] = nextdate.strftime('%F') # Write status file after each check, so if the process dies, we won't # have to recheck all the repos we've already checked logger.debug('Updating status file in %s' % config['statusfile']) stfh = open(config['statusfile'], 'w') json.dump(status, stfh, indent=2) stfh.close() if not total_checked: logger.info('No new repos to check.') else: logger.info('Repos checked: %s' % total_checked) logger.info('Total running time: %s s' % int(total_elapsed)) lockf(flockh, LOCK_UN) flockh.close()
def pull_mirror(name, config, opts): global logger global lock_fails logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if opts.verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]' % name) mymanifest = config['mymanifest'] if opts.verify: logger.info('Verifying mirror against %s' % config['manifest']) opts.nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!' % config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s' % (manifile, fstat[8])) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s' % my_last_modified) if not (opts.force or opts.nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s' % manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.critical('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s' % config['manifest']) request = urllib2.Request(config['manifest']) opener = urllib2.build_opener() # Find out if we need to run at all first if not (opts.force or opts.nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s' % (mymanifest, mtime)) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s' % my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except urllib2.HTTPError, ex: if ex.code == 304: logger.info( 'Server says we have the latest manifest. Quitting.') return 0 logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Server returned: %s' % ex) return 1 except urllib2.URLError, ex: logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Error was: %s' % ex) return 1
logger.addHandler(ch) if opts.logfile is not None: ch = logging.FileHandler(opts.logfile) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) ch.setLevel(logging.DEBUG) logger.addHandler(ch) # push our logger into grokmirror to override the default grokmirror.logger = logger grokmirror.manifest_lock(opts.manifile) manifest = grokmirror.read_manifest(opts.manifile, wait=opts.wait) # If manifest is empty, don't use current timestamp if not len(manifest.keys()): opts.usenow = False if opts.remove and len(args): # Remove the repos as required, write new manfiest and exit for fullpath in args: repo = fullpath.replace(opts.toplevel, '', 1) if repo in manifest.keys(): del manifest[repo] logger.info('Repository %s removed from manifest' % repo) else: logger.info('Repository %s not in manifest' % repo)
def pull_mirror(name, config, verbose=False, force=False, nomtime=False, verify=False, verify_subpath='*', noreuse=False, purge=False, pretty=False, forcepurge=False): global logger global lock_fails # noinspection PyTypeChecker em = enlighten.get_manager(series=' -=#') logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) em.enabled = False logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]', name) mymanifest = config['mymanifest'] if verify: logger.info('Verifying mirror against %s', config['manifest']) nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!', config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s', manifile, fstat[8]) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s', my_last_modified) if not (force or nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s', manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.warning('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s', config['manifest']) # Do we have username:password@ in the URL? chunks = urlparse(config['manifest']) if chunks.netloc.find('@') > 0: logger.debug( 'Taking username/password from the URL for basic auth') (upass, netloc) = chunks.netloc.split('@') if upass.find(':') > 0: (username, password) = upass.split(':') else: username = upass password = '' manifesturl = config['manifest'].replace(chunks.netloc, netloc) logger.debug('manifesturl=%s', manifesturl) request = urllib_request.Request(manifesturl) password_mgr = urllib_request.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, manifesturl, username, password) auth_handler = urllib_request.HTTPBasicAuthHandler(password_mgr) opener = urllib_request.build_opener(auth_handler) else: request = urllib_request.Request(config['manifest']) opener = urllib_request.build_opener() # Find out if we need to run at all first if not (force or nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s', mymanifest, mtime) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s', my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except HTTPError as ex: if ex.code == 304: logger.info('Server says we have the latest manifest. ' 'Quitting.') return 0 logger.warning('Could not fetch %s', config['manifest']) logger.warning('Server returned: %s', ex) return 1 except (URLError, ssl.SSLError, ssl.CertificateError) as ex: logger.warning('Could not fetch %s', config['manifest']) logger.warning('Error was: %s', ex) return 1 last_modified = ufh.headers.get('Last-Modified') last_modified = time.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') last_modified = calendar.timegm(last_modified) # We don't use read_manifest for the remote manifest, as it can be # anything, really. For now, blindly open it with gzipfile if it ends # with .gz. XXX: some http servers will auto-deflate such files. try: if config['manifest'].find('.gz') > 0: fh = gzip.GzipFile(fileobj=BytesIO(ufh.read())) else: fh = ufh jdata = fh.read().decode('utf-8') fh.close() manifest = anyjson.deserialize(jdata) except Exception as ex: logger.warning('Failed to parse %s', config['manifest']) logger.warning('Error was: %s', ex) return 1 mymanifest = grokmirror.read_manifest(mymanifest) culled = cull_manifest(manifest, config) to_clone = [] to_pull = [] existing = [] toplevel = config['toplevel'] if not os.access(toplevel, os.W_OK): logger.critical('Toplevel %s does not exist or is not writable', toplevel) sys.exit(1) if 'pull_threads' in config.keys(): pull_threads = int(config['pull_threads']) if pull_threads < 1: logger.info('pull_threads is less than 1, forcing to 1') pull_threads = 1 else: # be conservative logger.info('pull_threads is not set, consider setting it') pull_threads = 5 # noinspection PyTypeChecker e_cmp = em.counter(total=len(culled), desc='Comparing:', unit='repos', leave=False) for gitdir in list(culled): fullpath = os.path.join(toplevel, gitdir.lstrip('/')) e_cmp.update() # fingerprints were added in later versions, so deal if the upstream # manifest doesn't have a fingerprint if 'fingerprint' not in culled[gitdir]: culled[gitdir]['fingerprint'] = None # Attempt to lock the repo try: grokmirror.lock_repo(fullpath, nonblocking=True) except IOError: logger.info('Could not lock %s, skipping', gitdir) lock_fails.append(gitdir) # Force the fingerprint to what we have in mymanifest, # if we have it. culled[gitdir]['fingerprint'] = None if gitdir in mymanifest and 'fingerprint' in mymanifest[gitdir]: culled[gitdir]['fingerprint'] = mymanifest[gitdir][ 'fingerprint'] if len(lock_fails) >= pull_threads: logger.info('Too many repositories locked (%s). Exiting.', len(lock_fails)) return 0 continue if verify: if culled[gitdir]['fingerprint'] is None: logger.debug('No fingerprint for %s, not verifying', gitdir) grokmirror.unlock_repo(fullpath) continue if not fnmatch.fnmatch(gitdir, verify_subpath): grokmirror.unlock_repo(fullpath) continue logger.debug('Verifying %s', gitdir) if not os.path.exists(fullpath): verify_fails.append(gitdir) logger.info('Verify: %s ABSENT', gitdir) grokmirror.unlock_repo(fullpath) continue my_fingerprint = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=force) if my_fingerprint == culled[gitdir]['fingerprint']: logger.info('Verify: %s OK', gitdir) else: logger.critical('Verify: %s FAILED', gitdir) verify_fails.append(gitdir) grokmirror.unlock_repo(fullpath) continue # Is the directory in place? if os.path.exists(fullpath): # Did grok-fsck request to reclone it? rfile = os.path.join(fullpath, 'grokmirror.reclone') if os.path.exists(rfile): logger.info('Reclone requested for %s:', gitdir) with open(rfile, 'r') as rfh: reason = rfh.read() logger.info(' %s', reason) to_clone.append(gitdir) grokmirror.unlock_repo(fullpath) continue # Fix owner and description, if necessary if gitdir in mymanifest.keys(): # This code is hurky and needs to be cleaned up desc = culled[gitdir].get('description') owner = culled[gitdir].get('owner') ref = None if config['ignore_repo_references'] != 'yes': ref = culled[gitdir].get('reference') # dirty hack to force on-disk owner/description checks # when we're called with -n, in case our manifest # differs from what is on disk for owner/description/alternates myref = None if nomtime: mydesc = None myowner = None else: mydesc = mymanifest[gitdir].get('description') myowner = mymanifest[gitdir].get('owner') if config['ignore_repo_references'] != 'yes': myref = mymanifest[gitdir].get('reference') if myowner is None: myowner = config['default_owner'] if owner is None: owner = config['default_owner'] if desc != mydesc or owner != myowner or ref != myref: # we can do this right away without waiting set_repo_params(toplevel, gitdir, owner, desc, ref) else: # It exists on disk, but not in my manifest? if noreuse: logger.critical('Found existing git repo in %s', fullpath) logger.critical('But you asked NOT to reuse repos') logger.critical('Skipping %s', gitdir) grokmirror.unlock_repo(fullpath) continue logger.info('Setting new origin for %s', gitdir) fix_remotes(gitdir, toplevel, config['site']) to_pull.append(gitdir) grokmirror.unlock_repo(fullpath) continue # fingerprints were added late, so if we don't have them # in the remote manifest, fall back on using timestamps changed = False if culled[gitdir]['fingerprint'] is not None: logger.debug('Will use fingerprints to compare %s', gitdir) my_fingerprint = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=force) if my_fingerprint != culled[gitdir]['fingerprint']: logger.debug('No fingerprint match, will pull %s', gitdir) changed = True else: logger.debug('Fingerprints match, skipping %s', gitdir) else: logger.debug('Will use timestamps to compare %s', gitdir) if force: logger.debug('Will force-pull %s', gitdir) changed = True # set timestamp to 0 as well grokmirror.set_repo_timestamp(toplevel, gitdir, 0) else: ts = grokmirror.get_repo_timestamp(toplevel, gitdir) if ts < culled[gitdir]['modified']: changed = True if changed: to_pull.append(gitdir) grokmirror.unlock_repo(fullpath) continue else: logger.debug('Repo %s unchanged', gitdir) # if we don't have a fingerprint for it, add it now if culled[gitdir]['fingerprint'] is None: fpr = grokmirror.get_repo_fingerprint(toplevel, gitdir) culled[gitdir]['fingerprint'] = fpr existing.append(gitdir) grokmirror.unlock_repo(fullpath) continue else: # Newly incoming repo to_clone.append(gitdir) grokmirror.unlock_repo(fullpath) continue # If we got here, something is odd. # noinspection PyUnreachableCode logger.critical('Could not figure out what to do with %s', gitdir) grokmirror.unlock_repo(fullpath) logger.info('Compared new manifest against %s repositories in %0.2fs', len(culled), e_cmp.elapsed) e_cmp.close() if verify: if len(verify_fails): logger.critical('%s repos failed to verify', len(verify_fails)) return 1 else: logger.info('Verification successful') return 0 hookscript = config['post_update_hook'] if len(to_pull): if len(lock_fails) > 0: pull_threads -= len(lock_fails) # Don't spin up more threads than we need if pull_threads > len(to_pull): pull_threads = len(to_pull) # exit if we're ever at 0 pull_threads. Shouldn't happen, but some extra # precaution doesn't hurt if pull_threads <= 0: logger.info('Too many repositories locked. Exiting.') return 0 logger.info('Will use %d threads to pull repos', pull_threads) # noinspection PyTypeChecker e_pull = em.counter(total=len(to_pull), desc='Updating :', unit='repos', leave=False) logger.info('Updating %s repos from %s', len(to_pull), config['site']) in_queue = Queue() out_queue = Queue() for gitdir in to_pull: in_queue.put((gitdir, culled[gitdir]['fingerprint'], culled[gitdir]['modified'])) for i in range(pull_threads): logger.debug('Spun up thread %s', i) t = PullerThread(in_queue, out_queue, config, i, e_pull) t.setDaemon(True) t.start() # wait till it's all done in_queue.join() logger.info('All threads finished.') while not out_queue.empty(): # see if any of it failed (gitdir, my_fingerprint, status) = out_queue.get() # We always record our fingerprint in our manifest culled[gitdir]['fingerprint'] = my_fingerprint if not status: # To make sure we check this again during next run, # fudge the manifest accordingly. logger.debug('Will recheck %s during next run', gitdir) culled[gitdir] = mymanifest[gitdir] # this is rather hackish, but effective last_modified -= 1 logger.info('Updates completed in %0.2fs', e_pull.elapsed) e_pull.close() else: logger.info('No repositories need updating') # how many lockfiles have we seen? # If there are more lock_fails than there are # pull_threads configured, we skip cloning out of caution if len(to_clone) and len(lock_fails) > pull_threads: logger.info( 'Too many repositories locked. Skipping cloning new repos.') to_clone = [] if len(to_clone): # noinspection PyTypeChecker e_clone = em.counter(total=len(to_clone), desc='Cloning :', unit='repos', leave=False) logger.info('Cloning %s repos from %s', len(to_clone), config['site']) # we use "existing" to track which repos can be used as references existing.extend(to_pull) to_clone_sorted = [] clone_order(to_clone, manifest, to_clone_sorted, existing) for gitdir in to_clone_sorted: e_clone.refresh() fullpath = os.path.join(toplevel, gitdir.lstrip('/')) # Did grok-fsck request to reclone it? rfile = os.path.join(fullpath, 'grokmirror.reclone') if os.path.exists(rfile): logger.debug('Removing %s for reclone', gitdir) shutil.move(fullpath, '%s.reclone' % fullpath) shutil.rmtree('%s.reclone' % fullpath) # Do we still need to clone it, or has another process # already done this for us? ts = grokmirror.get_repo_timestamp(toplevel, gitdir) if ts > 0: logger.debug('Looks like %s already cloned, skipping', gitdir) e_clone.update() continue try: grokmirror.lock_repo(fullpath, nonblocking=True) except IOError: logger.info('Could not lock %s, skipping', gitdir) lock_fails.append(gitdir) e_clone.update() continue reference = None if config['ignore_repo_references'] != 'yes': reference = culled[gitdir]['reference'] if reference is not None and reference in existing: # Make sure we can lock the reference repo refrepo = os.path.join(toplevel, reference.lstrip('/')) try: grokmirror.lock_repo(refrepo, nonblocking=True) success = clone_repo(toplevel, gitdir, config['site'], reference=reference) grokmirror.unlock_repo(refrepo) except IOError: logger.info('Cannot lock reference repo %s, skipping %s', reference, gitdir) if reference not in lock_fails: lock_fails.append(reference) grokmirror.unlock_repo(fullpath) e_clone.update() continue else: success = clone_repo(toplevel, gitdir, config['site']) # check dir to make sure cloning succeeded and then add to existing if os.path.exists(fullpath) and success: logger.debug('Cloning of %s succeeded, adding to existing', gitdir) existing.append(gitdir) desc = culled[gitdir].get('description') owner = culled[gitdir].get('owner') ref = culled[gitdir].get('reference') if owner is None: owner = config['default_owner'] set_repo_params(toplevel, gitdir, owner, desc, ref) set_agefile(toplevel, gitdir, culled[gitdir]['modified']) my_fingerprint = grokmirror.set_repo_fingerprint( toplevel, gitdir) culled[gitdir]['fingerprint'] = my_fingerprint run_post_update_hook(hookscript, toplevel, gitdir) else: logger.warning('Was not able to clone %s', gitdir) # Remove it from our manifest so we can try re-cloning # next time grok-pull runs del culled[gitdir] git_fails.append(gitdir) grokmirror.unlock_repo(fullpath) e_clone.update() logger.info('Clones completed in %0.2fs' % e_clone.elapsed) e_clone.close() else: logger.info('No repositories need cloning') # loop through all entries and find any symlinks we need to set # We also collect all symlinks to do purging correctly symlinks = [] for gitdir in culled.keys(): if 'symlinks' in culled[gitdir].keys(): source = os.path.join(config['toplevel'], gitdir.lstrip('/')) for symlink in culled[gitdir]['symlinks']: if symlink not in symlinks: symlinks.append(symlink) target = os.path.join(config['toplevel'], symlink.lstrip('/')) if os.path.exists(source): if os.path.islink(target): # are you pointing to where we need you? if os.path.realpath(target) != source: # Remove symlink and recreate below logger.debug('Removed existing wrong symlink %s', target) os.unlink(target) elif os.path.exists(target): logger.warning('Deleted repo %s, because it is now' ' a symlink to %s' % (target, source)) shutil.rmtree(target) # Here we re-check if we still need to do anything if not os.path.exists(target): logger.info('Symlinking %s -> %s', target, source) # Make sure the leading dirs are in place if not os.path.exists(os.path.dirname(target)): os.makedirs(os.path.dirname(target)) os.symlink(source, target) manifile = config['mymanifest'] grokmirror.manifest_lock(manifile) # Is the local manifest newer than last_modified? That would indicate # that another process has run and "culled" is no longer the latest info if os.path.exists(manifile): fstat = os.stat(manifile) if fstat[8] > last_modified: logger.info('Local manifest is newer, not saving.') grokmirror.manifest_unlock(manifile) return 0 if purge: to_purge = [] found_repos = 0 for founddir in grokmirror.find_all_gitdirs(config['toplevel']): gitdir = founddir.replace(config['toplevel'], '') found_repos += 1 if gitdir not in culled.keys() and gitdir not in symlinks: to_purge.append(founddir) if len(to_purge): # Purge-protection engage try: purge_limit = int(config['purgeprotect']) assert 1 <= purge_limit <= 99 except (ValueError, AssertionError): logger.critical('Warning: "%s" is not valid for purgeprotect.', config['purgeprotect']) logger.critical('Please set to a number between 1 and 99.') logger.critical('Defaulting to purgeprotect=5.') purge_limit = 5 purge_pc = len(to_purge) * 100 / found_repos logger.debug('purgeprotect=%s', purge_limit) logger.debug('purge prercentage=%s', purge_pc) if not forcepurge and purge_pc >= purge_limit: logger.critical('Refusing to purge %s repos (%s%%)', len(to_purge), purge_pc) logger.critical('Set purgeprotect to a higher percentage, or' ' override with --force-purge.') logger.info('Not saving local manifest') return 1 else: # noinspection PyTypeChecker e_purge = em.counter(total=len(to_purge), desc='Purging :', unit='repos', leave=False) for founddir in to_purge: e_purge.refresh() if os.path.islink(founddir): logger.info('Removing unreferenced symlink %s', gitdir) os.unlink(founddir) else: # is anything using us for alternates? gitdir = '/' + os.path.relpath(founddir, toplevel).lstrip('/') if grokmirror.is_alt_repo(toplevel, gitdir): logger.info( 'Not purging %s because it is used by ' 'other repos via alternates', founddir) else: try: logger.info('Purging %s', founddir) grokmirror.lock_repo(founddir, nonblocking=True) shutil.rmtree(founddir) except IOError: lock_fails.append(gitdir) logger.info('%s is locked, not purging', gitdir) e_purge.update() logger.info('Purging completed in %0.2fs', e_purge.elapsed) e_purge.close() else: logger.info('No repositories need purging') # Done with progress bars em.stop() # Go through all repos in culled and get the latest local timestamps. for gitdir in culled: ts = grokmirror.get_repo_timestamp(toplevel, gitdir) culled[gitdir]['modified'] = ts # If there were any lock failures, we fudge last_modified to always # be older than the server, which will force the next grokmirror run. if len(lock_fails): logger.info('%s repos could not be locked. Forcing next run.', len(lock_fails)) last_modified -= 1 elif len(git_fails): logger.info('%s repos failed. Forcing next run.', len(git_fails)) last_modified -= 1 # Once we're done, save culled as our new manifest grokmirror.write_manifest(manifile, culled, mtime=last_modified, pretty=pretty) grokmirror.manifest_unlock(manifile) # write out projects.list, if asked to write_projects_list(culled, config) return 127
def fsck_mirror(name, config, verbose=False, force=False, repack_only=False, conn_only=False, repack_all_quick=False, repack_all_full=False): global logger logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # noinspection PyTypeChecker em = enlighten.get_manager(series=' -=#') if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) em.enabled = False logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger if conn_only or repack_all_quick or repack_all_full: force = True logger.info('Running grok-fsck for [%s]', name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s', config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError: logger.info('Could not obtain exclusive lock on %s', config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s', config['statusfile']) stfh = open(config['statusfile'], 'rb') # noinspection PyBroadException try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # 'lastrepack': 'YYYY-MM-DD', # 'fingerprint': 'sha-1', # 's_elapsed': seconds, # 'quick_repack_count': times, # }, # ... # } status = json.loads(stfh.read().decode('utf-8')) except: # Huai le! logger.critical('Failed to parse %s', config['statusfile']) lockf(flockh, LOCK_UN) flockh.close() return 1 else: status = {} if 'frequency' in config: frequency = int(config['frequency']) else: frequency = 30 today = datetime.datetime.today() todayiso = today.strftime('%F') if force: # Use randomization for next check, again checkdelay = random.randint(1, frequency) else: checkdelay = frequency # Go through the manifest and compare with status # noinspection PyTypeChecker e_find = em.counter(total=len(manifest), desc='Discovering:', unit='repos', leave=False) for gitdir in list(manifest): e_find.update() fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/')) if fullpath not in status.keys(): # Newly added repository if not force: # Randomize next check between now and frequency delay = random.randint(0, frequency) nextdate = today + datetime.timedelta(days=delay) nextcheck = nextdate.strftime('%F') else: nextcheck = todayiso status[fullpath] = { 'lastcheck': 'never', 'nextcheck': nextcheck, } logger.info('%s:', fullpath) logger.info(' added : next check on %s', nextcheck) e_find.close() # record newly found repos in the status file logger.debug('Updating status file in %s', config['statusfile']) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) # Go through status and find all repos that need work done on them. # This is a dictionary that contains: # full_path_to_repo: # repack: 0, 1, 2 (0-no, 1-needs quick repack, 2-needs full repack) # fsck: 0/1 to_process = {} total_checked = 0 total_elapsed = 0 # noinspection PyTypeChecker e_cmp = em.counter(total=len(status), desc='Analyzing:', unit='repos', leave=False) for fullpath in list(status): e_cmp.update() # Check to make sure it's still in the manifest gitdir = fullpath.replace(config['toplevel'], '', 1) gitdir = '/' + gitdir.lstrip('/') if gitdir not in manifest.keys(): del status[fullpath] logger.debug('%s is gone, no longer in manifest', gitdir) continue needs_repack = needs_prune = needs_fsck = 0 obj_info = get_repo_obj_info(fullpath) try: packs = int(obj_info['packs']) count_loose = int(obj_info['count']) except KeyError: logger.warning('Unable to count objects in %s, skipping' % fullpath) continue schedcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d') nextcheck = today + datetime.timedelta(days=checkdelay) if 'repack' not in config.keys() or config['repack'] != 'yes': # don't look at me if you turned off repack logger.debug('Not repacking because repack=no in config') needs_repack = 0 elif repack_all_full and (count_loose > 0 or packs > 1): logger.debug('needs_repack=2 due to repack_all_full') needs_repack = 2 elif repack_all_quick and count_loose > 0: logger.debug('needs_repack=1 due to repack_all_quick') needs_repack = 1 elif conn_only: # don't do any repacks if we're running forced connectivity checks, unless # you specifically passed --repack-all-foo logger.debug('needs_repack=0 due to --conn-only') needs_repack = 0 else: # for now, hardcode the maximum loose objects and packs # TODO: we can probably set this in git config values? # I don't think this makes sense as a global setting, because # optimal values will depend on the size of the repo as a whole max_loose_objects = 1200 max_packs = 20 pc_loose_objects = 10 pc_loose_size = 10 # first, compare against max values: if packs >= max_packs: logger.debug('Triggering full repack of %s because packs > %s', fullpath, max_packs) needs_repack = 2 elif count_loose >= max_loose_objects: logger.debug('Triggering quick repack of %s because loose objects > %s', fullpath, max_loose_objects) needs_repack = 1 else: # is the number of loose objects or their size more than 10% of # the overall total? in_pack = int(obj_info['in-pack']) size_loose = int(obj_info['size']) size_pack = int(obj_info['size-pack']) total_obj = count_loose + in_pack total_size = size_loose + size_pack # set some arbitrary "worth bothering" limits so we don't # continuously repack tiny repos. if total_obj > 500 and count_loose/total_obj*100 >= pc_loose_objects: logger.debug('Triggering repack of %s because loose objects > %s%% of total', fullpath, pc_loose_objects) needs_repack = 1 elif total_size > 1024 and size_loose/total_size*100 >= pc_loose_size: logger.debug('Triggering repack of %s because loose size > %s%% of total', fullpath, pc_loose_size) needs_repack = 1 if needs_repack > 0 and check_precious_objects(fullpath): # if we have preciousObjects, then we only repack based on the same # schedule as fsck. logger.debug('preciousObjects is set') # for repos with preciousObjects, we use the fsck schedule for repacking if schedcheck <= today: logger.debug('Time for a full periodic repack of a preciousObjects repo') status[fullpath]['nextcheck'] = nextcheck.strftime('%F') needs_repack = 2 else: logger.debug('Not repacking preciousObjects repo outside of schedule') needs_repack = 0 # Do we need to fsck it? if not (repack_all_quick or repack_all_full or repack_only): if schedcheck <= today or force: status[fullpath]['nextcheck'] = nextcheck.strftime('%F') needs_fsck = 1 if needs_repack or needs_fsck or needs_prune: # emit a warning if we find garbage in a repo # we do it here so we don't spam people nightly on every cron run, # but only do it when a repo needs actual work done on it if obj_info['garbage'] != '0': logger.warning('%s:\n\tcontains %s garbage files (garbage-size: %s KiB)', fullpath, obj_info['garbage'], obj_info['size-garbage']) to_process[fullpath] = { 'repack': needs_repack, 'prune': needs_prune, 'fsck': needs_fsck, } e_cmp.close() if not len(to_process): logger.info('No repos need attention.') em.stop() return logger.info('Processing %s repositories', len(to_process)) # noinspection PyTypeChecker run = em.counter(total=len(to_process), desc='Processing:', unit='repos', leave=False) for fullpath, needs in to_process.items(): logger.info('%s:', fullpath) # Calculate elapsed seconds run.refresh() startt = time.time() # Wait till the repo is available and lock it for the duration of checks, # otherwise there may be false-positives if a mirrored repo is updated # in the middle of fsck or repack. grokmirror.lock_repo(fullpath, nonblocking=False) if needs['repack']: if run_git_repack(fullpath, config, needs['repack']): status[fullpath]['lastrepack'] = todayiso if needs['repack'] > 1: status[fullpath]['lastfullrepack'] = todayiso else: logger.warning('Repacking %s was unsuccessful, ' 'not running fsck.', fullpath) grokmirror.unlock_repo(fullpath) continue if needs['prune']: run_git_prune(fullpath, config) if needs['fsck']: run_git_fsck(fullpath, config, conn_only) endt = time.time() status[fullpath]['lastcheck'] = todayiso status[fullpath]['s_elapsed'] = int(endt-startt) logger.info(' done : %ss, next check on %s', status[fullpath]['s_elapsed'], status[fullpath]['nextcheck']) run.update() # We're done with the repo now grokmirror.unlock_repo(fullpath) total_checked += 1 total_elapsed += time.time()-startt # Write status file after each check, so if the process dies, we won't # have to recheck all the repos we've already checked logger.debug('Updating status file in %s', config['statusfile']) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) run.close() em.stop() logger.info('Processed %s repos in %0.2fs', total_checked, total_elapsed) with open(config['statusfile'], 'wb') as stfh: stfh.write(json.dumps(status, indent=2).encode('utf-8')) lockf(flockh, LOCK_UN) flockh.close()
# push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Running grok-fsck for [%s]' % name) # Lock the tree to make sure we only run one instance logger.debug('Attempting to obtain lock on %s' % config['lock']) flockh = open(config['lock'], 'w') try: lockf(flockh, LOCK_EX | LOCK_NB) except IOError, ex: logger.info('Could not obtain exclusive lock on %s' % config['lock']) logger.info('Assuming another process is running.') return 0 manifest = grokmirror.read_manifest(config['manifest']) if os.path.exists(config['statusfile']): logger.info('Reading status from %s' % config['statusfile']) stfh = open(config['statusfile'], 'r') try: # Format of the status file: # { # '/full/path/to/repository': { # 'lastcheck': 'YYYY-MM-DD' or 'never', # 'nextcheck': 'YYYY-MM-DD', # }, # ... # } status = json.load(stfh)
def pull_mirror(name, config, verbose=False, force=False, nomtime=False, verify=False, verify_subpath='*', noreuse=False, purge=False, pretty=False, forcepurge=False): global logger global lock_fails # noinspection PyTypeChecker em = enlighten.get_manager(series=' -=#') logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) em.enabled = False logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]', name) mymanifest = config['mymanifest'] if verify: logger.info('Verifying mirror against %s', config['manifest']) nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!', config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s', manifile, fstat[8]) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s', my_last_modified) if not (force or nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s', manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.warning('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s', config['manifest']) # Do we have username:password@ in the URL? chunks = urlparse(config['manifest']) if chunks.netloc.find('@') > 0: logger.debug('Taking username/password from the URL for basic auth') (upass, netloc) = chunks.netloc.split('@') if upass.find(':') > 0: (username, password) = upass.split(':') else: username = upass password = '' manifesturl = config['manifest'].replace(chunks.netloc, netloc) logger.debug('manifesturl=%s', manifesturl) request = urllib_request.Request(manifesturl) password_mgr = urllib_request.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, manifesturl, username, password) auth_handler = urllib_request.HTTPBasicAuthHandler(password_mgr) opener = urllib_request.build_opener(auth_handler) else: request = urllib_request.Request(config['manifest']) opener = urllib_request.build_opener() # Find out if we need to run at all first if not (force or nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s', mymanifest, mtime) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s', my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except HTTPError as ex: if ex.code == 304: logger.info('Server says we have the latest manifest. ' 'Quitting.') return 0 logger.warning('Could not fetch %s', config['manifest']) logger.warning('Server returned: %s', ex) return 1 except (URLError, ssl.SSLError, ssl.CertificateError) as ex: logger.warning('Could not fetch %s', config['manifest']) logger.warning('Error was: %s', ex) return 1 last_modified = ufh.headers.get('Last-Modified') last_modified = time.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') last_modified = calendar.timegm(last_modified) # We don't use read_manifest for the remote manifest, as it can be # anything, really. For now, blindly open it with gzipfile if it ends # with .gz. XXX: some http servers will auto-deflate such files. try: if config['manifest'].find('.gz') > 0: fh = gzip.GzipFile(fileobj=BytesIO(ufh.read())) else: fh = ufh jdata = fh.read().decode('utf-8') fh.close() manifest = anyjson.deserialize(jdata) except Exception as ex: logger.warning('Failed to parse %s', config['manifest']) logger.warning('Error was: %s', ex) return 1 mymanifest = grokmirror.read_manifest(mymanifest) culled = cull_manifest(manifest, config) to_clone = [] to_pull = [] existing = [] toplevel = config['toplevel'] if not os.access(toplevel, os.W_OK): logger.critical('Toplevel %s does not exist or is not writable', toplevel) sys.exit(1) if 'pull_threads' in config.keys(): pull_threads = int(config['pull_threads']) if pull_threads < 1: logger.info('pull_threads is less than 1, forcing to 1') pull_threads = 1 else: # be conservative logger.info('pull_threads is not set, consider setting it') pull_threads = 5 # noinspection PyTypeChecker e_cmp = em.counter(total=len(culled), desc='Comparing:', unit='repos', leave=False) for gitdir in list(culled): fullpath = os.path.join(toplevel, gitdir.lstrip('/')) e_cmp.update() # fingerprints were added in later versions, so deal if the upstream # manifest doesn't have a fingerprint if 'fingerprint' not in culled[gitdir]: culled[gitdir]['fingerprint'] = None # Attempt to lock the repo try: grokmirror.lock_repo(fullpath, nonblocking=True) except IOError: logger.info('Could not lock %s, skipping', gitdir) lock_fails.append(gitdir) # Force the fingerprint to what we have in mymanifest, # if we have it. culled[gitdir]['fingerprint'] = None if gitdir in mymanifest and 'fingerprint' in mymanifest[gitdir]: culled[gitdir]['fingerprint'] = mymanifest[gitdir][ 'fingerprint'] if len(lock_fails) >= pull_threads: logger.info('Too many repositories locked (%s). Exiting.', len(lock_fails)) return 0 continue if verify: if culled[gitdir]['fingerprint'] is None: logger.debug('No fingerprint for %s, not verifying', gitdir) grokmirror.unlock_repo(fullpath) continue if not fnmatch.fnmatch(gitdir, verify_subpath): grokmirror.unlock_repo(fullpath) continue logger.debug('Verifying %s', gitdir) if not os.path.exists(fullpath): verify_fails.append(gitdir) logger.info('Verify: %s ABSENT', gitdir) grokmirror.unlock_repo(fullpath) continue my_fingerprint = grokmirror.get_repo_fingerprint( toplevel, gitdir, force=force) if my_fingerprint == culled[gitdir]['fingerprint']: logger.info('Verify: %s OK', gitdir) else: logger.critical('Verify: %s FAILED', gitdir) verify_fails.append(gitdir) grokmirror.unlock_repo(fullpath) continue # Is the directory in place? if os.path.exists(fullpath): # Did grok-fsck request to reclone it? rfile = os.path.join(fullpath, 'grokmirror.reclone') if os.path.exists(rfile): logger.info('Reclone requested for %s:', gitdir) with open(rfile, 'r') as rfh: reason = rfh.read() logger.info(' %s', reason) to_clone.append(gitdir) grokmirror.unlock_repo(fullpath) continue # Fix owner and description, if necessary if gitdir in mymanifest.keys(): # This code is hurky and needs to be cleaned up desc = culled[gitdir].get('description') owner = culled[gitdir].get('owner') ref = None if config['ignore_repo_references'] != 'yes': ref = culled[gitdir].get('reference') # dirty hack to force on-disk owner/description checks # when we're called with -n, in case our manifest # differs from what is on disk for owner/description/alternates myref = None if nomtime: mydesc = None myowner = None else: mydesc = mymanifest[gitdir].get('description') myowner = mymanifest[gitdir].get('owner') if config['ignore_repo_references'] != 'yes': myref = mymanifest[gitdir].get('reference') if myowner is None: myowner = config['default_owner'] if owner is None: owner = config['default_owner'] if desc != mydesc or owner != myowner or ref != myref: # we can do this right away without waiting set_repo_params(toplevel, gitdir, owner, desc, ref) else: # It exists on disk, but not in my manifest? if noreuse: logger.critical('Found existing git repo in %s', fullpath) logger.critical('But you asked NOT to reuse repos') logger.critical('Skipping %s', gitdir) grokmirror.unlock_repo(fullpath) continue logger.info('Setting new origin for %s', gitdir) fix_remotes(gitdir, toplevel, config['site']) to_pull.append(gitdir) grokmirror.unlock_repo(fullpath) continue # fingerprints were added late, so if we don't have them # in the remote manifest, fall back on using timestamps changed = False if culled[gitdir]['fingerprint'] is not None: logger.debug('Will use fingerprints to compare %s', gitdir) my_fingerprint = grokmirror.get_repo_fingerprint(toplevel, gitdir, force=force) if my_fingerprint != culled[gitdir]['fingerprint']: logger.debug('No fingerprint match, will pull %s', gitdir) changed = True else: logger.debug('Fingerprints match, skipping %s', gitdir) else: logger.debug('Will use timestamps to compare %s', gitdir) if force: logger.debug('Will force-pull %s', gitdir) changed = True # set timestamp to 0 as well grokmirror.set_repo_timestamp(toplevel, gitdir, 0) else: ts = grokmirror.get_repo_timestamp(toplevel, gitdir) if ts < culled[gitdir]['modified']: changed = True if changed: to_pull.append(gitdir) grokmirror.unlock_repo(fullpath) continue else: logger.debug('Repo %s unchanged', gitdir) # if we don't have a fingerprint for it, add it now if culled[gitdir]['fingerprint'] is None: fpr = grokmirror.get_repo_fingerprint(toplevel, gitdir) culled[gitdir]['fingerprint'] = fpr existing.append(gitdir) grokmirror.unlock_repo(fullpath) continue else: # Newly incoming repo to_clone.append(gitdir) grokmirror.unlock_repo(fullpath) continue # If we got here, something is odd. # noinspection PyUnreachableCode logger.critical('Could not figure out what to do with %s', gitdir) grokmirror.unlock_repo(fullpath) logger.info('Compared new manifest against %s repositories in %0.2fs', len(culled), e_cmp.elapsed) e_cmp.close() if verify: if len(verify_fails): logger.critical('%s repos failed to verify', len(verify_fails)) return 1 else: logger.info('Verification successful') return 0 hookscript = config['post_update_hook'] if len(to_pull): if len(lock_fails) > 0: pull_threads -= len(lock_fails) # Don't spin up more threads than we need if pull_threads > len(to_pull): pull_threads = len(to_pull) # exit if we're ever at 0 pull_threads. Shouldn't happen, but some extra # precaution doesn't hurt if pull_threads <= 0: logger.info('Too many repositories locked. Exiting.') return 0 logger.info('Will use %d threads to pull repos', pull_threads) # noinspection PyTypeChecker e_pull = em.counter(total=len(to_pull), desc='Updating :', unit='repos', leave=False) logger.info('Updating %s repos from %s', len(to_pull), config['site']) in_queue = Queue() out_queue = Queue() for gitdir in to_pull: in_queue.put((gitdir, culled[gitdir]['fingerprint'], culled[gitdir]['modified'])) for i in range(pull_threads): logger.debug('Spun up thread %s', i) t = PullerThread(in_queue, out_queue, config, i, e_pull) t.setDaemon(True) t.start() # wait till it's all done in_queue.join() logger.info('All threads finished.') while not out_queue.empty(): # see if any of it failed (gitdir, my_fingerprint, status) = out_queue.get() # We always record our fingerprint in our manifest culled[gitdir]['fingerprint'] = my_fingerprint if not status: # To make sure we check this again during next run, # fudge the manifest accordingly. logger.debug('Will recheck %s during next run', gitdir) culled[gitdir] = mymanifest[gitdir] # this is rather hackish, but effective last_modified -= 1 logger.info('Updates completed in %0.2fs', e_pull.elapsed) e_pull.close() else: logger.info('No repositories need updating') # how many lockfiles have we seen? # If there are more lock_fails than there are # pull_threads configured, we skip cloning out of caution if len(to_clone) and len(lock_fails) > pull_threads: logger.info('Too many repositories locked. Skipping cloning new repos.') to_clone = [] if len(to_clone): # noinspection PyTypeChecker e_clone = em.counter(total=len(to_clone), desc='Cloning :', unit='repos', leave=False) logger.info('Cloning %s repos from %s', len(to_clone), config['site']) # we use "existing" to track which repos can be used as references existing.extend(to_pull) to_clone_sorted = [] clone_order(to_clone, manifest, to_clone_sorted, existing) for gitdir in to_clone_sorted: e_clone.refresh() fullpath = os.path.join(toplevel, gitdir.lstrip('/')) # Did grok-fsck request to reclone it? rfile = os.path.join(fullpath, 'grokmirror.reclone') if os.path.exists(rfile): logger.debug('Removing %s for reclone', gitdir) shutil.move(fullpath, '%s.reclone' % fullpath) shutil.rmtree('%s.reclone' % fullpath) # Do we still need to clone it, or has another process # already done this for us? ts = grokmirror.get_repo_timestamp(toplevel, gitdir) if ts > 0: logger.debug('Looks like %s already cloned, skipping', gitdir) continue try: grokmirror.lock_repo(fullpath, nonblocking=True) except IOError: logger.info('Could not lock %s, skipping', gitdir) lock_fails.append(gitdir) e_clone.update() continue reference = None if config['ignore_repo_references'] != 'yes': reference = culled[gitdir]['reference'] if reference is not None and reference in existing: # Make sure we can lock the reference repo refrepo = os.path.join(toplevel, reference.lstrip('/')) try: grokmirror.lock_repo(refrepo, nonblocking=True) success = clone_repo(toplevel, gitdir, config['site'], reference=reference) grokmirror.unlock_repo(refrepo) except IOError: logger.info('Cannot lock reference repo %s, skipping %s', reference, gitdir) if reference not in lock_fails: lock_fails.append(reference) grokmirror.unlock_repo(fullpath) e_clone.update() continue else: success = clone_repo(toplevel, gitdir, config['site']) # check dir to make sure cloning succeeded and then add to existing if os.path.exists(fullpath) and success: logger.debug('Cloning of %s succeeded, adding to existing', gitdir) existing.append(gitdir) desc = culled[gitdir].get('description') owner = culled[gitdir].get('owner') ref = culled[gitdir].get('reference') if owner is None: owner = config['default_owner'] set_repo_params(toplevel, gitdir, owner, desc, ref) set_agefile(toplevel, gitdir, culled[gitdir]['modified']) my_fingerprint = grokmirror.set_repo_fingerprint(toplevel, gitdir) culled[gitdir]['fingerprint'] = my_fingerprint run_post_update_hook(hookscript, toplevel, gitdir) else: logger.warning('Was not able to clone %s', gitdir) # Remove it from our manifest so we can try re-cloning # next time grok-pull runs del culled[gitdir] git_fails.append(gitdir) grokmirror.unlock_repo(fullpath) e_clone.update() logger.info('Clones completed in %0.2fs' % e_clone.elapsed) e_clone.close() else: logger.info('No repositories need cloning') # loop through all entries and find any symlinks we need to set # We also collect all symlinks to do purging correctly symlinks = [] for gitdir in culled.keys(): if 'symlinks' in culled[gitdir].keys(): source = os.path.join(config['toplevel'], gitdir.lstrip('/')) for symlink in culled[gitdir]['symlinks']: if symlink not in symlinks: symlinks.append(symlink) target = os.path.join(config['toplevel'], symlink.lstrip('/')) if os.path.exists(source): if os.path.islink(target): # are you pointing to where we need you? if os.path.realpath(target) != source: # Remove symlink and recreate below logger.debug('Removed existing wrong symlink %s', target) os.unlink(target) elif os.path.exists(target): logger.warning('Deleted repo %s, because it is now' ' a symlink to %s' % (target, source)) shutil.rmtree(target) # Here we re-check if we still need to do anything if not os.path.exists(target): logger.info('Symlinking %s -> %s', target, source) # Make sure the leading dirs are in place if not os.path.exists(os.path.dirname(target)): os.makedirs(os.path.dirname(target)) os.symlink(source, target) manifile = config['mymanifest'] grokmirror.manifest_lock(manifile) # Is the local manifest newer than last_modified? That would indicate # that another process has run and "culled" is no longer the latest info if os.path.exists(manifile): fstat = os.stat(manifile) if fstat[8] > last_modified: logger.info('Local manifest is newer, not saving.') grokmirror.manifest_unlock(manifile) return 0 if purge: to_purge = [] found_repos = 0 for founddir in grokmirror.find_all_gitdirs(config['toplevel']): gitdir = founddir.replace(config['toplevel'], '') found_repos += 1 if gitdir not in culled.keys() and gitdir not in symlinks: to_purge.append(founddir) if len(to_purge): # Purge-protection engage try: purge_limit = int(config['purgeprotect']) assert 1 <= purge_limit <= 99 except (ValueError, AssertionError): logger.critical('Warning: "%s" is not valid for purgeprotect.', config['purgeprotect']) logger.critical('Please set to a number between 1 and 99.') logger.critical('Defaulting to purgeprotect=5.') purge_limit = 5 purge_pc = len(to_purge) * 100 / found_repos logger.debug('purgeprotect=%s', purge_limit) logger.debug('purge prercentage=%s', purge_pc) if not forcepurge and purge_pc >= purge_limit: logger.critical('Refusing to purge %s repos (%s%%)', len(to_purge), purge_pc) logger.critical('Set purgeprotect to a higher percentage, or' ' override with --force-purge.') logger.info('Not saving local manifest') return 1 else: # noinspection PyTypeChecker e_purge = em.counter(total=len(to_purge), desc='Purging :', unit='repos', leave=False) for founddir in to_purge: e_purge.refresh() if os.path.islink(founddir): logger.info('Removing unreferenced symlink %s', gitdir) os.unlink(founddir) else: # is anything using us for alternates? gitdir = '/' + os.path.relpath(founddir, toplevel).lstrip('/') if grokmirror.is_alt_repo(toplevel, gitdir): logger.info('Not purging %s because it is used by ' 'other repos via alternates', founddir) else: try: logger.info('Purging %s', founddir) grokmirror.lock_repo(founddir, nonblocking=True) shutil.rmtree(founddir) except IOError: lock_fails.append(gitdir) logger.info('%s is locked, not purging', gitdir) e_purge.update() logger.info('Purging completed in %0.2fs', e_purge.elapsed) e_purge.close() else: logger.info('No repositories need purging') # Done with progress bars em.stop() # Go through all repos in culled and get the latest local timestamps. for gitdir in culled: ts = grokmirror.get_repo_timestamp(toplevel, gitdir) culled[gitdir]['modified'] = ts # If there were any lock failures, we fudge last_modified to always # be older than the server, which will force the next grokmirror run. if len(lock_fails): logger.info('%s repos could not be locked. Forcing next run.', len(lock_fails)) last_modified -= 1 elif len(git_fails): logger.info('%s repos failed. Forcing next run.', len(git_fails)) last_modified -= 1 # Once we're done, save culled as our new manifest grokmirror.write_manifest(manifile, culled, mtime=last_modified, pretty=pretty) grokmirror.manifest_unlock(manifile) # write out projects.list, if asked to write_projects_list(culled, config) return 127
logger.addHandler(ch) if opts.logfile is not None: ch = logging.FileHandler(opts.logfile) formatter = logging.Formatter("[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) ch.setLevel(logging.DEBUG) logger.addHandler(ch) # push our logger into grokmirror to override the default grokmirror.logger = logger grokmirror.manifest_lock(opts.manifile) manifest = grokmirror.read_manifest(opts.manifile, wait=opts.wait) # If manifest is empty, don't use current timestamp if not len(manifest.keys()): opts.usenow = False if opts.remove and len(args): # Remove the repos as required, write new manfiest and exit for fullpath in args: repo = fullpath.replace(opts.toplevel, '', 1) if repo in manifest.keys(): del manifest[repo] logger.info('Repository %s removed from manifest' % repo) else: logger.info('Repository %s not in manifest' % repo)
def grok_manifest(manifile, toplevel, args=None, logfile=None, usenow=False, check_export_ok=False, purge=False, remove=False, pretty=False, ignore=None, wait=False, verbose=False): if args is None: args = [] if ignore is None: ignore = [] logger.setLevel(logging.DEBUG) # noinspection PyTypeChecker em = enlighten.get_manager(series=' -=#') ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) em.enabled = False logger.addHandler(ch) if logfile is not None: ch = logging.FileHandler(logfile) formatter = logging.Formatter( "[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) ch.setLevel(logging.DEBUG) logger.addHandler(ch) # push our logger into grokmirror to override the default grokmirror.logger = logger grokmirror.manifest_lock(manifile) manifest = grokmirror.read_manifest(manifile, wait=wait) # If manifest is empty, don't use current timestamp if not len(manifest.keys()): usenow = False if remove and len(args): # Remove the repos as required, write new manfiest and exit for fullpath in args: repo = fullpath.replace(toplevel, '', 1) if repo in manifest.keys(): del manifest[repo] logger.info('Repository %s removed from manifest', repo) else: logger.info('Repository %s not in manifest', repo) # XXX: need to add logic to make sure we don't break the world # by removing a repository used as a reference for others # also make sure we clean up any dangling symlinks grokmirror.write_manifest(manifile, manifest, pretty=pretty) grokmirror.manifest_unlock(manifile) return 0 gitdirs = [] if purge or not len(args) or not len(manifest.keys()): # We automatically purge when we do a full tree walk gitdirs = grokmirror.find_all_gitdirs(toplevel, ignore=ignore) purge_manifest(manifest, toplevel, gitdirs) if len(manifest) and len(args): # limit ourselves to passed dirs only when there is something # in the manifest. This precaution makes sure we regenerate the # whole file when there is nothing in it or it can't be parsed. gitdirs = args # Don't draw a progress bar for a single repo em.enabled = False symlinks = [] # noinspection PyTypeChecker run = em.counter(total=len(gitdirs), desc='Processing:', unit='repos', leave=False) for gitdir in gitdirs: run.update() # check to make sure this gitdir is ok to export if (check_export_ok and not os.path.exists(os.path.join(gitdir, 'git-daemon-export-ok'))): # is it curently in the manifest? repo = gitdir.replace(toplevel, '', 1) if repo in list(manifest): logger.info('Repository %s is no longer exported, ' 'removing from manifest', repo) del manifest[repo] # XXX: need to add logic to make sure we don't break the world # by removing a repository used as a reference for others # also make sure we clean up any dangling symlinks continue if os.path.islink(gitdir): symlinks.append(gitdir) else: update_manifest(manifest, toplevel, gitdir, usenow) logger.info('Updated %s records in %0.2fs', len(gitdirs), run.elapsed) run.close() em.stop() if len(symlinks): set_symlinks(manifest, toplevel, symlinks) grokmirror.write_manifest(manifile, manifest, pretty=pretty) grokmirror.manifest_unlock(manifile)
def pull_mirror(name, config, opts): global logger global lock_fails logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) if 'log' in config.keys(): ch = logging.FileHandler(config['log']) formatter = logging.Formatter("[%(process)d] %(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) loglevel = logging.INFO if 'loglevel' in config.keys(): if config['loglevel'] == 'debug': loglevel = logging.DEBUG ch.setLevel(loglevel) logger.addHandler(ch) ch = logging.StreamHandler() formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) if opts.verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.CRITICAL) logger.addHandler(ch) # push it into grokmirror to override the default logger grokmirror.logger = logger logger.info('Checking [%s]' % name) mymanifest = config['mymanifest'] if opts.verify: logger.info('Verifying mirror against %s' % config['manifest']) opts.nomtime = True if config['manifest'].find('file:///') == 0: manifile = config['manifest'].replace('file://', '') if not os.path.exists(manifile): logger.critical('Remote manifest not found in %s! Quitting!' % config['manifest']) return 1 fstat = os.stat(manifile) last_modified = fstat[8] logger.debug('mtime on %s is: %s' % (manifile, fstat[8])) if os.path.exists(config['mymanifest']): fstat = os.stat(config['mymanifest']) my_last_modified = fstat[8] logger.debug('Our last-modified is: %s' % my_last_modified) if not (opts.force or opts.nomtime) and last_modified <= my_last_modified: logger.info('Manifest file unchanged. Quitting.') return 0 logger.info('Reading new manifest from %s' % manifile) manifest = grokmirror.read_manifest(manifile) # Don't accept empty manifests -- that indicates something is wrong if not len(manifest.keys()): logger.critical('Remote manifest empty or unparseable! Quitting.') return 1 else: # Load it from remote host using http and header magic logger.info('Fetching remote manifest from %s' % config['manifest']) request = urllib2.Request(config['manifest']) opener = urllib2.build_opener() # Find out if we need to run at all first if not (opts.force or opts.nomtime) and os.path.exists(mymanifest): fstat = os.stat(mymanifest) mtime = fstat[8] logger.debug('mtime on %s is: %s' % (mymanifest, mtime)) my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)) logger.debug('Our last-modified is: %s' % my_last_modified) request.add_header('If-Modified-Since', my_last_modified) try: ufh = opener.open(request, timeout=30) except urllib2.HTTPError, ex: if ex.code == 304: logger.info('Server says we have the latest manifest. Quitting.') return 0 logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Server returned: %s' % ex) return 1 except urllib2.URLError, ex: logger.warning('Could not fetch %s' % config['manifest']) logger.warning('Error was: %s' % ex) return 1