Example #1
0
    def run(self):
        # XXX: This is not thread-safe, but okay for now,
        #      as we only use this for very blunt throttling
        global lock_fails
        global git_fails
        while True:
            (gitdir, fingerprint, modified) = self.in_queue.get()
            # Do we still need to update it, or has another process
            # already done this for us?
            todo = True
            my_fingerprint = grokmirror.get_repo_fingerprint(self.toplevel,
                    gitdir, force=True)
            if fingerprint is not None:
                if fingerprint == my_fingerprint:
                    logger.debug('Fingeprints match, not pulling %s' % gitdir)
                    todo = False
            else:
                ts = grokmirror.get_repo_timestamp(self.toplevel, gitdir)
                if ts >= modified:
                    logger.debug('TS same or newer, not pulling %s' % gitdir)
                    todo = False

            if not todo:
                logger.debug('Looks like %s already latest, skipping' % gitdir)
                # Update the fingerprint stored in-repo
                grokmirror.set_repo_fingerprint(self.toplevel, gitdir,
                        fingerprint=my_fingerprint)
                self.out_queue.put((gitdir, my_fingerprint, True))
                self.in_queue.task_done()
                continue

            fullpath = os.path.join(self.toplevel, gitdir.lstrip('/'))
            success  = False

            try:
                grokmirror.lock_repo(fullpath, nonblocking=True)

                logger.info('[Thread-%s] Updating %s' % (self.myname, gitdir))
                success = pull_repo(self.toplevel, gitdir)
                logger.debug('[Thread-%s] done pulling %s' %
                             (self.myname, gitdir))

                if success:
                    set_agefile(self.toplevel, gitdir, modified)
                    run_post_update_hook(self.hookscript, self.toplevel, gitdir)
                else:
                    logger.warning('Pulling unsuccessful, not setting agefile')
                    git_fails.append(gitdir)

                # Record our current fingerprint and return it
                my_fingerprint = grokmirror.set_repo_fingerprint(self.toplevel,
                        gitdir)

                grokmirror.unlock_repo(fullpath)
            except IOError, ex:
                logger.info('Could not lock %s, skipping' % gitdir)
                lock_fails.append(gitdir)

            self.out_queue.put((gitdir, my_fingerprint, success))
            self.in_queue.task_done()
Example #2
0
def run_git_fsck(fullpath, config):
    # Lock the git repository so no other grokmirror process attempts to
    # modify it while we're running git ops. If we miss this window, we
    # may not check the repo again for a long time, so block until the lock
    # is available.
    try:
        grokmirror.lock_repo(fullpath, nonblocking=False)
    except IOError, ex:
        logger.info('Could not obtain exclusive lock on %s' % fullpath)
        logger.info('Will run next time')
        return
Example #3
0
def run_git_prune(fullpath, config, manifest):
    if 'prune' not in config.keys() or config['prune'] != 'yes':
        return

    # Are any other repos using us in their objects/info/alternates?
    gitdir = '/' + os.path.relpath(fullpath, config['toplevel']).lstrip('/')
    repolist = grokmirror.find_all_alt_repos(gitdir, manifest)

    if len(repolist):
        logger.debug('Not pruning %s as other repos use it as alternates' % gitdir)
        return

    try:
        grokmirror.lock_repo(fullpath, nonblocking=True)
    except IOError:
        logger.info('Could not obtain exclusive lock on %s' % fullpath)
        logger.info('Will prune next time')
        return

    env = {'GIT_DIR': fullpath}
    args = ['/usr/bin/git', 'prune']
    logger.info('Pruning %s' % fullpath)

    logger.debug('Running: GIT_DIR=%s %s' % (env['GIT_DIR'], ' '.join(args)))

    (output, error) = subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        env=env).communicate()

    error = error.strip()

    if error:
        # Put things we recognize as fairly benign into debug
        debug = []
        warn = []
        for line in error.split('\n'):
            ignored = False
            for estring in config['ignore_errors']:
                if line.find(estring) != -1:
                    ignored = True
                    debug.append(line)
                    break
            if not ignored:
                warn.append(line)

        if debug:
            logger.debug('Stderr: %s' % '\n'.join(debug))
        if warn:
            logger.critical('Pruning %s returned critical errors:' % fullpath)
            for entry in warn:
                logger.critical("\t%s" % entry)

    grokmirror.unlock_repo(fullpath)
Example #4
0
def run_git_fsck(fullpath, config):
    # Lock the git repository so no other grokmirror process attempts to
    # modify it while we're running git ops. If we miss this window, we
    # may not check the repo again for a long time, so block until the lock
    # is available.
    try:
        grokmirror.lock_repo(fullpath, nonblocking=False)
    except IOError:
        logger.info('Could not obtain exclusive lock on %s' % fullpath)
        logger.info('Will run next time')
        return

    env = {'GIT_DIR': fullpath}
    args = ['/usr/bin/git', 'fsck', '--full']
    logger.info('Checking %s' % fullpath)

    logger.debug('Running: GIT_DIR=%s %s' % (env['GIT_DIR'], ' '.join(args)))

    (output, error) = subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        env=env).communicate()

    error = error.strip()

    if error:
        # Put things we recognize as fairly benign into debug
        debug = []
        warn = []
        for line in error.split('\n'):
            ignored = False
            for estring in config['ignore_errors']:
                if line.find(estring) != -1:
                    ignored = True
                    debug.append(line)
                    break
            if not ignored:
                warn.append(line)

        if debug:
            logger.debug('Stderr: %s' % '\n'.join(debug))
        if warn:
            logger.critical('%s has critical errors:' % fullpath)
            for entry in warn:
                logger.critical("\t%s" % entry)

    run_git_repack(fullpath, config)

    grokmirror.unlock_repo(fullpath)
Example #5
0
def dumb_pull_repo(gitdir, remotes, svn=False):
    # verify it's a git repo and fetch all remotes
    logger.debug('Will pull %s with following remotes: %s' % (gitdir, remotes))
    try:
        repo = Repo(gitdir)
        assert repo.bare == True
    except:
        logger.critical('Error opening %s.' % gitdir)
        logger.critical('Make sure it is a bare git repository.')
        sys.exit(1)

    try:
        grokmirror.lock_repo(gitdir, nonblocking=True)
    except IOError, ex:
        logger.info('Could not obtain exclusive lock on %s' % gitdir)
        logger.info('\tAssuming another process is running.')
        return False
Example #6
0
def dumb_pull_repo(gitdir, remotes, svn=False):
    # verify it's a git repo and fetch all remotes
    logger.debug('Will pull %s with following remotes: %s', gitdir, remotes)
    try:
        repo = Repo(gitdir)
        assert repo.bare is True
    except:
        logger.critical('Error opening %s.', gitdir)
        logger.critical('Make sure it is a bare git repository.')
        sys.exit(1)

    try:
        grokmirror.lock_repo(gitdir, nonblocking=True)
    except IOError:
        logger.info('Could not obtain exclusive lock on %s', gitdir)
        logger.info('\tAssuming another process is running.')
        return False

    old_revs = git_rev_parse_all(gitdir)

    if svn:
        logger.debug('Using git-svn for %s', gitdir)

        for remote in remotes:
            # arghie-argh-argh
            if remote == '*':
                remote = '--all'

            logger.info('Running git-svn fetch %s in %s', remote, gitdir)
            args = ['svn', 'fetch', remote]
            git_remote_update(args, gitdir)

    else:
        # Not an svn remote
        hasremotes = repo.git.remote()
        if not len(hasremotes.strip()):
            logger.info('Repository %s has no defined remotes!', gitdir)
            return False

        logger.debug('existing remotes: %s', hasremotes)
        for remote in remotes:
            remotefound = False
            for hasremote in hasremotes.split('\n'):
                if fnmatch.fnmatch(hasremote, remote):
                    remotefound = True
                    logger.debug('existing remote %s matches %s',
                                 hasremote, remote)
                    args = ['remote', 'update', hasremote]
                    logger.info('Updating remote %s in %s', hasremote, gitdir)

                    git_remote_update(args, gitdir)

            if not remotefound:
                logger.info('Could not find any remotes matching %s in %s',
                            remote, gitdir)

    new_revs = git_rev_parse_all(gitdir)
    grokmirror.unlock_repo(gitdir)

    if old_revs == new_revs:
        logger.debug('No new revs, no updates')
        return False

    logger.debug('New revs found -- new content pulled')
    return True
Example #7
0
    def run(self):
        # XXX: This is not thread-safe, but okay for now,
        #      as we only use this for very blunt throttling
        global lock_fails
        global git_fails
        while True:
            (gitdir, fingerprint, modified) = self.in_queue.get()
            # Do we still need to update it, or has another process
            # already done this for us?
            todo = True
            success = False
            logger.debug('[Thread-%s] gitdir=%s, figerprint=%s, modified=%s'
                         % (self.myname, gitdir, fingerprint, modified))

            fullpath = os.path.join(self.toplevel, gitdir.lstrip('/'))

            try:
                grokmirror.lock_repo(fullpath, nonblocking=True)
                # First, get fingerprint as reported in grokmirror.fingerprint
                my_fingerprint = grokmirror.get_repo_fingerprint(
                    self.toplevel, gitdir, force=False)

                # We never rely on timestamps if fingerprints are in play
                if fingerprint is None:
                    ts = grokmirror.get_repo_timestamp(self.toplevel, gitdir)
                    if ts >= modified:
                        logger.debug('[Thread-%s] TS same or newer, not pulling %s'
                                     % (self.myname, gitdir))
                        todo = False
                else:
                    # Recheck the real fingerprint to make sure there is no
                    # divergence between grokmirror.fingerprint and real repo
                    logger.debug('[Thread-%s] Rechecking fingerprint in %s' %
                                 (self.myname, gitdir))
                    my_fingerprint = grokmirror.get_repo_fingerprint(
                        self.toplevel, gitdir, force=True)

                    # Update the fingerprint stored in-repo
                    grokmirror.set_repo_fingerprint(
                        self.toplevel, gitdir, fingerprint=my_fingerprint)

                    if fingerprint == my_fingerprint:
                        logger.debug('[Thread-%s] FP match, not pulling %s'
                                     % (self.myname, gitdir))
                        todo = False

                if not todo:
                    logger.debug('[Thread-%s] %s already latest, skipping'
                                 % (self.myname, gitdir))
                    set_agefile(self.toplevel, gitdir, modified)
                    grokmirror.unlock_repo(fullpath)
                    self.out_queue.put((gitdir, my_fingerprint, True))
                    self.in_queue.task_done()
                    continue

                logger.info('[Thread-%s] updating %s' % (self.myname, gitdir))
                success = pull_repo(self.toplevel, gitdir, threadid=self.myname)
                logger.debug('[Thread-%s] done pulling %s' %
                             (self.myname, gitdir))

                if success:
                    set_agefile(self.toplevel, gitdir, modified)
                    run_post_update_hook(self.hookscript, self.toplevel, gitdir,
                                         threadid=self.myname)
                else:
                    logger.warning('[Thread-%s] pulling %s unsuccessful'
                                   % (self.myname, gitdir))
                    git_fails.append(gitdir)

                # Record our current fingerprint and return it
                my_fingerprint = grokmirror.set_repo_fingerprint(
                    self.toplevel, gitdir)

                grokmirror.unlock_repo(fullpath)
            except IOError:
                my_fingerprint = fingerprint
                logger.info('[Thread-%s] Could not lock %s, skipping'
                            % (self.myname, gitdir))
                lock_fails.append(gitdir)

            self.out_queue.put((gitdir, my_fingerprint, success))
            self.in_queue.task_done()
Example #8
0
        logger.info('pull_threads is not set, consider setting it')
        pull_threads = 5

    logger.info('Comparing repository info')

    for gitdir in culled.keys():
        fullpath = os.path.join(toplevel, gitdir.lstrip('/'))

        # fingerprints were added in later versions, so deal if the upstream
        # manifest doesn't have a fingerprint
        if 'fingerprint' not in culled[gitdir]:
            culled[gitdir]['fingerprint'] = None

        # Attempt to lock the repo
        try:
            grokmirror.lock_repo(fullpath, nonblocking=True)
        except IOError:
            logger.info('Could not lock %s, skipping' % gitdir)
            lock_fails.append(gitdir)
            # Force the fingerprint to what we have in mymanifest,
            # if we have it.
            culled[gitdir]['fingerprint'] = None
            if gitdir in mymanifest and 'fingerprint' in mymanifest[gitdir]:
                culled[gitdir]['fingerprint'] = mymanifest[gitdir][
                    'fingerprint']
            if len(lock_fails) >= pull_threads:
                logger.info('Too many repositories locked (%s). Exiting.'
                            % len(lock_fails))
                return 0
            continue
Example #9
0
def run_git_repack(fullpath, config, full_repack=False):
    if 'repack' not in config.keys() or config['repack'] != 'yes':
        return

    try:
        grokmirror.lock_repo(fullpath, nonblocking=True)
    except IOError:
        logger.info('Could not obtain exclusive lock on %s' % fullpath)
        logger.info('Will repack next time')
        return

    repack_flags = '-A -d -l -q'

    if full_repack and 'full_repack_flags' in config.keys():
        repack_flags = config['full_repack_flags']
        logger.info('Time to do a full repack of %s' % fullpath)

    elif 'repack_flags' in config.keys():
        repack_flags = config['repack_flags']

    flags = repack_flags.split()

    env = {'GIT_DIR': fullpath}
    args = ['/usr/bin/git', 'repack'] + flags
    logger.info('Repacking %s' % fullpath)

    logger.debug('Running: GIT_DIR=%s %s' % (env['GIT_DIR'], ' '.join(args)))

    (output, error) = subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        env=env).communicate()

    error = error.strip()

    # With newer versions of git, repack may return warnings that are safe to ignore
    # so use the same strategy to weed out things we aren't interested in seeing
    if error:
        # Put things we recognize as fairly benign into debug
        debug = []
        warn = []
        for line in error.split('\n'):
            ignored = False
            for estring in config['ignore_errors']:
                if line.find(estring) != -1:
                    ignored = True
                    debug.append(line)
                    break
            if not ignored:
                warn.append(line)

        if debug:
            logger.debug('Stderr: %s' % '\n'.join(debug))
        if warn:
            logger.critical('Repacking %s returned critical errors:' % fullpath)
            for entry in warn:
                logger.critical("\t%s" % entry)

    # repacking refs requires a separate command, so run it now
    args = ['/usr/bin/git', 'pack-refs', '--all']
    logger.debug('Repacking refs in %s' % fullpath)

    logger.debug('Running: GIT_DIR=%s %s' % (env['GIT_DIR'], ' '.join(args)))

    (output, error) = subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        env=env).communicate()

    # pack-refs shouldn't return anything, but use the same ignore_errors block
    # to weed out any future potential benign warnings
    if error:
        # Put things we recognize as fairly benign into debug
        debug = []
        warn = []
        for line in error.split('\n'):
            ignored = False
            for estring in config['ignore_errors']:
                if line.find(estring) != -1:
                    ignored = True
                    debug.append(line)
                    break
            if not ignored:
                warn.append(line)

        if debug:
            logger.debug('Stderr: %s' % '\n'.join(debug))
        if warn:
            logger.critical('Repacking refs %s returned critical errors:' % fullpath)
            for entry in warn:
                logger.critical("\t%s" % entry)

    grokmirror.unlock_repo(fullpath)
Example #10
0
def fsck_mirror(name, config, verbose=False, force=False, repack_only=False,
                conn_only=False, repack_all_quick=False, repack_all_full=False):
    global logger
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)

    # noinspection PyTypeChecker
    em = enlighten.get_manager(series=' -=#')

    if 'log' in config.keys():
        ch = logging.FileHandler(config['log'])
        formatter = logging.Formatter(
            "[%(process)d] %(asctime)s - %(levelname)s - %(message)s")
        ch.setFormatter(formatter)
        loglevel = logging.INFO

        if 'loglevel' in config.keys():
            if config['loglevel'] == 'debug':
                loglevel = logging.DEBUG

        ch.setLevel(loglevel)
        logger.addHandler(ch)

    ch = logging.StreamHandler()
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)

    if verbose:
        ch.setLevel(logging.INFO)
    else:
        ch.setLevel(logging.CRITICAL)
        em.enabled = False

    logger.addHandler(ch)

    # push it into grokmirror to override the default logger
    grokmirror.logger = logger

    if conn_only or repack_all_quick or repack_all_full:
        force = True

    logger.info('Running grok-fsck for [%s]', name)

    # Lock the tree to make sure we only run one instance
    logger.debug('Attempting to obtain lock on %s', config['lock'])
    flockh = open(config['lock'], 'w')
    try:
        lockf(flockh, LOCK_EX | LOCK_NB)
    except IOError:
        logger.info('Could not obtain exclusive lock on %s', config['lock'])
        logger.info('Assuming another process is running.')
        return 0

    manifest = grokmirror.read_manifest(config['manifest'])

    if os.path.exists(config['statusfile']):
        logger.info('Reading status from %s', config['statusfile'])
        stfh = open(config['statusfile'], 'rb')
        # noinspection PyBroadException
        try:
            # Format of the status file:
            #  {
            #    '/full/path/to/repository': {
            #      'lastcheck': 'YYYY-MM-DD' or 'never',
            #      'nextcheck': 'YYYY-MM-DD',
            #      'lastrepack': 'YYYY-MM-DD',
            #      'fingerprint': 'sha-1',
            #      's_elapsed': seconds,
            #      'quick_repack_count': times,
            #    },
            #    ...
            #  }

            status = json.loads(stfh.read().decode('utf-8'))
        except:
            # Huai le!
            logger.critical('Failed to parse %s', config['statusfile'])
            lockf(flockh, LOCK_UN)
            flockh.close()
            return 1
    else:
        status = {}

    if 'frequency' in config:
        frequency = int(config['frequency'])
    else:
        frequency = 30

    today = datetime.datetime.today()
    todayiso = today.strftime('%F')

    if force:
        # Use randomization for next check, again
        checkdelay = random.randint(1, frequency)
    else:
        checkdelay = frequency

    # Go through the manifest and compare with status
    # noinspection PyTypeChecker
    e_find = em.counter(total=len(manifest), desc='Discovering:', unit='repos', leave=False)
    for gitdir in list(manifest):
        e_find.update()
        fullpath = os.path.join(config['toplevel'], gitdir.lstrip('/'))
        if fullpath not in status.keys():
            # Newly added repository
            if not force:
                # Randomize next check between now and frequency
                delay = random.randint(0, frequency)
                nextdate = today + datetime.timedelta(days=delay)
                nextcheck = nextdate.strftime('%F')
            else:
                nextcheck = todayiso

            status[fullpath] = {
                'lastcheck': 'never',
                'nextcheck': nextcheck,
            }
            logger.info('%s:', fullpath)
            logger.info('  added : next check on %s', nextcheck)

    e_find.close()

    # record newly found repos in the status file
    logger.debug('Updating status file in %s', config['statusfile'])
    with open(config['statusfile'], 'wb') as stfh:
        stfh.write(json.dumps(status, indent=2).encode('utf-8'))

    # Go through status and find all repos that need work done on them.
    # This is a dictionary that contains:
    # full_path_to_repo:
    #   repack: 0, 1, 2 (0-no, 1-needs quick repack, 2-needs full repack)
    #   fsck: 0/1

    to_process = {}

    total_checked = 0
    total_elapsed = 0

    # noinspection PyTypeChecker
    e_cmp = em.counter(total=len(status), desc='Analyzing:', unit='repos', leave=False)
    for fullpath in list(status):
        e_cmp.update()

        # Check to make sure it's still in the manifest
        gitdir = fullpath.replace(config['toplevel'], '', 1)
        gitdir = '/' + gitdir.lstrip('/')

        if gitdir not in manifest.keys():
            del status[fullpath]
            logger.debug('%s is gone, no longer in manifest', gitdir)
            continue

        needs_repack = needs_prune = needs_fsck = 0

        obj_info = get_repo_obj_info(fullpath)
        try:
            packs = int(obj_info['packs'])
            count_loose = int(obj_info['count'])
        except KeyError:
            logger.warning('Unable to count objects in %s, skipping' % fullpath)
            continue

        schedcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d')
        nextcheck = today + datetime.timedelta(days=checkdelay)

        if 'repack' not in config.keys() or config['repack'] != 'yes':
            # don't look at me if you turned off repack
            logger.debug('Not repacking because repack=no in config')
            needs_repack = 0
        elif repack_all_full and (count_loose > 0 or packs > 1):
            logger.debug('needs_repack=2 due to repack_all_full')
            needs_repack = 2
        elif repack_all_quick and count_loose > 0:
            logger.debug('needs_repack=1 due to repack_all_quick')
            needs_repack = 1
        elif conn_only:
            # don't do any repacks if we're running forced connectivity checks, unless
            # you specifically passed --repack-all-foo
            logger.debug('needs_repack=0 due to --conn-only')
            needs_repack = 0
        else:
            # for now, hardcode the maximum loose objects and packs
            # TODO: we can probably set this in git config values?
            #       I don't think this makes sense as a global setting, because
            #       optimal values will depend on the size of the repo as a whole
            max_loose_objects = 1200
            max_packs = 20
            pc_loose_objects = 10
            pc_loose_size = 10

            # first, compare against max values:
            if packs >= max_packs:
                logger.debug('Triggering full repack of %s because packs > %s', fullpath, max_packs)
                needs_repack = 2
            elif count_loose >= max_loose_objects:
                logger.debug('Triggering quick repack of %s because loose objects > %s', fullpath, max_loose_objects)
                needs_repack = 1
            else:
                # is the number of loose objects or their size more than 10% of
                # the overall total?
                in_pack = int(obj_info['in-pack'])
                size_loose = int(obj_info['size'])
                size_pack = int(obj_info['size-pack'])
                total_obj = count_loose + in_pack
                total_size = size_loose + size_pack
                # set some arbitrary "worth bothering" limits so we don't
                # continuously repack tiny repos.
                if total_obj > 500 and count_loose/total_obj*100 >= pc_loose_objects:
                    logger.debug('Triggering repack of %s because loose objects > %s%% of total',
                                 fullpath, pc_loose_objects)
                    needs_repack = 1
                elif total_size > 1024 and size_loose/total_size*100 >= pc_loose_size:
                    logger.debug('Triggering repack of %s because loose size > %s%% of total',
                                 fullpath, pc_loose_size)
                    needs_repack = 1

        if needs_repack > 0 and check_precious_objects(fullpath):
            # if we have preciousObjects, then we only repack based on the same
            # schedule as fsck.
            logger.debug('preciousObjects is set')
            # for repos with preciousObjects, we use the fsck schedule for repacking
            if schedcheck <= today:
                logger.debug('Time for a full periodic repack of a preciousObjects repo')
                status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
                needs_repack = 2
            else:
                logger.debug('Not repacking preciousObjects repo outside of schedule')
                needs_repack = 0

        # Do we need to fsck it?
        if not (repack_all_quick or repack_all_full or repack_only):
            if schedcheck <= today or force:
                status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
                needs_fsck = 1

        if needs_repack or needs_fsck or needs_prune:
            # emit a warning if we find garbage in a repo
            # we do it here so we don't spam people nightly on every cron run,
            # but only do it when a repo needs actual work done on it
            if obj_info['garbage'] != '0':
                logger.warning('%s:\n\tcontains %s garbage files (garbage-size: %s KiB)',
                               fullpath, obj_info['garbage'], obj_info['size-garbage'])

            to_process[fullpath] = {
                'repack': needs_repack,
                'prune': needs_prune,
                'fsck': needs_fsck,
            }

    e_cmp.close()

    if not len(to_process):
        logger.info('No repos need attention.')
        em.stop()
        return

    logger.info('Processing %s repositories', len(to_process))

    # noinspection PyTypeChecker
    run = em.counter(total=len(to_process), desc='Processing:', unit='repos', leave=False)
    for fullpath, needs in to_process.items():
        logger.info('%s:', fullpath)
        # Calculate elapsed seconds
        run.refresh()
        startt = time.time()

        # Wait till the repo is available and lock it for the duration of checks,
        # otherwise there may be false-positives if a mirrored repo is updated
        # in the middle of fsck or repack.
        grokmirror.lock_repo(fullpath, nonblocking=False)
        if needs['repack']:
            if run_git_repack(fullpath, config, needs['repack']):
                status[fullpath]['lastrepack'] = todayiso
                if needs['repack'] > 1:
                    status[fullpath]['lastfullrepack'] = todayiso
            else:
                logger.warning('Repacking %s was unsuccessful, '
                               'not running fsck.', fullpath)
                grokmirror.unlock_repo(fullpath)
                continue

        if needs['prune']:
            run_git_prune(fullpath, config)

        if needs['fsck']:
            run_git_fsck(fullpath, config, conn_only)
            endt = time.time()
            status[fullpath]['lastcheck'] = todayiso
            status[fullpath]['s_elapsed'] = int(endt-startt)

            logger.info('   done : %ss, next check on %s',
                        status[fullpath]['s_elapsed'],
                        status[fullpath]['nextcheck'])

        run.update()

        # We're done with the repo now
        grokmirror.unlock_repo(fullpath)
        total_checked += 1
        total_elapsed += time.time()-startt

        # Write status file after each check, so if the process dies, we won't
        # have to recheck all the repos we've already checked
        logger.debug('Updating status file in %s', config['statusfile'])
        with open(config['statusfile'], 'wb') as stfh:
            stfh.write(json.dumps(status, indent=2).encode('utf-8'))

    run.close()
    em.stop()
    logger.info('Processed %s repos in %0.2fs', total_checked, total_elapsed)

    with open(config['statusfile'], 'wb') as stfh:
        stfh.write(json.dumps(status, indent=2).encode('utf-8'))

    lockf(flockh, LOCK_UN)
    flockh.close()
Example #11
0
def pull_mirror(name, config, verbose=False, force=False, nomtime=False,
                verify=False, verify_subpath='*', noreuse=False,
                purge=False, pretty=False, forcepurge=False):
    global logger
    global lock_fails

    # noinspection PyTypeChecker
    em = enlighten.get_manager(series=' -=#')

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)

    if 'log' in config.keys():
        ch = logging.FileHandler(config['log'])
        formatter = logging.Formatter(
            "[%(process)d] %(asctime)s - %(levelname)s - %(message)s")
        ch.setFormatter(formatter)
        loglevel = logging.INFO

        if 'loglevel' in config.keys():
            if config['loglevel'] == 'debug':
                loglevel = logging.DEBUG

        ch.setLevel(loglevel)
        logger.addHandler(ch)

    ch = logging.StreamHandler()
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)

    if verbose:
        ch.setLevel(logging.INFO)
    else:
        ch.setLevel(logging.CRITICAL)
        em.enabled = False

    logger.addHandler(ch)

    # push it into grokmirror to override the default logger
    grokmirror.logger = logger

    logger.info('Checking [%s]', name)
    mymanifest = config['mymanifest']

    if verify:
        logger.info('Verifying mirror against %s', config['manifest'])
        nomtime = True

    if config['manifest'].find('file:///') == 0:
        manifile = config['manifest'].replace('file://', '')
        if not os.path.exists(manifile):
            logger.critical('Remote manifest not found in %s! Quitting!',
                            config['manifest'])
            return 1

        fstat = os.stat(manifile)
        last_modified = fstat[8]
        logger.debug('mtime on %s is: %s', manifile, fstat[8])

        if os.path.exists(config['mymanifest']):
            fstat = os.stat(config['mymanifest'])
            my_last_modified = fstat[8]
            logger.debug('Our last-modified is: %s', my_last_modified)
            if not (force or nomtime) and last_modified <= my_last_modified:
                logger.info('Manifest file unchanged. Quitting.')
                return 0

        logger.info('Reading new manifest from %s', manifile)
        manifest = grokmirror.read_manifest(manifile)
        # Don't accept empty manifests -- that indicates something is wrong
        if not len(manifest.keys()):
            logger.warning('Remote manifest empty or unparseable! Quitting.')
            return 1

    else:
        # Load it from remote host using http and header magic
        logger.info('Fetching remote manifest from %s', config['manifest'])

        # Do we have username:[email protected] in the URL?
        chunks = urlparse(config['manifest'])
        if chunks.netloc.find('@') > 0:
            logger.debug('Taking username/password from the URL for basic auth')
            (upass, netloc) = chunks.netloc.split('@')
            if upass.find(':') > 0:
                (username, password) = upass.split(':')
            else:
                username = upass
                password = ''

            manifesturl = config['manifest'].replace(chunks.netloc, netloc)
            logger.debug('manifesturl=%s', manifesturl)
            request = urllib_request.Request(manifesturl)

            password_mgr = urllib_request.HTTPPasswordMgrWithDefaultRealm()
            password_mgr.add_password(None, manifesturl, username, password)
            auth_handler = urllib_request.HTTPBasicAuthHandler(password_mgr)
            opener = urllib_request.build_opener(auth_handler)

        else:
            request = urllib_request.Request(config['manifest'])
            opener = urllib_request.build_opener()

        # Find out if we need to run at all first
        if not (force or nomtime) and os.path.exists(mymanifest):
            fstat = os.stat(mymanifest)
            mtime = fstat[8]
            logger.debug('mtime on %s is: %s', mymanifest, mtime)
            my_last_modified = time.strftime('%a, %d %b %Y %H:%M:%S GMT',
                                             time.gmtime(mtime))
            logger.debug('Our last-modified is: %s', my_last_modified)
            request.add_header('If-Modified-Since', my_last_modified)

        try:
            ufh = opener.open(request, timeout=30)
        except HTTPError as ex:
            if ex.code == 304:
                logger.info('Server says we have the latest manifest. '
                            'Quitting.')
                return 0
            logger.warning('Could not fetch %s', config['manifest'])
            logger.warning('Server returned: %s', ex)
            return 1
        except (URLError, ssl.SSLError, ssl.CertificateError) as ex:
            logger.warning('Could not fetch %s', config['manifest'])
            logger.warning('Error was: %s', ex)
            return 1

        last_modified = ufh.headers.get('Last-Modified')
        last_modified = time.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z')
        last_modified = calendar.timegm(last_modified)

        # We don't use read_manifest for the remote manifest, as it can be
        # anything, really. For now, blindly open it with gzipfile if it ends
        # with .gz. XXX: some http servers will auto-deflate such files.
        try:
            if config['manifest'].find('.gz') > 0:
                fh = gzip.GzipFile(fileobj=BytesIO(ufh.read()))
            else:
                fh = ufh

            jdata = fh.read().decode('utf-8')
            fh.close()

            manifest = anyjson.deserialize(jdata)

        except Exception as ex:
            logger.warning('Failed to parse %s', config['manifest'])
            logger.warning('Error was: %s', ex)
            return 1

    mymanifest = grokmirror.read_manifest(mymanifest)

    culled = cull_manifest(manifest, config)

    to_clone = []
    to_pull = []
    existing = []

    toplevel = config['toplevel']
    if not os.access(toplevel, os.W_OK):
        logger.critical('Toplevel %s does not exist or is not writable',
                        toplevel)
        sys.exit(1)

    if 'pull_threads' in config.keys():
        pull_threads = int(config['pull_threads'])
        if pull_threads < 1:
            logger.info('pull_threads is less than 1, forcing to 1')
            pull_threads = 1
    else:
        # be conservative
        logger.info('pull_threads is not set, consider setting it')
        pull_threads = 5

    # noinspection PyTypeChecker
    e_cmp = em.counter(total=len(culled), desc='Comparing:', unit='repos', leave=False)

    for gitdir in list(culled):
        fullpath = os.path.join(toplevel, gitdir.lstrip('/'))
        e_cmp.update()

        # fingerprints were added in later versions, so deal if the upstream
        # manifest doesn't have a fingerprint
        if 'fingerprint' not in culled[gitdir]:
            culled[gitdir]['fingerprint'] = None

        # Attempt to lock the repo
        try:
            grokmirror.lock_repo(fullpath, nonblocking=True)
        except IOError:
            logger.info('Could not lock %s, skipping', gitdir)
            lock_fails.append(gitdir)
            # Force the fingerprint to what we have in mymanifest,
            # if we have it.
            culled[gitdir]['fingerprint'] = None
            if gitdir in mymanifest and 'fingerprint' in mymanifest[gitdir]:
                culled[gitdir]['fingerprint'] = mymanifest[gitdir][
                    'fingerprint']
            if len(lock_fails) >= pull_threads:
                logger.info('Too many repositories locked (%s). Exiting.',
                            len(lock_fails))
                return 0
            continue

        if verify:
            if culled[gitdir]['fingerprint'] is None:
                logger.debug('No fingerprint for %s, not verifying', gitdir)
                grokmirror.unlock_repo(fullpath)
                continue

            if not fnmatch.fnmatch(gitdir, verify_subpath):
                grokmirror.unlock_repo(fullpath)
                continue

            logger.debug('Verifying %s', gitdir)
            if not os.path.exists(fullpath):
                verify_fails.append(gitdir)
                logger.info('Verify: %s ABSENT', gitdir)
                grokmirror.unlock_repo(fullpath)
                continue

            my_fingerprint = grokmirror.get_repo_fingerprint(
                toplevel, gitdir, force=force)

            if my_fingerprint == culled[gitdir]['fingerprint']:
                logger.info('Verify: %s OK', gitdir)
            else:
                logger.critical('Verify: %s FAILED', gitdir)
                verify_fails.append(gitdir)

            grokmirror.unlock_repo(fullpath)
            continue

        # Is the directory in place?
        if os.path.exists(fullpath):
            # Did grok-fsck request to reclone it?
            rfile = os.path.join(fullpath, 'grokmirror.reclone')
            if os.path.exists(rfile):
                logger.info('Reclone requested for %s:', gitdir)
                with open(rfile, 'r') as rfh:
                    reason = rfh.read()
                    logger.info('  %s', reason)

                to_clone.append(gitdir)
                grokmirror.unlock_repo(fullpath)
                continue

            # Fix owner and description, if necessary
            if gitdir in mymanifest.keys():
                # This code is hurky and needs to be cleaned up
                desc = culled[gitdir].get('description')
                owner = culled[gitdir].get('owner')
                ref = None
                if config['ignore_repo_references'] != 'yes':
                    ref = culled[gitdir].get('reference')

                # dirty hack to force on-disk owner/description checks
                # when we're called with -n, in case our manifest
                # differs from what is on disk for owner/description/alternates
                myref = None
                if nomtime:
                    mydesc = None
                    myowner = None
                else:
                    mydesc = mymanifest[gitdir].get('description')
                    myowner = mymanifest[gitdir].get('owner')

                    if config['ignore_repo_references'] != 'yes':
                        myref = mymanifest[gitdir].get('reference')

                    if myowner is None:
                        myowner = config['default_owner']

                if owner is None:
                    owner = config['default_owner']

                if desc != mydesc or owner != myowner or ref != myref:
                    # we can do this right away without waiting
                    set_repo_params(toplevel, gitdir, owner, desc, ref)

            else:
                # It exists on disk, but not in my manifest?
                if noreuse:
                    logger.critical('Found existing git repo in %s', fullpath)
                    logger.critical('But you asked NOT to reuse repos')
                    logger.critical('Skipping %s', gitdir)
                    grokmirror.unlock_repo(fullpath)
                    continue

                logger.info('Setting new origin for %s', gitdir)
                fix_remotes(gitdir, toplevel, config['site'])
                to_pull.append(gitdir)
                grokmirror.unlock_repo(fullpath)
                continue

            # fingerprints were added late, so if we don't have them
            # in the remote manifest, fall back on using timestamps
            changed = False
            if culled[gitdir]['fingerprint'] is not None:
                logger.debug('Will use fingerprints to compare %s', gitdir)
                my_fingerprint = grokmirror.get_repo_fingerprint(toplevel,
                                                                 gitdir,
                                                                 force=force)

                if my_fingerprint != culled[gitdir]['fingerprint']:
                    logger.debug('No fingerprint match, will pull %s', gitdir)
                    changed = True
                else:
                    logger.debug('Fingerprints match, skipping %s', gitdir)
            else:
                logger.debug('Will use timestamps to compare %s', gitdir)
                if force:
                    logger.debug('Will force-pull %s', gitdir)
                    changed = True
                    # set timestamp to 0 as well
                    grokmirror.set_repo_timestamp(toplevel, gitdir, 0)
                else:
                    ts = grokmirror.get_repo_timestamp(toplevel, gitdir)
                    if ts < culled[gitdir]['modified']:
                        changed = True

            if changed:
                to_pull.append(gitdir)
                grokmirror.unlock_repo(fullpath)
                continue
            else:
                logger.debug('Repo %s unchanged', gitdir)
                # if we don't have a fingerprint for it, add it now
                if culled[gitdir]['fingerprint'] is None:
                    fpr = grokmirror.get_repo_fingerprint(toplevel, gitdir)
                    culled[gitdir]['fingerprint'] = fpr
                existing.append(gitdir)
                grokmirror.unlock_repo(fullpath)
                continue

        else:
            # Newly incoming repo
            to_clone.append(gitdir)
            grokmirror.unlock_repo(fullpath)
            continue

        # If we got here, something is odd.
        # noinspection PyUnreachableCode
        logger.critical('Could not figure out what to do with %s', gitdir)
        grokmirror.unlock_repo(fullpath)

    logger.info('Compared new manifest against %s repositories in %0.2fs', len(culled), e_cmp.elapsed)
    e_cmp.close()

    if verify:
        if len(verify_fails):
            logger.critical('%s repos failed to verify', len(verify_fails))
            return 1
        else:
            logger.info('Verification successful')
            return 0

    hookscript = config['post_update_hook']

    if len(to_pull):

        if len(lock_fails) > 0:
            pull_threads -= len(lock_fails)

        # Don't spin up more threads than we need
        if pull_threads > len(to_pull):
            pull_threads = len(to_pull)

        # exit if we're ever at 0 pull_threads. Shouldn't happen, but some extra
        # precaution doesn't hurt
        if pull_threads <= 0:
            logger.info('Too many repositories locked. Exiting.')
            return 0

        logger.info('Will use %d threads to pull repos', pull_threads)

        # noinspection PyTypeChecker
        e_pull = em.counter(total=len(to_pull), desc='Updating :', unit='repos', leave=False)
        logger.info('Updating %s repos from %s', len(to_pull), config['site'])
        in_queue = Queue()
        out_queue = Queue()

        for gitdir in to_pull:
            in_queue.put((gitdir, culled[gitdir]['fingerprint'],
                          culled[gitdir]['modified']))

        for i in range(pull_threads):
            logger.debug('Spun up thread %s', i)
            t = PullerThread(in_queue, out_queue, config, i, e_pull)
            t.setDaemon(True)
            t.start()

        # wait till it's all done
        in_queue.join()
        logger.info('All threads finished.')

        while not out_queue.empty():
            # see if any of it failed
            (gitdir, my_fingerprint, status) = out_queue.get()
            # We always record our fingerprint in our manifest
            culled[gitdir]['fingerprint'] = my_fingerprint
            if not status:
                # To make sure we check this again during next run,
                # fudge the manifest accordingly.
                logger.debug('Will recheck %s during next run', gitdir)
                culled[gitdir] = mymanifest[gitdir]
                # this is rather hackish, but effective
                last_modified -= 1

        logger.info('Updates completed in %0.2fs', e_pull.elapsed)
        e_pull.close()
    else:
        logger.info('No repositories need updating')

    # how many lockfiles have we seen?
    # If there are more lock_fails than there are
    # pull_threads configured, we skip cloning out of caution
    if len(to_clone) and len(lock_fails) > pull_threads:
        logger.info('Too many repositories locked. Skipping cloning new repos.')
        to_clone = []

    if len(to_clone):
        # noinspection PyTypeChecker
        e_clone = em.counter(total=len(to_clone), desc='Cloning  :', unit='repos', leave=False)
        logger.info('Cloning %s repos from %s', len(to_clone), config['site'])
        # we use "existing" to track which repos can be used as references
        existing.extend(to_pull)

        to_clone_sorted = []
        clone_order(to_clone, manifest, to_clone_sorted, existing)

        for gitdir in to_clone_sorted:
            e_clone.refresh()

            fullpath = os.path.join(toplevel, gitdir.lstrip('/'))

            # Did grok-fsck request to reclone it?
            rfile = os.path.join(fullpath, 'grokmirror.reclone')
            if os.path.exists(rfile):
                logger.debug('Removing %s for reclone', gitdir)
                shutil.move(fullpath, '%s.reclone' % fullpath)
                shutil.rmtree('%s.reclone' % fullpath)

            # Do we still need to clone it, or has another process
            # already done this for us?
            ts = grokmirror.get_repo_timestamp(toplevel, gitdir)

            if ts > 0:
                logger.debug('Looks like %s already cloned, skipping', gitdir)
                continue

            try:
                grokmirror.lock_repo(fullpath, nonblocking=True)
            except IOError:
                logger.info('Could not lock %s, skipping', gitdir)
                lock_fails.append(gitdir)
                e_clone.update()
                continue

            reference = None
            if config['ignore_repo_references'] != 'yes':
                reference = culled[gitdir]['reference']

            if reference is not None and reference in existing:
                # Make sure we can lock the reference repo
                refrepo = os.path.join(toplevel, reference.lstrip('/'))
                try:
                    grokmirror.lock_repo(refrepo, nonblocking=True)
                    success = clone_repo(toplevel, gitdir, config['site'],
                                         reference=reference)
                    grokmirror.unlock_repo(refrepo)
                except IOError:
                    logger.info('Cannot lock reference repo %s, skipping %s',
                                reference, gitdir)
                    if reference not in lock_fails:
                        lock_fails.append(reference)

                    grokmirror.unlock_repo(fullpath)
                    e_clone.update()
                    continue
            else:
                success = clone_repo(toplevel, gitdir, config['site'])

            # check dir to make sure cloning succeeded and then add to existing
            if os.path.exists(fullpath) and success:
                logger.debug('Cloning of %s succeeded, adding to existing',
                             gitdir)
                existing.append(gitdir)

                desc = culled[gitdir].get('description')
                owner = culled[gitdir].get('owner')
                ref = culled[gitdir].get('reference')

                if owner is None:
                    owner = config['default_owner']
                set_repo_params(toplevel, gitdir, owner, desc, ref)
                set_agefile(toplevel, gitdir, culled[gitdir]['modified'])
                my_fingerprint = grokmirror.set_repo_fingerprint(toplevel,
                                                                 gitdir)
                culled[gitdir]['fingerprint'] = my_fingerprint
                run_post_update_hook(hookscript, toplevel, gitdir)
            else:
                logger.warning('Was not able to clone %s', gitdir)
                # Remove it from our manifest so we can try re-cloning
                # next time grok-pull runs
                del culled[gitdir]
                git_fails.append(gitdir)

            grokmirror.unlock_repo(fullpath)
            e_clone.update()

        logger.info('Clones completed in %0.2fs' % e_clone.elapsed)
        e_clone.close()

    else:
        logger.info('No repositories need cloning')

    # loop through all entries and find any symlinks we need to set
    # We also collect all symlinks to do purging correctly
    symlinks = []
    for gitdir in culled.keys():
        if 'symlinks' in culled[gitdir].keys():
            source = os.path.join(config['toplevel'], gitdir.lstrip('/'))
            for symlink in culled[gitdir]['symlinks']:
                if symlink not in symlinks:
                    symlinks.append(symlink)
                target = os.path.join(config['toplevel'], symlink.lstrip('/'))

                if os.path.exists(source):
                    if os.path.islink(target):
                        # are you pointing to where we need you?
                        if os.path.realpath(target) != source:
                            # Remove symlink and recreate below
                            logger.debug('Removed existing wrong symlink %s',
                                         target)
                            os.unlink(target)
                    elif os.path.exists(target):
                        logger.warning('Deleted repo %s, because it is now'
                                       ' a symlink to %s' % (target, source))
                        shutil.rmtree(target)

                    # Here we re-check if we still need to do anything
                    if not os.path.exists(target):
                        logger.info('Symlinking %s -> %s', target, source)
                        # Make sure the leading dirs are in place
                        if not os.path.exists(os.path.dirname(target)):
                            os.makedirs(os.path.dirname(target))
                        os.symlink(source, target)

    manifile = config['mymanifest']
    grokmirror.manifest_lock(manifile)

    # Is the local manifest newer than last_modified? That would indicate
    # that another process has run and "culled" is no longer the latest info
    if os.path.exists(manifile):
        fstat = os.stat(manifile)
        if fstat[8] > last_modified:
            logger.info('Local manifest is newer, not saving.')
            grokmirror.manifest_unlock(manifile)
            return 0

    if purge:
        to_purge = []
        found_repos = 0
        for founddir in grokmirror.find_all_gitdirs(config['toplevel']):
            gitdir = founddir.replace(config['toplevel'], '')
            found_repos += 1

            if gitdir not in culled.keys() and gitdir not in symlinks:
                to_purge.append(founddir)

        if len(to_purge):
            # Purge-protection engage
            try:
                purge_limit = int(config['purgeprotect'])
                assert 1 <= purge_limit <= 99
            except (ValueError, AssertionError):
                logger.critical('Warning: "%s" is not valid for purgeprotect.',
                                config['purgeprotect'])
                logger.critical('Please set to a number between 1 and 99.')
                logger.critical('Defaulting to purgeprotect=5.')
                purge_limit = 5

            purge_pc = len(to_purge) * 100 / found_repos
            logger.debug('purgeprotect=%s', purge_limit)
            logger.debug('purge prercentage=%s', purge_pc)

            if not forcepurge and purge_pc >= purge_limit:
                logger.critical('Refusing to purge %s repos (%s%%)',
                                len(to_purge), purge_pc)
                logger.critical('Set purgeprotect to a higher percentage, or'
                                ' override with --force-purge.')
                logger.info('Not saving local manifest')
                return 1
            else:
                # noinspection PyTypeChecker
                e_purge = em.counter(total=len(to_purge), desc='Purging  :', unit='repos', leave=False)
                for founddir in to_purge:
                    e_purge.refresh()
                    if os.path.islink(founddir):
                        logger.info('Removing unreferenced symlink %s', gitdir)
                        os.unlink(founddir)
                    else:
                        # is anything using us for alternates?
                        gitdir = '/' + os.path.relpath(founddir, toplevel).lstrip('/')
                        if grokmirror.is_alt_repo(toplevel, gitdir):
                            logger.info('Not purging %s because it is used by '
                                        'other repos via alternates', founddir)
                        else:
                            try:
                                logger.info('Purging %s', founddir)
                                grokmirror.lock_repo(founddir, nonblocking=True)
                                shutil.rmtree(founddir)
                            except IOError:
                                lock_fails.append(gitdir)
                                logger.info('%s is locked, not purging',
                                            gitdir)
                    e_purge.update()

                logger.info('Purging completed in %0.2fs', e_purge.elapsed)
                e_purge.close()

        else:
            logger.info('No repositories need purging')

    # Done with progress bars
    em.stop()

    # Go through all repos in culled and get the latest local timestamps.
    for gitdir in culled:
        ts = grokmirror.get_repo_timestamp(toplevel, gitdir)
        culled[gitdir]['modified'] = ts

    # If there were any lock failures, we fudge last_modified to always
    # be older than the server, which will force the next grokmirror run.
    if len(lock_fails):
        logger.info('%s repos could not be locked. Forcing next run.',
                    len(lock_fails))
        last_modified -= 1
    elif len(git_fails):
        logger.info('%s repos failed. Forcing next run.', len(git_fails))
        last_modified -= 1

    # Once we're done, save culled as our new manifest
    grokmirror.write_manifest(manifile, culled, mtime=last_modified,
                              pretty=pretty)

    grokmirror.manifest_unlock(manifile)

    # write out projects.list, if asked to
    write_projects_list(culled, config)

    return 127