Example #1
0
    def commits(self, pid=None, tid=None, cid=None, gid=None,
                start=0, limit=10,
                dfrom=None, dto=None, inc_merge_commit=None,
                inc_repos=None, metadata=None, exc_groups=None,
                inc_groups=None):

        c = Commits(index.Connector())
        projects_index = Projects()
        idents = Contributors()

        query_kwargs = utils.resolv_filters(
            projects_index, idents, pid, tid, cid, gid,
            dfrom, dto, inc_repos, inc_merge_commit,
            metadata, exc_groups, inc_groups)
        query_kwargs.update(
            {'start': start, 'limit': limit})

        resp = c.get_commits(**query_kwargs)

        for cmt in resp[2]:
            # Get extra metadata keys
            extra = set(cmt.keys()) - set(PROPERTIES.keys())
            cmt['metadata'] = list(extra)
            cmt['repos'] = [r for r in cmt['repos']
                            if not r.startswith('meta_ref: ')]
            # Compute link to access commit diff based on the
            # URL template provided in projects.yaml
            cmt['gitwebs'] = [
                projects_index.get_gitweb_link(r) %
                {'sha': cmt['sha']} for r in cmt['repos']]
            cmt['projects'] = utils.get_projects_from_references(
                projects_index, cmt['repos'])
            # Also remove the URI part
            cmt['repos'] = [":".join(p.split(':')[-2:]) for
                            p in cmt['repos']]
            # Request the ident index to fetch author/committer name/email
            for elm in ('author', 'committer'):
                ident = list(idents.get_idents_by_emails(
                    cmt['%s_email' % elm]).values())[0]
                cmt['%s_email' % elm] = ident['default-email']
                if ident['name']:
                    cmt['%s_name' % elm] = ident['name']
            # Convert the TTL to something human readable
            cmt['ttl'] = str((datetime.fromtimestamp(cmt['ttl']) -
                              datetime.fromtimestamp(0)))
            cmt['author_gravatar'] = \
                hashlib.md5(cmt['author_email'].encode(
                    errors='ignore')).hexdigest()
            cmt['committer_gravatar'] = \
                hashlib.md5(cmt['committer_email'].encode(
                    errors='ignore')).hexdigest()
            if len(cmt['commit_msg']) > 80:
                cmt['commit_msg'] = cmt['commit_msg'][0:76] + '...'
            # Add cid and ccid
            cmt['cid'] = utils.encrypt(xorkey, cmt['author_email'])
            cmt['ccid'] = utils.encrypt(xorkey, cmt['committer_email'])
            # Remove email details
            del cmt['author_email']
            del cmt['committer_email']
        return resp
Example #2
0
class RefsCleaner():
    def __init__(self, projects, con=None, config=None):
        if config:
            configuration.set_config(config)
        if not con:
            self.con = index.Connector()
        else:
            self.con = con
        self.projects = projects
        self.c = Commits(self.con)
        self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED)

    def find_refs_to_clean(self):
        prjs = self.projects.get_projects_raw()
        refs_ids = set()
        for pid, pdata in prjs.items():
            for rid, repo in pdata['repos'].items():
                for branch in repo['branches']:
                    refs_ids.add('%s:%s:%s' % (repo['uri'], rid, branch))
        if not os.path.isfile(self.seen_refs_path):
            self.data = set()
        else:
            try:
                self.data = cPickle.load(file(self.seen_refs_path))
            except Exception:
                # Protect against corrupted file
                self.data = set()
        refs_to_clean = self.data - refs_ids
        logger.info("Found %s refs to clean." % len(refs_to_clean))
        return refs_to_clean

    def clean(self, refs):
        for ref in refs:
            # Find ref's Commits
            ids = [
                c['_id'] for c in self.c.get_commits(repos=[ref], scan=True)
            ]
            if not ids:
                self.remove_from_seen_refs(ref)
                continue
            logger.info("Ref %s no longer referenced. Cleaning %s cmts." %
                        (ref, len(ids)))
            # Do it by bulk of 10000 to not hurt memory
            bulk = 10000
            i = 0
            while True:
                _ids = ids[i:i + bulk]
                if not _ids:
                    break
                else:
                    delete_commits(self.c, ref, _ids, ref)
                    i += bulk
            self.remove_from_seen_refs(ref)

    def remove_from_seen_refs(self, ref_id):
        self.data.remove(ref_id)
        cPickle.dump(self.data, file(self.seen_refs_path, 'w'))
Example #3
0
class RepoIndexer():
    def __init__(self, name, uri, parsers=None, con=None, config=None):
        if config:
            configuration.set_config(config)
        if not con:
            self.con = index.Connector()
        else:
            self.con = con
        self.c = Commits(self.con)
        self.t = Tags(self.con)
        if not os.path.isdir(conf.git_store):
            os.makedirs(conf.git_store)
        self.name = name
        self.uri = uri
        self.base_id = '%s:%s' % (self.uri, self.name)
        self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED)
        if not parsers:
            self.parsers = []
        else:
            self.parsers = parsers
        self.parsers_compiled = False
        self.local = os.path.join(conf.git_store, self.name,
                                  self.uri.replace('/', '_'))
        if not os.path.isdir(self.local):
            os.makedirs(self.local)
        self.credentials_helper_path = os.path.join(
            sys.prefix, 'bin', 'repoxplorer-git-credentials-helper')

    def __str__(self):
        return 'Git indexer of %s' % self.ref_id

    def save_seen_ref_in_cache(self):
        # Keep a cache a each ref that have been indexed
        # This is use later to discover seen refs no longer in projects.yaml
        # In that case a removal from the backend will be performed
        logger.debug("Save ref %s into seen_refs file" % self.ref_id)
        if not os.path.isfile(self.seen_refs_path):
            data = set()
        else:
            try:
                data = cPickle.load(file(self.seen_refs_path))
            except Exception:
                # Protect against corrupted file
                data = set()
        data.add(self.ref_id)
        cPickle.dump(data, file(self.seen_refs_path, 'w'))

    def set_branch(self, branch):
        self.branch = branch
        self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch)
        self.save_seen_ref_in_cache()

    def git_init(self):
        logger.debug("Git init for %s:%s in %s" %
                     (self.uri, self.name, self.local))
        run(["git", "init", "--bare", "."], self.local)
        if "origin" not in run(["git", "remote", "-v"], self.local):
            run(["git", "remote", "add", "origin", self.uri], self.local)

    def git_fetch_branch(self):
        logger.debug("Fetch %s %s:%s" % (self.name, self.uri, self.branch))
        run([
            "git", "-c",
            "credential.helper=%s" % self.credentials_helper_path, "fetch",
            "-nk", "origin",
            "+%s:%s" % (self.branch, self.branch)
        ], self.local)

    def get_refs(self):
        refs = run([
            "git", "-c",
            "credential.helper=%s" % self.credentials_helper_path, "ls-remote",
            "origin"
        ], self.local).splitlines()
        self.refs = []
        for r in refs:
            self.refs.append(r.split('\t'))

    def get_heads(self):
        self.heads = filter(lambda x: x[1].startswith('refs/heads/'),
                            self.refs)

    def get_tags(self):
        self.tags = filter(lambda x: x[1].startswith('refs/tags/'), self.refs)

    def git_get_commit_obj(self):
        self.commits = get_all_shas(self.local)

    def run_workers(self, shas, workers):
        BULK_CHUNK = 1000
        to_process = []
        if workers == 0:
            # Default value (auto)
            workers = mp.cpu_count() - 1 or 1
        while True:
            try:
                shas[BULK_CHUNK]
                to_process.append(shas[:BULK_CHUNK])
                del shas[:BULK_CHUNK]
            except IndexError:
                # Add the rest
                to_process.append(shas)
                break
        options = [(self.local, self.ref_id, stp) for stp in to_process]
        worker_pool = mp.Pool(workers)
        worker_pool.map(process_commits, options)
        worker_pool.terminate()
        worker_pool.join()

    def is_branch_fully_indexed(self):
        branch = [
            head for head in self.heads if head[1].endswith(self.branch)
        ][0]
        branch_tip_sha = branch[0]
        cmt = self.c.get_commit(branch_tip_sha, silent=True)
        if cmt and self.ref_id in cmt['repos']:
            return True
        return False

    def get_current_commit_indexed(self):
        """ Fetch from the index commits mentionned for this repo
        and branch.
        """
        self.already_indexed = [
            c['_id']
            for c in self.c.get_commits(repos=[self.ref_id], scan=True)
        ]
        logger.debug(
            "%s: In the DB - repo history is composed of %s commits." %
            (self.name, len(self.already_indexed)))

    def compute_to_index_to_delete(self):
        """ Compute the list of commits (sha) to index and the
        list to delete from the index.
        """
        logger.debug("%s: Upstream - repo history is composed of %s commits." %
                     (self.name, len(self.commits)))
        self.to_delete = set(self.already_indexed) - set(self.commits)
        self.to_index = set(self.commits) - set(self.already_indexed)
        logger.debug("%s: Indexer will reference %s commits." %
                     (self.name, len(self.to_index)))
        logger.debug("%s: Indexer will dereference %s commits." %
                     (self.name, len(self.to_delete)))

    def compute_to_create_to_update(self):
        if self.to_index:
            res = self.c.get_commits_by_id(list(self.to_index))
            to_update = [
                c['_source'] for c in res['docs'] if c['found'] is True
            ]
            to_create = [c['_id'] for c in res['docs'] if c['found'] is False]
            return to_create, to_update
        return [], []

    def index_tags(self):
        def c_tid(t):
            return "%s%s%s" % (t['sha'], t['name'].replace('refs/tags/',
                                                           ''), t['repo'])

        if not self.tags:
            logger.debug('%s: no tags detected for this repository' %
                         (self.name))
            return
        logger.debug('%s: %s tags exist upstream' %
                     (self.name, len(self.tags)))
        tags = self.t.get_tags([self.base_id])
        existing = dict([(c_tid(t['_source']), t['_id']) for t in tags])
        logger.debug('%s: %s tags already referenced' %
                     (self.name, len(existing)))
        # Some commits may be not found because it is possible the branches
        # has not been indexed.
        commits = [
            c['_source']
            for c in self.c.get_commits_by_id([t[0]
                                               for t in self.tags])['docs']
            if c['found']
        ]
        lookup = dict([(c['sha'], c['committer_date']) for c in commits])
        to_delete = [
            v for k, v in existing.items() if k not in [
                "%s%s%s" %
                (sha, name.replace('refs/tags/', '').replace('^{}', ''),
                 self.base_id) for sha, name in self.tags
            ]
        ]
        docs = []
        for sha, name in self.tags:
            if sha in lookup:
                doc = {}
                doc['name'] = name.replace('refs/tags/', '').replace('^{}', '')
                doc['sha'] = sha
                doc['date'] = lookup[sha]
                doc['repo'] = self.base_id
                if c_tid(doc) in existing:
                    continue
                docs.append(doc)
        if docs:
            logger.info('%s: %s tags will be indexed' % (self.name, len(docs)))
            self.t.add_tags(docs)
        if to_delete:
            logger.info('%s: %s tags will be deleted' %
                        (self.name, len(to_delete)))
            self.t.del_tags(to_delete)

    def index(self, extract_workers=1):
        # Compile the parsers
        if self.parsers:
            if not self.parsers_compiled:
                raw_parsers = copy.deepcopy(self.parsers)
                self.parsers = []
                for parser in raw_parsers:
                    self.parsers.append(re.compile(parser))
                logger.debug("%s: Prepared %s regex parsers for commit msgs" %
                             (self.name, len(self.parsers)))
                self.parsers_compiled = True

        # check whether a commit should be completly deleted or
        # updated by removing the repo from the repos field
        if self.to_delete:
            delete_commits(self.c, self.name, self.to_delete, self.ref_id)

        # check whether a commit should be created or
        # updated by adding the repo into the repos field
        if self.to_index:
            to_create, to_update = self.compute_to_create_to_update()

            if to_create:
                logger.info("%s: %s commits will be created ..." %
                            (self.name, len(to_create)))
                self.run_workers(to_create, extract_workers)

            if to_update:
                logger.info(
                    "%s: %s commits already indexed and need to be updated" %
                    (self.name, len(to_update)))
                for c in to_update:
                    c['repos'].append(self.ref_id)
                self.c.update_commits(to_update)
Example #4
0
class RefsCleaner():
    def __init__(self, projects, con=None):
        if not con:
            self.con = index.Connector()
        else:
            self.con = con
        self.projects = projects
        self.c = Commits(self.con)
        self.t = Tags(
            index.Connector(index=self.con.index, index_suffix='tags'))
        self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED)
        self.current_base_ids = set()

    def find_refs_to_clean(self):
        projects = self.projects.get_projects(source=['refs'])
        refs_ids = set()
        for project in projects.values():
            for ref in project['refs']:
                self.current_base_ids.add(ref['shortrid'])
                refs_ids.add(ref['fullrid'])
        if not os.path.isfile(self.seen_refs_path):
            self.data = set()
        else:
            try:
                self.data = pickle.load(open(self.seen_refs_path, 'rb'))
            except Exception:
                # Protect against corrupted file
                self.data = set()
        refs_to_clean = self.data - refs_ids
        if len(refs_to_clean):
            logger.info("Found %s refs to clean." % len(refs_to_clean))
        return refs_to_clean

    def clean_tags(self, base_id):
        # Tags are indexed by repos (base_id) not by ref (ref_id)
        tags = self.t.get_tags([base_id])
        ids = [t['_id'] for t in tags]
        if ids:
            logger.info("Repo %s no longer referenced. Cleaning %s tags" %
                        (base_id, len(ids)))
            self.t.del_tags(ids)

    def clean_ref_cmts(self, ref):
        # Find ref's Commits
        ids = [c['_id'] for c in self.c.get_commits(repos=[ref], scan=True)]
        if not ids:
            self.remove_from_seen_refs(ref)
            return
        logger.info("Ref %s no longer referenced. Cleaning %s cmts." %
                    (ref, len(ids)))
        # Do it by bulk of 10000 to not hurt memory
        bulk = 10000
        i = 0
        while True:
            _ids = ids[i:i + bulk]
            if not _ids:
                break
            else:
                delete_commits(self.c, ref, _ids, ref)
                i += bulk

    def clean(self, refs):
        base_ids = set()
        for ref in refs:
            self.clean_ref_cmts(ref)
            self.remove_from_seen_refs(ref)
            base_id = ref.replace(":%s" % ref.split(':')[-1], "")
            if base_id not in self.current_base_ids:
                base_ids.add(base_id)
        for base_id in base_ids:
            self.clean_tags(base_id)

    def remove_from_seen_refs(self, ref_id):
        # Remove from the struct to be dumped
        self.data.remove(ref_id)
        pickle.dump(self.data, open(self.seen_refs_path, 'wb'))
Example #5
0
class RepoIndexer():
    def __init__(self, name, uri, parsers=None,
                 con=None, meta_ref=None):
        if not con:
            self.con = index.Connector()
        else:
            self.con = con
        self.c = Commits(self.con)
        self.t = Tags(index.Connector(
            index=self.con.index, index_suffix='tags'))
        if not os.path.isdir(conf.git_store):
            os.makedirs(conf.git_store)
        self.name = name
        self.uri = uri
        self.base_id = '%s:%s' % (self.uri, self.name)
        self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED)
        if meta_ref:
            self.meta_ref = 'meta_ref: %s' % meta_ref
        else:
            self.meta_ref = None

        if not parsers:
            self.parsers = []
        else:
            self.parsers = parsers
        self.parsers_compiled = False
        self.local = os.path.join(conf.git_store,
                                  self.name,
                                  self.uri.replace('/', '_'))
        if not os.path.isdir(self.local):
            os.makedirs(self.local)

        self.credentials_helper_path = getattr(
            conf, 'git_credential_helper_path', None)
        if not (self.credentials_helper_path and
                self.credentials_helper_path.startswith('/') and
                os.path.isfile(self.credentials_helper_path)):
            if self.credentials_helper_path:
                logger.warning(
                    'Configured git_credential_helper %s not found' % (
                        self.credentials_helper_path))
            self.credentials_helper_path = None
        # Look at the default installation pathes
        if not self.credentials_helper_path:
            self.credentials_helper_path = os.path.join(
                sys.prefix, 'bin', 'repoxplorer-git-credentials-helper')
            if not os.path.isfile(self.credentials_helper_path):
                self.credentials_helper_path = shutil.which(
                    'repoxplorer-git-credentials-helper')
            if not self.credentials_helper_path:
                logger.warning(
                    'Default repoxplorer-git-credential-helper command '
                    'not found')

    def __str__(self):
        return 'Git indexer of %s' % self.ref_id

    def save_seen_ref_in_cache(self):
        # Keep a cache a each ref that have been indexed
        # This is use later to discover seen refs no longer in projects.yaml
        # In that case a removal from the backend will be performed
        logger.debug("Save ref %s into seen_refs file" % self.ref_id)
        if not os.path.isfile(self.seen_refs_path):
            data = set()
        else:
            try:
                data = pickle.load(open(self.seen_refs_path, 'rb'))
            except Exception:
                # Protect against corrupted file
                data = set()
        data.add(self.ref_id)
        pickle.dump(data, open(self.seen_refs_path, 'wb'))

    def set_branch(self, branch):
        self.branch = branch
        self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch)
        self.save_seen_ref_in_cache()

    def git_init(self):
        logger.debug("Git init for %s:%s in %s" % (
            self.uri, self.name, self.local))
        run(["git", "init", "--bare", "."], self.local)
        remotes = run(["git", "remote", "-v"], self.local)
        remote_names = [line.split()[0] for line in remotes.splitlines()]
        if "origin" not in remote_names:
            run(["git", "remote", "add", "origin", self.uri], self.local)

    def git_fetch_branch(self):
        logger.debug("Fetch %s %s:%s" % (self.name, self.uri,
                                         self.branch))
        run(["git", "-c",
             "credential.helper=%s" % self.credentials_helper_path,
             "fetch", "-nk", "origin", "+%s:%s" % (self.branch, self.branch)],
            self.local)

    def get_refs(self):
        refs = run([
            "git", "-c", "credential.helper=%s" % self.credentials_helper_path,
            "ls-remote", "origin"], self.local).splitlines()
        self.refs = []
        for r in refs:
            self.refs.append(r.split('\t'))

    def get_heads(self):
        self.heads = [x for x in self.refs if x[1].startswith('refs/heads/')]

    def get_tags(self):
        self.tags = [x for x in self.refs if x[1].startswith('refs/tags/')]

    def git_get_commit_obj(self):
        self.commits = get_all_shas(self.local)

    def run_workers(self, shas, workers):
        BULK_CHUNK = 1000
        to_process = []
        if workers == 0:
            # Default value (auto)
            workers = mp.cpu_count() - 1 or 1
        while True:
            try:
                shas[BULK_CHUNK]
                to_process.append(shas[:BULK_CHUNK])
                del shas[:BULK_CHUNK]
            except IndexError:
                # Add the rest
                to_process.append(shas)
                break
        ref_ids = [self.ref_id]
        if self.meta_ref:
            ref_ids.append(self.meta_ref)
        options = [
            (self.local, ref_ids, stp) for stp in to_process]
        worker_pool = mp.Pool(workers)
        worker_pool.map(process_commits, options)
        worker_pool.terminate()
        worker_pool.join()

    def is_branch_fully_indexed(self):
        branch = [head for head in self.heads if
                  head[1].endswith(self.branch)][0]
        branch_tip_sha = branch[0]
        _, _, cmts_list = self.c.get_commits(repos=[self.ref_id], limit=1)
        if not cmts_list:
            return False
        cmt = cmts_list[0]
        if branch_tip_sha != cmt['sha']:
            return False
        return True

    def get_current_commits_indexed(self):
        """ Fetch from the index commits mentionned for this repo
        and branch.
        """
        self.already_indexed = [c['_id'] for c in
                                self.c.get_commits(repos=[self.ref_id],
                                                   scan=True)]
        logger.debug(
            "%s: In the DB - repo history is composed of %s commits." % (
                self.name, len(self.already_indexed)))

    def compute_to_index_to_delete(self):
        """ Compute the list of commits (sha) to index and the
        list to delete from the index.
        """
        logger.debug(
            "%s: Upstream - repo history is composed of %s commits." % (
                self.name, len(self.commits)))
        self.to_delete = set(self.already_indexed) - set(self.commits)
        self.to_index = set(self.commits) - set(self.already_indexed)
        logger.debug(
            "%s: Indexer will reference %s commits." % (
                self.name,
                len(self.to_index)))
        logger.debug(
            "%s: Indexer will dereference %s commits." % (
                self.name,
                len(self.to_delete)))

    def compute_to_create_to_update(self):
        if self.to_index:
            res = self.c.get_commits_by_id(list(self.to_index))
            to_update = [c['_source'] for
                         c in res['docs'] if c['found'] is True]
            to_create = [c['_id'] for
                         c in res['docs'] if c['found'] is False]
            return to_create, to_update
        return [], []

    def index_tags(self):
        def c_tid(t):
            return "%s%s%s" % (t['sha'],
                               t['name'].replace('refs/tags/', ''),
                               t['repo'])
        if not self.tags:
            logger.debug('%s: no tags detected for this repository' % (
                         self.name))
            return
        logger.debug('%s: %s tags exist upstream' % (
                     self.name, len(self.tags)))
        tags = self.t.get_tags([self.base_id])
        existing = dict([(c_tid(t['_source']), t['_id']) for t in tags])
        logger.debug('%s: %s tags already referenced' % (
                     self.name, len(existing)))
        # Some commits may be not found because it is possible the branches
        # has not been indexed.
        commits = [c['_source'] for c in self.c.get_commits_by_id(
                   [t[0] for t in self.tags])['docs'] if c['found']]
        lookup = dict([(c['sha'], c['committer_date']) for c in commits])
        to_delete = [v for k, v in existing.items() if
                     k not in ["%s%s%s" % (sha,
                                           name.replace('refs/tags/',
                                                        '').replace('^{}', ''),
                                           self.base_id) for
                               sha, name in self.tags]]
        docs = []
        for sha, name in self.tags:
            if sha in lookup:
                doc = {}
                doc['name'] = name.replace('refs/tags/', '').replace('^{}', '')
                doc['sha'] = sha
                doc['date'] = lookup[sha]
                doc['repo'] = self.base_id
                if c_tid(doc) in existing:
                    continue
                docs.append(doc)
        if docs:
            logger.info('%s: %s tags will be indexed' % (
                        self.name, len(docs)))
            self.t.add_tags(docs)
        if to_delete:
            logger.info('%s: %s tags will be deleted' % (
                        self.name, len(to_delete)))
            self.t.del_tags(to_delete)

    def index(self, extract_workers=1):
        # Compile the parsers
        if self.parsers:
            if not self.parsers_compiled:
                raw_parsers = copy.deepcopy(self.parsers)
                self.parsers = []
                for parser in raw_parsers:
                    self.parsers.append(re.compile(parser))
                logger.debug(
                    "%s: Prepared %s regex parsers for commit msgs" % (
                        self.name, len(self.parsers)))
                self.parsers_compiled = True

        # check whether a commit should be completly deleted or
        # updated by removing the repo from the repos field
        if self.to_delete:
            delete_commits(self.c, self.name, self.to_delete, self.ref_id)

        # check whether a commit should be created or
        # updated by adding the repo into the repos field
        if self.to_index:
            to_create, to_update = self.compute_to_create_to_update()

            if to_create:
                logger.info("%s: %s commits will be created ..." % (
                    self.name, len(to_create)))
                self.run_workers(to_create, extract_workers)

            if to_update:
                logger.info(
                    "%s: %s commits already indexed and need to be updated" % (
                        self.name, len(to_update)))
                for c in to_update:
                    c['repos'].append(self.ref_id)
                self.c.update_commits(to_update)
Example #6
0
class RefsCleaner():
    def __init__(self, projects, con=None):
        if not con:
            self.con = index.Connector()
        else:
            self.con = con
        self.projects = projects
        self.c = Commits(self.con)
        self.t = Tags(index.Connector(
            index=self.con.index, index_suffix='tags'))
        self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED)
        self.current_base_ids = set()

    def find_refs_to_clean(self):
        projects = self.projects.get_projects(source=['refs'])
        refs_ids = set()
        for project in projects.values():
            for ref in project['refs']:
                self.current_base_ids.add(ref['shortrid'])
                refs_ids.add(ref['fullrid'])
        if not os.path.isfile(self.seen_refs_path):
            self.data = set()
        else:
            try:
                self.data = pickle.load(open(self.seen_refs_path, 'rb'))
            except Exception:
                # Protect against corrupted file
                self.data = set()
        refs_to_clean = self.data - refs_ids
        if len(refs_to_clean):
            logger.info("Found %s refs to clean." % len(refs_to_clean))
        return refs_to_clean

    def clean_tags(self, base_id):
        # Tags are indexed by repos (base_id) not by ref (ref_id)
        tags = self.t.get_tags([base_id])
        ids = [t['_id'] for t in tags]
        if ids:
            logger.info("Repo %s no longer referenced. Cleaning %s tags" % (
                base_id, len(ids)))
            self.t.del_tags(ids)

    def clean_ref_cmts(self, ref):
        # Find ref's Commits
        ids = [c['_id'] for c in
               self.c.get_commits(repos=[ref], scan=True)]
        if not ids:
            self.remove_from_seen_refs(ref)
            return
        logger.info("Ref %s no longer referenced. Cleaning %s cmts." %
                    (ref, len(ids)))
        # Do it by bulk of 10000 to not hurt memory
        bulk = 10000
        i = 0
        while True:
            _ids = ids[i:i+bulk]
            if not _ids:
                break
            else:
                delete_commits(self.c, ref, _ids, ref)
                i += bulk

    def clean(self, refs):
        base_ids = set()
        for ref in refs:
            self.clean_ref_cmts(ref)
            self.remove_from_seen_refs(ref)
            base_id = ref.replace(":%s" % ref.split(':')[-1], "")
            if base_id not in self.current_base_ids:
                base_ids.add(base_id)
        for base_id in base_ids:
            self.clean_tags(base_id)

    def remove_from_seen_refs(self, ref_id):
        # Remove from the struct to be dumped
        self.data.remove(ref_id)
        pickle.dump(self.data, open(self.seen_refs_path, 'wb'))