def commits(self, pid=None, tid=None, cid=None, gid=None, start=0, limit=10, dfrom=None, dto=None, inc_merge_commit=None, inc_repos=None, metadata=None, exc_groups=None, inc_groups=None): c = Commits(index.Connector()) projects_index = Projects() idents = Contributors() query_kwargs = utils.resolv_filters( projects_index, idents, pid, tid, cid, gid, dfrom, dto, inc_repos, inc_merge_commit, metadata, exc_groups, inc_groups) query_kwargs.update( {'start': start, 'limit': limit}) resp = c.get_commits(**query_kwargs) for cmt in resp[2]: # Get extra metadata keys extra = set(cmt.keys()) - set(PROPERTIES.keys()) cmt['metadata'] = list(extra) cmt['repos'] = [r for r in cmt['repos'] if not r.startswith('meta_ref: ')] # Compute link to access commit diff based on the # URL template provided in projects.yaml cmt['gitwebs'] = [ projects_index.get_gitweb_link(r) % {'sha': cmt['sha']} for r in cmt['repos']] cmt['projects'] = utils.get_projects_from_references( projects_index, cmt['repos']) # Also remove the URI part cmt['repos'] = [":".join(p.split(':')[-2:]) for p in cmt['repos']] # Request the ident index to fetch author/committer name/email for elm in ('author', 'committer'): ident = list(idents.get_idents_by_emails( cmt['%s_email' % elm]).values())[0] cmt['%s_email' % elm] = ident['default-email'] if ident['name']: cmt['%s_name' % elm] = ident['name'] # Convert the TTL to something human readable cmt['ttl'] = str((datetime.fromtimestamp(cmt['ttl']) - datetime.fromtimestamp(0))) cmt['author_gravatar'] = \ hashlib.md5(cmt['author_email'].encode( errors='ignore')).hexdigest() cmt['committer_gravatar'] = \ hashlib.md5(cmt['committer_email'].encode( errors='ignore')).hexdigest() if len(cmt['commit_msg']) > 80: cmt['commit_msg'] = cmt['commit_msg'][0:76] + '...' # Add cid and ccid cmt['cid'] = utils.encrypt(xorkey, cmt['author_email']) cmt['ccid'] = utils.encrypt(xorkey, cmt['committer_email']) # Remove email details del cmt['author_email'] del cmt['committer_email'] return resp
class RefsCleaner(): def __init__(self, projects, con=None, config=None): if config: configuration.set_config(config) if not con: self.con = index.Connector() else: self.con = con self.projects = projects self.c = Commits(self.con) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) def find_refs_to_clean(self): prjs = self.projects.get_projects_raw() refs_ids = set() for pid, pdata in prjs.items(): for rid, repo in pdata['repos'].items(): for branch in repo['branches']: refs_ids.add('%s:%s:%s' % (repo['uri'], rid, branch)) if not os.path.isfile(self.seen_refs_path): self.data = set() else: try: self.data = cPickle.load(file(self.seen_refs_path)) except Exception: # Protect against corrupted file self.data = set() refs_to_clean = self.data - refs_ids logger.info("Found %s refs to clean." % len(refs_to_clean)) return refs_to_clean def clean(self, refs): for ref in refs: # Find ref's Commits ids = [ c['_id'] for c in self.c.get_commits(repos=[ref], scan=True) ] if not ids: self.remove_from_seen_refs(ref) continue logger.info("Ref %s no longer referenced. Cleaning %s cmts." % (ref, len(ids))) # Do it by bulk of 10000 to not hurt memory bulk = 10000 i = 0 while True: _ids = ids[i:i + bulk] if not _ids: break else: delete_commits(self.c, ref, _ids, ref) i += bulk self.remove_from_seen_refs(ref) def remove_from_seen_refs(self, ref_id): self.data.remove(ref_id) cPickle.dump(self.data, file(self.seen_refs_path, 'w'))
class RepoIndexer(): def __init__(self, name, uri, parsers=None, con=None, config=None): if config: configuration.set_config(config) if not con: self.con = index.Connector() else: self.con = con self.c = Commits(self.con) self.t = Tags(self.con) if not os.path.isdir(conf.git_store): os.makedirs(conf.git_store) self.name = name self.uri = uri self.base_id = '%s:%s' % (self.uri, self.name) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) if not parsers: self.parsers = [] else: self.parsers = parsers self.parsers_compiled = False self.local = os.path.join(conf.git_store, self.name, self.uri.replace('/', '_')) if not os.path.isdir(self.local): os.makedirs(self.local) self.credentials_helper_path = os.path.join( sys.prefix, 'bin', 'repoxplorer-git-credentials-helper') def __str__(self): return 'Git indexer of %s' % self.ref_id def save_seen_ref_in_cache(self): # Keep a cache a each ref that have been indexed # This is use later to discover seen refs no longer in projects.yaml # In that case a removal from the backend will be performed logger.debug("Save ref %s into seen_refs file" % self.ref_id) if not os.path.isfile(self.seen_refs_path): data = set() else: try: data = cPickle.load(file(self.seen_refs_path)) except Exception: # Protect against corrupted file data = set() data.add(self.ref_id) cPickle.dump(data, file(self.seen_refs_path, 'w')) def set_branch(self, branch): self.branch = branch self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch) self.save_seen_ref_in_cache() def git_init(self): logger.debug("Git init for %s:%s in %s" % (self.uri, self.name, self.local)) run(["git", "init", "--bare", "."], self.local) if "origin" not in run(["git", "remote", "-v"], self.local): run(["git", "remote", "add", "origin", self.uri], self.local) def git_fetch_branch(self): logger.debug("Fetch %s %s:%s" % (self.name, self.uri, self.branch)) run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "fetch", "-nk", "origin", "+%s:%s" % (self.branch, self.branch) ], self.local) def get_refs(self): refs = run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "ls-remote", "origin" ], self.local).splitlines() self.refs = [] for r in refs: self.refs.append(r.split('\t')) def get_heads(self): self.heads = filter(lambda x: x[1].startswith('refs/heads/'), self.refs) def get_tags(self): self.tags = filter(lambda x: x[1].startswith('refs/tags/'), self.refs) def git_get_commit_obj(self): self.commits = get_all_shas(self.local) def run_workers(self, shas, workers): BULK_CHUNK = 1000 to_process = [] if workers == 0: # Default value (auto) workers = mp.cpu_count() - 1 or 1 while True: try: shas[BULK_CHUNK] to_process.append(shas[:BULK_CHUNK]) del shas[:BULK_CHUNK] except IndexError: # Add the rest to_process.append(shas) break options = [(self.local, self.ref_id, stp) for stp in to_process] worker_pool = mp.Pool(workers) worker_pool.map(process_commits, options) worker_pool.terminate() worker_pool.join() def is_branch_fully_indexed(self): branch = [ head for head in self.heads if head[1].endswith(self.branch) ][0] branch_tip_sha = branch[0] cmt = self.c.get_commit(branch_tip_sha, silent=True) if cmt and self.ref_id in cmt['repos']: return True return False def get_current_commit_indexed(self): """ Fetch from the index commits mentionned for this repo and branch. """ self.already_indexed = [ c['_id'] for c in self.c.get_commits(repos=[self.ref_id], scan=True) ] logger.debug( "%s: In the DB - repo history is composed of %s commits." % (self.name, len(self.already_indexed))) def compute_to_index_to_delete(self): """ Compute the list of commits (sha) to index and the list to delete from the index. """ logger.debug("%s: Upstream - repo history is composed of %s commits." % (self.name, len(self.commits))) self.to_delete = set(self.already_indexed) - set(self.commits) self.to_index = set(self.commits) - set(self.already_indexed) logger.debug("%s: Indexer will reference %s commits." % (self.name, len(self.to_index))) logger.debug("%s: Indexer will dereference %s commits." % (self.name, len(self.to_delete))) def compute_to_create_to_update(self): if self.to_index: res = self.c.get_commits_by_id(list(self.to_index)) to_update = [ c['_source'] for c in res['docs'] if c['found'] is True ] to_create = [c['_id'] for c in res['docs'] if c['found'] is False] return to_create, to_update return [], [] def index_tags(self): def c_tid(t): return "%s%s%s" % (t['sha'], t['name'].replace('refs/tags/', ''), t['repo']) if not self.tags: logger.debug('%s: no tags detected for this repository' % (self.name)) return logger.debug('%s: %s tags exist upstream' % (self.name, len(self.tags))) tags = self.t.get_tags([self.base_id]) existing = dict([(c_tid(t['_source']), t['_id']) for t in tags]) logger.debug('%s: %s tags already referenced' % (self.name, len(existing))) # Some commits may be not found because it is possible the branches # has not been indexed. commits = [ c['_source'] for c in self.c.get_commits_by_id([t[0] for t in self.tags])['docs'] if c['found'] ] lookup = dict([(c['sha'], c['committer_date']) for c in commits]) to_delete = [ v for k, v in existing.items() if k not in [ "%s%s%s" % (sha, name.replace('refs/tags/', '').replace('^{}', ''), self.base_id) for sha, name in self.tags ] ] docs = [] for sha, name in self.tags: if sha in lookup: doc = {} doc['name'] = name.replace('refs/tags/', '').replace('^{}', '') doc['sha'] = sha doc['date'] = lookup[sha] doc['repo'] = self.base_id if c_tid(doc) in existing: continue docs.append(doc) if docs: logger.info('%s: %s tags will be indexed' % (self.name, len(docs))) self.t.add_tags(docs) if to_delete: logger.info('%s: %s tags will be deleted' % (self.name, len(to_delete))) self.t.del_tags(to_delete) def index(self, extract_workers=1): # Compile the parsers if self.parsers: if not self.parsers_compiled: raw_parsers = copy.deepcopy(self.parsers) self.parsers = [] for parser in raw_parsers: self.parsers.append(re.compile(parser)) logger.debug("%s: Prepared %s regex parsers for commit msgs" % (self.name, len(self.parsers))) self.parsers_compiled = True # check whether a commit should be completly deleted or # updated by removing the repo from the repos field if self.to_delete: delete_commits(self.c, self.name, self.to_delete, self.ref_id) # check whether a commit should be created or # updated by adding the repo into the repos field if self.to_index: to_create, to_update = self.compute_to_create_to_update() if to_create: logger.info("%s: %s commits will be created ..." % (self.name, len(to_create))) self.run_workers(to_create, extract_workers) if to_update: logger.info( "%s: %s commits already indexed and need to be updated" % (self.name, len(to_update))) for c in to_update: c['repos'].append(self.ref_id) self.c.update_commits(to_update)
class RefsCleaner(): def __init__(self, projects, con=None): if not con: self.con = index.Connector() else: self.con = con self.projects = projects self.c = Commits(self.con) self.t = Tags( index.Connector(index=self.con.index, index_suffix='tags')) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) self.current_base_ids = set() def find_refs_to_clean(self): projects = self.projects.get_projects(source=['refs']) refs_ids = set() for project in projects.values(): for ref in project['refs']: self.current_base_ids.add(ref['shortrid']) refs_ids.add(ref['fullrid']) if not os.path.isfile(self.seen_refs_path): self.data = set() else: try: self.data = pickle.load(open(self.seen_refs_path, 'rb')) except Exception: # Protect against corrupted file self.data = set() refs_to_clean = self.data - refs_ids if len(refs_to_clean): logger.info("Found %s refs to clean." % len(refs_to_clean)) return refs_to_clean def clean_tags(self, base_id): # Tags are indexed by repos (base_id) not by ref (ref_id) tags = self.t.get_tags([base_id]) ids = [t['_id'] for t in tags] if ids: logger.info("Repo %s no longer referenced. Cleaning %s tags" % (base_id, len(ids))) self.t.del_tags(ids) def clean_ref_cmts(self, ref): # Find ref's Commits ids = [c['_id'] for c in self.c.get_commits(repos=[ref], scan=True)] if not ids: self.remove_from_seen_refs(ref) return logger.info("Ref %s no longer referenced. Cleaning %s cmts." % (ref, len(ids))) # Do it by bulk of 10000 to not hurt memory bulk = 10000 i = 0 while True: _ids = ids[i:i + bulk] if not _ids: break else: delete_commits(self.c, ref, _ids, ref) i += bulk def clean(self, refs): base_ids = set() for ref in refs: self.clean_ref_cmts(ref) self.remove_from_seen_refs(ref) base_id = ref.replace(":%s" % ref.split(':')[-1], "") if base_id not in self.current_base_ids: base_ids.add(base_id) for base_id in base_ids: self.clean_tags(base_id) def remove_from_seen_refs(self, ref_id): # Remove from the struct to be dumped self.data.remove(ref_id) pickle.dump(self.data, open(self.seen_refs_path, 'wb'))
class RepoIndexer(): def __init__(self, name, uri, parsers=None, con=None, meta_ref=None): if not con: self.con = index.Connector() else: self.con = con self.c = Commits(self.con) self.t = Tags(index.Connector( index=self.con.index, index_suffix='tags')) if not os.path.isdir(conf.git_store): os.makedirs(conf.git_store) self.name = name self.uri = uri self.base_id = '%s:%s' % (self.uri, self.name) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) if meta_ref: self.meta_ref = 'meta_ref: %s' % meta_ref else: self.meta_ref = None if not parsers: self.parsers = [] else: self.parsers = parsers self.parsers_compiled = False self.local = os.path.join(conf.git_store, self.name, self.uri.replace('/', '_')) if not os.path.isdir(self.local): os.makedirs(self.local) self.credentials_helper_path = getattr( conf, 'git_credential_helper_path', None) if not (self.credentials_helper_path and self.credentials_helper_path.startswith('/') and os.path.isfile(self.credentials_helper_path)): if self.credentials_helper_path: logger.warning( 'Configured git_credential_helper %s not found' % ( self.credentials_helper_path)) self.credentials_helper_path = None # Look at the default installation pathes if not self.credentials_helper_path: self.credentials_helper_path = os.path.join( sys.prefix, 'bin', 'repoxplorer-git-credentials-helper') if not os.path.isfile(self.credentials_helper_path): self.credentials_helper_path = shutil.which( 'repoxplorer-git-credentials-helper') if not self.credentials_helper_path: logger.warning( 'Default repoxplorer-git-credential-helper command ' 'not found') def __str__(self): return 'Git indexer of %s' % self.ref_id def save_seen_ref_in_cache(self): # Keep a cache a each ref that have been indexed # This is use later to discover seen refs no longer in projects.yaml # In that case a removal from the backend will be performed logger.debug("Save ref %s into seen_refs file" % self.ref_id) if not os.path.isfile(self.seen_refs_path): data = set() else: try: data = pickle.load(open(self.seen_refs_path, 'rb')) except Exception: # Protect against corrupted file data = set() data.add(self.ref_id) pickle.dump(data, open(self.seen_refs_path, 'wb')) def set_branch(self, branch): self.branch = branch self.ref_id = '%s:%s:%s' % (self.uri, self.name, self.branch) self.save_seen_ref_in_cache() def git_init(self): logger.debug("Git init for %s:%s in %s" % ( self.uri, self.name, self.local)) run(["git", "init", "--bare", "."], self.local) remotes = run(["git", "remote", "-v"], self.local) remote_names = [line.split()[0] for line in remotes.splitlines()] if "origin" not in remote_names: run(["git", "remote", "add", "origin", self.uri], self.local) def git_fetch_branch(self): logger.debug("Fetch %s %s:%s" % (self.name, self.uri, self.branch)) run(["git", "-c", "credential.helper=%s" % self.credentials_helper_path, "fetch", "-nk", "origin", "+%s:%s" % (self.branch, self.branch)], self.local) def get_refs(self): refs = run([ "git", "-c", "credential.helper=%s" % self.credentials_helper_path, "ls-remote", "origin"], self.local).splitlines() self.refs = [] for r in refs: self.refs.append(r.split('\t')) def get_heads(self): self.heads = [x for x in self.refs if x[1].startswith('refs/heads/')] def get_tags(self): self.tags = [x for x in self.refs if x[1].startswith('refs/tags/')] def git_get_commit_obj(self): self.commits = get_all_shas(self.local) def run_workers(self, shas, workers): BULK_CHUNK = 1000 to_process = [] if workers == 0: # Default value (auto) workers = mp.cpu_count() - 1 or 1 while True: try: shas[BULK_CHUNK] to_process.append(shas[:BULK_CHUNK]) del shas[:BULK_CHUNK] except IndexError: # Add the rest to_process.append(shas) break ref_ids = [self.ref_id] if self.meta_ref: ref_ids.append(self.meta_ref) options = [ (self.local, ref_ids, stp) for stp in to_process] worker_pool = mp.Pool(workers) worker_pool.map(process_commits, options) worker_pool.terminate() worker_pool.join() def is_branch_fully_indexed(self): branch = [head for head in self.heads if head[1].endswith(self.branch)][0] branch_tip_sha = branch[0] _, _, cmts_list = self.c.get_commits(repos=[self.ref_id], limit=1) if not cmts_list: return False cmt = cmts_list[0] if branch_tip_sha != cmt['sha']: return False return True def get_current_commits_indexed(self): """ Fetch from the index commits mentionned for this repo and branch. """ self.already_indexed = [c['_id'] for c in self.c.get_commits(repos=[self.ref_id], scan=True)] logger.debug( "%s: In the DB - repo history is composed of %s commits." % ( self.name, len(self.already_indexed))) def compute_to_index_to_delete(self): """ Compute the list of commits (sha) to index and the list to delete from the index. """ logger.debug( "%s: Upstream - repo history is composed of %s commits." % ( self.name, len(self.commits))) self.to_delete = set(self.already_indexed) - set(self.commits) self.to_index = set(self.commits) - set(self.already_indexed) logger.debug( "%s: Indexer will reference %s commits." % ( self.name, len(self.to_index))) logger.debug( "%s: Indexer will dereference %s commits." % ( self.name, len(self.to_delete))) def compute_to_create_to_update(self): if self.to_index: res = self.c.get_commits_by_id(list(self.to_index)) to_update = [c['_source'] for c in res['docs'] if c['found'] is True] to_create = [c['_id'] for c in res['docs'] if c['found'] is False] return to_create, to_update return [], [] def index_tags(self): def c_tid(t): return "%s%s%s" % (t['sha'], t['name'].replace('refs/tags/', ''), t['repo']) if not self.tags: logger.debug('%s: no tags detected for this repository' % ( self.name)) return logger.debug('%s: %s tags exist upstream' % ( self.name, len(self.tags))) tags = self.t.get_tags([self.base_id]) existing = dict([(c_tid(t['_source']), t['_id']) for t in tags]) logger.debug('%s: %s tags already referenced' % ( self.name, len(existing))) # Some commits may be not found because it is possible the branches # has not been indexed. commits = [c['_source'] for c in self.c.get_commits_by_id( [t[0] for t in self.tags])['docs'] if c['found']] lookup = dict([(c['sha'], c['committer_date']) for c in commits]) to_delete = [v for k, v in existing.items() if k not in ["%s%s%s" % (sha, name.replace('refs/tags/', '').replace('^{}', ''), self.base_id) for sha, name in self.tags]] docs = [] for sha, name in self.tags: if sha in lookup: doc = {} doc['name'] = name.replace('refs/tags/', '').replace('^{}', '') doc['sha'] = sha doc['date'] = lookup[sha] doc['repo'] = self.base_id if c_tid(doc) in existing: continue docs.append(doc) if docs: logger.info('%s: %s tags will be indexed' % ( self.name, len(docs))) self.t.add_tags(docs) if to_delete: logger.info('%s: %s tags will be deleted' % ( self.name, len(to_delete))) self.t.del_tags(to_delete) def index(self, extract_workers=1): # Compile the parsers if self.parsers: if not self.parsers_compiled: raw_parsers = copy.deepcopy(self.parsers) self.parsers = [] for parser in raw_parsers: self.parsers.append(re.compile(parser)) logger.debug( "%s: Prepared %s regex parsers for commit msgs" % ( self.name, len(self.parsers))) self.parsers_compiled = True # check whether a commit should be completly deleted or # updated by removing the repo from the repos field if self.to_delete: delete_commits(self.c, self.name, self.to_delete, self.ref_id) # check whether a commit should be created or # updated by adding the repo into the repos field if self.to_index: to_create, to_update = self.compute_to_create_to_update() if to_create: logger.info("%s: %s commits will be created ..." % ( self.name, len(to_create))) self.run_workers(to_create, extract_workers) if to_update: logger.info( "%s: %s commits already indexed and need to be updated" % ( self.name, len(to_update))) for c in to_update: c['repos'].append(self.ref_id) self.c.update_commits(to_update)
class RefsCleaner(): def __init__(self, projects, con=None): if not con: self.con = index.Connector() else: self.con = con self.projects = projects self.c = Commits(self.con) self.t = Tags(index.Connector( index=self.con.index, index_suffix='tags')) self.seen_refs_path = os.path.join(conf.db_path, SEEN_REFS_CACHED) self.current_base_ids = set() def find_refs_to_clean(self): projects = self.projects.get_projects(source=['refs']) refs_ids = set() for project in projects.values(): for ref in project['refs']: self.current_base_ids.add(ref['shortrid']) refs_ids.add(ref['fullrid']) if not os.path.isfile(self.seen_refs_path): self.data = set() else: try: self.data = pickle.load(open(self.seen_refs_path, 'rb')) except Exception: # Protect against corrupted file self.data = set() refs_to_clean = self.data - refs_ids if len(refs_to_clean): logger.info("Found %s refs to clean." % len(refs_to_clean)) return refs_to_clean def clean_tags(self, base_id): # Tags are indexed by repos (base_id) not by ref (ref_id) tags = self.t.get_tags([base_id]) ids = [t['_id'] for t in tags] if ids: logger.info("Repo %s no longer referenced. Cleaning %s tags" % ( base_id, len(ids))) self.t.del_tags(ids) def clean_ref_cmts(self, ref): # Find ref's Commits ids = [c['_id'] for c in self.c.get_commits(repos=[ref], scan=True)] if not ids: self.remove_from_seen_refs(ref) return logger.info("Ref %s no longer referenced. Cleaning %s cmts." % (ref, len(ids))) # Do it by bulk of 10000 to not hurt memory bulk = 10000 i = 0 while True: _ids = ids[i:i+bulk] if not _ids: break else: delete_commits(self.c, ref, _ids, ref) i += bulk def clean(self, refs): base_ids = set() for ref in refs: self.clean_ref_cmts(ref) self.remove_from_seen_refs(ref) base_id = ref.replace(":%s" % ref.split(':')[-1], "") if base_id not in self.current_base_ids: base_ids.add(base_id) for base_id in base_ids: self.clean_tags(base_id) def remove_from_seen_refs(self, ref_id): # Remove from the struct to be dumped self.data.remove(ref_id) pickle.dump(self.data, open(self.seen_refs_path, 'wb'))