def execute(cls, options): q_project = {} if options.nbhd: nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) if not nbhd: return "Invalid neighborhood url prefix." q_project['neighborhood_id'] = nbhd._id if options.project: q_project['shortname'] = options.project elif options.project_regex: q_project['shortname'] = {'$regex': options.project_regex} for chunk in chunked_find(M.Project, q_project): project_ids = [] for p in chunk: log.info('Reindex project %s', p.shortname) if options.dry_run: continue c.project = p project_ids.append(p._id) try: for chunk in chunked_list(project_ids, options.max_chunk): if options.tasks: cls._post_add_projects(chunk) else: add_projects(chunk) except CompoundError, err: log.exception('Error indexing projects:\n%r', err) log.error('%s', err.format_error()) M.main_orm_session.flush() M.main_orm_session.clear()
def _chunked_add_artifacts(self, ref_ids): # ref_ids contains solr index ids which can easily be over # 100 bytes. Here we allow for 160 bytes avg, plus # room for other document overhead. for chunk in utils.chunked_list(ref_ids, self.options.max_chunk): if self.options.tasks: self._post_add_artifacts(chunk) else: add_artifacts( chunk, update_solr=self.options.solr, update_refs=self.options.refs, **self.add_artifact_kwargs )
def _chunked_add_artifacts(self, ref_ids): # ref_ids contains solr index ids which can easily be over # 100 bytes. Here we allow for 160 bytes avg, plus # room for other document overhead. for chunk in utils.chunked_list(ref_ids, self.options.max_chunk): if self.options.tasks: self._post_add_artifacts(chunk) else: add_artifacts(chunk, update_solr=self.options.solr, update_refs=self.options.refs, **self.add_artifact_kwargs)
def flush(cls): """ Creates indexing tasks for cached adds and deletes, and resets the caches. .. warning:: This method is NOT called automatically when the parent session is flushed. It MUST be called explicitly. """ # Post in chunks to avoid overflowing the max BSON document # size when the Monq task is created: # cls.to_delete - contains solr index ids which can easily be over # 100 bytes. Here we allow for 160 bytes avg, plus # room for other document overhead. # cls.to_add - contains BSON ObjectIds, which are 12 bytes each, so # we can easily put 1m in a doc with room left over. if cls.to_delete: for chunk in chunked_list(list(cls.to_delete), 100 * 1000): cls._post(index_tasks.del_artifacts, chunk) if cls.to_add: for chunk in chunked_list(list(cls.to_add), 1000 * 1000): cls._post(index_tasks.add_artifacts, chunk) cls.to_delete = set() cls.to_add = set()
def execute(cls, options): for chunk in chunked_find(M.User, {}): user_ids = [] for u in chunk: log.info('Reindex user %s', u.username) if options.dry_run: continue user_ids.append(u._id) try: for chunk in chunked_list(user_ids, options.max_chunk): if options.tasks: cls._post_add_users(chunk) else: add_users(chunk) except CompoundError, err: log.exception('Error indexing users:\n%r', err) log.error('%s', err.format_error()) M.main_orm_session.flush() M.main_orm_session.clear()
def test_chunked_list(self): l = range(10) chunks = list(utils.chunked_list(l, 3)) self.assertEqual(len(chunks), 4) self.assertEqual(len(chunks[0]), 3) self.assertEqual([el for sublist in chunks for el in sublist], l)
def _chunked_add_artifacts(self, ref_ids): # ref_ids contains solr index ids which can easily be over # 100 bytes. Here we allow for 160 bytes avg, plus # room for other document overhead. for chunk in utils.chunked_list(ref_ids, 100 * 1000): self._post_add_artifacts(chunk)
def execute(cls, options): q_project = {} if options.nbhd: nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) if not nbhd: return "Invalid neighborhood url prefix." q_project['neighborhood_id'] = nbhd._id if options.project: q_project['shortname'] = options.project elif options.project_regex: q_project['shortname'] = {'$regex': options.project_regex} log.info('Refreshing repositories') for chunk in chunked_find(M.Project, q_project): for p in chunk: log.info("Refreshing repos for project '%s'." % p.shortname) if options.dry_run: continue c.project = p if options.mount_point: mount_points = [options.mount_point] else: mount_points = [ac.options.mount_point for ac in M.AppConfig.query.find(dict(project_id=p._id))] for app in (p.app_instance(mp) for mp in mount_points): c.app = app if not hasattr(app, 'repo'): continue if c.app.repo.tool.lower() not in options.repo_types: log.info("Skipping %r: wrong type (%s)", c.app.repo, c.app.repo.tool.lower()) continue ci_ids = [] if options.clean: ci_ids = list(c.app.repo.all_commit_ids()) elif options.clean_after: for ci in M.repository.CommitDoc.m.find({'repo_ids': c.app.repo._id, 'committed.date': {'$gt': options.clean_after}}): ci_ids.append(ci._id) if ci_ids: log.info("Deleting mongo data for %i commits...", len(ci_ids)) # delete these in chunks, otherwise the query doc can # exceed the max BSON size limit (16MB at the moment) for ci_ids_chunk in chunked_list(ci_ids, 3000): i = M.repository.CommitDoc.m.find( {"_id": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i CommitDoc docs...", i) M.repository.CommitDoc.m.remove( {"_id": {"$in": ci_ids_chunk}}) # we used to have a TreesDoc (plural) collection to provide a mapping of commit_id to tree_id # so that we could clear the relevant TreeDoc records # its ok though, since they are created in refresh_tree_info() and overwrite existing records for ci_ids_chunk in chunked_list(ci_ids, 3000): # delete LastCommitDocs i = M.repository.LastCommitDoc.m.find( dict(commit_id={'$in': ci_ids_chunk})).count() if i: log.info( "Deleting %i LastCommitDoc docs...", i) M.repository.LastCommitDoc.m.remove( dict(commit_id={'$in': ci_ids_chunk})) del ci_ids try: if options.all: log.info('Refreshing ALL commits in %r', c.app.repo) else: log.info('Refreshing NEW commits in %r', c.app.repo) if options.profile: import cProfile cProfile.runctx( 'c.app.repo.refresh(options.all, notify=options.notify, ' ' commits_are_new=options.commits_are_new)', globals(), locals(), 'refresh.profile') else: c.app.repo.refresh( options.all, notify=options.notify, commits_are_new=options.commits_are_new) except Exception: log.exception('Error refreshing %r', c.app.repo) ThreadLocalORMSession.flush_all()
def main(options): q_project = {} if options.nbhd: nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) if not nbhd: return "Invalid neighborhood url prefix." q_project['neighborhood_id'] = nbhd._id if options.project: q_project['shortname'] = options.project elif options.project_regex: q_project['shortname'] = {'$regex': options.project_regex} log.info('Refreshing repositories') if options.clean_all: log.info('Removing all repository objects') M.repo.CommitDoc.m.remove({}) M.repo.TreeDoc.m.remove({}) M.repo.TreesDoc.m.remove({}) M.repo.DiffInfoDoc.m.remove({}) M.repo.CommitRunDoc.m.remove({}) for chunk in chunked_find(M.Project, q_project): for p in chunk: log.info("Refreshing repos for project '%s'." % p.shortname) if options.dry_run: continue c.project = p if options.mount_point: mount_points = [options.mount_point] else: mount_points = [ ac.options.mount_point for ac in M.AppConfig.query.find(dict(project_id=p._id)) ] for app in (p.app_instance(mp) for mp in mount_points): c.app = app if not hasattr(app, 'repo'): continue if c.app.repo.tool.lower() not in options.repo_types: log.info("Skipping %r: wrong type (%s)", c.app.repo, c.app.repo.tool.lower()) continue try: c.app.repo._impl._setup_hooks() except: log.exception('Error setting up hooks for %r', c.app.repo) if options.clean: ci_ids = list(c.app.repo.all_commit_ids()) log.info("Deleting mongo data for %i commits...", len(ci_ids)) tree_ids = [ tree_id for doc in M.repo.TreesDoc.m.find( {"_id": { "$in": ci_ids }}, {"tree_ids": 1}) for tree_id in doc.get("tree_ids", []) ] i = M.repo.CommitDoc.m.find({ "_id": { "$in": ci_ids } }).count() log.info("Deleting %i CommitDoc docs...", i) M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids}}) # delete these in chunks, otherwise the query doc can # exceed the max BSON size limit (16MB at the moment) for tree_ids_chunk in chunked_list(tree_ids, 300000): i = M.repo.TreeDoc.m.find({ "_id": { "$in": tree_ids_chunk } }).count() log.info("Deleting %i TreeDoc docs...", i) M.repo.TreeDoc.m.remove( {"_id": { "$in": tree_ids_chunk }}) i = M.repo.LastCommitDoc.m.find({ "object_id": { "$in": tree_ids_chunk } }).count() log.info("Deleting %i LastCommitDoc docs...", i) M.repo.LastCommitDoc.m.remove( {"object_id": { "$in": tree_ids_chunk }}) del tree_ids # delete these after TreeDoc and LastCommitDoc so that if # we crash, we don't lose the ability to delete those i = M.repo.TreesDoc.m.find({ "_id": { "$in": ci_ids } }).count() log.info("Deleting %i TreesDoc docs...", i) M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}}) # delete LastCommitDocs for non-trees repo_lastcommit_re = re.compile("^{}:".format( c.app.repo._id)) i = M.repo.LastCommitDoc.m.find( dict(_id=repo_lastcommit_re)).count() log.info( "Deleting %i remaining LastCommitDoc docs, by repo id...", i) M.repo.LastCommitDoc.m.remove(dict(_id=repo_lastcommit_re)) i = M.repo.DiffInfoDoc.m.find({ "_id": { "$in": ci_ids } }).count() log.info("Deleting %i DiffInfoDoc docs...", i) M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids}}) i = M.repo.CommitRunDoc.m.find({ "commit_ids": { "$in": ci_ids } }).count() log.info("Deleting %i CommitRunDoc docs...", i) M.repo.CommitRunDoc.m.remove( {"commit_ids": { "$in": ci_ids }}) del ci_ids try: if options.all: log.info('Refreshing ALL commits in %r', c.app.repo) else: log.info('Refreshing NEW commits in %r', c.app.repo) if options.profile: import cProfile cProfile.runctx( 'c.app.repo.refresh(options.all, notify=options.notify)', globals(), locals(), 'refresh.profile') else: c.app.repo.refresh(options.all, notify=options.notify) except: log.exception('Error refreshing %r', c.app.repo) ThreadLocalORMSession.flush_all() ThreadLocalORMSession.close_all()
def execute(cls, options): q_project = {} if options.nbhd: nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) if not nbhd: return "Invalid neighborhood url prefix." q_project['neighborhood_id'] = nbhd._id if options.project: q_project['shortname'] = options.project elif options.project_regex: q_project['shortname'] = {'$regex': options.project_regex} log.info('Refreshing repositories') for chunk in chunked_find(M.Project, q_project): for p in chunk: log.info("Refreshing repos for project '%s'." % p.shortname) if options.dry_run: continue c.project = p if options.mount_point: mount_points = [options.mount_point] else: mount_points = [ac.options.mount_point for ac in M.AppConfig.query.find(dict(project_id=p._id))] for app in (p.app_instance(mp) for mp in mount_points): c.app = app if not hasattr(app, 'repo'): continue if c.app.repo.tool.lower() not in options.repo_types: log.info("Skipping %r: wrong type (%s)", c.app.repo, c.app.repo.tool.lower()) continue if options.clean: ci_ids = list(c.app.repo.all_commit_ids()) log.info("Deleting mongo data for %i commits...", len(ci_ids)) # like the tree_ids themselves below, we need to process these in # chunks to avoid hitting the BSON max size limit tree_ids = [] for ci_ids_chunk in chunked_list(ci_ids, 3000): tree_ids.extend([ tree_id for doc in M.repo.TreesDoc.m.find( {"_id": {"$in": ci_ids_chunk}}, {"tree_ids": 1}) for tree_id in doc.get("tree_ids", [])]) i = M.repo.CommitDoc.m.find( {"_id": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i CommitDoc docs...", i) M.repo.CommitDoc.m.remove( {"_id": {"$in": ci_ids_chunk}}) # delete these in chunks, otherwise the query doc can # exceed the max BSON size limit (16MB at the moment) for tree_ids_chunk in chunked_list(tree_ids, 300000): i = M.repo.TreeDoc.m.find( {"_id": {"$in": tree_ids_chunk}}).count() if i: log.info("Deleting %i TreeDoc docs...", i) M.repo.TreeDoc.m.remove( {"_id": {"$in": tree_ids_chunk}}) del tree_ids # delete these after TreeDoc and LastCommitDoc so that if # we crash, we don't lose the ability to delete those for ci_ids_chunk in chunked_list(ci_ids, 3000): # delete TreesDocs i = M.repo.TreesDoc.m.find( {"_id": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i TreesDoc docs...", i) M.repo.TreesDoc.m.remove( {"_id": {"$in": ci_ids_chunk}}) # delete LastCommitDocs i = M.repo.LastCommitDoc.m.find( dict(commit_ids={'$in': ci_ids_chunk})).count() if i: log.info( "Deleting %i remaining LastCommitDoc docs, by repo id...", i) M.repo.LastCommitDoc.m.remove( dict(commit_ids={'$in': ci_ids_chunk})) i = M.repo.DiffInfoDoc.m.find( {"_id": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i DiffInfoDoc docs...", i) M.repo.DiffInfoDoc.m.remove( {"_id": {"$in": ci_ids_chunk}}) i = M.repo.CommitRunDoc.m.find( {"commit_ids": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i CommitRunDoc docs...", i) M.repo.CommitRunDoc.m.remove( {"commit_ids": {"$in": ci_ids_chunk}}) del ci_ids try: if options.all: log.info('Refreshing ALL commits in %r', c.app.repo) else: log.info('Refreshing NEW commits in %r', c.app.repo) if options.profile: import cProfile cProfile.runctx( 'c.app.repo.refresh(options.all, notify=options.notify)', globals(), locals(), 'refresh.profile') else: c.app.repo.refresh( options.all, notify=options.notify) except: log.exception('Error refreshing %r', c.app.repo) ThreadLocalORMSession.flush_all()
def execute(cls, options): q_project = {} if options.nbhd: nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) if not nbhd: return "Invalid neighborhood url prefix." q_project['neighborhood_id'] = nbhd._id if options.project: q_project['shortname'] = options.project elif options.project_regex: q_project['shortname'] = {'$regex': options.project_regex} log.info('Refreshing repositories') for chunk in chunked_find(M.Project, q_project): for p in chunk: log.info("Refreshing repos for project '%s'." % p.shortname) if options.dry_run: continue c.project = p if options.mount_point: mount_points = [options.mount_point] else: mount_points = [ac.options.mount_point for ac in M.AppConfig.query.find(dict(project_id=p._id))] for app in (p.app_instance(mp) for mp in mount_points): c.app = app if not hasattr(app, 'repo'): continue if c.app.repo.tool.lower() not in options.repo_types: log.info("Skipping %r: wrong type (%s)", c.app.repo, c.app.repo.tool.lower()) continue ci_ids = [] if options.clean: ci_ids = list(c.app.repo.all_commit_ids()) elif options.clean_after: for ci in M.repository.CommitDoc.m.find({'repo_ids': c.app.repo._id, 'committed.date': {'$gt': options.clean_after}}): ci_ids.append(ci._id) if ci_ids: log.info("Deleting mongo data for %i commits...", len(ci_ids)) # delete these in chunks, otherwise the query doc can # exceed the max BSON size limit (16MB at the moment) for ci_ids_chunk in chunked_list(ci_ids, 3000): i = M.repository.CommitDoc.m.find( {"_id": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i CommitDoc docs...", i) M.repository.CommitDoc.m.remove( {"_id": {"$in": ci_ids_chunk}}) # we used to have a TreesDoc (plural) collection to provide a mapping of commit_id to tree_id # so that we could clear the relevant TreeDoc records # its ok though, since they are created in refresh_tree_info() and overwrite existing records for ci_ids_chunk in chunked_list(ci_ids, 3000): # delete LastCommitDocs i = M.repository.LastCommitDoc.m.find( dict(commit_id={'$in': ci_ids_chunk})).count() if i: log.info( "Deleting %i LastCommitDoc docs...", i) M.repository.LastCommitDoc.m.remove( dict(commit_id={'$in': ci_ids_chunk})) i = M.repository.CommitRunDoc.m.find( {"commit_ids": {"$in": ci_ids_chunk}}).count() if i: log.info("Deleting %i CommitRunDoc docs...", i) M.repository.CommitRunDoc.m.remove( {"commit_ids": {"$in": ci_ids_chunk}}) del ci_ids try: if options.all: log.info('Refreshing ALL commits in %r', c.app.repo) else: log.info('Refreshing NEW commits in %r', c.app.repo) if options.profile: import cProfile cProfile.runctx( 'c.app.repo.refresh(options.all, notify=options.notify, ' ' commits_are_new=options.commits_are_new)', globals(), locals(), 'refresh.profile') else: c.app.repo.refresh( options.all, notify=options.notify, commits_are_new=options.commits_are_new) except: log.exception('Error refreshing %r', c.app.repo) ThreadLocalORMSession.flush_all()