def run(self, working_dir, module_name): repo = Repo(working_dir) storage_manager = StorageManager(repo) records = [] target_dir = os.path.join(repo.working_dir, module_name) gitmodel_info = self.inspect_repo(repo, target_dir) for directory, schema, data_records in gitmodel_info: # GitModel uses ``id``, ElasticGit uses ``uuid`` so add an alias. schema = self.add_alias(schema, 'uuid', 'id') # Update the namespace for where we're migrating to. schema['namespace'] = module_name model_class = avro.deserialize(schema, module_name=module_name) for data_record in data_records: record = model_class(data_record) storage_manager.store( record, 'Migrated %s.' % (record.uuid.encode('utf-8'), )) records.append(record) # Save the schema in the new module's dir file_path = os.path.join(target_dir, '%s.avro.json' % (schema['name'], )) with self.file_opener(file_path, 'w') as stdout: json.dump(schema, fp=stdout, indent=2) return schema, records
def run(self, working_dir, module_name): repo = Repo(working_dir) storage_manager = StorageManager(repo) records = [] target_dir = os.path.join(repo.working_dir, module_name) gitmodel_info = self.inspect_repo(repo, target_dir) for directory, schema, data_records in gitmodel_info: # GitModel uses ``id``, ElasticGit uses ``uuid`` so add an alias. schema = self.add_alias(schema, 'uuid', 'id') # Update the namespace for where we're migrating to. schema['namespace'] = module_name model_class = avro.deserialize(schema, module_name=module_name) for data_record in data_records: record = model_class(data_record) storage_manager.store(record, 'Migrated %s.' % ( record.uuid.encode('utf-8'),)) records.append(record) # Save the schema in the new module's dir file_path = os.path.join(target_dir, '%s.avro.json' % (schema['name'],)) with self.file_opener(file_path, 'w') as stdout: json.dump(schema, fp=stdout, indent=2) return schema, records
def post(self): name = self.request.matchdict['name'] branch_name = self.request.params.get('branch', 'master') remote_name = self.request.params.get('remote') storage_path = self.config.get('repo.storage_path') repo = get_repository(os.path.join(storage_path, name)) storage_manager = StorageManager(repo) changes = storage_manager.pull(branch_name=branch_name, remote_name=remote_name) # Fire events self.request.registry.notify( RepositoryUpdated(config=self.config, repo=repo, changes=changes, branch=branch_name)) self.request.registry.notify( WebhookEvent(owner=self.request.authenticated_userid, event_type='repo.push', payload={ 'repo': name, 'url': self.request.route_url('repositoryresource', name=name) })) return list(format_diffindex(changes))
def __init__(self, repo, es, index_prefix): self.repo = repo self.sm = StorageManager(repo) self.es_settings = es self.im = ESManager(self.sm, get_es(**self.es_settings), index_prefix) self.working_dir = self.repo.working_dir self.index_prefix = index_prefix
def delete_content_type_object(repo, content_type, uuid): """ Delete an object of a certain content type """ storage_manager = StorageManager(repo) model_class = load_model_class(repo, content_type) model = storage_manager.get(model_class, uuid) commit = storage_manager.delete(model, 'Deleted via DELETE request.') return commit, model
def save_content_type_object(repo, schema, uuid, data): """ Save an object as a certain content type """ storage_manager = StorageManager(repo) model_class = deserialize(schema, module_name=schema['namespace']) model = model_class(data) commit = storage_manager.store(model, 'Updated via PUT request.') return commit, model
def initialize_repo_index(event): repo = event.repo # load models and mappings before creating index in case of errors model_mappings = [(load_model_class(repo, content_type), get_mapping(repo, content_type)) for content_type in list_content_types(repo)] sm = StorageManager(repo) im = ESManager(storage_manager=sm, es=get_es(event.config), index_prefix=get_index_prefix(repo.working_dir)) if im.index_exists(sm.active_branch()): im.destroy_index(sm.active_branch()) im.create_index(sm.active_branch()) while not im.index_ready(sm.active_branch()): pass try: for model_class, mapping in model_mappings: im.setup_custom_mapping(sm.active_branch(), model_class, mapping) for model_class, _ in model_mappings: for model in sm.iterate(model_class): im.index(model) except ElasticsearchException: im.destroy_index(sm.active_branch()) raise
def pull_repository_files(repo, commit_id): changed_files = {} for name in list_content_types(repo): changed_files[name] = [] try: old_commit = repo.commit(commit_id) diff = old_commit.diff(repo.head) sm = StorageManager(repo) for diff_added in diff.iter_change_type('A'): add_model_item_to_pull_dict(sm, diff_added.b_blob.path, changed_files) for diff_modified in diff.iter_change_type('M'): add_model_item_to_pull_dict(sm, diff_modified.b_blob.path, changed_files) json_diff = [] for diff_added in diff.iter_change_type('R'): json_diff.append(format_diff_R(diff_added)) for diff_removed in diff.iter_change_type('D'): json_diff.append(format_diff_D(diff_removed)) changed_files["other"] = json_diff changed_files["commit"] = repo.head.commit.hexsha return changed_files except (GitCommandError, BadName): raise NotFound("The git index does not exist")
def format_content_type(repo, content_type): """ Return a list of all content objects for a given content type in a repository. :param Repo repo: The git repository. :param str content_type: The content type to list :returns: list """ storage_manager = StorageManager(repo) model_class = load_model_class(repo, content_type) return [ dict(model_obj) for model_obj in storage_manager.iterate(model_class) ]
def format_content_type_object(repo, content_type, uuid): """ Return a content object from a repository for a given content_type and uuid :param Repo repo: The git repository. :param str content_type: The content type to list :returns: dict """ try: storage_manager = StorageManager(repo) model_class = load_model_class(repo, content_type) return dict(storage_manager.get(model_class, uuid)) except GitCommandError: raise NotFound('Object does not exist.')
def __init__(self, repo, es, index_prefix): self.repo = repo self.sm = StorageManager(repo) self.es_settings = es self.im = ESManager( self.sm, get_es(**self.es_settings), index_prefix) self.working_dir = self.repo.working_dir self.index_prefix = index_prefix
def test_clone_from(self): workspace = self.workspace person = TestPerson({ 'age': 1, 'name': 'Test Kees 1' }) workspace.save(person, 'Saving a person') clone_source = workspace.working_dir clone_dest = '%s_clone' % (workspace.working_dir,) cloned_repo = EG.clone_repo(clone_source, clone_dest) workspace = EG.workspace(cloned_repo.working_dir) self.addCleanup(workspace.destroy) sm = StorageManager(cloned_repo) [cloned_person] = sm.iterate(TestPerson) self.assertEqual(person, cloned_person)
def index_content_type_object(event): repo = event.repo model = event.model im = ESManager(storage_manager=StorageManager(repo), es=get_es(event.config), index_prefix=get_index_prefix(repo.working_dir)) if event.change_type == 'delete': im.unindex(model) else: im.index(model)
def pull_repo(self, env, repo): sm = StorageManager(repo) branch = repo.active_branch tracking_branch = branch.tracking_branch() if tracking_branch: remote_name = tracking_branch.remote_name else: remote_name = repo.remotes[0].name original_commit = branch.commit sm.pull(branch_name=branch.name, remote_name=remote_name) last_commit = branch.commit if original_commit.hexsha != last_commit.hexsha: name = os.path.basename(repo.working_dir) request = env['request'] self.notify( WebhookEvent(owner=None, event_type='repo.push', payload={ 'repo': name, 'url': request.route_url('repositoryresource', name=name) }))
class Workspace(object): """ The main API exposing a model interface to both a Git repository and an Elasticsearch index. :param git.Repo repo: A :py:class:`git.Repo` instance. :param dit es: A dictionary of values one would pass to elasticutils.get_es to get an Elasticsearch connection :param str index_prefix: The prefix to use when generating index names for Elasticsearch """ def __init__(self, repo, es, index_prefix): self.repo = repo self.sm = StorageManager(repo) self.es_settings = es self.im = ESManager( self.sm, get_es(**self.es_settings), index_prefix) self.working_dir = self.repo.working_dir self.index_prefix = index_prefix def setup(self, name, email): """ Setup a Git repository & ES index if they do not yet exist. This is safe to run if already existing. :param str name: The name of the committer in this repository. :param str email: The email address of the committer in this repository. """ if not self.sm.storage_exists(): self.sm.create_storage() self.sm.write_config('user', { 'name': name, 'email': email, }) if not self.im.index_exists(self.repo.active_branch.name): self.im.create_index(self.repo.active_branch.name) def exists(self): """ Check if the Git repository or the ES index exists. Returns ``True`` if either of them exist. :returns: bool """ if self.sm.storage_exists(): branch = self.sm.repo.active_branch return self.im.index_exists(branch.name) return False def destroy(self): """ Removes an ES index and a Git repository completely. Guaranteed to remove things completely, use with caution. """ if self.sm.storage_exists(): branch = self.sm.repo.active_branch if self.im.index_exists(branch.name): self.im.destroy_index(branch.name) self.sm.destroy_storage() def save(self, model, message, author=None, committer=None): """ Save a :py:class:`elasticgit.models.Model` instance in Git and add it to the Elasticsearch index. :param elasticgit.models.Model model: The model instance :param str message: The commit message to write the model to Git with. :param tuple author: The author information (name, email address) Defaults repo default if unspecified. :param tuple committer: The committer information (name, email address). Defaults to the author if unspecified. """ if isinstance(message, unicode): message = unidecode(message) self.sm.store(model, message, author=author, committer=committer) self.im.index(model) def delete(self, model, message, author=None, committer=None): """ Delete a :py:class`elasticgit.models.Model` instance from Git and the Elasticsearch index. :param elasticgit.models.Model model: The model instance :param str message: The commit message to remove the model from Git with. :param tuple author: The author information (name, email address) Defaults repo default if unspecified. :param tuple committer: The committer information (name, email address). Defaults to the author if unspecified. """ if isinstance(message, unicode): message = unidecode(message) self.sm.delete(model, message, author=author, committer=committer) self.im.unindex(model) def fast_forward(self, branch_name='master', remote_name='origin'): warnings.warn('This method is deprecated, use pull() instead', DeprecationWarning) return self.pull(branch_name=branch_name, remote_name=remote_name) def reindex_diff(self, diff_index): changed_model_set = set([]) for diff in diff_index: if diff.new_file: path_info = self.sm.path_info(diff.b_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) elif diff.renamed: path_info = self.sm.path_info(diff.a_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) else: path_info = self.sm.path_info(diff.a_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) for model_class in changed_model_set: self.reindex(model_class) def pull(self, branch_name='master', remote_name='origin'): """ Fetch & Merge in an upstream's commits. :param str branch_name: The name of the branch to fast forward & merge in :param str remote_name: The name of the remote to fetch from. """ changes = self.sm.pull(branch_name=branch_name, remote_name=remote_name) # NOTE: This is probably more complicated than it needs to be # If we have multiple remotes GitPython gets confused about # deletes. It marks things as deletes because it may not # exist on another remote. # # Here we loop over all changes, track the models that've # changed and then reindex fully to make sure we're in sync. if len(self.repo.remotes) > 1 and any(changes): return self.reindex_diff(changes) # NOTE: There's a very unlikely scenario where we're dealing with # renames. This generally can only happen when a repository # has been manually modififed. If that's the case then # reindex everything as well if any(changes.iter_change_type('R')): return self.reindex_diff(changes) # unindex deleted blobs for diff in changes.iter_change_type('D'): path_info = self.sm.path_info(diff.a_blob.path) if path_info is None: continue self.im.raw_unindex(*path_info) # reindex added blobs for diff in changes.iter_change_type('A'): path_info = self.sm.path_info(diff.b_blob.path) if path_info is None: continue obj = self.sm.get(*path_info) self.im.index(obj) # reindex modified blobs for diff in changes.iter_change_type('M'): path_info = self.sm.path_info(diff.a_blob.path) if path_info is None: continue obj = self.sm.get(*path_info) self.im.index(obj) def reindex_iter(self, model_class, refresh_index=True): """ Reindex everything that Git knows about in an iterator :param elasticgit.models.Model model_class: :param bool refresh_index: Whether or not to refresh the index after everything has been indexed. Defaults to ``True`` """ branch = self.repo.active_branch if not self.im.index_exists(branch.name): self.im.create_index(branch.name) iterator = self.sm.iterate(model_class) for model in iterator: yield self.im.index(model) if refresh_index: self.refresh_index() def reindex(self, model_class, refresh_index=True): """ Same as :py:func:`reindex_iter` but returns a list instead of a generator. """ return list( self.reindex_iter(model_class, refresh_index=refresh_index)) def refresh_index(self): """ Manually refresh the Elasticsearch index. In production this is not necessary but it is useful when running tests. """ self.im.refresh_indices(self.repo.active_branch.name) def index_ready(self): """ Check if the index is ready :returns: bool """ return self.im.index_ready(self.repo.active_branch.name) def sync(self, model_class, refresh_index=True): """ Resync a workspace, it assumes the Git repository is the source of truth and Elasticsearch is made to match. This involves two passes, first to index everything that Git knows about and unindexing everything that's in Elastisearch that Git does not know about. :param elasticgit.models.Model model_class: The model to resync :param bool refresh_index: Whether or not to refresh the index after indexing everything from Git """ reindexed_uuids = set([]) removed_uuids = set([]) for model_obj in self.reindex_iter(model_class, refresh_index=refresh_index): reindexed_uuids.add(model_obj.uuid) for result in self.S(model_class).everything(): if result.uuid not in reindexed_uuids: self.im.raw_unindex(model_class, result.uuid) removed_uuids.add(result.uuid) return reindexed_uuids, removed_uuids def setup_mapping(self, model_class): """ Add a custom mapping for a model_class :param elasticgit.models.Model model_class: :returns: dict, the decoded dictionary from Elasticsearch """ return self.im.setup_mapping(self.repo.active_branch.name, model_class) def setup_custom_mapping(self, model_class, mapping): """ Add a custom mapping for a model class instead of accepting what the model_class defines. :param elasticgit.models.Model model_class: :param dict: the Elastisearch mapping definition :returns: dict, the decoded dictionary from Elasticsearch """ return self.im.setup_custom_mapping( self.repo.active_branch.name, model_class, mapping) def get_mapping(self, model_class): """ Get a mapping from Elasticsearch for a model_class :param elasticgit.models.Model model_class: :returns: dict """ return self.im.get_mapping(self.repo.active_branch.name, model_class) def S(self, model_class): """ Get a :py:class:`elasticutils.S` object for the given model class. Under the hood this dynamically generates a :py:class:`elasticutils.MappingType` and :py:class:`elasticutils.Indexable` subclass which maps the Elasticsearch results to :py:class:`elasticgit.models.Model` instances on the UUIDs. :param elasticgit.models.Model model_class: The class to provide a search interface for. """ return S( self.im.get_mapping_type(model_class)).es(**self.es_settings)
class Workspace(object): """ The main API exposing a model interface to both a Git repository and an Elasticsearch index. :param git.Repo repo: A :py:class:`git.Repo` instance. :param dit es: A dictionary of values one would pass to elasticutils.get_es to get an Elasticsearch connection :param str index_prefix: The prefix to use when generating index names for Elasticsearch """ def __init__(self, repo, es, index_prefix): self.repo = repo self.sm = StorageManager(repo) self.es_settings = es self.im = ESManager(self.sm, get_es(**self.es_settings), index_prefix) self.working_dir = self.repo.working_dir self.index_prefix = index_prefix def setup(self, name, email): """ Setup a Git repository & ES index if they do not yet exist. This is safe to run if already existing. :param str name: The name of the committer in this repository. :param str email: The email address of the committer in this repository. """ if not self.sm.storage_exists(): self.sm.create_storage() self.sm.write_config('user', { 'name': name, 'email': email, }) if not self.im.index_exists(self.sm.active_branch()): self.im.create_index(self.sm.active_branch()) def exists(self): """ Check if the Git repository or the ES index exists. Returns ``True`` if either of them exist. :returns: bool """ if self.sm.storage_exists(): return self.im.index_exists(self.sm.active_branch()) return False def destroy(self): """ Removes an ES index and a Git repository completely. Guaranteed to remove things completely, use with caution. """ if self.sm.storage_exists(): if self.im.index_exists(self.sm.active_branch()): self.im.destroy_index(self.sm.active_branch()) self.sm.destroy_storage() def save(self, model, message, author=None, committer=None): """ Save a :py:class:`elasticgit.models.Model` instance in Git and add it to the Elasticsearch index. :param elasticgit.models.Model model: The model instance :param str message: The commit message to write the model to Git with. :param tuple author: The author information (name, email address) Defaults repo default if unspecified. :param tuple committer: The committer information (name, email address). Defaults to the author if unspecified. """ if isinstance(message, unicode): message = unidecode(message) self.sm.store(model, message, author=author, committer=committer) self.im.index(model) def delete(self, model, message, author=None, committer=None): """ Delete a :py:class`elasticgit.models.Model` instance from Git and the Elasticsearch index. :param elasticgit.models.Model model: The model instance :param str message: The commit message to remove the model from Git with. :param tuple author: The author information (name, email address) Defaults repo default if unspecified. :param tuple committer: The committer information (name, email address). Defaults to the author if unspecified. """ if isinstance(message, unicode): message = unidecode(message) self.sm.delete(model, message, author=author, committer=committer) self.im.unindex(model) def fast_forward(self, branch_name='master', remote_name='origin'): warnings.warn('This method is deprecated, use pull() instead', DeprecationWarning) return self.pull(branch_name=branch_name, remote_name=remote_name) def index_diff(self, diff_index): # NOTE: This is probably more complicated than it needs to be # If we have multiple remotes GitPython gets confused about # deletes. It marks things as deletes because it may not # exist on another remote. # # Here we loop over all changes, track the models that've # changed and then reindex fully to make sure we're in sync. if len(self.repo.remotes) > 1 and any(diff_index): return self.reindex_diff(diff_index) # NOTE: There's a very unlikely scenario where we're dealing with # renames. This generally can only happen when a repository # has been manually modififed. If that's the case then # reindex everything as well if any(diff_index.iter_change_type('R')): return self.reindex_diff(diff_index) # unindex deleted blobs for diff in diff_index.iter_change_type('D'): path_info = self.sm.path_info(diff.a_blob.path) if path_info is None: continue self.im.raw_unindex(*path_info) # reindex added blobs for diff in diff_index.iter_change_type('A'): path_info = self.sm.path_info(diff.b_blob.path) if path_info is None: continue obj = self.sm.get(*path_info) self.im.index(obj) # reindex modified blobs for diff in diff_index.iter_change_type('M'): path_info = self.sm.path_info(diff.a_blob.path) if path_info is None: continue obj = self.sm.get(*path_info) self.im.index(obj) def reindex_diff(self, diff_index): changed_model_set = set([]) for diff in diff_index: if diff.new_file: path_info = self.sm.path_info(diff.b_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) elif diff.renamed: path_info = self.sm.path_info(diff.a_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) else: path_info = self.sm.path_info(diff.a_blob.path) if path_info is not None: changed_model_set.add(path_info[0]) for model_class in changed_model_set: self.reindex(model_class) def pull(self, branch_name='master', remote_name='origin'): """ Fetch & Merge in an upstream's commits. :param str branch_name: The name of the branch to fast forward & merge in :param str remote_name: The name of the remote to fetch from. """ changes = self.sm.pull(branch_name=branch_name, remote_name=remote_name) return self.index_diff(changes) def reindex_iter(self, model_class, refresh_index=True): """ Reindex everything that Git knows about in an iterator :param elasticgit.models.Model model_class: :param bool refresh_index: Whether or not to refresh the index after everything has been indexed. Defaults to ``True`` """ if not self.im.index_exists(self.sm.active_branch()): self.im.create_index(self.sm.active_branch()) iterator = self.sm.iterate(model_class) for model in iterator: yield self.im.index(model) if refresh_index: self.refresh_index() def reindex(self, model_class, refresh_index=True): """ Same as :py:func:`reindex_iter` but returns a list instead of a generator. """ return list(self.reindex_iter(model_class, refresh_index=refresh_index)) def refresh_index(self): """ Manually refresh the Elasticsearch index. In production this is not necessary but it is useful when running tests. """ self.im.refresh_indices(self.sm.active_branch()) def index_ready(self): """ Check if the index is ready :returns: bool """ return self.im.index_ready(self.sm.active_branch()) def sync(self, model_class, refresh_index=True): """ Resync a workspace, it assumes the Git repository is the source of truth and Elasticsearch is made to match. This involves two passes, first to index everything that Git knows about and unindexing everything that's in Elastisearch that Git does not know about. :param elasticgit.models.Model model_class: The model to resync :param bool refresh_index: Whether or not to refresh the index after indexing everything from Git """ reindexed_uuids = set([]) removed_uuids = set([]) for model_obj in self.reindex_iter(model_class, refresh_index=refresh_index): reindexed_uuids.add(model_obj.uuid) for result in self.S(model_class).everything(): if result.uuid not in reindexed_uuids: self.im.raw_unindex(model_class, result.uuid) removed_uuids.add(result.uuid) return reindexed_uuids, removed_uuids def setup_mapping(self, model_class): """ Add a custom mapping for a model_class :param elasticgit.models.Model model_class: :returns: dict, the decoded dictionary from Elasticsearch """ return self.im.setup_mapping(self.sm.active_branch(), model_class) def setup_custom_mapping(self, model_class, mapping): """ Add a custom mapping for a model class instead of accepting what the model_class defines. :param elasticgit.models.Model model_class: :param dict: the Elastisearch mapping definition :returns: dict, the decoded dictionary from Elasticsearch """ return self.im.setup_custom_mapping(self.sm.active_branch(), model_class, mapping) def get_mapping(self, model_class): """ Get a mapping from Elasticsearch for a model_class :param elasticgit.models.Model model_class: :returns: dict """ return self.im.get_mapping(self.sm.active_branch(), model_class) def S(self, model_class): """ Get a :py:class:`elasticutils.S` object for the given model class. Under the hood this dynamically generates a :py:class:`elasticutils.MappingType` and :py:class:`elasticutils.Indexable` subclass which maps the Elasticsearch results to :py:class:`elasticgit.models.Model` instances on the UUIDs. :param elasticgit.models.Model model_class: The class to provide a search interface for. """ return S(self.im.get_mapping_type(model_class)).es(**self.es_settings)