def test_files_import_internal_nohashes(tmpdir, collection, test_csv_dir, test_files_dir): """test CSV with hash columns (sha1/sha256/md5/size) removed """ file_csv_path = os.path.join( test_csv_dir, 'ddrimport-files-import-internal-nohashes.csv') rewrite_file_paths(file_csv_path, test_files_dir) log_path = os.path.join(test_files_dir, 'ddrimport-files-import-internal-nohashes.log') out = batch.Importer.import_files( file_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT, log_path=log_path, tmp_dir=test_files_dir, ) # save and commit repo = dvcs.repository(collection.path_abs) commit = repo.index.commit('test_files_import_internal_nohashes') # test hashes present check_file_hashes(collection.path_abs) # ensure no binaries in .git/objects print('log_path %s' % log_path) assert not find_binaries_in_git_objects(repo) assert not find_missing_annex_binaries(repo)
def test_files_import_external(tmpdir, collection, test_csv_dir, test_files_dir): """Test importing *external* files """ print('collection_path %s' % collection.path_abs) file_csv_path = os.path.join(test_csv_dir, 'ddrimport-files-import-external.csv') rewrite_file_paths(file_csv_path, test_files_dir) log_path = os.path.join(test_files_dir, 'ddrimport-files-import-external.log') out = batch.Importer.import_files( file_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT, log_path=log_path, tmp_dir=test_files_dir, ) # save and commit repo = dvcs.repository(collection.path_abs) commit = repo.index.commit('test_files_import_external') print('commit %s' % commit) # test hashes present check_file_hashes(collection.path_abs) # ensure no binaries in .git/objects print('log_path %s' % log_path) assert not find_binaries_in_git_objects(repo) assert not find_missing_annex_binaries(repo)
def missing_annex_files(self): """List File objects with missing binaries @returns: list of File objects """ def just_id(oid): # some "file IDs" might have config.ACCESS_FILE_APPEND appended. # remove config.ACCESS_FILE_APPEND if present # NOTE: make sure we're not matching some other part of the ID # example: ddr-test-123-456-master-abc123-a # ^^ rindex = oid.rfind(config.ACCESS_FILE_APPEND) if rindex > 0: stem = oid[:rindex] suffix = oid[rindex:] if (len(oid) - len(stem)) \ and (len(suffix) == len(config.ACCESS_FILE_APPEND)): return stem return oid def add_id_and_hash(item): item['hash'] = os.path.splitext(item['keyname'])[0] item['id'] = just_id( os.path.splitext(os.path.basename(item['file']))[0]) return item return [ add_id_and_hash(item) for item in dvcs.annex_missing_files(dvcs.repository(self.path)) ]
def clean(collection, remove): """TODO ddrimport cleanup subcommand docs """ start = datetime.now() # ensure we have absolute paths (CWD+relpath) collection_path = os.path.abspath(os.path.normpath(collection)) # Check args if not (os.path.isdir(collection_path)): print('ddrimport: collection path must be a directory.') sys.exit(1) if not os.path.exists(collection_path): print('ddrimport: Collection does not exist.') sys.exit(1) repo = dvcs.repository(collection) logging.debug('Resetting staged files') dvcs.reset(repo) logging.debug('Reverting modified files') dvcs.revert(repo) if remove: logging.debug('Removing untracked files') dvcs.remove_untracked(repo) status = dvcs.repo_status(repo) logging.debug('status\n%s' % status) finish = datetime.now() elapsed = finish - start logging.info('DONE - %s elapsed' % elapsed)
def ddrinfo(collection, json): """ddrinfo - Prints info about a repository. \b Example: ddr-info /PATH/TO/REPO """ start = datetime.now() repo = dvcs.repository(collection) #logging.debug(repo) data = {} #logging.debug('Getting file info') data.update(file_info(repo)) #logging.debug('Getting annex info') data.update(annex_info(repo)) if json: print(simplejson.dumps(data)) else: output(data) finish = datetime.now() elapsed = finish - start
def object_metadata(module, repo_path): """Metadata for the ddrlocal/ddrcmdln and models definitions used. @param module: collection, entity, files model definitions module @param repo_path: Absolute path to root of object's repo @returns: dict """ if not config.APP_METADATA: repo = dvcs.repository(repo_path) config.APP_METADATA['git_version'] = '; '.join([ dvcs.git_version(repo), dvcs.annex_version(repo) ]) # ddr-cmdln url = 'https://github.com/densho/ddr-cmdln.git' config.APP_METADATA['application'] = url config.APP_METADATA['app_path'] = config.INSTALL_PATH config.APP_METADATA['app_commit'] = dvcs.latest_commit( config.INSTALL_PATH ) config.APP_METADATA['app_release'] = VERSION # ddr-defs config.APP_METADATA['defs_path'] = modules.Module(module).path config.APP_METADATA['defs_commit'] = dvcs.latest_commit( modules.Module(module).path ) return config.APP_METADATA
def ddrinfo(collection, json): """ddrinfo - Prints info about a repository. \b Example: ddr-info /PATH/TO/REPO """ start = datetime.now() repo = dvcs.repository(collection) #logging.debug(repo) data = {} #logging.debug('Getting file info') data.update(file_info(repo)) #logging.debug('Getting annex info') data.update(annex_info(repo)) if json: print(json.dumps(data)) else: output(data) finish = datetime.now() elapsed = finish - start
def status(collection, short=False): """Command-line function for running git status on collection repository. @param collection: Collection @return: message ('ok' if successful) """ return dvcs.repo_status(dvcs.repository(collection.path))
def fetch(collection): """Command-line function for fetching latest changes to git repo from origin/master. @param collection: Collection @return: message ('ok' if successful) """ return dvcs.fetch(dvcs.repository(collection.path))
def annex_status(collection): """Command-line function for running git annex status on collection repository. @param collection: Collection @return: message ('ok' if successful) """ return dvcs.annex_status(dvcs.repository(collection.path))
def cmp_model_definition_commits(self, document_commit, module_commit): """Indicate document's model defs are newer or older than module's. Prepares repository and document/module commits to be compared by DDR.dvcs.cmp_commits. See that function for how to interpret the results. Note: if a document has no defs commit it is considered older than the module. NOTE: commit may not be found in log if definitions were on a branch at the time the document was committed. @param document: A Collection, Entity, or File object. @returns: dict See DDR.dvcs.cmp_commits """ try: repo = dvcs.repository(self.path) except dvcs.git.InvalidGitRepositoryError: # GitPython doesn't understand git worktrees # return empty dict see dvcs.cmp_commits return {'a':'', 'b':'', 'op':'--'} return dvcs.cmp_commits( repo, document_commit, module_commit )
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True): # TODO move to DDR.dvcs? repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # These vars will be used to determine if stage operation is successful. # If called in batch operation there may already be staged files. # stage_planned Files added/modified by this function call # stage_already Files that were already staged # stage_predicted List of staged files that should result from this operation. # stage_new Files that are being added. stage_planned = git_files + annex_files stage_already = dvcs.list_staged(repo) stage_predicted = predict_staged(stage_already, stage_planned) stage_new = [x for x in stage_planned if x not in stage_already] log.ok('| %s files to stage:' % len(stage_planned)) for sp in stage_planned: log.ok('| %s' % sp) stage_ok = False staged = [] try: log.ok('git stage') dvcs.stage(repo, git_files) log.ok('annex stage') dvcs.annex_stage(repo, annex_files) log.ok('ok') staged = dvcs.list_staged(repo) except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) finally: if show_staged: log.ok('| %s files staged:' % len(staged)) log.ok('show_staged %s' % show_staged) for sp in staged: log.ok('| %s' % sp) if len(staged) == len(stage_predicted): log.ok('| %s files staged (%s new, %s modified)' % ( len(staged), len(stage_new), len(stage_already)) ) stage_ok = True else: log.not_ok('%s new files staged (should be %s)' % ( len(staged), len(stage_predicted)) ) if not stage_ok: log.not_ok('File staging aborted. Cleaning up') # try to pick up the pieces # mv files back to tmp_dir # TODO Properly clean up git-annex-added files. # This clause moves the *symlinks* to annex files but leaves # the actual binaries in the .git/annex objects dir. for tmp,dest in new_files: if os.path.islink(dest): log.not_ok('| link (not moving) %s' % dest) else: log.not_ok('| mv %s %s' % (dest,tmp)) os.rename(dest,tmp) log.not_ok('finished cleanup. good luck...') log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo
def test_repository(tmpdir): """Tests that repository config values are set correctly""" # git_set_configs # annex_set_configs # repository path = str(tmpdir / 'ddr-test') repo = git.Repo.init(path) dvcs.repository(path=path, user_name=USER_NAME, user_mail=USER_MAIL) reader = repo.config_reader() reader.sections() core_items = {i[0]: i[1] for i in reader.items('core')} user_items = {i[0]: i[1] for i in reader.items('user')} annex_items = {i[0]: i[1] for i in reader.items('annex')} assert core_items.get('fileMode') == 'false' assert user_items.get('name') == USER_NAME assert user_items.get('email') == USER_MAIL assert annex_items.get('sshcaching') == 'false'
def set_repo_description(self): """Set COLLECTION/.git/description based on self.title """ desc_path = os.path.join(self.git_path, 'description') if self.title and os.path.exists(self.git_path) and os.access( desc_path, os.W_OK): repo = dvcs.repository(self.path) repo.description = self.title
def repo_annex_status(self): """Get annex status of collection repo. """ if not self._astatus and (os.path.exists(self.git_path)): astatus = dvcs.annex_status(dvcs.repository(self.path)) if astatus: self._astatus = astatus return self._astatus
def file_destroy(user_name, user_mail, collection_path, entity_uid, rm_files, updated_files, agent=''): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param entity_uid: A valid DDR entity UID @param rm_files: List of paths to files to delete (relative to entity files dir). @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) entity = DDREntity(collection.entity_path(entity_uid)) repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) # updated file paths are relative to collection root git_files = [os.path.join('files', entity.uid, f) for f in updated_files] # Only list the original file in changelog # TODO use a models.File function to ID the original file changelog_files = [f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f)] # remove the files # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file git = repo.git for f in rm_files: git.rm('-rf', f) # update entity control econtrol = entity.control() econtrol.update_checksums(entity) econtrol.write() git_files.append(econtrol.path_rel) # update entity changelog changelog_messages = ['Deleted entity file {}'.format(f) for f in changelog_files] if agent: changelog_messages.append('@agent: %s' % agent) write_changelog_entry(entity.changelog_path, changelog_messages, user_name, user_mail) git_files.append(entity.changelog_path_rel) # add files and commit commit_message = dvcs.compose_commit_message('Deleted entity file(s)', agent=agent) repo = commit_files(repo, commit_message, git_files, []) return 0,'ok'
def merge( request, repo, org, cid ): """ Decides how to merge the various files in a merge conflict. Sends user around to different editors and things until everything is merged. """ collection_path = Collection.collection_path(request,repo,org,cid) repository = dvcs.repository(collection_path) collection = Collection.from_json(collection_path) task_id = collection.locked() status = commands.status(collection_path) ahead = collection.repo_ahead() behind = collection.repo_behind() diverged = collection.repo_diverged() conflicted = collection.repo_conflicted() unmerged = dvcs.list_conflicted(repository) staged = dvcs.list_staged(repository) if request.method == 'POST': form = MergeCommitForm(request.POST) if form.is_valid(): which = form.cleaned_data['which'] if which == 'merge': dvcs.merge_commit(repository) committed = 1 elif which == 'commit': dvcs.diverge_commit(repository) committed = 1 else: committed = 0 if committed: if task_id: collection.unlock(task_id) messages.error(request, 'Merge conflict has been resolved. Please sync to make your changes available to other users.') return HttpResponseRedirect( reverse('webui-collection', args=[repo,org,cid]) ) return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) ) else: which = 'unknown' if conflicted and not unmerged: which = 'merge' elif diverged and staged: which = 'commit' form = MergeCommitForm({'path':collection_path, 'which':which,}) return render_to_response( 'webui/merge/index.html', {'repo': repo, 'org': org, 'cid': cid, 'collection_path': collection_path, 'collection': collection, 'status': status, 'conflicted': conflicted, 'ahead': ahead, 'behind': behind, 'unmerged': unmerged, 'diverged': diverged, 'staged': staged, 'form': form,}, context_instance=RequestContext(request, processors=[]) )
def file_destroy(user_name, user_mail, collection, entity, rm_files, updated_files, agent='', commit=True): """Remove file and metadata - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param rm_files: List of paths to files to delete (relative to entity files dir). @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: exit,message,touched_files ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # updated file paths are relative to collection root git_files = [f for f in updated_files] # remove the files # NOTE: File must be removed from filesystem at this point # so the File will be properly removed from the control file for f in rm_files: repo.git.rm('-rf', f) # update entity changelog changelog_files = [ # dont list access files in changelog # TODO use a models.File function to ID the original file f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f) ] changelog_messages = [ 'Deleted file {}'.format(os.path.basename(f)) for f in changelog_files ] if agent: changelog_messages.append('@agent: %s' % agent) write_changelog_entry(entity.changelog_path, changelog_messages, user_name, user_mail) git_files.append(entity.changelog_path_rel) dvcs.stage(repo, git_files) if commit: commit_obj = dvcs.commit(repo, 'Deleted file(s)', agent) return 0, 'ok', git_files
def repo_fetch(self): """Fetch latest changes to collection repo from origin/master. """ result = '-1' if os.path.exists(self.git_path): result = dvcs.fetch(dvcs.repository(self.path)) else: result = '%s is not a git repository' % self.path return result
def entity_destroy(user_name, user_mail, collection_path, entity_uid, agent=''): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param entity_uid: A valid DDR entity UID @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ entity_dir = os.path.join(collection_path, 'files', entity_uid) if not os.path.exists(collection_path): raise Exception('collection_path not found: %s' % collection_path) if not os.path.exists(entity_dir): raise Exception('entity not found: %s' % entity_dir) collection = DDRCollection(collection_path) repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) git_files = [] # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file git = repo.git git.rm('-rf', entity_dir) # update collection control ccontrol = collection.control() ccontrol.update_checksums(collection) ccontrol.write() git_files.append(ccontrol.path) # prep collection log entries changelog_messages = ['Deleted entity {}'.format(entity_uid),] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(collection.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(collection.changelog_path) # commit repo = commit_files(repo, commit_message, git_files) return 0,'ok'
def test_repository(): """Tests that repository config values are set correctly""" # git_set_configs # annex_set_configs # repository path = os.path.join(TESTING_BASE_DIR, 'ddr-test-%s' % datetime.now(config.TZ).strftime('%Y%m%d-%H%M%S')) user = '******' mail = '*****@*****.**' repo = git.Repo.init(path) dvcs.repository(path=path, user_name=user, user_mail=mail) reader = repo.config_reader() reader.sections() core_items = {i[0]:i[1] for i in reader.items('core')} user_items = {i[0]:i[1] for i in reader.items('user')} annex_items = {i[0]:i[1] for i in reader.items('annex')} assert core_items.get('fileMode') == 'false' assert user_items.get('name') == user assert user_items.get('email') == mail assert annex_items.get('sshcaching') == 'false'
def repo_status(self): """Get status of collection repo vis-a-vis origin/master. The repo_(synced,ahead,behind,diverged,conflicted) functions all use the result of this function so that git-status is only called once. """ if not self._status and (os.path.exists(self.git_path)): status = dvcs.repo_status(dvcs.repository(self.path), short=True) if status: self._status = status return self._status
def entity_destroy(user_name, user_mail, entity, updated_files, agent='', commit=True): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ collection = entity.collection() parent = entity.identifier.parent().object() repo = dvcs.repository(collection.path_abs, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) git_files = updated_files # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file repo.git.rm('-rf', entity.path_abs) # prep collection log entries changelog_messages = [ 'Deleted entity {}'.format(entity.id), ] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(parent.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(parent.changelog_path) dvcs.stage(repo, git_files) # commit if commit: repo = commit_files(repo, commit_message, git_files) return 0, 'ok'
def stage_files(entity, git_files, annex_files, log, show_staged=True): """Stage files; check before and after to ensure all files get staged @param entity: DDR.models.entities.Entity @param git_files: list @param annex_files: list @param log: AddFileLogger @param show_staged: bool @returns: repo """ repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # Remove any files in git_files that are in annex_files git_files = [ path for path in git_files if path not in annex_files ] log.ok('| BEFORE staging') staged_before,modified_before,untracked_before = repo_status(repo, log) stage_these = sorted(list(set(git_files + annex_files))) log.ok('| staging %s files:' % len(stage_these)) for path in stage_these: log.ok('| %s' % path) stage_ok = False staged = [] try: log.ok('| annex stage') # Stage annex files (binaries) before non-binary git files # else binaries might end up in .git/objects/ which would be NOT GOOD dvcs.annex_stage(repo, annex_files) log.ok('| git stage') # If git_files contains binaries they are already staged by now. dvcs.stage(repo, git_files) log.ok('| ok') except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) log.ok('| AFTER staging') staged_after,modified_after,untracked_after = repo_status(repo, log) # Crash if not staged still_modified = [path for path in stage_these if path in modified_after] if still_modified: log.not_ok('These files are still modified') for path in still_modified: log.not_ok('| %s' % path) log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo
def git_status( request, cid ): collection = Collection.from_identifier(Identifier(cid)) alert_if_conflicted(request, collection) gitstatus = collection.gitstatus() remotes = dvcs.remotes(dvcs.repository(collection.path)) return render(request, 'webui/collections/git-status.html', { 'collection': collection, 'status': gitstatus.get('status', 'git-status unavailable'), 'astatus': gitstatus.get('annex_status', 'annex-status unavailable'), 'timestamp': gitstatus.get('timestamp'), 'remotes': remotes, })
def entity_destroy(user_name, user_mail, collection, entity, agent=''): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ if not os.path.exists(collection.path_abs): raise Exception('collection_path not found: %s' % collection.path_abs) if not os.path.exists(entity.path_abs): raise Exception('entity not found: %s' % entity.path_abs) repo = dvcs.repository(collection.path_abs, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) git_files = [] # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file git = repo.git git.rm('-rf', entity.path_abs) # update collection control ccontrol = collection.control() ccontrol.update_checksums(collection) ccontrol.write() git_files.append(ccontrol.path) # prep collection log entries changelog_messages = ['Deleted entity {}'.format(entity.id),] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(collection.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(collection.changelog_path) # commit repo = commit_files(repo, commit_message, git_files) return 0,'ok'
def test_update_files(tmpdir, collection, test_csv_dir, test_files_dir): hashes_before = collect_hashes(collection.path_abs) file_csv_path = os.path.join(test_csv_dir, 'ddrimport-file-update.csv') rewrite_file_paths(file_csv_path, test_files_dir) log_path = os.path.join(test_files_dir, 'ddrimport-file-update.log') out = batch.Importer.import_files( file_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT, log_path=log_path, tmp_dir=test_files_dir, ) repo = dvcs.repository(collection.path_abs) staged = sorted(dvcs.list_staged(repo)) # test unstaged = [] for path in EXPECTED_UPDATE_FILES: if path not in staged: unstaged.append(path) unstaged = sorted(unstaged) for n, path in enumerate(unstaged): print('UNSTAGED %s %s' % (n + 1, path)) print(repo) print(log_path) assert not unstaged # save and commit repo = dvcs.repository(collection.path_abs) commit = repo.index.commit('test_update_files') # test hashes present check_file_hashes(collection.path_abs) # test hashes not modified hashes_after = collect_hashes(collection.path_abs) check_hashes(hashes_before, hashes_after) # ensure no binaries in .git/objects print('log_path %s' % log_path) assert not find_binaries_in_git_objects(repo) assert not find_missing_annex_binaries(repo)
def entity_update(user_name, user_mail, collection, entity, updated_files, agent='', commit=True): """Command-line function for committing changes to the specified entity file. NOTE: Does not push to the workbench server. Updates entity changelog but NOT in collection changelog. Makes an entry in git log. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # entity file paths are relative to collection root git_files = [] for f in updated_files: git_files.append(os.path.join('files', entity.id, str(f))) # entity changelog entity_changelog_messages = [] for f in updated_files: p = os.path.join(entity.id, f) entity_changelog_messages.append('Updated entity file {}'.format(p)) # prep log entries if agent: entity_changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Updated entity file(s)', agent=agent) write_changelog_entry(entity.changelog_path, entity_changelog_messages, user=user_name, email=user_mail) git_files.append(entity.changelog_path_rel) if commit: # add files and commit repo = commit_files(repo, commit_message, git_files, []) return 0, 'ok'
def sync(user_name, user_mail, collection_path): """Sync repo with bare clone on hub server; use instead of git-annex-sync. Git-annex has a "sync" command for communicating annex changes between repositories, but it is designed to be used between non-bare repositories. Normally Git does not support pushing to non-bare repositories, and git-annex does some trickery involving "synced/BRANCH" branches to make this work. Reference: http://git-annex.branchable.com/sync/ When git-annex-sync is used between a non-bare repo and a bare repo (e.g. between a local repo and our hub server running Gitolite), the "synced/master" branches do not get merged in to master and syncing no longer works. Therefore it is necessary to sync manually. If you think you want to use git-annex-sync, remember that we tried this in commit 1857a7aa3f and it did not work and we reverted to manual syncing. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) repo = dvcs.repository(collection.path, user_name, user_mail) logging.debug('repo: %s' % repo) dvcs.set_annex_description(repo) if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) # list remotes logging.debug('remotes') for remote in dvcs.remotes(collection_path): logging.debug('- %s %s' % (remote['name'], remote['url'])) # pull logging.debug('git pull %s master' % GIT_REMOTE_NAME) repo.git.checkout('master') repo.git.pull(GIT_REMOTE_NAME, 'master') logging.debug('git pull %s git-annex' % GIT_REMOTE_NAME) repo.git.checkout('git-annex') repo.git.pull(GIT_REMOTE_NAME, 'git-annex') #logging.debug('OK') # push logging.debug('git pull %s git-annex' % GIT_REMOTE_NAME) repo.git.checkout('git-annex') repo.git.push(GIT_REMOTE_NAME, 'git-annex') logging.debug('git pull %s master' % GIT_REMOTE_NAME) repo.git.checkout('master') repo.git.push(GIT_REMOTE_NAME, 'master') logging.debug('OK') return 0,'ok'
def git_status( request, repo, org, cid ): collection = Collection.from_request(request) alert_if_conflicted(request, collection) gitstatus = collection.gitstatus() remotes = dvcs.remotes(dvcs.repository(collection.path)) return render_to_response( 'webui/collections/git-status.html', {'collection': collection, 'status': gitstatus.get('status', 'git-status unavailable'), 'astatus': gitstatus.get('annex_status', 'annex-status unavailable'), 'timestamp': gitstatus.get('timestamp'), 'remotes': remotes, }, context_instance=RequestContext(request, processors=[]) )
def annex_push(collection, file_path_rel): """Push a git-annex file to workbench. Example file_paths: ddr-densho-1-1/files/video1.mov ddr-densho-42-17/files/image35.jpg ddr-one-35-248/files/newspaper.pdf $ git annex copy PATH --to=REMOTE @param collection: Collection @param file_path_rel: Path to file relative to collection root @return: message ('ok' if successful) """ file_path_abs = os.path.join(collection.path, file_path_rel) logging.debug(' collection.path {}'.format(collection.path)) logging.debug(' file_path_rel {}'.format(file_path_rel)) logging.debug(' file_path_abs {}'.format(file_path_abs)) if not os.path.exists(collection.path): logging.error(' NO COLLECTION AT {}'.format(collection.path)) return 1, 'no collection' if not os.path.exists(collection.annex_path): logging.error(' NO GIT ANNEX AT {}'.format(collection.annex_path)) return 1, 'no annex' if not os.path.exists(file_path_abs): logging.error(' NO FILE AT {}'.format(file_path_abs)) return 1, 'no file' # let's do this thing repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) logging.debug(' git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME, file_path_rel)) stdout = repo.git.annex('copy', '-t', config.GIT_REMOTE_NAME, file_path_rel) logging.debug('\n{}'.format(stdout)) # confirm that it worked whereis = dvcs.annex_whereis_file(repo, file_path_rel) if whereis['success']: remotes = [ r['description'] for r in whereis['whereis'] if not r['here'] ] logging.debug(' present in remotes {}'.format(remotes)) logging.debug( ' it worked: {}'.format(config.GIT_REMOTE_NAME in remotes)) logging.debug(' DONE') return 0, 'ok'
def entity_update(user_name, user_mail, collection_path, entity_uid, updated_files, agent=''): """Command-line function for committing changes to the specified entity file. NOTE: Does not push to the workbench server. Updates entity changelog but NOT in collection changelog. Makes an entry in git log. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param entity_uid: A valid DDR entity UID @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) entity = DDREntity(collection.entity_path(entity_uid)) repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) # entity file paths are relative to collection root git_files = [] for f in updated_files: git_files.append( os.path.join( 'files', entity.uid, f) ) # entity changelog entity_changelog_messages = [] for f in updated_files: p = os.path.join(entity.uid, f) entity_changelog_messages.append('Updated entity file {}'.format(p)) # prep log entries if agent: entity_changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Updated entity file(s)', agent=agent) write_changelog_entry(entity.changelog_path, entity_changelog_messages, user=user_name, email=user_mail) git_files.append(entity.changelog_path_rel) # add files and commit repo = commit_files(repo, commit_message, git_files, []) return 0,'ok'
def update( base_dir, collection_path ): """Gets a bunch of status info for the collection; refreshes if forced timestamp, elapsed, status, annex_status, syncstatus @param force: Boolean Forces refresh of status @returns: dict """ start = datetime.now() repo = dvcs.repository(collection_path) status = dvcs.repo_status(repo, short=True) annex_status = dvcs.annex_status(repo) timestamp = datetime.now() syncstatus = sync_status(collection_path, git_status=status, timestamp=timestamp, force=True) elapsed = timestamp - start text = write(base_dir, collection_path, timestamp, elapsed, status, annex_status, syncstatus) return loads(text)
def entity_destroy(user_name, user_mail, entity, updated_files, agent='', commit=True): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ collection = entity.collection() parent = entity.identifier.parent().object() repo = dvcs.repository(collection.path_abs, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) git_files = updated_files # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file repo.git.rm('-rf', entity.path_abs) # prep collection log entries changelog_messages = ['Deleted entity {}'.format(entity.id),] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(parent.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(parent.changelog_path) dvcs.stage(repo, git_files) # commit if commit: repo = commit_files(repo, commit_message, git_files) return 0,'ok'
def object_metadata(module, repo_path): """Metadata for the ddrlocal/ddrcmdln and models definitions used. @param module: collection, entity, files model definitions module @param repo_path: Absolute path to root of object's repo @returns: dict """ repo = dvcs.repository(repo_path) gitversion = '; '.join([dvcs.git_version(repo), dvcs.annex_version(repo)]) data = { 'application': 'https://github.com/densho/ddr-cmdln.git', 'app_commit': dvcs.latest_commit(config.INSTALL_PATH), 'app_release': VERSION, 'defs_path': modules.Module(module).path, 'models_commit': dvcs.latest_commit(modules.Module(module).path), 'git_version': gitversion, } return data
def entity_update(user_name, user_mail, collection, entity, updated_files, agent='', commit=True): """Command-line function for committing changes to the specified entity file. NOTE: Does not push to the workbench server. Updates entity changelog but NOT in collection changelog. Makes an entry in git log. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # entity file paths are relative to collection root git_files = [] for f in updated_files: git_files.append( os.path.join( 'files', entity.id, str(f)) ) # entity changelog entity_changelog_messages = [] for f in updated_files: p = os.path.join(entity.id, f) entity_changelog_messages.append('Updated entity file {}'.format(p)) # prep log entries if agent: entity_changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Updated entity file(s)', agent=agent) write_changelog_entry(entity.changelog_path, entity_changelog_messages, user=user_name, email=user_mail) git_files.append(entity.changelog_path_rel) if commit: # add files and commit repo = commit_files(repo, commit_message, git_files, []) return 0,'ok'
def update(user_name, user_mail, collection, updated_files, agent='', commit=False): """Command-line function for commiting changes to the specified file. NOTE: Does not push to the workbench server. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param updated_files: List of relative paths to updated file(s). @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) if repo: logging.debug(' git repo {}'.format(collection.path)) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # prep log entries changelog_messages = [] for f in updated_files: changelog_messages.append('Updated collection file(s) {}'.format(f)) if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Updated metadata file(s)', agent=agent) # write changelog write_changelog_entry(collection.changelog_path, changelog_messages, user_name, user_mail) if os.path.exists(collection.changelog_path): updated_files.append(collection.changelog_path) else: logging.error(' COULD NOT UPDATE changelog') if commit: # add files and commit repo = commit_files(repo, commit_message, updated_files, []) return 0, 'ok'
def cmp_model_definition_commits(self, document_commit, module_commit): """Indicate document's model defs are newer or older than module's. Prepares repository and document/module commits to be compared by DDR.dvcs.cmp_commits. See that function for how to interpret the results. Note: if a document has no defs commit it is considered older than the module. NOTE: commit may not be found in log if definitions were on a branch at the time the document was committed. @param document: A Collection, Entity, or File object. @returns: dict See DDR.dvcs.cmp_commits """ return dvcs.cmp_commits( dvcs.repository(self.path), document_commit, module_commit )
def annex_push(collection_path, file_path_rel): """Push a git-annex file to workbench. Example file_paths: ddr-densho-1-1/files/video1.mov ddr-densho-42-17/files/image35.jpg ddr-one-35-248/files/newspaper.pdf $ git annex copy PATH --to=REMOTE @param collection_path: Absolute path to collection repo. @param file_path_rel: Path to file relative to collection root @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) file_path_abs = os.path.join(collection.path, file_path_rel) logging.debug(' collection.path {}'.format(collection.path)) logging.debug(' file_path_rel {}'.format(file_path_rel)) logging.debug(' file_path_abs {}'.format(file_path_abs)) if not os.path.exists(collection.path): logging.error(' NO COLLECTION AT {}'.format(collection.path)) return 1,'no collection' if not os.path.exists(collection.annex_path): logging.error(' NO GIT ANNEX AT {}'.format(collection.annex_path)) return 1,'no annex' if not os.path.exists(file_path_abs): logging.error(' NO FILE AT {}'.format(file_path_abs)) return 1,'no file' # let's do this thing repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) logging.debug(' git annex copy -t {} {}'.format(GIT_REMOTE_NAME, file_path_rel)) stdout = repo.git.annex('copy', '-t', GIT_REMOTE_NAME, file_path_rel) logging.debug('\n{}'.format(stdout)) # confirm that it worked remotes = dvcs.annex_whereis_file(repo, file_path_rel) logging.debug(' present in remotes {}'.format(remotes)) logging.debug(' it worked: {}'.format(GIT_REMOTE_NAME in remotes)) logging.debug(' DONE') return 0,'ok'
def cmp_model_definition_commits(self, document_commit, module_commit): """Indicate document's model defs are newer or older than module's. Prepares repository and document/module commits to be compared by DDR.dvcs.cmp_commits. See that function for how to interpret the results. Note: if a document has no defs commit it is considered older than the module. NOTE: commit may not be found in log if definitions were on a branch at the time the document was committed. @param document: A Collection, Entity, or File object. @returns: dict See DDR.dvcs.cmp_commits """ try: repo = dvcs.repository(self.path) except dvcs.git.InvalidGitRepositoryError: # GitPython doesn't understand git worktrees # return empty dict see dvcs.cmp_commits return {'a': '', 'b': '', 'op': '--'} return dvcs.cmp_commits(repo, document_commit, module_commit)
def test_import_entities(tmpdir, collection, test_csv_dir, test_files_dir): entity_csv_path = os.path.join(test_csv_dir, 'ddrimport-entity-new.csv') out = batch.Importer.import_entities(entity_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT) print(out) out_ids = [o.id for o in out] assert out_ids == EXPECTED_ENTITY_IDS # save and commit git_files = [] for o in out: exit, status, updated_files = o.save('pytest', '*****@*****.**', 'pytest', collection=collection, commit=False) print(o, status) git_files += updated_files repo = dvcs.repository(collection.path_abs) dvcs.stage(repo, git_files) commit = repo.index.commit('test_import_entities')
def check_repository(cidentifier): """Load repository, check for staged or modified files Entity.add_files will not work properly if the repo contains staged or modified files. Results dict includes: - 'passed': boolean - 'repo': GitPython repository - 'staged': list of staged files - 'modified': list of modified files @param cidentifier: Identifier @returns: dict """ logging.info('Checking repository') passed = False repo = dvcs.repository(cidentifier.path_abs()) logging.info(repo) staged = dvcs.list_staged(repo) if staged: logging.error('*** Staged files in repo %s' % repo.working_dir) for f in staged: logging.error('*** %s' % f) modified = dvcs.list_modified(repo) if modified: logging.error('Modified files in repo: %s' % repo.working_dir) for f in modified: logging.error('*** %s' % f) if repo and (not (staged or modified)): passed = True logging.info('ok') else: logging.error('FAIL') return { 'passed': passed, 'repo': repo, 'staged': staged, 'modified': modified, }
def update(user_name, user_mail, collection_path, updated_files, agent=''): """Command-line function for commiting changes to the specified file. NOTE: Does not push to the workbench server. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param updated_files: List of relative paths to updated file(s). @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) repo = dvcs.repository(collection.path, user_name, user_mail) if repo: logging.debug(' git repo {}'.format(collection.path)) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) # prep log entries changelog_messages = [] for f in updated_files: changelog_messages.append('Updated collection file(s) {}'.format(f)) if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Updated metadata file(s)', agent=agent) # write changelog write_changelog_entry(collection.changelog_path, changelog_messages, user_name, user_mail) if os.path.exists(collection.changelog_path): updated_files.append(collection.changelog_path) else: logging.error(' COULD NOT UPDATE changelog') # add files and commit repo = commit_files(repo, commit_message, updated_files, []) return 0,'ok'
def edit_raw( request, repo, org, cid ): """ """ git_name = request.session.get('git_name') git_mail = request.session.get('git_mail') if not git_name and git_mail: messages.error(request, WEBUI_MESSAGES['LOGIN_REQUIRED']) collection_path = Collection.collection_path(request,repo,org,cid) repository = dvcs.repository(collection_path) filename = '' if request.method == 'POST': filename = request.POST.get('filename', None) elif request.method == 'GET': filename = request.GET.get('filename', None) filepath = os.path.join(collection_path, filename) if request.method == 'POST': form = MergeRawForm(request.POST) if form.is_valid(): text = form.cleaned_data['text'] # TODO validate XML with open(filepath, 'w') as f: f.write(text) # git add file dvcs.merge_add(repository, filename) return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) ) else: with open(filepath, 'r') as f: text = f.read() form = MergeRawForm({'filename': filename, 'text': text,}) return render_to_response( 'webui/merge/edit-raw.html', {'repo': repo, 'org': org, 'cid': cid, 'filename':filename, 'form': form,}, context_instance=RequestContext(request, processors=[]) )
def edit_json( request, repo, org, cid ): """ """ collection_path = Collection.collection_path(request,repo,org,cid) repository = dvcs.repository(collection_path) filename = '' if request.method == 'POST': filename = request.POST.get('filename', None) elif request.method == 'GET': filename = request.GET.get('filename', None) fields = [] if filename: path = os.path.join(collection_path, filename) with open(path, 'r') as f: txt = f.read() fields = dvcs.conflicting_fields(txt) if request.method == 'POST': #form = MergeJSONForm(request.POST) #if form.is_valid(): # text = form.cleaned_data['text'] # # TODO validate XML # with open(filepath, 'w') as f: # f.write(text) # # git add file # dvcs.merge_add(repository, filename) assert False elif request.method == 'GET': form = MergeJSONForm(fields=fields) return render_to_response( 'webui/merge/edit-json.html', {'filename':filename, 'fields':fields, 'form':form,}, context_instance=RequestContext(request, processors=[]) ) return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) )
def annex_pull(collection, file_path_rel): """git-annex copy a file from workbench. Example file_paths: ddr-densho-1-1/files/video1.mov ddr-densho-42-17/files/image35.jpg ddr-one-35-248/files/newspaper.pdf @param collection: Collection @param file_path_rel: Path to file relative to collection root. @return: message ('ok' if successful) """ file_path_abs = os.path.join(collection.path, file_path_rel) logging.debug(' collection.path {}'.format(collection.path)) logging.debug(' file_path_rel {}'.format(file_path_rel)) logging.debug(' file_path_abs {}'.format(file_path_abs)) if not os.path.exists(collection.path): logging.error(' NO COLLECTION AT {}'.format(collection.path)) return 1, 'no collection' if not os.path.exists(collection.annex_path): logging.error(' NO GIT ANNEX AT {}'.format(collection.annex_path)) return 1, 'no annex' # let's do this thing repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) logging.debug(' git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME, file_path_rel)) stdout = repo.git.annex('copy', '-f', config.GIT_REMOTE_NAME, file_path_rel) logging.debug('\n{}'.format(stdout)) # confirm that it worked exists = os.path.exists(file_path_abs) lexists = os.path.lexists(file_path_abs) islink = os.path.islink(file_path_abs) itworked = (exists and lexists and islink) logging.debug(' it worked: {}'.format(itworked)) logging.debug(' DONE') return 0, 'ok'
def sync(user_name, user_mail, collection): """Sync repo with bare clone on hub server; replaces git-annex-sync. Git-annex has a "sync" command for communicating annex changes between repositories, but it is designed to be used between non-bare repositories. Normally Git does not support pushing to non-bare repositories, and git-annex does some trickery involving "synced/BRANCH" branches to make this work. Reference: http://git-annex.branchable.com/sync/ When git-annex-sync is used between a non-bare repo and a bare repo (e.g. between a local repo and our hub server running Gitolite), the "synced/master" branches do not get merged in to master and syncing no longer works. Therefore it is necessary to sync manually. If you think you want to use git-annex-sync, remember that we tried this in commit 1857a7aa3f and it did not work and we reverted to manual syncing. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @return: message ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) logging.debug('repo: %s' % repo) drive_label = storage.drive_label(repo.working_dir) dvcs.annex_set_description(repo, dvcs.annex_status(repo), drive_label=drive_label) dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # list remotes logging.debug('remotes') for remote in dvcs.remotes(repo): logging.debug('- %s %s' % (remote['name'], remote['target'])) # sync logging.debug('git annex sync') out = repo.git.annex('sync') logging.debug(out) return 0, 'ok'
def sync(user_name, user_mail, collection): """Sync repo with bare clone on hub server; replaces git-annex-sync. Git-annex has a "sync" command for communicating annex changes between repositories, but it is designed to be used between non-bare repositories. Normally Git does not support pushing to non-bare repositories, and git-annex does some trickery involving "synced/BRANCH" branches to make this work. Reference: http://git-annex.branchable.com/sync/ When git-annex-sync is used between a non-bare repo and a bare repo (e.g. between a local repo and our hub server running Gitolite), the "synced/master" branches do not get merged in to master and syncing no longer works. Therefore it is necessary to sync manually. If you think you want to use git-annex-sync, remember that we tried this in commit 1857a7aa3f and it did not work and we reverted to manual syncing. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @return: message ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) logging.debug('repo: %s' % repo) drive_label = storage.drive_label(repo.working_dir) dvcs.annex_set_description(repo, dvcs.annex_status(repo), drive_label=drive_label) dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # list remotes logging.debug('remotes') for remote in dvcs.remotes(repo): logging.debug('- %s %s' % (remote['name'], remote['target'])) # sync logging.debug('git annex sync') out = repo.git.annex('sync') logging.debug(out) return 0,'ok'
def annex_pull(collection, file_path_rel): """git-annex copy a file from workbench. Example file_paths: ddr-densho-1-1/files/video1.mov ddr-densho-42-17/files/image35.jpg ddr-one-35-248/files/newspaper.pdf @param collection: Collection @param file_path_rel: Path to file relative to collection root. @return: message ('ok' if successful) """ file_path_abs = os.path.join(collection.path, file_path_rel) logging.debug(' collection.path {}'.format(collection.path)) logging.debug(' file_path_rel {}'.format(file_path_rel)) logging.debug(' file_path_abs {}'.format(file_path_abs)) if not os.path.exists(collection.path): logging.error(' NO COLLECTION AT {}'.format(collection.path)) return 1,'no collection' if not os.path.exists(collection.annex_path): logging.error(' NO GIT ANNEX AT {}'.format(collection.annex_path)) return 1,'no annex' # let's do this thing repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) logging.debug(' git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME, file_path_rel)) stdout = repo.git.annex('copy', '-f', config.GIT_REMOTE_NAME, file_path_rel) logging.debug('\n{}'.format(stdout)) # confirm that it worked exists = os.path.exists(file_path_abs) lexists = os.path.lexists(file_path_abs) islink = os.path.islink(file_path_abs) itworked = (exists and lexists and islink) logging.debug(' it worked: {}'.format(itworked)) logging.debug(' DONE') return 0,'ok'
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True): # TODO move to DDR.dvcs? repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # These vars will be used to determine if stage operation is successful. # If called in batch operation there may already be staged files. # stage_planned Files added/modified by this function call # stage_already Files that were already staged # stage_predicted List of staged files that should result from this operation. # stage_new Files that are being added. stage_planned = git_files + annex_files stage_already = dvcs.list_staged(repo) stage_predicted = predict_staged(stage_already, stage_planned) stage_new = [x for x in stage_planned if x not in stage_already] log.ok('| %s files to stage:' % len(stage_planned)) for sp in stage_planned: log.ok('| %s' % sp) stage_ok = False staged = [] try: log.ok('git stage') dvcs.stage(repo, git_files) log.ok('annex stage') dvcs.annex_stage(repo, annex_files) log.ok('ok') staged = dvcs.list_staged(repo) except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) finally: if show_staged: log.ok('| %s files staged:' % len(staged)) log.ok('show_staged %s' % show_staged) for sp in staged: log.ok('| %s' % sp) if len(staged) == len(stage_predicted): log.ok('| %s files staged (%s new, %s modified)' % (len(staged), len(stage_new), len(stage_already))) stage_ok = True else: log.not_ok('%s new files staged (should be %s)' % (len(staged), len(stage_predicted))) if not stage_ok: log.not_ok('File staging aborted. Cleaning up') # try to pick up the pieces # mv files back to tmp_dir # TODO Properly clean up git-annex-added files. # This clause moves the *symlinks* to annex files but leaves # the actual binaries in the .git/annex objects dir. for tmp, dest in new_files: if os.path.islink(dest): log.not_ok('| link (not moving) %s' % dest) else: log.not_ok('| mv %s %s' % (dest, tmp)) shutil.move(dest, tmp) log.not_ok('finished cleanup. good luck...') log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo
def sync_group(groupfile, local_base, local_name, remote_base, remote_name): """ """ logging.debug('reading group file: %s' % groupfile) repos = read_group_file(groupfile) ACCESS_SUFFIX = ACCESS_FILE_APPEND + ACCESS_FILE_EXTENSION def logif(txt): t = txt.strip() if t: logging.debug(t) for r in repos: repo_path = os.path.join(local_base, r['id']) logging.debug('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -') logging.debug('repo_path: %s' % repo_path) # clone/update if os.path.exists(repo_path): logging.debug('updating %s' % repo_path) repo = dvcs.repository(repo_path) repo.git.fetch('origin') repo.git.checkout('master') repo.git.pull('origin', 'master') repo.git.checkout('git-annex') repo.git.pull('origin', 'git-annex') repo.git.checkout('master') logging.debug('ok') else: url = '%s:%s.git' % (GITOLITE, r['id']) logging.debug('cloning %s' % url) repo = git.Repo.clone_from(url, r['id']) repo.git.config('annex.sshcaching', 'false') logging.debug('ok') # add/update remotes def add_remote(repo_path, remote_name, remote_path): repo = git.Repo(repo_path) if remote_name in [rem.name for rem in repo.remotes]: logging.debug('remote exists: %s %s' % (remote_name, remote_path)) else: logging.debug(repo_path) logging.debug('remote add %s %s' % (remote_name, remote_path)) if not remote_name in [r.name for r in repo.remotes]: repo.create_remote(remote_name, remote_path) logging.debug('ok') remote_path = os.path.join(remote_base, r['id']) add_remote(repo_path, remote_name, remote_path) # local -> remote add_remote(remote_path, local_name, repo_path) # remote -> local # annex sync logging.debug('annex sync') response = repo.git.annex('sync') logif(response) # annex get level = r['level'] logging.debug('level: %s' % level) if level == 'access': for root, dirs, files in os.walk(repo_path): if '.git' in dirs: # exclude .git dir dirs.remove('.git') for f in files: if f.endswith(ACCESS_SUFFIX): path_rel = os.path.join(root, f).replace(repo_path, '')[1:] response = repo.git.annex('get', path_rel) logif(response) elif level == 'all': logging.debug('git annex get .') response = repo.git.annex('get', '.') logif(response) logging.debug('DONE') return 0,'ok'
def entity_annex_add(user_name, user_mail, collection_path, entity_uid, updated_files, new_annex_files, agent='', entity=None): """Command-line function for git annex add-ing a file and updating metadata. All this function does is git annex add the file, update changelog and mets.xml, and commit. It does not copy the file into the entity dir. It does not mark the file as master/mezzanine/access/etc or edit any metadata. It does not perform any background processing on the file. TODO Refactor this when ddr-local models moved into ddr-cmdln WARNING - UGLY HACK! The 'entity' arg is intended to allow ddr-local to pass in Entity objects and use their checksums() method. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param entity_uid: A valid DDR entity UID @param updated_files: list of paths to updated files (relative to collection repo). @param new_annex_files: List of paths to new files (relative to entity files dir). @param agent: (optional) Name of software making the change. @param entity: (optional) Entity object (see above) @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) if not entity: entity = DDREntity(collection.entity_path(entity_uid)) repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) git_files = [] annex_files = [] if not os.path.exists(collection.annex_path): logging.error(' .git/annex IS MISSING!') return 1,'.git/annex IS MISSING!' if not os.path.exists(entity.path): logging.error(' Entity does not exist: {}'.format(entity.uid)) return 1,'entity does not exist: {}'.format(entity.uid) if not os.path.exists(entity.files_path): logging.error(' Entity files_path does not exist: {}'.format(entity.uid)) return 1,'entity files_path does not exist: {}'.format(entity.uid) # new annex files new_files_rel_entity = [] for new_file in new_annex_files: # paths: absolute, relative to collection repo, relative to entity_dir new_file_abs = os.path.join(entity.files_path, new_file) if not os.path.exists(new_file_abs): logging.error(' File does not exist: {}'.format(new_file_abs)) return 1,'File does not exist: {}'.format(new_file_abs) new_file_rel = os.path.join(entity.files_path_rel, new_file) new_file_rel_entity = new_file_abs.replace('{}/'.format(entity.path), '') new_files_rel_entity.append(new_file_rel_entity) annex_files.append(new_file_rel) # updated files [git_files.append(updated_file) for updated_file in updated_files] # update entity control econtrol = entity.control() econtrol.update_checksums(entity) econtrol.write() git_files.append(econtrol.path_rel) # prep log entries changelog_messages = ['Added entity file {}'.format(f) for f in new_files_rel_entity] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message('Added entity file(s)', agent=agent) # update entity changelog write_changelog_entry(entity.changelog_path, changelog_messages, user_name, user_mail) git_files.append(entity.changelog_path_rel) # add files and commit repo = commit_files(repo, commit_message, git_files, annex_files) return 0,'ok'
def sync_group(groupfile, local_base, local_name, remote_base, remote_name): """ """ logging.debug('reading group file: %s' % groupfile) repos = read_group_file(groupfile) ACCESS_SUFFIX = config.ACCESS_FILE_APPEND + config.ACCESS_FILE_EXTENSION def logif(txt): t = txt.strip() if t: logging.debug(t) for r in repos: repo_path = os.path.join(local_base, r['id']) logging.debug('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -') logging.debug('repo_path: %s' % repo_path) # clone/update if os.path.exists(repo_path): logging.debug('updating %s' % repo_path) repo = dvcs.repository(repo_path) repo.git.fetch('origin') repo.git.checkout('master') repo.git.pull('origin', 'master') repo.git.checkout('git-annex') repo.git.pull('origin', 'git-annex') repo.git.checkout('master') logging.debug('ok') else: url = '%s:%s.git' % (config.GITOLITE, r['id']) logging.debug('cloning %s' % url) repo = git.Repo.clone_from(url, r['id']) repo.git.config('annex.sshcaching', 'false') logging.debug('ok') remote_path = os.path.join(remote_base, r['id']) # local -> remote dvcs.remote_add(git.Repo(repo_path, search_parent_directories=True), remote_path, remote_name) # remote -> local dvcs.remote_add(git.Repo(remote_path, search_parent_directories=True), repo_path, local_name) # annex sync logging.debug('annex sync') response = repo.git.annex('sync') logif(response) # annex get level = r['level'] logging.debug('level: %s' % level) if level == 'access': for root, dirs, files in os.walk(repo_path): if '.git' in dirs: # exclude .git dir dirs.remove('.git') for f in files: if f.endswith(ACCESS_SUFFIX): path_rel = os.path.join(root, f).replace(repo_path, '')[1:] response = repo.git.annex('get', path_rel) logif(response) elif level == 'all': logging.debug('git annex get .') response = repo.git.annex('get', '.') logif(response) logging.debug('DONE') return 0,'ok'
def entity_create(user_name, user_mail, collection_path, entity_uid, updated_files, templates, agent=''): """Command-line function for creating an entity and adding it to the collection. @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection_path: Absolute path to collection repo. @param entity_uid: A valid DDR entity UID @param updated_files: List of updated files (relative to collection root). @param templates: List of entity metadata templates (absolute paths). @param agent: (optional) Name of software making the change. @return: message ('ok' if successful) """ collection = DDRCollection(collection_path) entity = DDREntity(collection.entity_path(entity_uid)) repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]: repo.create_remote(GIT_REMOTE_NAME, collection.git_url) git_files = [] # entity dir if not os.path.exists(entity.path): os.makedirs(entity.path) # copy template files to entity for src in templates: if os.path.exists(src): dst = os.path.join(entity.path, os.path.basename(src)) logging.debug('cp %s, %s' % (src, dst)) shutil.copy(src, dst) if os.path.exists(dst): git_files.append(dst) else: logging.error('COULD NOT COPY %s' % src) # entity control econtrol = entity.control() if os.path.exists(econtrol.path): git_files.append(econtrol.path) else: logging.error(' COULD NOT CREATE control') # update collection control ccontrol = collection.control() ccontrol.update_checksums(collection) ccontrol.write() git_files.append(ccontrol.path) # prep ENTITY log entries entity_changelog_messages = ['Initialized entity {}'.format(entity.uid),] if agent: entity_changelog_messages.append('@agent: %s' % agent) # prep COLLECTION log entries changelog_messages = ['Initialized entity {}'.format(entity.uid),] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # ENTITY changelog write_changelog_entry(entity.changelog_path, entity_changelog_messages, user=user_name, email=user_mail) if os.path.exists(entity.changelog_path): git_files.append(entity.changelog_path) else: logging.error(' COULD NOT CREATE changelog') # COLLECTION changelog write_changelog_entry(collection.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(collection.changelog_path) # add updated collection files for src in updated_files: git_files.append(src) # add files and commit repo = commit_files(repo, commit_message, git_files, []) return 0,'ok'
def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False): """Adds or updates files from a CSV file TODO how to handle excluded fields like XMP??? @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param log_path: str Absolute path to addfile log for all files @param dryrun: boolean """ logging.info('batch import files ----------------------------') # TODO hard-coded model name... model = 'file' csv_dir = os.path.dirname(csv_path) logging.debug('csv_dir %s' % csv_dir) # TODO this still knows too much about entities and files... entity_class = identifier.class_for_name( identifier.MODEL_CLASSES['entity']['module'], identifier.MODEL_CLASSES['entity']['class'] ) logging.debug('entity_class %s' % entity_class) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) # check for modified or uncommitted files in repo repository = dvcs.repository(cidentifier.path_abs()) logging.debug(repository) fidentifiers = { rowd['id']: identifier.Identifier( id=rowd['id'], base_path=cidentifier.basepath ) for rowd in rowds } fidentifier_parents = { fi.id: Importer._fidentifier_parent(fi) for fi in fidentifiers.itervalues() } # eidentifiers, removing duplicates eidentifiers = list(set([e for e in fidentifier_parents.itervalues()])) entities = {} bad_entities = [] for eidentifier in eidentifiers: if os.path.exists(eidentifier.path_abs()): entity = eidentifier.object() entities[eidentifier.id] = entity else: if eidentifier.id not in bad_entities: bad_entities.append(eidentifier.id) if bad_entities: for f in bad_entities: logging.error(' %s missing' % f) raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities)) # separate into new and existing lists rowds_new = [] rowds_existing = [] for n,rowd in enumerate(rowds): if Importer._file_is_new(fidentifiers[rowd['id']]): rowds_new.append(rowd) else: rowds_existing.append(rowd) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Updating existing files') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds_updates = [] staged = [] obj_metadata = None for n,rowd in enumerate(rowds_existing): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] file_ = fidentifier.object() modified = file_.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( fidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: logging.debug(' writing %s' % file_.json_path) file_.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(file_.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(file_) elapsed_round = datetime.now() - start_round elapsed_rounds_updates.append(elapsed_round) logging.debug('| %s (%s)' % (fidentifier, elapsed_round)) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates)) if dryrun: pass elif git_files: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) staged = util.natural_sort(dvcs.list_staged(repository)) for path in staged: if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) logging.debug('%s staged in %s' % (len(staged), elapsed_stage)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Adding new files') start_adds = datetime.now() elapsed_rounds_adds = [] logging.info('Checking source files') for rowd in rowds_new: rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig']) logging.debug('| %s' % rowd['src_path']) if not os.path.exists(rowd['src_path']): raise Exception('Missing file: %s' % rowd['src_path']) if log_path: logging.info('addfile logging to %s' % log_path) for n,rowd in enumerate(rowds_new): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] logging.debug('| %s' % (entity)) if dryrun: pass elif Importer._file_is_new(fidentifier): # ingest # TODO make sure this updates entity.files file_,repo2,log2 = ingest.add_file( entity, rowd['src_path'], fidentifier.parts['role'], rowd, git_name, git_mail, agent, log_path=log_path, show_staged=False ) elapsed_round = datetime.now() - start_round elapsed_rounds_adds.append(elapsed_round) logging.debug('| %s (%s)' % (file_, elapsed_round)) elapsed_adds = datetime.now() - start_adds logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return git_files