def add_file_commit(entity, file_, repo, log, git_name, git_mail, agent): log.ok('add_file_commit(%s, %s, %s, %s, %s, %s)' % (file_, repo, log, git_name, git_mail, agent)) staged = dvcs.list_staged(repo) modified = dvcs.list_modified(repo) if staged and not modified: log.ok('All files staged.') log.ok('Updating changelog') path = file_.path_abs.replace('{}/'.format(entity.path), '') changelog_messages = ['Added entity file {}'.format(path)] if agent: changelog_messages.append('@agent: %s' % agent) changelog.write_changelog_entry( entity.changelog_path, changelog_messages, git_name, git_mail) log.ok('git add %s' % entity.changelog_path_rel) git_files = [entity.changelog_path_rel] dvcs.stage(repo, git_files) log.ok('Committing') commit = dvcs.commit(repo, 'Added entity file(s)', agent) log.ok('commit: {}'.format(commit.hexsha)) committed = dvcs.list_committed(repo, commit) committed.sort() log.ok('files committed:') for f in committed: log.ok('| %s' % f) else: log.not_ok('%s files staged, %s files modified' % (len(staged),len(modified))) log.not_ok('staged %s' % staged) log.not_ok('modified %s' % modified) log.not_ok('Can not commit!') raise Exception('Could not commit bc %s unstaged files: %s' % (len(modified), modified)) return file_,repo,log
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True): # TODO move to DDR.dvcs? repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # These vars will be used to determine if stage operation is successful. # If called in batch operation there may already be staged files. # stage_planned Files added/modified by this function call # stage_already Files that were already staged # stage_predicted List of staged files that should result from this operation. # stage_new Files that are being added. stage_planned = git_files + annex_files stage_already = dvcs.list_staged(repo) stage_predicted = predict_staged(stage_already, stage_planned) stage_new = [x for x in stage_planned if x not in stage_already] log.ok('| %s files to stage:' % len(stage_planned)) for sp in stage_planned: log.ok('| %s' % sp) stage_ok = False staged = [] try: log.ok('git stage') dvcs.stage(repo, git_files) log.ok('annex stage') dvcs.annex_stage(repo, annex_files) log.ok('ok') staged = dvcs.list_staged(repo) except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) finally: if show_staged: log.ok('| %s files staged:' % len(staged)) log.ok('show_staged %s' % show_staged) for sp in staged: log.ok('| %s' % sp) if len(staged) == len(stage_predicted): log.ok('| %s files staged (%s new, %s modified)' % ( len(staged), len(stage_new), len(stage_already)) ) stage_ok = True else: log.not_ok('%s new files staged (should be %s)' % ( len(staged), len(stage_predicted)) ) if not stage_ok: log.not_ok('File staging aborted. Cleaning up') # try to pick up the pieces # mv files back to tmp_dir # TODO Properly clean up git-annex-added files. # This clause moves the *symlinks* to annex files but leaves # the actual binaries in the .git/annex objects dir. for tmp,dest in new_files: if os.path.islink(dest): log.not_ok('| link (not moving) %s' % dest) else: log.not_ok('| mv %s %s' % (dest,tmp)) os.rename(dest,tmp) log.not_ok('finished cleanup. good luck...') log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo
def file_destroy(user_name, user_mail, collection, entity, rm_files, updated_files, agent='', commit=True): """Remove file and metadata - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param rm_files: List of paths to files to delete (relative to entity files dir). @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: exit,message,touched_files ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # updated file paths are relative to collection root git_files = [f for f in updated_files] # remove the files # NOTE: File must be removed from filesystem at this point # so the File will be properly removed from the control file for f in rm_files: repo.git.rm('-rf', f) # update entity changelog changelog_files = [ # dont list access files in changelog # TODO use a models.File function to ID the original file f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f) ] changelog_messages = [ 'Deleted file {}'.format(os.path.basename(f)) for f in changelog_files ] if agent: changelog_messages.append('@agent: %s' % agent) write_changelog_entry(entity.changelog_path, changelog_messages, user_name, user_mail) git_files.append(entity.changelog_path_rel) dvcs.stage(repo, git_files) if commit: commit_obj = dvcs.commit(repo, 'Deleted file(s)', agent) return 0, 'ok', git_files
def stage_files(entity, git_files, annex_files, log, show_staged=True): """Stage files; check before and after to ensure all files get staged @param entity: DDR.models.entities.Entity @param git_files: list @param annex_files: list @param log: AddFileLogger @param show_staged: bool @returns: repo """ repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # Remove any files in git_files that are in annex_files git_files = [ path for path in git_files if path not in annex_files ] log.ok('| BEFORE staging') staged_before,modified_before,untracked_before = repo_status(repo, log) stage_these = sorted(list(set(git_files + annex_files))) log.ok('| staging %s files:' % len(stage_these)) for path in stage_these: log.ok('| %s' % path) stage_ok = False staged = [] try: log.ok('| annex stage') # Stage annex files (binaries) before non-binary git files # else binaries might end up in .git/objects/ which would be NOT GOOD dvcs.annex_stage(repo, annex_files) log.ok('| git stage') # If git_files contains binaries they are already staged by now. dvcs.stage(repo, git_files) log.ok('| ok') except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) log.ok('| AFTER staging') staged_after,modified_after,untracked_after = repo_status(repo, log) # Crash if not staged still_modified = [path for path in stage_these if path in modified_after] if still_modified: log.not_ok('These files are still modified') for path in still_modified: log.not_ok('| %s' % path) log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo
def entity_destroy(user_name, user_mail, entity, updated_files, agent='', commit=True): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ collection = entity.collection() parent = entity.identifier.parent().object() repo = dvcs.repository(collection.path_abs, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) git_files = updated_files # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file repo.git.rm('-rf', entity.path_abs) # prep collection log entries changelog_messages = [ 'Deleted entity {}'.format(entity.id), ] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(parent.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(parent.changelog_path) dvcs.stage(repo, git_files) # commit if commit: repo = commit_files(repo, commit_message, git_files) return 0, 'ok'
def entity_destroy(user_name, user_mail, entity, updated_files, agent='', commit=True): """Command-line function for creating an entity and adding it to the collection. - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param entity: Entity @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: message ('ok' if successful) """ collection = entity.collection() parent = entity.identifier.parent().object() repo = dvcs.repository(collection.path_abs, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) git_files = updated_files # remove entity directory # NOTE: entity files must be removed at this point so the entity will be # properly removed from the control file repo.git.rm('-rf', entity.path_abs) # prep collection log entries changelog_messages = ['Deleted entity {}'.format(entity.id),] if agent: changelog_messages.append('@agent: %s' % agent) commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent) # collection changelog write_changelog_entry(parent.changelog_path, changelog_messages, user=user_name, email=user_mail) git_files.append(parent.changelog_path) dvcs.stage(repo, git_files) # commit if commit: repo = commit_files(repo, commit_message, git_files) return 0,'ok'
def test_import_entities(tmpdir, collection, test_csv_dir, test_files_dir): entity_csv_path = os.path.join(test_csv_dir, 'ddrimport-entity-new.csv') out = batch.Importer.import_entities(entity_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT) print(out) out_ids = [o.id for o in out] assert out_ids == EXPECTED_ENTITY_IDS # save and commit git_files = [] for o in out: exit, status, updated_files = o.save('pytest', '*****@*****.**', 'pytest', collection=collection, commit=False) print(o, status) git_files += updated_files repo = dvcs.repository(collection.path_abs) dvcs.stage(repo, git_files) commit = repo.index.commit('test_import_entities')
def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False): """Adds or updates files from a CSV file TODO how to handle excluded fields like XMP??? @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param log_path: str Absolute path to addfile log for all files @param dryrun: boolean """ logging.info('batch import files ----------------------------') # TODO hard-coded model name... model = 'file' csv_dir = os.path.dirname(csv_path) logging.debug('csv_dir %s' % csv_dir) # TODO this still knows too much about entities and files... entity_class = identifier.class_for_name( identifier.MODEL_CLASSES['entity']['module'], identifier.MODEL_CLASSES['entity']['class'] ) logging.debug('entity_class %s' % entity_class) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) # check for modified or uncommitted files in repo repository = dvcs.repository(cidentifier.path_abs()) logging.debug(repository) fidentifiers = { rowd['id']: identifier.Identifier( id=rowd['id'], base_path=cidentifier.basepath ) for rowd in rowds } fidentifier_parents = { fi.id: Importer._fidentifier_parent(fi) for fi in fidentifiers.itervalues() } # eidentifiers, removing duplicates eidentifiers = list(set([e for e in fidentifier_parents.itervalues()])) entities = {} bad_entities = [] for eidentifier in eidentifiers: if os.path.exists(eidentifier.path_abs()): entity = eidentifier.object() entities[eidentifier.id] = entity else: if eidentifier.id not in bad_entities: bad_entities.append(eidentifier.id) if bad_entities: for f in bad_entities: logging.error(' %s missing' % f) raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities)) # separate into new and existing lists rowds_new = [] rowds_existing = [] for n,rowd in enumerate(rowds): if Importer._file_is_new(fidentifiers[rowd['id']]): rowds_new.append(rowd) else: rowds_existing.append(rowd) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Updating existing files') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds_updates = [] staged = [] obj_metadata = None for n,rowd in enumerate(rowds_existing): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] file_ = fidentifier.object() modified = file_.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( fidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: logging.debug(' writing %s' % file_.json_path) file_.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(file_.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(file_) elapsed_round = datetime.now() - start_round elapsed_rounds_updates.append(elapsed_round) logging.debug('| %s (%s)' % (fidentifier, elapsed_round)) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates)) if dryrun: pass elif git_files: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) staged = util.natural_sort(dvcs.list_staged(repository)) for path in staged: if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) logging.debug('%s staged in %s' % (len(staged), elapsed_stage)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Adding new files') start_adds = datetime.now() elapsed_rounds_adds = [] logging.info('Checking source files') for rowd in rowds_new: rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig']) logging.debug('| %s' % rowd['src_path']) if not os.path.exists(rowd['src_path']): raise Exception('Missing file: %s' % rowd['src_path']) if log_path: logging.info('addfile logging to %s' % log_path) for n,rowd in enumerate(rowds_new): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] logging.debug('| %s' % (entity)) if dryrun: pass elif Importer._file_is_new(fidentifier): # ingest # TODO make sure this updates entity.files file_,repo2,log2 = ingest.add_file( entity, rowd['src_path'], fidentifier.parts['role'], rowd, git_name, git_mail, agent, log_path=log_path, show_staged=False ) elapsed_round = datetime.now() - start_round elapsed_rounds_adds.append(elapsed_round) logging.debug('| %s (%s)' % (file_, elapsed_round)) elapsed_adds = datetime.now() - start_adds logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return git_files
def import_entities(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, dryrun=False): """Adds or updates entities from a CSV file Running function multiple times with the same CSV file is idempotent. After the initial pass, files will only be modified if the CSV data has been updated. This function writes and stages files but does not commit them! That is left to the user or to another function. @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param dryrun: boolean @returns: list of updated entities """ logging.info('------------------------------------------------------------------------') logging.info('batch import entity') model = 'entity' repository = dvcs.repository(cidentifier.path_abs()) logging.info(repository) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Importing') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds = [] obj_metadata = None if dryrun: logging.info('Dry run - no modifications') for n,rowd in enumerate(rowds): logging.info('%s/%s - %s' % (n+1, len(rowds), rowd['id'])) start_round = datetime.now() eidentifier = identifier.Identifier(id=rowd['id'], base_path=cidentifier.basepath) # if there is an existing object it will be loaded entity = eidentifier.object() if not entity: entity = models.Entity.create(eidentifier.path_abs(), eidentifier) modified = entity.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( eidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: # write files if not os.path.exists(entity.path_abs): os.makedirs(entity.path_abs) logging.debug(' writing %s' % entity.json_path) entity.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? # TODO write all additions to changelog at one time Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(entity.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(entity) elapsed_round = datetime.now() - start_round elapsed_rounds.append(elapsed_round) logging.debug('| %s (%s)' % (eidentifier, elapsed_round)) if dryrun: logging.info('Dry run - no modifications') elif updated: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) for path in util.natural_sort(dvcs.list_staged(repository)): if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds), elapsed_updates)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return updated
def file_destroy(user_name, user_mail, collection, entity, rm_files, updated_files, agent='', commit=True): """Remove file and metadata - check that paths exist, etc - intantiate collection, repo objects - remove entity dir - update control and changelog - commit everything @param user_name: Username for use in changelog, git log @param user_mail: User email address for use in changelog, git log @param collection: Collection @param entity: Entity @param rm_files: List of paths to files to delete (relative to entity files dir). @param updated_files: List of paths to updated file(s), relative to entitys. @param agent: (optional) Name of software making the change. @param commit: (optional) Commit files after staging them. @return: exit,message,touched_files ('ok' if successful) """ repo = dvcs.repository(collection.path, user_name, user_mail) repo.git.checkout('master') dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME) # updated file paths are relative to collection root git_files = [os.path.join('files', entity.id, f) for f in updated_files] # remove the files # NOTE: File must be removed from filesystem at this point # so the File will be properly removed from the control file for f in rm_files: repo.git.rm('-rf', f) # update entity control econtrol = entity.control() econtrol.update_checksums(entity) econtrol.write() git_files.append(econtrol.path_rel) # update entity changelog changelog_files = [ # dont list access files in changelog # TODO use a models.File function to ID the original file f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f) ] changelog_messages = [ 'Deleted file {}'.format(os.path.basename(f)) for f in changelog_files ] if agent: changelog_messages.append('@agent: %s' % agent) write_changelog_entry( entity.changelog_path, changelog_messages, user_name, user_mail ) git_files.append(entity.changelog_path_rel) dvcs.stage(repo, git_files) if commit: commit_obj = dvcs.commit(repo, 'Deleted file(s)', agent) return 0,'ok',git_files
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True): # TODO move to DDR.dvcs? repo = dvcs.repository(entity.collection_path) log.ok('| repo %s' % repo) # These vars will be used to determine if stage operation is successful. # If called in batch operation there may already be staged files. # stage_planned Files added/modified by this function call # stage_already Files that were already staged # stage_predicted List of staged files that should result from this operation. # stage_new Files that are being added. stage_planned = git_files + annex_files stage_already = dvcs.list_staged(repo) stage_predicted = predict_staged(stage_already, stage_planned) stage_new = [x for x in stage_planned if x not in stage_already] log.ok('| %s files to stage:' % len(stage_planned)) for sp in stage_planned: log.ok('| %s' % sp) stage_ok = False staged = [] try: log.ok('git stage') dvcs.stage(repo, git_files) log.ok('annex stage') dvcs.annex_stage(repo, annex_files) log.ok('ok') staged = dvcs.list_staged(repo) except: # FAILED! print traceback to addfile log log.not_ok(traceback.format_exc().strip()) finally: if show_staged: log.ok('| %s files staged:' % len(staged)) log.ok('show_staged %s' % show_staged) for sp in staged: log.ok('| %s' % sp) if len(staged) == len(stage_predicted): log.ok('| %s files staged (%s new, %s modified)' % (len(staged), len(stage_new), len(stage_already))) stage_ok = True else: log.not_ok('%s new files staged (should be %s)' % (len(staged), len(stage_predicted))) if not stage_ok: log.not_ok('File staging aborted. Cleaning up') # try to pick up the pieces # mv files back to tmp_dir # TODO Properly clean up git-annex-added files. # This clause moves the *symlinks* to annex files but leaves # the actual binaries in the .git/annex objects dir. for tmp, dest in new_files: if os.path.islink(dest): log.not_ok('| link (not moving) %s' % dest) else: log.not_ok('| mv %s %s' % (dest, tmp)) shutil.move(dest, tmp) log.not_ok('finished cleanup. good luck...') log.crash('Add file aborted, see log file for details: %s' % log.logpath) return repo