def test_files_import_internal_nohashes(tmpdir, collection, test_csv_dir,
                                        test_files_dir):
    """test CSV with hash columns (sha1/sha256/md5/size) removed
    """
    file_csv_path = os.path.join(
        test_csv_dir, 'ddrimport-files-import-internal-nohashes.csv')
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(test_files_dir,
                            'ddrimport-files-import-internal-nohashes.log')
    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    # save and commit
    repo = dvcs.repository(collection.path_abs)
    commit = repo.index.commit('test_files_import_internal_nohashes')
    # test hashes present
    check_file_hashes(collection.path_abs)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)
def test_files_import_external(tmpdir, collection, test_csv_dir,
                               test_files_dir):
    """Test importing *external* files
    """
    print('collection_path %s' % collection.path_abs)
    file_csv_path = os.path.join(test_csv_dir,
                                 'ddrimport-files-import-external.csv')
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(test_files_dir,
                            'ddrimport-files-import-external.log')
    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    # save and commit
    repo = dvcs.repository(collection.path_abs)
    commit = repo.index.commit('test_files_import_external')
    print('commit %s' % commit)
    # test hashes present
    check_file_hashes(collection.path_abs)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)
Example #3
0
    def missing_annex_files(self):
        """List File objects with missing binaries
        
        @returns: list of File objects
        """
        def just_id(oid):
            # some "file IDs" might have config.ACCESS_FILE_APPEND appended.
            # remove config.ACCESS_FILE_APPEND if present
            # NOTE: make sure we're not matching some other part of the ID
            # example: ddr-test-123-456-master-abc123-a
            #                                 ^^
            rindex = oid.rfind(config.ACCESS_FILE_APPEND)
            if rindex > 0:
                stem = oid[:rindex]
                suffix = oid[rindex:]
                if (len(oid) - len(stem)) \
                and (len(suffix) == len(config.ACCESS_FILE_APPEND)):
                    return stem
            return oid

        def add_id_and_hash(item):
            item['hash'] = os.path.splitext(item['keyname'])[0]
            item['id'] = just_id(
                os.path.splitext(os.path.basename(item['file']))[0])
            return item

        return [
            add_id_and_hash(item)
            for item in dvcs.annex_missing_files(dvcs.repository(self.path))
        ]
Example #4
0
def clean(collection, remove):
    """TODO ddrimport cleanup subcommand docs
    """
    start = datetime.now()

    # ensure we have absolute paths (CWD+relpath)
    collection_path = os.path.abspath(os.path.normpath(collection))
    # Check args
    if not (os.path.isdir(collection_path)):
        print('ddrimport: collection path must be a directory.')
        sys.exit(1)
    if not os.path.exists(collection_path):
        print('ddrimport: Collection does not exist.')
        sys.exit(1)

    repo = dvcs.repository(collection)
    logging.debug('Resetting staged files')
    dvcs.reset(repo)
    logging.debug('Reverting modified files')
    dvcs.revert(repo)
    if remove:
        logging.debug('Removing untracked files')
        dvcs.remove_untracked(repo)
    status = dvcs.repo_status(repo)
    logging.debug('status\n%s' % status)

    finish = datetime.now()
    elapsed = finish - start
    logging.info('DONE - %s elapsed' % elapsed)
Example #5
0
def ddrinfo(collection, json):
    """ddrinfo - Prints info about a repository.
    
    \b
    Example:
        ddr-info /PATH/TO/REPO
    """
    start = datetime.now()
    
    repo = dvcs.repository(collection)
    #logging.debug(repo)

    data = {}
    #logging.debug('Getting file info')
    data.update(file_info(repo))
    
    #logging.debug('Getting annex info')
    data.update(annex_info(repo))

    if json:
        print(simplejson.dumps(data))
    else:
        output(data)
    
    finish = datetime.now()
    elapsed = finish - start
Example #6
0
def object_metadata(module, repo_path):
    """Metadata for the ddrlocal/ddrcmdln and models definitions used.
    
    @param module: collection, entity, files model definitions module
    @param repo_path: Absolute path to root of object's repo
    @returns: dict
    """
    if not config.APP_METADATA:
        repo = dvcs.repository(repo_path)
        config.APP_METADATA['git_version'] = '; '.join([
            dvcs.git_version(repo),
            dvcs.annex_version(repo)
        ])
        # ddr-cmdln
        url = 'https://github.com/densho/ddr-cmdln.git'
        config.APP_METADATA['application'] = url
        config.APP_METADATA['app_path'] = config.INSTALL_PATH
        config.APP_METADATA['app_commit'] = dvcs.latest_commit(
            config.INSTALL_PATH
        )
        config.APP_METADATA['app_release'] = VERSION
        # ddr-defs
        config.APP_METADATA['defs_path'] = modules.Module(module).path
        config.APP_METADATA['defs_commit'] = dvcs.latest_commit(
            modules.Module(module).path
        )
    return config.APP_METADATA
Example #7
0
def ddrinfo(collection, json):
    """ddrinfo - Prints info about a repository.
    
    \b
    Example:
        ddr-info /PATH/TO/REPO
    """
    start = datetime.now()

    repo = dvcs.repository(collection)
    #logging.debug(repo)

    data = {}
    #logging.debug('Getting file info')
    data.update(file_info(repo))

    #logging.debug('Getting annex info')
    data.update(annex_info(repo))

    if json:
        print(json.dumps(data))
    else:
        output(data)

    finish = datetime.now()
    elapsed = finish - start
Example #8
0
def status(collection, short=False):
    """Command-line function for running git status on collection repository.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.repo_status(dvcs.repository(collection.path))
Example #9
0
def clean(collection, remove):
    """TODO ddrimport cleanup subcommand docs
    """
    start = datetime.now()
    
    # ensure we have absolute paths (CWD+relpath)
    collection_path = os.path.abspath(os.path.normpath(collection))
    # Check args
    if not (os.path.isdir(collection_path)):
        print('ddrimport: collection path must be a directory.')
        sys.exit(1)
    if not os.path.exists(collection_path):
        print('ddrimport: Collection does not exist.')
        sys.exit(1)
    
    repo = dvcs.repository(collection)
    logging.debug('Resetting staged files')
    dvcs.reset(repo)
    logging.debug('Reverting modified files')
    dvcs.revert(repo)
    if remove:
        logging.debug('Removing untracked files')
        dvcs.remove_untracked(repo)
    status = dvcs.repo_status(repo)
    logging.debug('status\n%s' % status)
    
    finish = datetime.now()
    elapsed = finish - start
    logging.info('DONE - %s elapsed' % elapsed)
Example #10
0
def fetch(collection):
    """Command-line function for fetching latest changes to git repo from origin/master.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.fetch(dvcs.repository(collection.path))
Example #11
0
def status(collection, short=False):
    """Command-line function for running git status on collection repository.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.repo_status(dvcs.repository(collection.path))
Example #12
0
def annex_status(collection):
    """Command-line function for running git annex status on collection repository.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.annex_status(dvcs.repository(collection.path))
Example #13
0
 def cmp_model_definition_commits(self, document_commit, module_commit):
     """Indicate document's model defs are newer or older than module's.
     
     Prepares repository and document/module commits to be compared
     by DDR.dvcs.cmp_commits.  See that function for how to interpret
     the results.
     Note: if a document has no defs commit it is considered older
     than the module.
     NOTE: commit may not be found in log if definitions were on a
     branch at the time the document was committed.
     
     @param document: A Collection, Entity, or File object.
     @returns: dict See DDR.dvcs.cmp_commits
     """
     try:
         repo = dvcs.repository(self.path)
     except dvcs.git.InvalidGitRepositoryError:
         # GitPython doesn't understand git worktrees
         # return empty dict see dvcs.cmp_commits
         return {'a':'', 'b':'', 'op':'--'}
     return dvcs.cmp_commits(
         repo,
         document_commit,
         module_commit
     )
Example #14
0
def object_metadata(module, repo_path):
    """Metadata for the ddrlocal/ddrcmdln and models definitions used.
    
    @param module: collection, entity, files model definitions module
    @param repo_path: Absolute path to root of object's repo
    @returns: dict
    """
    if not config.APP_METADATA:
        repo = dvcs.repository(repo_path)
        config.APP_METADATA['git_version'] = '; '.join([
            dvcs.git_version(repo),
            dvcs.annex_version(repo)
        ])
        # ddr-cmdln
        url = 'https://github.com/densho/ddr-cmdln.git'
        config.APP_METADATA['application'] = url
        config.APP_METADATA['app_path'] = config.INSTALL_PATH
        config.APP_METADATA['app_commit'] = dvcs.latest_commit(
            config.INSTALL_PATH
        )
        config.APP_METADATA['app_release'] = VERSION
        # ddr-defs
        config.APP_METADATA['defs_path'] = modules.Module(module).path
        config.APP_METADATA['defs_commit'] = dvcs.latest_commit(
            modules.Module(module).path
        )
    return config.APP_METADATA
Example #15
0
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True):
    # TODO move to DDR.dvcs?
    repo = dvcs.repository(entity.collection_path)
    log.ok('| repo %s' % repo)
    # These vars will be used to determine if stage operation is successful.
    # If called in batch operation there may already be staged files.
    # stage_planned   Files added/modified by this function call
    # stage_already   Files that were already staged
    # stage_predicted List of staged files that should result from this operation.
    # stage_new       Files that are being added.
    stage_planned = git_files + annex_files
    stage_already = dvcs.list_staged(repo)
    stage_predicted = predict_staged(stage_already, stage_planned)
    stage_new = [x for x in stage_planned if x not in stage_already]
    log.ok('| %s files to stage:' % len(stage_planned))
    for sp in stage_planned:
        log.ok('|   %s' % sp)
    stage_ok = False
    staged = []
    try:
        log.ok('git stage')
        dvcs.stage(repo, git_files)
        log.ok('annex stage')
        dvcs.annex_stage(repo, annex_files)
        log.ok('ok')
        staged = dvcs.list_staged(repo)
    except:
        # FAILED! print traceback to addfile log
        log.not_ok(traceback.format_exc().strip())
    finally:
        if show_staged:
            log.ok('| %s files staged:' % len(staged))
            log.ok('show_staged %s' % show_staged)
            for sp in staged:
                log.ok('|   %s' % sp)
        if len(staged) == len(stage_predicted):
            log.ok('| %s files staged (%s new, %s modified)' % (
                len(staged), len(stage_new), len(stage_already))
            )
            stage_ok = True
        else:
            log.not_ok('%s new files staged (should be %s)' % (
                len(staged), len(stage_predicted))
            )
        if not stage_ok:
            log.not_ok('File staging aborted. Cleaning up')
            # try to pick up the pieces
            # mv files back to tmp_dir
            # TODO Properly clean up git-annex-added files.
            #      This clause moves the *symlinks* to annex files but leaves
            #      the actual binaries in the .git/annex objects dir.
            for tmp,dest in new_files:
                if os.path.islink(dest):
                    log.not_ok('| link (not moving) %s' % dest)
                else:
                    log.not_ok('| mv %s %s' % (dest,tmp))
                    os.rename(dest,tmp)
            log.not_ok('finished cleanup. good luck...')
            log.crash('Add file aborted, see log file for details: %s' % log.logpath)
    return repo
Example #16
0
def annex_status(collection):
    """Command-line function for running git annex status on collection repository.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.annex_status(dvcs.repository(collection.path))
Example #17
0
def fetch(collection):
    """Command-line function for fetching latest changes to git repo from origin/master.
    
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    return dvcs.fetch(dvcs.repository(collection.path))
Example #18
0
def test_repository(tmpdir):
    """Tests that repository config values are set correctly"""
    # git_set_configs
    # annex_set_configs
    # repository
    path = str(tmpdir / 'ddr-test')
    repo = git.Repo.init(path)
    dvcs.repository(path=path, user_name=USER_NAME, user_mail=USER_MAIL)
    reader = repo.config_reader()
    reader.sections()
    core_items = {i[0]: i[1] for i in reader.items('core')}
    user_items = {i[0]: i[1] for i in reader.items('user')}
    annex_items = {i[0]: i[1] for i in reader.items('annex')}
    assert core_items.get('fileMode') == 'false'
    assert user_items.get('name') == USER_NAME
    assert user_items.get('email') == USER_MAIL
    assert annex_items.get('sshcaching') == 'false'
Example #19
0
 def set_repo_description(self):
     """Set COLLECTION/.git/description based on self.title
     """
     desc_path = os.path.join(self.git_path, 'description')
     if self.title and os.path.exists(self.git_path) and os.access(
             desc_path, os.W_OK):
         repo = dvcs.repository(self.path)
         repo.description = self.title
Example #20
0
 def repo_annex_status(self):
     """Get annex status of collection repo.
     """
     if not self._astatus and (os.path.exists(self.git_path)):
         astatus = dvcs.annex_status(dvcs.repository(self.path))
         if astatus:
             self._astatus = astatus
     return self._astatus
Example #21
0
def file_destroy(user_name, user_mail, collection_path, entity_uid, rm_files, updated_files, agent=''):
    """Command-line function for creating an entity and adding it to the collection.
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param entity_uid: A valid DDR entity UID
    @param rm_files: List of paths to files to delete (relative to entity files dir).
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    entity = DDREntity(collection.entity_path(entity_uid))
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    
    # updated file paths are relative to collection root
    git_files = [os.path.join('files', entity.uid, f) for f in updated_files]
    
    # Only list the original file in changelog
    # TODO use a models.File function to ID the original file
    changelog_files = [f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f)]
    
    # remove the files
    # NOTE: entity files must be removed at this point so the entity will be
    # properly removed from the control file
    git = repo.git
    for f in rm_files:
        git.rm('-rf', f)
    
    # update entity control
    econtrol = entity.control()
    econtrol.update_checksums(entity)
    econtrol.write()
    git_files.append(econtrol.path_rel)
    
    # update entity changelog
    changelog_messages = ['Deleted entity file {}'.format(f) for f in changelog_files]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    write_changelog_entry(entity.changelog_path,
                          changelog_messages,
                          user_name, user_mail)
    git_files.append(entity.changelog_path_rel)
    
    # add files and commit
    commit_message = dvcs.compose_commit_message('Deleted entity file(s)', agent=agent)
    repo = commit_files(repo, commit_message, git_files, [])
    return 0,'ok'
Example #22
0
def merge( request, repo, org, cid ):
    """
    Decides how to merge the various files in a merge conflict.
    Sends user around to different editors and things until everything is merged.
    """
    collection_path = Collection.collection_path(request,repo,org,cid)
    repository = dvcs.repository(collection_path)
    collection = Collection.from_json(collection_path)
    task_id = collection.locked()
    status = commands.status(collection_path)
    ahead = collection.repo_ahead()
    behind = collection.repo_behind()
    diverged = collection.repo_diverged()
    conflicted = collection.repo_conflicted()
    unmerged = dvcs.list_conflicted(repository)
    staged = dvcs.list_staged(repository)
    if request.method == 'POST':
        form = MergeCommitForm(request.POST)
        if form.is_valid():
            which = form.cleaned_data['which']
            if which == 'merge':
                dvcs.merge_commit(repository)
                committed = 1
            elif which == 'commit':
                dvcs.diverge_commit(repository)
                committed = 1
            else:
                committed = 0
            if committed:
                if task_id:
                    collection.unlock(task_id)
                messages.error(request, 'Merge conflict has been resolved. Please sync to make your changes available to other users.')
                return HttpResponseRedirect( reverse('webui-collection', args=[repo,org,cid]) )
            return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) )
    else:
        which = 'unknown'
        if conflicted and not unmerged:
            which = 'merge'
        elif diverged and staged:
            which = 'commit'
        form = MergeCommitForm({'path':collection_path, 'which':which,})
    return render_to_response(
        'webui/merge/index.html',
        {'repo': repo,
         'org': org,
         'cid': cid,
         'collection_path': collection_path,
         'collection': collection,
         'status': status,
         'conflicted': conflicted,
         'ahead': ahead,
         'behind': behind,
         'unmerged': unmerged,
         'diverged': diverged,
         'staged': staged,
         'form': form,},
        context_instance=RequestContext(request, processors=[])
    )
Example #23
0
def file_destroy(user_name,
                 user_mail,
                 collection,
                 entity,
                 rm_files,
                 updated_files,
                 agent='',
                 commit=True):
    """Remove file and metadata
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @param entity: Entity
    @param rm_files: List of paths to files to delete (relative to entity files dir).
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: exit,message,touched_files ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)

    # updated file paths are relative to collection root
    git_files = [f for f in updated_files]

    # remove the files
    # NOTE: File must be removed from filesystem at this point
    # so the File will be properly removed from the control file
    for f in rm_files:
        repo.git.rm('-rf', f)

    # update entity changelog
    changelog_files = [
        # dont list access files in changelog
        # TODO use a models.File function to ID the original file
        f for f in rm_files if ('-a.jpg' not in f) and ('.json' not in f)
    ]
    changelog_messages = [
        'Deleted file {}'.format(os.path.basename(f)) for f in changelog_files
    ]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    write_changelog_entry(entity.changelog_path, changelog_messages, user_name,
                          user_mail)

    git_files.append(entity.changelog_path_rel)
    dvcs.stage(repo, git_files)
    if commit:
        commit_obj = dvcs.commit(repo, 'Deleted file(s)', agent)
    return 0, 'ok', git_files
Example #24
0
 def repo_fetch(self):
     """Fetch latest changes to collection repo from origin/master.
     """
     result = '-1'
     if os.path.exists(self.git_path):
         result = dvcs.fetch(dvcs.repository(self.path))
     else:
         result = '%s is not a git repository' % self.path
     return result
Example #25
0
def entity_destroy(user_name, user_mail, collection_path, entity_uid, agent=''):
    """Command-line function for creating an entity and adding it to the collection.
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param entity_uid: A valid DDR entity UID
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    entity_dir = os.path.join(collection_path, 'files', entity_uid)
    
    if not os.path.exists(collection_path):
        raise Exception('collection_path not found: %s' % collection_path)
    if not os.path.exists(entity_dir):
        raise Exception('entity not found: %s' % entity_dir)
    
    collection = DDRCollection(collection_path)
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    git_files = []
    
    # remove entity directory
    # NOTE: entity files must be removed at this point so the entity will be
    # properly removed from the control file
    git = repo.git
    git.rm('-rf', entity_dir)
    
    # update collection control
    ccontrol = collection.control()
    ccontrol.update_checksums(collection)
    ccontrol.write()
    git_files.append(ccontrol.path)
    
    # prep collection log entries
    changelog_messages = ['Deleted entity {}'.format(entity_uid),]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent)
    
    # collection changelog
    write_changelog_entry(collection.changelog_path,
                          changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(collection.changelog_path)
    
    # commit
    repo = commit_files(repo, commit_message, git_files)
    return 0,'ok'
Example #26
0
def test_repository():
    """Tests that repository config values are set correctly"""
    # git_set_configs
    # annex_set_configs
    # repository
    path = os.path.join(TESTING_BASE_DIR, 'ddr-test-%s' % datetime.now(config.TZ).strftime('%Y%m%d-%H%M%S'))
    user = '******'
    mail = '*****@*****.**'
    repo = git.Repo.init(path)
    dvcs.repository(path=path, user_name=user, user_mail=mail)
    reader = repo.config_reader()
    reader.sections()
    core_items = {i[0]:i[1] for i in reader.items('core')}
    user_items = {i[0]:i[1] for i in reader.items('user')}
    annex_items = {i[0]:i[1] for i in reader.items('annex')}
    assert core_items.get('fileMode') == 'false'
    assert user_items.get('name') == user
    assert user_items.get('email') == mail
    assert annex_items.get('sshcaching') == 'false'
Example #27
0
 def repo_status(self):
     """Get status of collection repo vis-a-vis origin/master.
     
     The repo_(synced,ahead,behind,diverged,conflicted) functions all use
     the result of this function so that git-status is only called once.
     """
     if not self._status and (os.path.exists(self.git_path)):
         status = dvcs.repo_status(dvcs.repository(self.path), short=True)
         if status:
             self._status = status
     return self._status
Example #28
0
def entity_destroy(user_name,
                   user_mail,
                   entity,
                   updated_files,
                   agent='',
                   commit=True):
    """Command-line function for creating an entity and adding it to the collection.
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param entity: Entity
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: message ('ok' if successful)
    """
    collection = entity.collection()
    parent = entity.identifier.parent().object()
    repo = dvcs.repository(collection.path_abs, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    git_files = updated_files

    # remove entity directory
    # NOTE: entity files must be removed at this point so the entity will be
    # properly removed from the control file
    repo.git.rm('-rf', entity.path_abs)

    # prep collection log entries
    changelog_messages = [
        'Deleted entity {}'.format(entity.id),
    ]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message(changelog_messages[0],
                                                 agent=agent)

    # collection changelog
    write_changelog_entry(parent.changelog_path,
                          changelog_messages,
                          user=user_name,
                          email=user_mail)
    git_files.append(parent.changelog_path)
    dvcs.stage(repo, git_files)
    # commit
    if commit:
        repo = commit_files(repo, commit_message, git_files)
    return 0, 'ok'
Example #29
0
def stage_files(entity, git_files, annex_files, log, show_staged=True):
    """Stage files; check before and after to ensure all files get staged

    @param entity: DDR.models.entities.Entity
    @param git_files: list
    @param annex_files: list
    @param log: AddFileLogger
    @param show_staged: bool
    @returns: repo
    """
    repo = dvcs.repository(entity.collection_path)
    log.ok('| repo %s' % repo)

    # Remove any files in git_files that are in annex_files
    git_files = [
        path for path in git_files
        if path not in annex_files
    ]
    
    log.ok('| BEFORE staging')
    staged_before,modified_before,untracked_before = repo_status(repo, log)
    
    stage_these = sorted(list(set(git_files + annex_files)))
    log.ok('| staging %s files:' % len(stage_these))
    for path in stage_these:
        log.ok('|   %s' % path)
    
    stage_ok = False
    staged = []
    try:
        log.ok('| annex stage')
        # Stage annex files (binaries) before non-binary git files
        # else binaries might end up in .git/objects/ which would be NOT GOOD
        dvcs.annex_stage(repo, annex_files)
        log.ok('| git stage')
        # If git_files contains binaries they are already staged by now.
        dvcs.stage(repo, git_files)
        log.ok('| ok')
    except:
        # FAILED! print traceback to addfile log
        log.not_ok(traceback.format_exc().strip())
        
    log.ok('| AFTER staging')
    staged_after,modified_after,untracked_after = repo_status(repo, log)
    
    # Crash if not staged
    still_modified = [path for path in stage_these if path in modified_after]
    if still_modified:
        log.not_ok('These files are still modified')
        for path in still_modified:
            log.not_ok('| %s' % path)
        log.crash('Add file aborted, see log file for details: %s' % log.logpath)
    
    return repo
Example #30
0
def git_status( request, cid ):
    collection = Collection.from_identifier(Identifier(cid))
    alert_if_conflicted(request, collection)
    gitstatus = collection.gitstatus()
    remotes = dvcs.remotes(dvcs.repository(collection.path))
    return render(request, 'webui/collections/git-status.html', {
        'collection': collection,
        'status': gitstatus.get('status', 'git-status unavailable'),
        'astatus': gitstatus.get('annex_status', 'annex-status unavailable'),
        'timestamp': gitstatus.get('timestamp'),
        'remotes': remotes,
    })
Example #31
0
def entity_destroy(user_name, user_mail, collection, entity, agent=''):
    """Command-line function for creating an entity and adding it to the collection.
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @param entity: Entity
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    if not os.path.exists(collection.path_abs):
        raise Exception('collection_path not found: %s' % collection.path_abs)
    if not os.path.exists(entity.path_abs):
        raise Exception('entity not found: %s' % entity.path_abs)
    
    repo = dvcs.repository(collection.path_abs, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    git_files = []
    
    # remove entity directory
    # NOTE: entity files must be removed at this point so the entity will be
    # properly removed from the control file
    git = repo.git
    git.rm('-rf', entity.path_abs)
    
    # update collection control
    ccontrol = collection.control()
    ccontrol.update_checksums(collection)
    ccontrol.write()
    git_files.append(ccontrol.path)
    
    # prep collection log entries
    changelog_messages = ['Deleted entity {}'.format(entity.id),]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent)
    
    # collection changelog
    write_changelog_entry(collection.changelog_path,
                          changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(collection.changelog_path)
    
    # commit
    repo = commit_files(repo, commit_message, git_files)
    return 0,'ok'
Example #32
0
def test_update_files(tmpdir, collection, test_csv_dir, test_files_dir):
    hashes_before = collect_hashes(collection.path_abs)
    file_csv_path = os.path.join(test_csv_dir, 'ddrimport-file-update.csv')
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(test_files_dir, 'ddrimport-file-update.log')
    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    repo = dvcs.repository(collection.path_abs)
    staged = sorted(dvcs.list_staged(repo))
    # test
    unstaged = []
    for path in EXPECTED_UPDATE_FILES:
        if path not in staged:
            unstaged.append(path)
    unstaged = sorted(unstaged)
    for n, path in enumerate(unstaged):
        print('UNSTAGED %s %s' % (n + 1, path))
    print(repo)
    print(log_path)
    assert not unstaged
    # save and commit
    repo = dvcs.repository(collection.path_abs)
    commit = repo.index.commit('test_update_files')
    # test hashes present
    check_file_hashes(collection.path_abs)
    # test hashes not modified
    hashes_after = collect_hashes(collection.path_abs)
    check_hashes(hashes_before, hashes_after)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)
Example #33
0
def entity_update(user_name,
                  user_mail,
                  collection,
                  entity,
                  updated_files,
                  agent='',
                  commit=True):
    """Command-line function for committing changes to the specified entity file.
    
    NOTE: Does not push to the workbench server.
    Updates entity changelog but NOT in collection changelog.
    Makes an entry in git log.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @param entity: Entity
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: message ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)

    # entity file paths are relative to collection root
    git_files = []
    for f in updated_files:
        git_files.append(os.path.join('files', entity.id, str(f)))

    # entity changelog
    entity_changelog_messages = []
    for f in updated_files:
        p = os.path.join(entity.id, f)
        entity_changelog_messages.append('Updated entity file {}'.format(p))

    # prep log entries
    if agent:
        entity_changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Updated entity file(s)',
                                                 agent=agent)

    write_changelog_entry(entity.changelog_path,
                          entity_changelog_messages,
                          user=user_name,
                          email=user_mail)
    git_files.append(entity.changelog_path_rel)
    if commit:
        # add files and commit
        repo = commit_files(repo, commit_message, git_files, [])
    return 0, 'ok'
Example #34
0
def sync(user_name, user_mail, collection_path):
    """Sync repo with bare clone on hub server; use instead of git-annex-sync.
    
    Git-annex has a "sync" command for communicating annex changes between
    repositories, but it is designed to be used between non-bare repositories.
    Normally Git does not support pushing to non-bare repositories, and
    git-annex does some trickery involving "synced/BRANCH" branches to make
    this work.
    Reference: http://git-annex.branchable.com/sync/
    
    When git-annex-sync is used between a non-bare repo and a bare repo
    (e.g. between a local repo and our hub server running Gitolite),
    the "synced/master" branches do not get merged in to master and syncing
    no longer works.  Therefore it is necessary to sync manually.
    
    If you think you want to use git-annex-sync, remember that we tried this
    in commit 1857a7aa3f and it did not work and we reverted to manual syncing.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    repo = dvcs.repository(collection.path, user_name, user_mail)
    logging.debug('repo: %s' % repo)
    dvcs.set_annex_description(repo)
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    # list remotes
    logging.debug('remotes')
    for remote in dvcs.remotes(collection_path):
        logging.debug('- %s %s' % (remote['name'], remote['url']))
    # pull
    logging.debug('git pull %s master' % GIT_REMOTE_NAME)
    repo.git.checkout('master')
    repo.git.pull(GIT_REMOTE_NAME, 'master')
    logging.debug('git pull %s git-annex' % GIT_REMOTE_NAME)
    repo.git.checkout('git-annex')
    repo.git.pull(GIT_REMOTE_NAME, 'git-annex')
    #logging.debug('OK')
    # push
    logging.debug('git pull %s git-annex' % GIT_REMOTE_NAME)
    repo.git.checkout('git-annex')
    repo.git.push(GIT_REMOTE_NAME, 'git-annex')
    logging.debug('git pull %s master' % GIT_REMOTE_NAME)
    repo.git.checkout('master')
    repo.git.push(GIT_REMOTE_NAME, 'master')
    logging.debug('OK')
    return 0,'ok'
Example #35
0
def git_status( request, repo, org, cid ):
    collection = Collection.from_request(request)
    alert_if_conflicted(request, collection)
    gitstatus = collection.gitstatus()
    remotes = dvcs.remotes(dvcs.repository(collection.path))
    return render_to_response(
        'webui/collections/git-status.html',
        {'collection': collection,
         'status': gitstatus.get('status', 'git-status unavailable'),
         'astatus': gitstatus.get('annex_status', 'annex-status unavailable'),
         'timestamp': gitstatus.get('timestamp'),
         'remotes': remotes,
         },
        context_instance=RequestContext(request, processors=[])
    )
Example #36
0
def annex_push(collection, file_path_rel):
    """Push a git-annex file to workbench.

    Example file_paths:
        ddr-densho-1-1/files/video1.mov
        ddr-densho-42-17/files/image35.jpg
        ddr-one-35-248/files/newspaper.pdf
    
    $ git annex copy PATH --to=REMOTE
    
    @param collection: Collection
    @param file_path_rel: Path to file relative to collection root
    @return: message ('ok' if successful)
    """
    file_path_abs = os.path.join(collection.path, file_path_rel)
    logging.debug('    collection.path {}'.format(collection.path))
    logging.debug('    file_path_rel {}'.format(file_path_rel))
    logging.debug('    file_path_abs {}'.format(file_path_abs))
    if not os.path.exists(collection.path):
        logging.error('    NO COLLECTION AT {}'.format(collection.path))
        return 1, 'no collection'
    if not os.path.exists(collection.annex_path):
        logging.error('    NO GIT ANNEX AT {}'.format(collection.annex_path))
        return 1, 'no annex'
    if not os.path.exists(file_path_abs):
        logging.error('    NO FILE AT {}'.format(file_path_abs))
        return 1, 'no file'
    # let's do this thing
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    logging.debug('    git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME,
                                                       file_path_rel))
    stdout = repo.git.annex('copy', '-t', config.GIT_REMOTE_NAME,
                            file_path_rel)
    logging.debug('\n{}'.format(stdout))
    # confirm that it worked
    whereis = dvcs.annex_whereis_file(repo, file_path_rel)
    if whereis['success']:
        remotes = [
            r['description'] for r in whereis['whereis'] if not r['here']
        ]
        logging.debug('    present in remotes {}'.format(remotes))
        logging.debug(
            '    it worked: {}'.format(config.GIT_REMOTE_NAME in remotes))
    logging.debug('    DONE')
    return 0, 'ok'
Example #37
0
def entity_update(user_name, user_mail, collection_path, entity_uid, updated_files, agent=''):
    """Command-line function for committing changes to the specified entity file.
    
    NOTE: Does not push to the workbench server.
    Updates entity changelog but NOT in collection changelog.
    Makes an entry in git log.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param entity_uid: A valid DDR entity UID
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    entity = DDREntity(collection.entity_path(entity_uid))
    
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    
    # entity file paths are relative to collection root
    git_files = []
    for f in updated_files:
        git_files.append( os.path.join( 'files', entity.uid, f) )
    
    # entity changelog
    entity_changelog_messages = []
    for f in updated_files:
        p = os.path.join(entity.uid, f)
        entity_changelog_messages.append('Updated entity file {}'.format(p))

    # prep log entries
    if agent:
        entity_changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Updated entity file(s)', agent=agent)
    
    write_changelog_entry(entity.changelog_path,
                          entity_changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(entity.changelog_path_rel)
    # add files and commit
    repo = commit_files(repo, commit_message, git_files, [])
    return 0,'ok'
Example #38
0
def update( base_dir, collection_path ):
    """Gets a bunch of status info for the collection; refreshes if forced
    
    timestamp, elapsed, status, annex_status, syncstatus
    
    @param force: Boolean Forces refresh of status
    @returns: dict
    """
    start = datetime.now()
    repo = dvcs.repository(collection_path)
    status = dvcs.repo_status(repo, short=True)
    annex_status = dvcs.annex_status(repo)
    timestamp = datetime.now()
    syncstatus = sync_status(collection_path, git_status=status, timestamp=timestamp, force=True)
    elapsed = timestamp - start
    text = write(base_dir, collection_path, timestamp, elapsed, status, annex_status, syncstatus)
    return loads(text)
Example #39
0
def entity_destroy(user_name, user_mail, entity, updated_files, agent='', commit=True):
    """Command-line function for creating an entity and adding it to the collection.
    
    - check that paths exist, etc
    - intantiate collection, repo objects
    - remove entity dir
    - update control and changelog
    - commit everything
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param entity: Entity
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: message ('ok' if successful)
    """
    collection = entity.collection()
    parent = entity.identifier.parent().object()
    repo = dvcs.repository(collection.path_abs, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    git_files = updated_files
    
    # remove entity directory
    # NOTE: entity files must be removed at this point so the entity will be
    # properly removed from the control file
    repo.git.rm('-rf', entity.path_abs)
    
    # prep collection log entries
    changelog_messages = ['Deleted entity {}'.format(entity.id),]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent)
    
    # collection changelog
    write_changelog_entry(parent.changelog_path,
                          changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(parent.changelog_path)
    dvcs.stage(repo, git_files)
    # commit
    if commit:
        repo = commit_files(repo, commit_message, git_files)
    return 0,'ok'
Example #40
0
def object_metadata(module, repo_path):
    """Metadata for the ddrlocal/ddrcmdln and models definitions used.
    
    @param module: collection, entity, files model definitions module
    @param repo_path: Absolute path to root of object's repo
    @returns: dict
    """
    repo = dvcs.repository(repo_path)
    gitversion = '; '.join([dvcs.git_version(repo), dvcs.annex_version(repo)])
    data = {
        'application': 'https://github.com/densho/ddr-cmdln.git',
        'app_commit': dvcs.latest_commit(config.INSTALL_PATH),
        'app_release': VERSION,
        'defs_path': modules.Module(module).path,
        'models_commit': dvcs.latest_commit(modules.Module(module).path),
        'git_version': gitversion,
    }
    return data
Example #41
0
def entity_update(user_name, user_mail, collection, entity, updated_files, agent='', commit=True):
    """Command-line function for committing changes to the specified entity file.
    
    NOTE: Does not push to the workbench server.
    Updates entity changelog but NOT in collection changelog.
    Makes an entry in git log.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @param entity: Entity
    @param updated_files: List of paths to updated file(s), relative to entitys.
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: message ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    
    # entity file paths are relative to collection root
    git_files = []
    for f in updated_files:
        git_files.append( os.path.join( 'files', entity.id, str(f)) )
    
    # entity changelog
    entity_changelog_messages = []
    for f in updated_files:
        p = os.path.join(entity.id, f)
        entity_changelog_messages.append('Updated entity file {}'.format(p))

    # prep log entries
    if agent:
        entity_changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Updated entity file(s)', agent=agent)
    
    write_changelog_entry(entity.changelog_path,
                          entity_changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(entity.changelog_path_rel)
    if commit:
        # add files and commit
        repo = commit_files(repo, commit_message, git_files, [])
    return 0,'ok'
Example #42
0
def update(user_name,
           user_mail,
           collection,
           updated_files,
           agent='',
           commit=False):
    """Command-line function for commiting changes to the specified file.
    
    NOTE: Does not push to the workbench server.
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @param updated_files: List of relative paths to updated file(s).
    @param agent: (optional) Name of software making the change.
    @param commit: (optional) Commit files after staging them.
    @return: message ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    if repo:
        logging.debug('    git repo {}'.format(collection.path))
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)

    # prep log entries
    changelog_messages = []
    for f in updated_files:
        changelog_messages.append('Updated collection file(s) {}'.format(f))
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Updated metadata file(s)',
                                                 agent=agent)

    # write changelog
    write_changelog_entry(collection.changelog_path, changelog_messages,
                          user_name, user_mail)
    if os.path.exists(collection.changelog_path):
        updated_files.append(collection.changelog_path)
    else:
        logging.error('    COULD NOT UPDATE changelog')

    if commit:
        # add files and commit
        repo = commit_files(repo, commit_message, updated_files, [])
    return 0, 'ok'
Example #43
0
 def cmp_model_definition_commits(self, document_commit, module_commit):
     """Indicate document's model defs are newer or older than module's.
     
     Prepares repository and document/module commits to be compared
     by DDR.dvcs.cmp_commits.  See that function for how to interpret
     the results.
     Note: if a document has no defs commit it is considered older
     than the module.
     NOTE: commit may not be found in log if definitions were on a
     branch at the time the document was committed.
     
     @param document: A Collection, Entity, or File object.
     @returns: dict See DDR.dvcs.cmp_commits
     """
     return dvcs.cmp_commits(
         dvcs.repository(self.path),
         document_commit,
         module_commit
     )
Example #44
0
def annex_push(collection_path, file_path_rel):
    """Push a git-annex file to workbench.

    Example file_paths:
        ddr-densho-1-1/files/video1.mov
        ddr-densho-42-17/files/image35.jpg
        ddr-one-35-248/files/newspaper.pdf
    
    $ git annex copy PATH --to=REMOTE
    
    @param collection_path: Absolute path to collection repo.
    @param file_path_rel: Path to file relative to collection root
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    file_path_abs = os.path.join(collection.path, file_path_rel)
    logging.debug('    collection.path {}'.format(collection.path))
    logging.debug('    file_path_rel {}'.format(file_path_rel))
    logging.debug('    file_path_abs {}'.format(file_path_abs))
    if not os.path.exists(collection.path):
        logging.error('    NO COLLECTION AT {}'.format(collection.path))
        return 1,'no collection'
    if not os.path.exists(collection.annex_path):
        logging.error('    NO GIT ANNEX AT {}'.format(collection.annex_path))
        return 1,'no annex'
    if not os.path.exists(file_path_abs):
        logging.error('    NO FILE AT {}'.format(file_path_abs))
        return 1,'no file'
    # let's do this thing
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    logging.debug('    git annex copy -t {} {}'.format(GIT_REMOTE_NAME, file_path_rel))
    stdout = repo.git.annex('copy', '-t', GIT_REMOTE_NAME, file_path_rel)
    logging.debug('\n{}'.format(stdout))
    # confirm that it worked
    remotes = dvcs.annex_whereis_file(repo, file_path_rel)
    logging.debug('    present in remotes {}'.format(remotes))
    logging.debug('    it worked: {}'.format(GIT_REMOTE_NAME in remotes))
    logging.debug('    DONE')
    return 0,'ok'
Example #45
0
 def cmp_model_definition_commits(self, document_commit, module_commit):
     """Indicate document's model defs are newer or older than module's.
     
     Prepares repository and document/module commits to be compared
     by DDR.dvcs.cmp_commits.  See that function for how to interpret
     the results.
     Note: if a document has no defs commit it is considered older
     than the module.
     NOTE: commit may not be found in log if definitions were on a
     branch at the time the document was committed.
     
     @param document: A Collection, Entity, or File object.
     @returns: dict See DDR.dvcs.cmp_commits
     """
     try:
         repo = dvcs.repository(self.path)
     except dvcs.git.InvalidGitRepositoryError:
         # GitPython doesn't understand git worktrees
         # return empty dict see dvcs.cmp_commits
         return {'a': '', 'b': '', 'op': '--'}
     return dvcs.cmp_commits(repo, document_commit, module_commit)
Example #46
0
def test_import_entities(tmpdir, collection, test_csv_dir, test_files_dir):
    entity_csv_path = os.path.join(test_csv_dir, 'ddrimport-entity-new.csv')
    out = batch.Importer.import_entities(entity_csv_path,
                                         collection.identifier, VOCABS_URL,
                                         GIT_USER, GIT_MAIL, AGENT)
    print(out)
    out_ids = [o.id for o in out]
    assert out_ids == EXPECTED_ENTITY_IDS
    # save and commit
    git_files = []
    for o in out:
        exit, status, updated_files = o.save('pytest',
                                             '*****@*****.**',
                                             'pytest',
                                             collection=collection,
                                             commit=False)
        print(o, status)
        git_files += updated_files
    repo = dvcs.repository(collection.path_abs)
    dvcs.stage(repo, git_files)
    commit = repo.index.commit('test_import_entities')
Example #47
0
 def check_repository(cidentifier):
     """Load repository, check for staged or modified files
     
     Entity.add_files will not work properly if the repo contains staged
     or modified files.
     
     Results dict includes:
     - 'passed': boolean
     - 'repo': GitPython repository
     - 'staged': list of staged files
     - 'modified': list of modified files
     
     @param cidentifier: Identifier
     @returns: dict
     """
     logging.info('Checking repository')
     passed = False
     repo = dvcs.repository(cidentifier.path_abs())
     logging.info(repo)
     staged = dvcs.list_staged(repo)
     if staged:
         logging.error('*** Staged files in repo %s' % repo.working_dir)
         for f in staged:
             logging.error('*** %s' % f)
     modified = dvcs.list_modified(repo)
     if modified:
         logging.error('Modified files in repo: %s' % repo.working_dir)
         for f in modified:
             logging.error('*** %s' % f)
     if repo and (not (staged or modified)):
         passed = True
         logging.info('ok')
     else:
         logging.error('FAIL')
     return {
         'passed': passed,
         'repo': repo,
         'staged': staged,
         'modified': modified,
     }
Example #48
0
def update(user_name, user_mail, collection_path, updated_files, agent=''):
    """Command-line function for commiting changes to the specified file.
    
    NOTE: Does not push to the workbench server.
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param updated_files: List of relative paths to updated file(s).
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    
    repo = dvcs.repository(collection.path, user_name, user_mail)
    if repo:
        logging.debug('    git repo {}'.format(collection.path))
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    
    # prep log entries
    changelog_messages = []
    for f in updated_files:
        changelog_messages.append('Updated collection file(s) {}'.format(f))
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Updated metadata file(s)', agent=agent)
    
    # write changelog
    write_changelog_entry(collection.changelog_path,
                          changelog_messages,
                          user_name, user_mail)
    if os.path.exists(collection.changelog_path):
        updated_files.append(collection.changelog_path)
    else:
        logging.error('    COULD NOT UPDATE changelog')
    
    # add files and commit
    repo = commit_files(repo, commit_message, updated_files, [])
    return 0,'ok'
Example #49
0
def edit_raw( request, repo, org, cid ):
    """
    """
    git_name = request.session.get('git_name')
    git_mail = request.session.get('git_mail')
    if not git_name and git_mail:
        messages.error(request, WEBUI_MESSAGES['LOGIN_REQUIRED'])
    collection_path = Collection.collection_path(request,repo,org,cid)
    repository = dvcs.repository(collection_path)
    filename = ''
    if request.method == 'POST':
        filename = request.POST.get('filename', None)
    elif request.method == 'GET':
        filename = request.GET.get('filename', None)
    filepath = os.path.join(collection_path, filename)
    
    if request.method == 'POST':
        form = MergeRawForm(request.POST)
        if form.is_valid():
            text = form.cleaned_data['text']
            # TODO validate XML
            with open(filepath, 'w') as f:
                f.write(text)
            # git add file
            dvcs.merge_add(repository, filename)
            return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) )
    else:
        with open(filepath, 'r') as f:
            text = f.read()
        form = MergeRawForm({'filename': filename, 'text': text,})
    return render_to_response(
        'webui/merge/edit-raw.html',
        {'repo': repo,
         'org': org,
         'cid': cid,
         'filename':filename,
         'form': form,},
        context_instance=RequestContext(request, processors=[])
    )
Example #50
0
def edit_json( request, repo, org, cid ):
    """
    """
    collection_path = Collection.collection_path(request,repo,org,cid)
    repository = dvcs.repository(collection_path)
    
    filename = ''
    if request.method == 'POST':
        filename = request.POST.get('filename', None)
    elif request.method == 'GET':
        filename = request.GET.get('filename', None)
    
    fields = []
    if filename:
        path = os.path.join(collection_path, filename)
        with open(path, 'r') as f:
            txt = f.read()
        fields = dvcs.conflicting_fields(txt)
    
    if request.method == 'POST':
        #form = MergeJSONForm(request.POST)
        #if form.is_valid():
        #    text = form.cleaned_data['text']
        #    # TODO validate XML
        #    with open(filepath, 'w') as f:
        #        f.write(text)
        #    # git add file
        #    dvcs.merge_add(repository, filename)
        assert False
    elif request.method == 'GET':
        form = MergeJSONForm(fields=fields)
        return render_to_response(
            'webui/merge/edit-json.html',
            {'filename':filename,
             'fields':fields,
             'form':form,},
            context_instance=RequestContext(request, processors=[])
        )
    return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) )
Example #51
0
def annex_pull(collection, file_path_rel):
    """git-annex copy a file from workbench.

    Example file_paths:
        ddr-densho-1-1/files/video1.mov
        ddr-densho-42-17/files/image35.jpg
        ddr-one-35-248/files/newspaper.pdf
        
    @param collection: Collection
    @param file_path_rel: Path to file relative to collection root.
    @return: message ('ok' if successful)
    """
    file_path_abs = os.path.join(collection.path, file_path_rel)
    logging.debug('    collection.path {}'.format(collection.path))
    logging.debug('    file_path_rel {}'.format(file_path_rel))
    logging.debug('    file_path_abs {}'.format(file_path_abs))
    if not os.path.exists(collection.path):
        logging.error('    NO COLLECTION AT {}'.format(collection.path))
        return 1, 'no collection'
    if not os.path.exists(collection.annex_path):
        logging.error('    NO GIT ANNEX AT {}'.format(collection.annex_path))
        return 1, 'no annex'
    # let's do this thing
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    logging.debug('    git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME,
                                                       file_path_rel))
    stdout = repo.git.annex('copy', '-f', config.GIT_REMOTE_NAME,
                            file_path_rel)
    logging.debug('\n{}'.format(stdout))
    # confirm that it worked
    exists = os.path.exists(file_path_abs)
    lexists = os.path.lexists(file_path_abs)
    islink = os.path.islink(file_path_abs)
    itworked = (exists and lexists and islink)
    logging.debug('    it worked: {}'.format(itworked))
    logging.debug('    DONE')
    return 0, 'ok'
Example #52
0
def sync(user_name, user_mail, collection):
    """Sync repo with bare clone on hub server; replaces git-annex-sync.
    
    Git-annex has a "sync" command for communicating annex changes between
    repositories, but it is designed to be used between non-bare repositories.
    Normally Git does not support pushing to non-bare repositories, and
    git-annex does some trickery involving "synced/BRANCH" branches to make
    this work.
    Reference: http://git-annex.branchable.com/sync/
    
    When git-annex-sync is used between a non-bare repo and a bare repo
    (e.g. between a local repo and our hub server running Gitolite),
    the "synced/master" branches do not get merged in to master and syncing
    no longer works.  Therefore it is necessary to sync manually.
    
    If you think you want to use git-annex-sync, remember that we tried this
    in commit 1857a7aa3f and it did not work and we reverted to manual syncing.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    logging.debug('repo: %s' % repo)
    drive_label = storage.drive_label(repo.working_dir)
    dvcs.annex_set_description(repo,
                               dvcs.annex_status(repo),
                               drive_label=drive_label)
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    # list remotes
    logging.debug('remotes')
    for remote in dvcs.remotes(repo):
        logging.debug('- %s %s' % (remote['name'], remote['target']))
    # sync
    logging.debug('git annex sync')
    out = repo.git.annex('sync')
    logging.debug(out)
    return 0, 'ok'
Example #53
0
def sync(user_name, user_mail, collection):
    """Sync repo with bare clone on hub server; replaces git-annex-sync.
    
    Git-annex has a "sync" command for communicating annex changes between
    repositories, but it is designed to be used between non-bare repositories.
    Normally Git does not support pushing to non-bare repositories, and
    git-annex does some trickery involving "synced/BRANCH" branches to make
    this work.
    Reference: http://git-annex.branchable.com/sync/
    
    When git-annex-sync is used between a non-bare repo and a bare repo
    (e.g. between a local repo and our hub server running Gitolite),
    the "synced/master" branches do not get merged in to master and syncing
    no longer works.  Therefore it is necessary to sync manually.
    
    If you think you want to use git-annex-sync, remember that we tried this
    in commit 1857a7aa3f and it did not work and we reverted to manual syncing.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection: Collection
    @return: message ('ok' if successful)
    """
    repo = dvcs.repository(collection.path, user_name, user_mail)
    logging.debug('repo: %s' % repo)
    drive_label = storage.drive_label(repo.working_dir)
    dvcs.annex_set_description(repo, dvcs.annex_status(repo), drive_label=drive_label)
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    # list remotes
    logging.debug('remotes')
    for remote in dvcs.remotes(repo):
        logging.debug('- %s %s' % (remote['name'], remote['target']))
    # sync
    logging.debug('git annex sync')
    out = repo.git.annex('sync')
    logging.debug(out)
    return 0,'ok'
Example #54
0
def annex_pull(collection, file_path_rel):
    """git-annex copy a file from workbench.

    Example file_paths:
        ddr-densho-1-1/files/video1.mov
        ddr-densho-42-17/files/image35.jpg
        ddr-one-35-248/files/newspaper.pdf
        
    @param collection: Collection
    @param file_path_rel: Path to file relative to collection root.
    @return: message ('ok' if successful)
    """
    file_path_abs = os.path.join(collection.path, file_path_rel)
    logging.debug('    collection.path {}'.format(collection.path))
    logging.debug('    file_path_rel {}'.format(file_path_rel))
    logging.debug('    file_path_abs {}'.format(file_path_abs))
    if not os.path.exists(collection.path):
        logging.error('    NO COLLECTION AT {}'.format(collection.path))
        return 1,'no collection'
    if not os.path.exists(collection.annex_path):
        logging.error('    NO GIT ANNEX AT {}'.format(collection.annex_path))
        return 1,'no annex'
    # let's do this thing
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    dvcs.remote_add(repo, collection.git_url, config.GIT_REMOTE_NAME)
    logging.debug('    git annex copy -t {} {}'.format(config.GIT_REMOTE_NAME, file_path_rel))
    stdout = repo.git.annex('copy', '-f', config.GIT_REMOTE_NAME, file_path_rel)
    logging.debug('\n{}'.format(stdout))
    # confirm that it worked
    exists = os.path.exists(file_path_abs)
    lexists = os.path.lexists(file_path_abs)
    islink = os.path.islink(file_path_abs)
    itworked = (exists and lexists and islink)
    logging.debug('    it worked: {}'.format(itworked))
    logging.debug('    DONE')
    return 0,'ok'
Example #55
0
def stage_files(entity,
                git_files,
                annex_files,
                new_files,
                log,
                show_staged=True):
    # TODO move to DDR.dvcs?
    repo = dvcs.repository(entity.collection_path)
    log.ok('| repo %s' % repo)
    # These vars will be used to determine if stage operation is successful.
    # If called in batch operation there may already be staged files.
    # stage_planned   Files added/modified by this function call
    # stage_already   Files that were already staged
    # stage_predicted List of staged files that should result from this operation.
    # stage_new       Files that are being added.
    stage_planned = git_files + annex_files
    stage_already = dvcs.list_staged(repo)
    stage_predicted = predict_staged(stage_already, stage_planned)
    stage_new = [x for x in stage_planned if x not in stage_already]
    log.ok('| %s files to stage:' % len(stage_planned))
    for sp in stage_planned:
        log.ok('|   %s' % sp)
    stage_ok = False
    staged = []
    try:
        log.ok('git stage')
        dvcs.stage(repo, git_files)
        log.ok('annex stage')
        dvcs.annex_stage(repo, annex_files)
        log.ok('ok')
        staged = dvcs.list_staged(repo)
    except:
        # FAILED! print traceback to addfile log
        log.not_ok(traceback.format_exc().strip())
    finally:
        if show_staged:
            log.ok('| %s files staged:' % len(staged))
            log.ok('show_staged %s' % show_staged)
            for sp in staged:
                log.ok('|   %s' % sp)
        if len(staged) == len(stage_predicted):
            log.ok('| %s files staged (%s new, %s modified)' %
                   (len(staged), len(stage_new), len(stage_already)))
            stage_ok = True
        else:
            log.not_ok('%s new files staged (should be %s)' %
                       (len(staged), len(stage_predicted)))
        if not stage_ok:
            log.not_ok('File staging aborted. Cleaning up')
            # try to pick up the pieces
            # mv files back to tmp_dir
            # TODO Properly clean up git-annex-added files.
            #      This clause moves the *symlinks* to annex files but leaves
            #      the actual binaries in the .git/annex objects dir.
            for tmp, dest in new_files:
                if os.path.islink(dest):
                    log.not_ok('| link (not moving) %s' % dest)
                else:
                    log.not_ok('| mv %s %s' % (dest, tmp))
                    shutil.move(dest, tmp)
            log.not_ok('finished cleanup. good luck...')
            log.crash('Add file aborted, see log file for details: %s' %
                      log.logpath)
    return repo
Example #56
0
def sync_group(groupfile, local_base, local_name, remote_base, remote_name):
    """
    """
    logging.debug('reading group file: %s' % groupfile)
    repos = read_group_file(groupfile)
    ACCESS_SUFFIX = ACCESS_FILE_APPEND + ACCESS_FILE_EXTENSION
    
    def logif(txt):
        t = txt.strip()
        if t:
            logging.debug(t)
    
    for r in repos:
        repo_path = os.path.join(local_base, r['id'])
        logging.debug('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
        logging.debug('repo_path: %s' % repo_path)
        
        # clone/update
        if os.path.exists(repo_path):
            logging.debug('updating %s' % repo_path)
            repo = dvcs.repository(repo_path)
            repo.git.fetch('origin')
            repo.git.checkout('master')
            repo.git.pull('origin', 'master')
            repo.git.checkout('git-annex')
            repo.git.pull('origin', 'git-annex')
            repo.git.checkout('master')
            logging.debug('ok')
        else:
            url = '%s:%s.git' % (GITOLITE, r['id'])
            logging.debug('cloning %s' % url)
            repo = git.Repo.clone_from(url, r['id'])
            repo.git.config('annex.sshcaching', 'false')
            logging.debug('ok')
        
        # add/update remotes
        def add_remote(repo_path, remote_name, remote_path):
            repo = git.Repo(repo_path)
            if remote_name in [rem.name for rem in repo.remotes]:
                logging.debug('remote exists: %s %s' % (remote_name, remote_path))
            else:
                logging.debug(repo_path)
                logging.debug('remote add %s %s' % (remote_name, remote_path))
                if not remote_name in [r.name for r in repo.remotes]:
                    repo.create_remote(remote_name, remote_path)
                logging.debug('ok')
        remote_path = os.path.join(remote_base, r['id'])
        add_remote(repo_path, remote_name, remote_path) # local -> remote
        add_remote(remote_path, local_name, repo_path)  # remote -> local
        
        # annex sync
        logging.debug('annex sync')
        response = repo.git.annex('sync')
        logif(response)
        
        # annex get
        level = r['level']
        logging.debug('level: %s' % level)
        if level == 'access':
            for root, dirs, files in os.walk(repo_path):
                if '.git' in dirs: # exclude .git dir
                    dirs.remove('.git')
                for f in files:
                    if f.endswith(ACCESS_SUFFIX):
                        path_rel = os.path.join(root, f).replace(repo_path, '')[1:]
                        response = repo.git.annex('get', path_rel)
                        logif(response)
        elif level == 'all':
            logging.debug('git annex get .')
            response = repo.git.annex('get', '.')
            logif(response)
        logging.debug('DONE')
        
    return 0,'ok'
Example #57
0
def entity_annex_add(user_name, user_mail, collection_path, entity_uid, updated_files, new_annex_files, agent='', entity=None):
    """Command-line function for git annex add-ing a file and updating metadata.
    
    All this function does is git annex add the file, update changelog and
    mets.xml, and commit.
    It does not copy the file into the entity dir.
    It does not mark the file as master/mezzanine/access/etc or edit any metadata.
    It does not perform any background processing on the file.
    
    TODO Refactor this when ddr-local models moved into ddr-cmdln
    WARNING - UGLY HACK!
    The 'entity' arg is intended to allow ddr-local to pass in Entity
    objects and use their checksums() method.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param entity_uid: A valid DDR entity UID
    @param updated_files: list of paths to updated files (relative to collection repo).
    @param new_annex_files: List of paths to new files (relative to entity files dir).
    @param agent: (optional) Name of software making the change.
    @param entity: (optional) Entity object (see above)
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    if not entity:
        entity = DDREntity(collection.entity_path(entity_uid))
    
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    git_files = []
    annex_files = []
    
    if not os.path.exists(collection.annex_path):
        logging.error('    .git/annex IS MISSING!')
        return 1,'.git/annex IS MISSING!'
    if not os.path.exists(entity.path):
        logging.error('    Entity does not exist: {}'.format(entity.uid))
        return 1,'entity does not exist: {}'.format(entity.uid)
    if not os.path.exists(entity.files_path):
        logging.error('    Entity files_path does not exist: {}'.format(entity.uid))
        return 1,'entity files_path does not exist: {}'.format(entity.uid)
    
    # new annex files
    new_files_rel_entity = []
    for new_file in new_annex_files:
        # paths: absolute, relative to collection repo, relative to entity_dir
        new_file_abs = os.path.join(entity.files_path, new_file)
        if not os.path.exists(new_file_abs):
            logging.error('    File does not exist: {}'.format(new_file_abs))
            return 1,'File does not exist: {}'.format(new_file_abs)
        new_file_rel = os.path.join(entity.files_path_rel, new_file)
        new_file_rel_entity = new_file_abs.replace('{}/'.format(entity.path), '')
        new_files_rel_entity.append(new_file_rel_entity)
        annex_files.append(new_file_rel)
    
    # updated files
    [git_files.append(updated_file) for updated_file in updated_files]
    
    # update entity control
    econtrol = entity.control()
    econtrol.update_checksums(entity)
    econtrol.write()
    git_files.append(econtrol.path_rel)
    
    # prep log entries
    changelog_messages = ['Added entity file {}'.format(f) for f in new_files_rel_entity]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message('Added entity file(s)', agent=agent)
    
    # update entity changelog
    write_changelog_entry(entity.changelog_path,
                          changelog_messages,
                          user_name, user_mail)
    git_files.append(entity.changelog_path_rel)
    
    # add files and commit
    repo = commit_files(repo, commit_message, git_files, annex_files)
    return 0,'ok'
Example #58
0
def sync_group(groupfile, local_base, local_name, remote_base, remote_name):
    """
    """
    logging.debug('reading group file: %s' % groupfile)
    repos = read_group_file(groupfile)
    ACCESS_SUFFIX = config.ACCESS_FILE_APPEND + config.ACCESS_FILE_EXTENSION
    
    def logif(txt):
        t = txt.strip()
        if t:
            logging.debug(t)
    
    for r in repos:
        repo_path = os.path.join(local_base, r['id'])
        logging.debug('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
        logging.debug('repo_path: %s' % repo_path)
        
        # clone/update
        if os.path.exists(repo_path):
            logging.debug('updating %s' % repo_path)
            repo = dvcs.repository(repo_path)
            repo.git.fetch('origin')
            repo.git.checkout('master')
            repo.git.pull('origin', 'master')
            repo.git.checkout('git-annex')
            repo.git.pull('origin', 'git-annex')
            repo.git.checkout('master')
            logging.debug('ok')
        else:
            url = '%s:%s.git' % (config.GITOLITE, r['id'])
            logging.debug('cloning %s' % url)
            repo = git.Repo.clone_from(url, r['id'])
            repo.git.config('annex.sshcaching', 'false')
            logging.debug('ok')
        
        remote_path = os.path.join(remote_base, r['id'])
        # local -> remote
        dvcs.remote_add(git.Repo(repo_path, search_parent_directories=True), remote_path, remote_name)
        # remote -> local
        dvcs.remote_add(git.Repo(remote_path, search_parent_directories=True), repo_path, local_name)
        
        # annex sync
        logging.debug('annex sync')
        response = repo.git.annex('sync')
        logif(response)
        
        # annex get
        level = r['level']
        logging.debug('level: %s' % level)
        if level == 'access':
            for root, dirs, files in os.walk(repo_path):
                if '.git' in dirs: # exclude .git dir
                    dirs.remove('.git')
                for f in files:
                    if f.endswith(ACCESS_SUFFIX):
                        path_rel = os.path.join(root, f).replace(repo_path, '')[1:]
                        response = repo.git.annex('get', path_rel)
                        logif(response)
        elif level == 'all':
            logging.debug('git annex get .')
            response = repo.git.annex('get', '.')
            logif(response)
        logging.debug('DONE')
        
    return 0,'ok'
Example #59
0
def entity_create(user_name, user_mail, collection_path, entity_uid, updated_files, templates, agent=''):
    """Command-line function for creating an entity and adding it to the collection.
    
    @param user_name: Username for use in changelog, git log
    @param user_mail: User email address for use in changelog, git log
    @param collection_path: Absolute path to collection repo.
    @param entity_uid: A valid DDR entity UID
    @param updated_files: List of updated files (relative to collection root).
    @param templates: List of entity metadata templates (absolute paths).
    @param agent: (optional) Name of software making the change.
    @return: message ('ok' if successful)
    """
    collection = DDRCollection(collection_path)
    entity = DDREntity(collection.entity_path(entity_uid))
    
    repo = dvcs.repository(collection.path, user_name, user_mail)
    repo.git.checkout('master')
    if not GIT_REMOTE_NAME in [r.name for r in repo.remotes]:
        repo.create_remote(GIT_REMOTE_NAME, collection.git_url)
    git_files = []
    
    # entity dir
    if not os.path.exists(entity.path):
        os.makedirs(entity.path)
    
    # copy template files to entity
    for src in templates:
        if os.path.exists(src):
            dst = os.path.join(entity.path, os.path.basename(src))
            logging.debug('cp %s, %s' % (src, dst))
            shutil.copy(src, dst)
            if os.path.exists(dst):
                git_files.append(dst)
            else:
                logging.error('COULD NOT COPY %s' % src)
    
    # entity control
    econtrol = entity.control()
    if os.path.exists(econtrol.path):
        git_files.append(econtrol.path)
    else:
        logging.error('    COULD NOT CREATE control')
    # update collection control
    ccontrol = collection.control()
    ccontrol.update_checksums(collection)
    ccontrol.write()
    git_files.append(ccontrol.path)
    
    # prep ENTITY log entries
    entity_changelog_messages = ['Initialized entity {}'.format(entity.uid),]
    if agent:
        entity_changelog_messages.append('@agent: %s' % agent)
    # prep COLLECTION log entries
    changelog_messages = ['Initialized entity {}'.format(entity.uid),]
    if agent:
        changelog_messages.append('@agent: %s' % agent)
    commit_message = dvcs.compose_commit_message(changelog_messages[0], agent=agent)
    
    # ENTITY changelog
    write_changelog_entry(entity.changelog_path,
                          entity_changelog_messages,
                          user=user_name, email=user_mail)
    if os.path.exists(entity.changelog_path):
        git_files.append(entity.changelog_path)
    else:
        logging.error('    COULD NOT CREATE changelog')
    # COLLECTION changelog
    write_changelog_entry(collection.changelog_path,
                          changelog_messages,
                          user=user_name, email=user_mail)
    git_files.append(collection.changelog_path)
    
    # add updated collection files
    for src in updated_files:
        git_files.append(src)
    
    # add files and commit
    repo = commit_files(repo, commit_message, git_files, [])
    return 0,'ok'
Example #60
0
 def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False):
     """Adds or updates files from a CSV file
     
     TODO how to handle excluded fields like XMP???
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param log_path: str Absolute path to addfile log for all files
     @param dryrun: boolean
     """
     logging.info('batch import files ----------------------------')
     
     # TODO hard-coded model name...
     model = 'file'
     
     csv_dir = os.path.dirname(csv_path)
     logging.debug('csv_dir %s' % csv_dir)
 
     # TODO this still knows too much about entities and files...
     entity_class = identifier.class_for_name(
         identifier.MODEL_CLASSES['entity']['module'],
         identifier.MODEL_CLASSES['entity']['class']
     )
     logging.debug('entity_class %s' % entity_class)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     # check for modified or uncommitted files in repo
     repository = dvcs.repository(cidentifier.path_abs())
     logging.debug(repository)
 
     fidentifiers = {
         rowd['id']: identifier.Identifier(
             id=rowd['id'],
             base_path=cidentifier.basepath
         )
         for rowd in rowds
     }
     fidentifier_parents = {
         fi.id: Importer._fidentifier_parent(fi)
         for fi in fidentifiers.itervalues()
     }
     # eidentifiers, removing duplicates
     eidentifiers = list(set([e for e in fidentifier_parents.itervalues()]))
     entities = {}
     bad_entities = []
     for eidentifier in eidentifiers:
         if os.path.exists(eidentifier.path_abs()):
             entity = eidentifier.object()
             entities[eidentifier.id] = entity
         else:
             if eidentifier.id not in bad_entities:
                 bad_entities.append(eidentifier.id)
     if bad_entities:
         for f in bad_entities:
             logging.error('    %s missing' % f)
         raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities))
 
     # separate into new and existing lists
     rowds_new = []
     rowds_existing = []
     for n,rowd in enumerate(rowds):
         if Importer._file_is_new(fidentifiers[rowd['id']]):
             rowds_new.append(rowd)
         else:
             rowds_existing.append(rowd)
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Updating existing files')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds_updates = []
     staged = []
     obj_metadata = None
     for n,rowd in enumerate(rowds_existing):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         file_ = fidentifier.object()
         modified = file_.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 fidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             logging.debug('    writing %s' % file_.json_path)
             file_.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(file_.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(file_)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_updates.append(elapsed_round)
         logging.debug('| %s (%s)' % (fidentifier, elapsed_round))
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates))
             
     if dryrun:
         pass
     elif git_files:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         staged = util.natural_sort(dvcs.list_staged(repository))
         for path in staged:
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
         logging.debug('%s staged in %s' % (len(staged), elapsed_stage))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Adding new files')
     start_adds = datetime.now()
     elapsed_rounds_adds = []
     logging.info('Checking source files')
     for rowd in rowds_new:
         rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig'])
         logging.debug('| %s' % rowd['src_path'])
         if not os.path.exists(rowd['src_path']):
             raise Exception('Missing file: %s' % rowd['src_path'])
     if log_path:
         logging.info('addfile logging to %s' % log_path)
     for n,rowd in enumerate(rowds_new):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         logging.debug('| %s' % (entity))
 
         if dryrun:
             pass
         elif Importer._file_is_new(fidentifier):
             # ingest
             # TODO make sure this updates entity.files
             file_,repo2,log2 = ingest.add_file(
                 entity,
                 rowd['src_path'],
                 fidentifier.parts['role'],
                 rowd,
                 git_name, git_mail, agent,
                 log_path=log_path,
                 show_staged=False
             )
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_adds.append(elapsed_round)
         logging.debug('| %s (%s)' % (file_, elapsed_round))
     
     elapsed_adds = datetime.now() - start_adds
     logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return git_files