Esempio n. 1
0
def stage_files(entity, git_files, annex_files, new_files, log, show_staged=True):
    # TODO move to DDR.dvcs?
    repo = dvcs.repository(entity.collection_path)
    log.ok('| repo %s' % repo)
    # These vars will be used to determine if stage operation is successful.
    # If called in batch operation there may already be staged files.
    # stage_planned   Files added/modified by this function call
    # stage_already   Files that were already staged
    # stage_predicted List of staged files that should result from this operation.
    # stage_new       Files that are being added.
    stage_planned = git_files + annex_files
    stage_already = dvcs.list_staged(repo)
    stage_predicted = predict_staged(stage_already, stage_planned)
    stage_new = [x for x in stage_planned if x not in stage_already]
    log.ok('| %s files to stage:' % len(stage_planned))
    for sp in stage_planned:
        log.ok('|   %s' % sp)
    stage_ok = False
    staged = []
    try:
        log.ok('git stage')
        dvcs.stage(repo, git_files)
        log.ok('annex stage')
        dvcs.annex_stage(repo, annex_files)
        log.ok('ok')
        staged = dvcs.list_staged(repo)
    except:
        # FAILED! print traceback to addfile log
        log.not_ok(traceback.format_exc().strip())
    finally:
        if show_staged:
            log.ok('| %s files staged:' % len(staged))
            log.ok('show_staged %s' % show_staged)
            for sp in staged:
                log.ok('|   %s' % sp)
        if len(staged) == len(stage_predicted):
            log.ok('| %s files staged (%s new, %s modified)' % (
                len(staged), len(stage_new), len(stage_already))
            )
            stage_ok = True
        else:
            log.not_ok('%s new files staged (should be %s)' % (
                len(staged), len(stage_predicted))
            )
        if not stage_ok:
            log.not_ok('File staging aborted. Cleaning up')
            # try to pick up the pieces
            # mv files back to tmp_dir
            # TODO Properly clean up git-annex-added files.
            #      This clause moves the *symlinks* to annex files but leaves
            #      the actual binaries in the .git/annex objects dir.
            for tmp,dest in new_files:
                if os.path.islink(dest):
                    log.not_ok('| link (not moving) %s' % dest)
                else:
                    log.not_ok('| mv %s %s' % (dest,tmp))
                    os.rename(dest,tmp)
            log.not_ok('finished cleanup. good luck...')
            log.crash('Add file aborted, see log file for details: %s' % log.logpath)
    return repo
Esempio n. 2
0
def commit_files(repo, message, git_files=[], annex_files=[]):
    """git-add and git-annex-add files and commit them
    
    @param repo: GitPython Repo object
    @param message: String
    @param git_files: List of filenames relative to repo root.
    @param annex_files: List of filenames relative to repo root.
    @return: GitPython Repo object
    """
    added = annex_files + git_files
    added.sort()
    logging.debug('    files added:         {}'.format(added))
    
    if annex_files:
        repo.git.annex('add', annex_files)
    if git_files:
        repo.index.add(git_files)
    
    staged = dvcs.list_staged(repo)
    staged.sort()
    logging.debug('    files staged:        {}'.format(staged))
    # TODO cancel commit if list of staged doesn't match list of files added?
    
    commit = repo.index.commit(message)
    logging.debug('    commit: {}'.format(commit.hexsha))
    
    committed = dvcs.list_committed(repo, commit)
    committed.sort()
    logging.debug('    files committed:     {}'.format(committed))
    # TODO complain if list of committed files doesn't match lists of added and staged files?
    
    return repo
Esempio n. 3
0
def add_file_commit(entity, file_, repo, log, git_name, git_mail, agent):
    log.ok('add_file_commit(%s, %s, %s, %s, %s, %s)' % (file_, repo, log, git_name, git_mail, agent))
    staged = dvcs.list_staged(repo)
    modified = dvcs.list_modified(repo)
    if staged and not modified:
        log.ok('All files staged.')
        log.ok('Updating changelog')
        path = file_.path_abs.replace('{}/'.format(entity.path), '')
        changelog_messages = ['Added entity file {}'.format(path)]
        if agent:
            changelog_messages.append('@agent: %s' % agent)
        changelog.write_changelog_entry(
            entity.changelog_path, changelog_messages, git_name, git_mail)
        log.ok('git add %s' % entity.changelog_path_rel)
        git_files = [entity.changelog_path_rel]
        dvcs.stage(repo, git_files)
        
        log.ok('Committing')
        commit = dvcs.commit(repo, 'Added entity file(s)', agent)
        log.ok('commit: {}'.format(commit.hexsha))
        committed = dvcs.list_committed(repo, commit)
        committed.sort()
        log.ok('files committed:')
        for f in committed:
            log.ok('| %s' % f)
        
    else:
        log.not_ok('%s files staged, %s files modified' % (len(staged),len(modified)))
        log.not_ok('staged %s' % staged)
        log.not_ok('modified %s' % modified)
        log.not_ok('Can not commit!')
        raise Exception('Could not commit bc %s unstaged files: %s' % (len(modified), modified))
    return file_,repo,log
Esempio n. 4
0
def add_file_commit(entity, file_, repo, log, git_name, git_mail, agent):
    log.ok('add_file_commit(%s, %s, %s, %s, %s, %s)' % (file_, repo, log, git_name, git_mail, agent))
    staged = dvcs.list_staged(repo)
    modified = dvcs.list_modified(repo)
    if staged and not modified:
        log.ok('All files staged.')
        log.ok('Updating changelog')
        path = file_.path_abs.replace('{}/'.format(entity.path), '')
        changelog_messages = ['Added entity file {}'.format(path)]
        if agent:
            changelog_messages.append('@agent: %s' % agent)
        changelog.write_changelog_entry(
            entity.changelog_path, changelog_messages, git_name, git_mail)
        log.ok('git add %s' % entity.changelog_path_rel)
        git_files = [entity.changelog_path_rel]
        dvcs.stage(repo, git_files)
        
        log.ok('Committing')
        commit = dvcs.commit(repo, 'Added entity file(s)', agent)
        log.ok('commit: {}'.format(commit.hexsha))
        committed = dvcs.list_committed(repo, commit)
        committed.sort()
        log.ok('files committed:')
        for f in committed:
            log.ok('| %s' % f)
        
    else:
        log.not_ok('%s files staged, %s files modified' % (len(staged),len(modified)))
        log.not_ok('staged %s' % staged)
        log.not_ok('modified %s' % modified)
        log.not_ok('Can not commit!')
        raise Exception('Could not commit bc %s unstaged files: %s' % (len(modified), modified))
    return file_,repo,log
Esempio n. 5
0
def commit_files(repo, message, git_files=[], annex_files=[]):
    """git-add and git-annex-add files and commit them
    
    @param repo: GitPython Repo object
    @param message: String
    @param git_files: List of filenames relative to repo root.
    @param annex_files: List of filenames relative to repo root.
    @return: GitPython Repo object
    """
    added = annex_files + git_files
    added.sort()
    logging.debug('    files added:         {}'.format(added))
    
    if annex_files:
        repo.git.annex('add', annex_files)
    if git_files:
        repo.index.add(git_files)
    
    staged = dvcs.list_staged(repo)
    staged.sort()
    logging.debug('    files staged:        {}'.format(staged))
    # TODO cancel commit if list of staged doesn't match list of files added?
    
    commit = repo.index.commit(message)
    logging.debug('    commit: {}'.format(commit.hexsha))
    
    committed = dvcs.list_committed(repo, commit)
    committed.sort()
    logging.debug('    files committed:     {}'.format(committed))
    # TODO complain if list of committed files doesn't match lists of added and staged files?
    
    return repo
Esempio n. 6
0
def merge( request, repo, org, cid ):
    """
    Decides how to merge the various files in a merge conflict.
    Sends user around to different editors and things until everything is merged.
    """
    collection_path = Collection.collection_path(request,repo,org,cid)
    repository = dvcs.repository(collection_path)
    collection = Collection.from_json(collection_path)
    task_id = collection.locked()
    status = commands.status(collection_path)
    ahead = collection.repo_ahead()
    behind = collection.repo_behind()
    diverged = collection.repo_diverged()
    conflicted = collection.repo_conflicted()
    unmerged = dvcs.list_conflicted(repository)
    staged = dvcs.list_staged(repository)
    if request.method == 'POST':
        form = MergeCommitForm(request.POST)
        if form.is_valid():
            which = form.cleaned_data['which']
            if which == 'merge':
                dvcs.merge_commit(repository)
                committed = 1
            elif which == 'commit':
                dvcs.diverge_commit(repository)
                committed = 1
            else:
                committed = 0
            if committed:
                if task_id:
                    collection.unlock(task_id)
                messages.error(request, 'Merge conflict has been resolved. Please sync to make your changes available to other users.')
                return HttpResponseRedirect( reverse('webui-collection', args=[repo,org,cid]) )
            return HttpResponseRedirect( reverse('webui-merge', args=[repo,org,cid]) )
    else:
        which = 'unknown'
        if conflicted and not unmerged:
            which = 'merge'
        elif diverged and staged:
            which = 'commit'
        form = MergeCommitForm({'path':collection_path, 'which':which,})
    return render_to_response(
        'webui/merge/index.html',
        {'repo': repo,
         'org': org,
         'cid': cid,
         'collection_path': collection_path,
         'collection': collection,
         'status': status,
         'conflicted': conflicted,
         'ahead': ahead,
         'behind': behind,
         'unmerged': unmerged,
         'diverged': diverged,
         'staged': staged,
         'form': form,},
        context_instance=RequestContext(request, processors=[])
    )
Esempio n. 7
0
def repo_status(repo, log):
    """Logs staged, modified, and untracked files and returns same
    
    @param repo
    @param log
    @returns: staged,modified,untracked
    """
    log.ok('| %s' % repo)
    staged = dvcs.list_staged(repo)
    modified = dvcs.list_modified(repo)
    untracked = dvcs.list_untracked(repo)
    log.ok('|   %s staged, %s modified, %s untracked' % (
        len(staged), len(modified), len(untracked),
    ))
    for path in staged:
        log.ok('|   staged: %s' % path)
    for path in modified:
        log.ok('|   modified: %s' % path)
    for path in untracked:
        log.ok('|   untracked: %s' % path)
    return staged, modified, untracked
Esempio n. 8
0
 def check_repository(cidentifier):
     """Load repository, check for staged or modified files
     
     Entity.add_files will not work properly if the repo contains staged
     or modified files.
     
     Results dict includes:
     - 'passed': boolean
     - 'repo': GitPython repository
     - 'staged': list of staged files
     - 'modified': list of modified files
     
     @param cidentifier: Identifier
     @returns: dict
     """
     logging.info('Checking repository')
     passed = False
     repo = dvcs.repository(cidentifier.path_abs())
     logging.info(repo)
     staged = dvcs.list_staged(repo)
     if staged:
         logging.error('*** Staged files in repo %s' % repo.working_dir)
         for f in staged:
             logging.error('*** %s' % f)
     modified = dvcs.list_modified(repo)
     if modified:
         logging.error('Modified files in repo: %s' % repo.working_dir)
         for f in modified:
             logging.error('*** %s' % f)
     if repo and (not (staged or modified)):
         passed = True
         logging.info('ok')
     else:
         logging.error('FAIL')
     return {
         'passed': passed,
         'repo': repo,
         'staged': staged,
         'modified': modified,
     }
Esempio n. 9
0
def test_update_files(tmpdir, collection, test_csv_dir, test_files_dir):
    hashes_before = collect_hashes(collection.path_abs)
    file_csv_path = os.path.join(test_csv_dir, 'ddrimport-file-update.csv')
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(test_files_dir, 'ddrimport-file-update.log')
    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    repo = dvcs.repository(collection.path_abs)
    staged = sorted(dvcs.list_staged(repo))
    # test
    unstaged = []
    for path in EXPECTED_UPDATE_FILES:
        if path not in staged:
            unstaged.append(path)
    unstaged = sorted(unstaged)
    for n, path in enumerate(unstaged):
        print('UNSTAGED %s %s' % (n + 1, path))
    print(repo)
    print(log_path)
    assert not unstaged
    # save and commit
    repo = dvcs.repository(collection.path_abs)
    commit = repo.index.commit('test_update_files')
    # test hashes present
    check_file_hashes(collection.path_abs)
    # test hashes not modified
    hashes_after = collect_hashes(collection.path_abs)
    check_hashes(hashes_before, hashes_after)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)
Esempio n. 10
0
 def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False):
     """Adds or updates files from a CSV file
     
     TODO how to handle excluded fields like XMP???
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param log_path: str Absolute path to addfile log for all files
     @param dryrun: boolean
     """
     logging.info('batch import files ----------------------------')
     
     # TODO hard-coded model name...
     model = 'file'
     
     csv_dir = os.path.dirname(csv_path)
     logging.debug('csv_dir %s' % csv_dir)
 
     # TODO this still knows too much about entities and files...
     entity_class = identifier.class_for_name(
         identifier.MODEL_CLASSES['entity']['module'],
         identifier.MODEL_CLASSES['entity']['class']
     )
     logging.debug('entity_class %s' % entity_class)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     # check for modified or uncommitted files in repo
     repository = dvcs.repository(cidentifier.path_abs())
     logging.debug(repository)
 
     fidentifiers = {
         rowd['id']: identifier.Identifier(
             id=rowd['id'],
             base_path=cidentifier.basepath
         )
         for rowd in rowds
     }
     fidentifier_parents = {
         fi.id: Importer._fidentifier_parent(fi)
         for fi in fidentifiers.itervalues()
     }
     # eidentifiers, removing duplicates
     eidentifiers = list(set([e for e in fidentifier_parents.itervalues()]))
     entities = {}
     bad_entities = []
     for eidentifier in eidentifiers:
         if os.path.exists(eidentifier.path_abs()):
             entity = eidentifier.object()
             entities[eidentifier.id] = entity
         else:
             if eidentifier.id not in bad_entities:
                 bad_entities.append(eidentifier.id)
     if bad_entities:
         for f in bad_entities:
             logging.error('    %s missing' % f)
         raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities))
 
     # separate into new and existing lists
     rowds_new = []
     rowds_existing = []
     for n,rowd in enumerate(rowds):
         if Importer._file_is_new(fidentifiers[rowd['id']]):
             rowds_new.append(rowd)
         else:
             rowds_existing.append(rowd)
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Updating existing files')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds_updates = []
     staged = []
     obj_metadata = None
     for n,rowd in enumerate(rowds_existing):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         file_ = fidentifier.object()
         modified = file_.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 fidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             logging.debug('    writing %s' % file_.json_path)
             file_.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(file_.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(file_)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_updates.append(elapsed_round)
         logging.debug('| %s (%s)' % (fidentifier, elapsed_round))
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates))
             
     if dryrun:
         pass
     elif git_files:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         staged = util.natural_sort(dvcs.list_staged(repository))
         for path in staged:
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
         logging.debug('%s staged in %s' % (len(staged), elapsed_stage))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Adding new files')
     start_adds = datetime.now()
     elapsed_rounds_adds = []
     logging.info('Checking source files')
     for rowd in rowds_new:
         rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig'])
         logging.debug('| %s' % rowd['src_path'])
         if not os.path.exists(rowd['src_path']):
             raise Exception('Missing file: %s' % rowd['src_path'])
     if log_path:
         logging.info('addfile logging to %s' % log_path)
     for n,rowd in enumerate(rowds_new):
         logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig']))
         start_round = datetime.now()
         
         fidentifier = fidentifiers[rowd['id']]
         eidentifier = fidentifier_parents[fidentifier.id]
         entity = entities[eidentifier.id]
         logging.debug('| %s' % (entity))
 
         if dryrun:
             pass
         elif Importer._file_is_new(fidentifier):
             # ingest
             # TODO make sure this updates entity.files
             file_,repo2,log2 = ingest.add_file(
                 entity,
                 rowd['src_path'],
                 fidentifier.parts['role'],
                 rowd,
                 git_name, git_mail, agent,
                 log_path=log_path,
                 show_staged=False
             )
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds_adds.append(elapsed_round)
         logging.debug('| %s (%s)' % (file_, elapsed_round))
     
     elapsed_adds = datetime.now() - start_adds
     logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return git_files
Esempio n. 11
0
 def import_entities(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, dryrun=False):
     """Adds or updates entities from a CSV file
     
     Running function multiple times with the same CSV file is idempotent.
     After the initial pass, files will only be modified if the CSV data
     has been updated.
     
     This function writes and stages files but does not commit them!
     That is left to the user or to another function.
     
     @param csv_path: Absolute path to CSV data file.
     @param cidentifier: Identifier
     @param vocabs_path: Absolute path to vocab dir
     @param git_name: str
     @param git_mail: str
     @param agent: str
     @param dryrun: boolean
     @returns: list of updated entities
     """
     logging.info('------------------------------------------------------------------------')
     logging.info('batch import entity')
     model = 'entity'
     
     repository = dvcs.repository(cidentifier.path_abs())
     logging.info(repository)
     
     logging.info('Reading %s' % csv_path)
     headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path))
     logging.info('%s rows' % len(rowds))
     
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     logging.info('Importing')
     start_updates = datetime.now()
     git_files = []
     updated = []
     elapsed_rounds = []
     obj_metadata = None
     
     if dryrun:
         logging.info('Dry run - no modifications')
     for n,rowd in enumerate(rowds):
         logging.info('%s/%s - %s' % (n+1, len(rowds), rowd['id']))
         start_round = datetime.now()
         
         eidentifier = identifier.Identifier(id=rowd['id'], base_path=cidentifier.basepath)
         # if there is an existing object it will be loaded
         entity = eidentifier.object()
         if not entity:
             entity = models.Entity.create(eidentifier.path_abs(), eidentifier)
         modified = entity.load_csv(rowd)
         # Getting obj_metadata takes about 1sec each time
         # TODO caching works as long as all objects have same metadata...
         if not obj_metadata:
             obj_metadata = models.object_metadata(
                 eidentifier.fields_module(),
                 repository.working_dir
             )
         
         if dryrun:
             pass
         elif modified:
             # write files
             if not os.path.exists(entity.path_abs):
                 os.makedirs(entity.path_abs)
             logging.debug('    writing %s' % entity.json_path)
             entity.write_json(obj_metadata=obj_metadata)
             # TODO better to write to collection changelog?
             # TODO write all additions to changelog at one time
             Importer._write_entity_changelog(entity, git_name, git_mail, agent)
             # stage
             git_files.append(entity.json_path_rel)
             git_files.append(entity.changelog_path_rel)
             updated.append(entity)
         
         elapsed_round = datetime.now() - start_round
         elapsed_rounds.append(elapsed_round)
         logging.debug('| %s (%s)' % (eidentifier, elapsed_round))
 
     if dryrun:
         logging.info('Dry run - no modifications')
     elif updated:
         logging.info('Staging %s modified files' % len(git_files))
         start_stage = datetime.now()
         dvcs.stage(repository, git_files)
         for path in util.natural_sort(dvcs.list_staged(repository)):
             if path in git_files:
                 logging.debug('+ %s' % path)
             else:
                 logging.debug('| %s' % path)
         elapsed_stage = datetime.now() - start_stage
         logging.debug('ok (%s)' % elapsed_stage)
     
     elapsed_updates = datetime.now() - start_updates
     logging.debug('%s updated in %s' % (len(elapsed_rounds), elapsed_updates))
     logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
     
     return updated
Esempio n. 12
0
def stage_files(entity,
                git_files,
                annex_files,
                new_files,
                log,
                show_staged=True):
    # TODO move to DDR.dvcs?
    repo = dvcs.repository(entity.collection_path)
    log.ok('| repo %s' % repo)
    # These vars will be used to determine if stage operation is successful.
    # If called in batch operation there may already be staged files.
    # stage_planned   Files added/modified by this function call
    # stage_already   Files that were already staged
    # stage_predicted List of staged files that should result from this operation.
    # stage_new       Files that are being added.
    stage_planned = git_files + annex_files
    stage_already = dvcs.list_staged(repo)
    stage_predicted = predict_staged(stage_already, stage_planned)
    stage_new = [x for x in stage_planned if x not in stage_already]
    log.ok('| %s files to stage:' % len(stage_planned))
    for sp in stage_planned:
        log.ok('|   %s' % sp)
    stage_ok = False
    staged = []
    try:
        log.ok('git stage')
        dvcs.stage(repo, git_files)
        log.ok('annex stage')
        dvcs.annex_stage(repo, annex_files)
        log.ok('ok')
        staged = dvcs.list_staged(repo)
    except:
        # FAILED! print traceback to addfile log
        log.not_ok(traceback.format_exc().strip())
    finally:
        if show_staged:
            log.ok('| %s files staged:' % len(staged))
            log.ok('show_staged %s' % show_staged)
            for sp in staged:
                log.ok('|   %s' % sp)
        if len(staged) == len(stage_predicted):
            log.ok('| %s files staged (%s new, %s modified)' %
                   (len(staged), len(stage_new), len(stage_already)))
            stage_ok = True
        else:
            log.not_ok('%s new files staged (should be %s)' %
                       (len(staged), len(stage_predicted)))
        if not stage_ok:
            log.not_ok('File staging aborted. Cleaning up')
            # try to pick up the pieces
            # mv files back to tmp_dir
            # TODO Properly clean up git-annex-added files.
            #      This clause moves the *symlinks* to annex files but leaves
            #      the actual binaries in the .git/annex objects dir.
            for tmp, dest in new_files:
                if os.path.islink(dest):
                    log.not_ok('| link (not moving) %s' % dest)
                else:
                    log.not_ok('| mv %s %s' % (dest, tmp))
                    shutil.move(dest, tmp)
            log.not_ok('finished cleanup. good luck...')
            log.crash('Add file aborted, see log file for details: %s' %
                      log.logpath)
    return repo
Esempio n. 13
0
def test_files_import_external_nohashes_rename(tmpdir, collection,
                                               test_csv_dir, test_files_dir):
    """Test importing *external* files with *no* hash cols but binaries present
    
    If file is external, binary is present, and no hash cols, rename binary in place
    
    ddr-testing-123-1-master-684e15e967
    ddr-testing-123-2-master-b9773b9aef
    """
    print('collection_path %s' % collection.path_abs)
    file_csv_path = os.path.join(
        test_csv_dir, 'ddrimport-files-import-external-nohashes-rename.csv')
    print('file_csv_path %s' % file_csv_path)
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(
        test_files_dir, 'ddrimport-files-import-external-nohashes-rename.log')
    print('log_path %s' % log_path)

    print('test_files_dir %s' % test_files_dir)
    for path in os.listdir(test_files_dir):
        print(path)

    # copy test files so later tests don't crash
    # replace basename_orig in CSV with copied file
    # and rewrite CSV
    headers, rowds, csv_errs = csvfile.make_rowds(
        fileio.read_csv(file_csv_path))
    renamed_files = []
    copied_files = []
    ingested_files = []
    access_files = []
    for rowd in rowds:
        print(rowd)
        src_file = os.path.join(test_files_dir, rowd['basename_orig'])
        path, ext = os.path.splitext(src_file)
        dest_file = path + '-rename' + ext
        print('shutil.copy(%s, %s)' % (src_file, dest_file))
        shutil.copy(src_file, dest_file)
        if os.path.exists(dest_file):
            renamed_files.append(os.path.basename(dest_file))
        else:
            print('could not copy')
            assert False
        rowd['basename_orig'] = dest_file
        # figure out new file ID
        sha1 = util.file_hash(dest_file, 'sha1')[:10]
        idparts = rowd['id'].split('-') + [rowd['role']] + [sha1]
        final_file = '-'.join(idparts) + ext
        final_access = '-'.join(idparts + ['a.jpg'])
        copied_files.append(final_file)
        ingested_files.append(final_file)
        access_files.append(final_access)
    headers, rows = csvfile.make_rows(rowds)
    fileio.write_csv(file_csv_path, headers, rows)

    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    # save and commit
    repo = dvcs.repository(collection.path_abs)

    print('STAGED FILES')
    staged_files = sorted([path for path in dvcs.list_staged(repo)])
    for path in staged_files:
        print('  %s' % path)

    # after import_files, we expect to see
    offenses = 0
    # assert final_file in os.listdir(test_files_dir)

    print('test_files_dir')
    test_files = [path for path in os.listdir(test_files_dir)]
    for path in copied_files:
        print(path)
        if path not in test_files:
            print('RENAMED SRC FILE NOT PRESENT %s' % path)
            offenses += 1
    # assert files not ingested
    # assert no access files created
    for path in staged_files:
        if os.path.basename(path) in ingested_files:
            print('ERROR %s HAS BEEN IMPORTED!!' % path)
            offenses += 1
        if os.path.basename(path) in access_files:
            print('ERROR %s ACCESS FILE GENERATED!!' % path)
            offenses += 1

    commit = repo.index.commit('test_files_import_external_nohashes_rename')
    print('commit %s' % commit)
    if offenses:
        assert False
    # test hashes present
    check_file_hashes(collection.path_abs)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)