Example #1
0
def checksums(src_path, log):
    md5    = util.file_hash(src_path, 'md5');    log.ok('| md5: %s' % md5)
    sha1   = util.file_hash(src_path, 'sha1');   log.ok('| sha1: %s' % sha1)
    sha256 = util.file_hash(src_path, 'sha256'); log.ok('| sha256: %s' % sha256)
    if not (sha1 and md5 and sha256):
        log.crash('Could not calculate checksums')
    return md5,sha1,sha256
Example #2
0
def checksums(src_path, log):
    md5    = util.file_hash(src_path, 'md5');    log.ok('| md5: %s' % md5)
    sha1   = util.file_hash(src_path, 'sha1');   log.ok('| sha1: %s' % sha1)
    sha256 = util.file_hash(src_path, 'sha256'); log.ok('| sha256: %s' % sha256)
    if not (sha1 and md5 and sha256):
        log.crash('Could not calculate checksums')
    return md5,sha1,sha256
Example #3
0
def check_file(json_path, verbose=False):
    fi = Identifier(json_path)
    f = models.File.from_identifier(fi)

    if not os.path.exists(f.path_abs):
        result = ['missing', f.path_abs]
        print(result)
        return result
    
    mismatches = []
    md5 = util.file_hash(f.path_abs, 'md5')
    if not (md5 == f.md5):
        mismatches.append['md5']
    sha1 = util.file_hash(f.path_abs, 'sha1')
    if not (sha1 == f.sha1):
        mismatches.append['sha1']
    sha256 = util.file_hash(f.path_abs, 'sha256')
    if not (sha256 == f.sha256):
        mismatches.append['sha256']
    # SHA256 hash from the git-annex filename
    annex_sha256 = os.path.basename(
        os.path.realpath(f.path_abs)
    ).split('--')[1]
    if not (sha256 == annex_sha256):
        mismatches.append['annex_sha256']
    
    if mismatches:
        mismatches.append(json_path)
        print(mismatches)
    
    return mismatches
Example #4
0
def check_file(json_path, verbose=False):
    fi = Identifier(json_path)
    f = models.File.from_identifier(fi)

    if not os.path.exists(f.path_abs):
        result = ['missing', f.path_abs]
        print result
        return result
    
    mismatches = []
    md5 = util.file_hash(f.path_abs, 'md5')
    if not (md5 == f.md5):
        mismatches.append['md5']
    sha1 = util.file_hash(f.path_abs, 'sha1')
    if not (sha1 == f.sha1):
        mismatches.append['sha1']
    sha256 = util.file_hash(f.path_abs, 'sha256')
    if not (sha256 == f.sha256):
        mismatches.append['sha256']
    # SHA256 hash from the git-annex filename
    annex_sha256 = os.path.basename(
        os.path.realpath(f.path_abs)
    ).split('--')[1]
    if not (sha256 == annex_sha256):
        mismatches.append['annex_sha256']
    
    if mismatches:
        mismatches.append(json_path)
        print mismatches
    
    return mismatches
Example #5
0
def test_file_hash():
    path = os.path.join(TESTING_BASE_DIR, 'test-hash-%s' % datetime.now(config.TZ).strftime('%Y%m%dT%H%M%S'))
    text = 'hash'
    sha1 = '2346ad27d7568ba9896f1b7da6b5991251debdf2'
    sha256 = 'd04b98f48e8f8bcc15c6ae5ac050801cd6dcfd428fb5f9e65c4e16e7807340fa'
    md5 = '0800fc577294c34e0b28ad2839435945'
    with open(path, 'w') as f:
        f.write(text)
    assert util.file_hash(path, 'sha1') == sha1
    assert util.file_hash(path, 'sha256') == sha256
    assert util.file_hash(path, 'md5') == md5
    os.remove(path)
Example #6
0
def test_file_hash(tmpdir):
    path = str(tmpdir / 'test-hash')
    text = 'hash'
    sha1 = '2346ad27d7568ba9896f1b7da6b5991251debdf2'
    sha256 = 'd04b98f48e8f8bcc15c6ae5ac050801cd6dcfd428fb5f9e65c4e16e7807340fa'
    md5 = '0800fc577294c34e0b28ad2839435945'
    with open(path, 'w') as f:
        f.write(text)
    assert util.file_hash(path, 'sha1') == sha1
    assert util.file_hash(path, 'sha256') == sha256
    assert util.file_hash(path, 'md5') == md5
    os.remove(path)
Example #7
0
 def checksums(self, algo, force_read=False):
     """Calculates hash checksums for the Entity's files.
     
     Gets hashes from FILE.json metadata if the file(s) are absent
     from the filesystem (i.e. git-annex file symlinks).
     Overrides DDR.models.Entity.checksums.
     
     @param algo: str
     @param force_read: bool Traverse filesystem if true.
     @returns: list of (checksum, filepath) tuples
     """
     checksums = []
     if algo not in self.checksum_algorithms():
         raise Error('BAD ALGORITHM CHOICE: {}'.format(algo))
     for f in self._file_paths():
         cs = None
         ext = None
         pathname = os.path.splitext(f)[0]
         # from metadata file
         json_path = os.path.join(self.files_path, f)
         for field in json.loads(fileio.read_text(json_path)):
             for k, v in field.items():
                 if k == algo:
                     cs = v
                 if k == 'basename_orig':
                     ext = os.path.splitext(v)[-1]
         fpath = pathname + ext
         if force_read:
             # from filesystem
             # git-annex files are present
             if os.path.exists(fpath):
                 cs = util.file_hash(fpath, algo)
         if cs:
             checksums.append((cs, os.path.basename(fpath)))
     return checksums
Example #8
0
 def file_name( entity, path_abs, role, sha1=None ):
     """Generate a new name for the specified file; Use only when ingesting a file!
     
     rename files to standard names on ingest:
     %{entity_id%}-%{role}-%{sha1}.%{ext}
     example: ddr-testing-56-101-master-fb73f9de29.jpg
     
     SHA1 is optional so it can be passed in by a calling process that has already
     generated it.
     
     @param entity
     @param path_abs: Absolute path to the file.
     @param role
     @param sha1: SHA1 hash (optional)
     """
     if os.path.exists and os.access(path_abs, os.R_OK):
         ext = os.path.splitext(path_abs)[1]
         if not sha1:
             sha1 = util.file_hash(path_abs, 'sha1')
         if sha1:
             idparts = [a for a in entity.idparts]
             idparts.append(role)
             idparts.append(sha1[:10])
             name = '{}{}'.format(Identifier(parts=idparts).id, ext)
             return name
     return None
Example #9
0
 def checksums(self, algo, force_read=False):
     """Calculates hash checksums for the Entity's files.
     
     Gets hashes from FILE.json metadata if the file(s) are absent
     from the filesystem (i.e. git-annex file symlinks).
     Overrides DDR.models.Entity.checksums.
     
     @param algo: str
     @param force_read: bool Traverse filesystem if true.
     @returns: list of (checksum, filepath) tuples
     """
     checksums = []
     if algo not in self.checksum_algorithms():
         raise Error('BAD ALGORITHM CHOICE: {}'.format(algo))
     for f in self._file_paths():
         cs = None
         ext = None
         pathname = os.path.splitext(f)[0]
         # from metadata file
         json_path = os.path.join(self.files_path, f)
         for field in json.loads(fileio.read_text(json_path)):
             for k,v in field.iteritems():
                 if k == algo:
                     cs = v
                 if k == 'basename_orig':
                     ext = os.path.splitext(v)[-1]
         fpath = pathname + ext
         if force_read:
             # from filesystem
             # git-annex files are present
             if os.path.exists(fpath):
                 cs = util.file_hash(fpath, algo)
         if cs:
             checksums.append( (cs, os.path.basename(fpath)) )
     return checksums
Example #10
0
 def file_name(entity, path_abs, role, sha1=None):
     """Generate a new name for the specified file; Use only when ingesting a file!
     
     rename files to standard names on ingest:
     %{entity_id%}-%{role}-%{sha1}.%{ext}
     example: ddr-testing-56-101-master-fb73f9de29.jpg
     
     SHA1 is optional so it can be passed in by a calling process that has already
     generated it.
     
     @param entity
     @param path_abs: Absolute path to the file.
     @param role
     @param sha1: SHA1 hash (optional)
     """
     if os.path.exists and os.access(path_abs, os.R_OK):
         ext = os.path.splitext(path_abs)[1]
         if not sha1:
             sha1 = util.file_hash(path_abs, 'sha1')
         if sha1:
             idparts = [a for a in entity.idparts]
             idparts.append(role)
             idparts.append(sha1[:10])
             name = '{}{}'.format(Identifier(parts=idparts).id, ext)
             return name
     return None
Example #11
0
def add_access( entity, ddrfile, src_path, git_name, git_mail, agent='', log_path=None, show_staged=True ):
    """Generate new access file for entity
    
    This method breaks out of OOP and manipulates entity.json directly.
    Thus it needs to lock to prevent other edits while it does its thing.
    Writes a log to ${entity}/addfile.log, formatted in pseudo-TAP.
    This log is returned along with a File object.
    
    TODO Refactor this function! It is waaay too long!
    
    @param entity: Entity object
    @param ddrfile: File
    @param src_path: str Absolute path to the access file (ddrfile.path_abs)
    @param git_name: Username of git committer.
    @param git_mail: Email of git committer.
    @param agent: str (optional) Name of software making the change.
    @param log_path: str (optional) Absolute path to addfile log
    @param show_staged: boolean Log list of staged files
    @returns: file_,repo,log,next_op
    """
    f = None
    repo = None
    if log_path:
        log = addfile_logger(log_path=log_path)
    else:
        log = addfile_logger(identifier=entity.identifier)
    
    log.ok('------------------------------------------------------------------------')
    log.ok('DDR.models.Entity.add_access: START')
    log.ok('entity: %s' % entity.id)
    log.ok('ddrfile: %s' % ddrfile)
    
    log.ok('Checking files/dirs')
    check_dir('| src_path', src_path, log, mkdir=False, perm=os.R_OK)
    
    log.ok('Identifier')
    log.ok('| file_id %s' % ddrfile.id)
    log.ok('| basepath %s' % entity.identifier.basepath)
    fidentifier = identifier.Identifier(ddrfile.id, entity.identifier.basepath)
    log.ok('| identifier %s' % fidentifier)
    file_class = fidentifier.object_class()

    dest_path = destination_path(src_path, entity.files_path, fidentifier)
    tmp_path = temporary_path(src_path, config.MEDIA_BASE, fidentifier)
    tmp_path_renamed = temporary_path_renamed(tmp_path, dest_path)
    access_dest_path = access_path(file_class, tmp_path_renamed)
    dest_dir = os.path.dirname(dest_path)
    tmp_dir = os.path.dirname(tmp_path)
    # this is the final path of the access file
    access_final_path = ddrfile.identifier.path_abs('access')
    
    log.ok('Checking files/dirs')
    check_dir('| tmp_dir', tmp_dir, log, mkdir=True, perm=os.W_OK)
    check_dir('| dest_dir', dest_dir, log, mkdir=True, perm=os.W_OK)
    
    log.ok('Making access file')
    tmp_access_path = make_access_file(src_path, access_dest_path, log)
    
    log.ok('File object')
    file_ = ddrfile
    log.ok('| file_ %s' % file_)
    
    # if new tmp_access_path and access_dest_path are same, declare success and quit
    existing_sha1 = None
    tmp_sha1 = None
    if os.path.exists(access_final_path):
        # if src_path is an existing file, it's probably a git-annex symlink
        # we want to compare two actual files, not a file and a symlink
        access_final_path_real = os.path.realpath(access_final_path)
        existing_sha1 = util.file_hash(access_final_path_real, 'sha1')
        log.ok('| existing_sha1: %s' % existing_sha1)
    if os.path.exists(access_dest_path):
        tmp_sha1 = util.file_hash(access_dest_path, 'sha1')
        log.ok('| tmp_sha1:      %s' % tmp_sha1)
    if tmp_sha1 == existing_sha1:
        log.ok('New access file same as existing. Nothing to see here, move along.')
        return file_,repo,log,'pass'
    
    log.ok('Writing object metadata')
    tmp_file_json = write_object_metadata(file_, tmp_dir, log)
    #tmp_entity_json
    
    # WE ARE NOW MAKING CHANGES TO THE REPO ------------------------
    
    log.ok('Moving files to dest_dir')
    new_files = []
    if tmp_access_path and os.path.exists(tmp_access_path):
        new_files.append([tmp_access_path, file_.access_abs])
    mvnew_fails = move_files(new_files, log)
    if mvnew_fails:
        log.not_ok('Failed to place one or more new files to destination repo')
        move_new_files_back(new_files, mvnew_fails, log)
    else:
        log.ok('| all files moved')
    
    # file metadata will only be copied if everything else was moved
    log.ok('Moving file .json to dest_dir')
    existing_files = [
        (tmp_file_json, file_.json_path)
    ]
    mvold_fails = move_files(existing_files, log)
    if mvold_fails:
        log.not_ok('Failed to update metadata in destination repo')
        move_existing_files_back(existing_files, mvold_fails, log)
    else:
        log.ok('| all files moved')
    
    log.ok('Staging files')
    git_files = [
        file_.json_path_rel
    ]
    annex_files = [
        file_.access_rel
    ]
    repo = stage_files(
        entity=entity,
        git_files=git_files, annex_files=annex_files,
        log=log,
        show_staged=show_staged,
    )
    
    # IMPORTANT: Files are only staged! Be sure to commit!
    # IMPORTANT: changelog is not staged!
    return file_,repo,log,'continue'
Example #12
0
def add_access( entity, ddrfile, git_name, git_mail, agent='', log_path=None, show_staged=True ):
    """Generate new access file for entity
    
    This method breaks out of OOP and manipulates entity.json directly.
    Thus it needs to lock to prevent other edits while it does its thing.
    Writes a log to ${entity}/addfile.log, formatted in pseudo-TAP.
    This log is returned along with a File object.
    
    TODO Refactor this function! It is waaay too long!
    
    @param ddrfile: File
    @param git_name: Username of git committer.
    @param git_mail: Email of git committer.
    @param agent: str (optional) Name of software making the change.
    @param log_path: str (optional) Absolute path to addfile log
    @param show_staged: boolean Log list of staged files
    @returns: file_,repo,log,next_op
    """
    f = None
    repo = None
    if log_path:
        log = addfile_logger(log_path=log_path)
    else:
        log = addfile_logger(identifier=entity.identifier)
    
    src_path = ddrfile.path_abs
    
    log.ok('------------------------------------------------------------------------')
    log.ok('DDR.models.Entity.add_access: START')
    log.ok('entity: %s' % entity.id)
    log.ok('ddrfile: %s' % ddrfile)
    
    log.ok('Checking files/dirs')
    check_dir('| src_path', src_path, log, mkdir=False, perm=os.R_OK)
    
    log.ok('Identifier')
    log.ok('| file_id %s' % ddrfile.id)
    log.ok('| basepath %s' % entity.identifier.basepath)
    fidentifier = identifier.Identifier(ddrfile.id, entity.identifier.basepath)
    log.ok('| identifier %s' % fidentifier)
    file_class = fidentifier.object_class()

    dest_path = destination_path(src_path, entity.files_path, fidentifier)
    tmp_path = temporary_path(src_path, config.MEDIA_BASE, fidentifier)
    tmp_path_renamed = temporary_path_renamed(tmp_path, dest_path)
    access_dest_path = access_path(file_class, tmp_path_renamed)
    dest_dir = os.path.dirname(dest_path)
    tmp_dir = os.path.dirname(tmp_path)
    # this is the final path of the access file
    access_final_path = ddrfile.identifier.path_abs('access')
    
    log.ok('Checking files/dirs')
    check_dir('| tmp_dir', tmp_dir, log, mkdir=True, perm=os.W_OK)
    check_dir('| dest_dir', dest_dir, log, mkdir=True, perm=os.W_OK)
    
    log.ok('Making access file')
    tmp_access_path = make_access_file(src_path, access_dest_path, log)
    
    log.ok('File object')
    file_ = ddrfile
    log.ok('| file_ %s' % file_)
    
    # if new tmp_access_path and access_dest_path are same, declare success and quit
    existing_sha1 = None
    tmp_sha1 = None
    if os.path.exists(access_final_path):
        # if src_path is an existing file, it's probably a git-annex symlink
        # we want to compare two actual files, not a file and a symlink
        access_final_path_real = os.path.realpath(access_final_path)
        existing_sha1 = util.file_hash(access_final_path_real, 'sha1')
        log.ok('| existing_sha1: %s' % existing_sha1)
    if os.path.exists(access_dest_path):
        tmp_sha1 = util.file_hash(access_dest_path, 'sha1')
        log.ok('| tmp_sha1:      %s' % tmp_sha1)
    if tmp_sha1 == existing_sha1:
        log.ok('New access file same as existing. Nothing to see here, move along.')
        return file_,repo,log,'pass'
    
    log.ok('Writing object metadata')
    tmp_file_json = write_object_metadata(file_, tmp_dir, log)
    #tmp_entity_json
    
    # WE ARE NOW MAKING CHANGES TO THE REPO ------------------------
    
    log.ok('Moving files to dest_dir')
    new_files = []
    if tmp_access_path and os.path.exists(tmp_access_path):
        new_files.append([tmp_access_path, file_.access_abs])
    mvnew_fails = move_files(new_files, log)
    if mvnew_fails:
        log.not_ok('Failed to place one or more new files to destination repo')
        move_new_files_back(new_files, mvnew_fails, log)
    else:
        log.ok('| all files moved')
    
    # file metadata will only be copied if everything else was moved
    log.ok('Moving file .json to dest_dir')
    existing_files = [
        (tmp_file_json, file_.json_path)
    ]
    mvold_fails = move_files(existing_files, log)
    if mvold_fails:
        log.not_ok('Failed to update metadata in destination repo')
        move_existing_files_back(existing_files, mvold_fails, log)
    else:
        log.ok('| all files moved')
    
    log.ok('Staging files')
    git_files = [
        file_.json_path_rel
    ]
    annex_files = [
        file_.access_rel
    ]
    repo = stage_files(entity, git_files, annex_files, new_files, log, show_staged=show_staged)
    
    # IMPORTANT: Files are only staged! Be sure to commit!
    # IMPORTANT: changelog is not staged!
    return file_,repo,log,'continue'
Example #13
0
def test_files_import_external_nohashes_rename(tmpdir, collection,
                                               test_csv_dir, test_files_dir):
    """Test importing *external* files with *no* hash cols but binaries present
    
    If file is external, binary is present, and no hash cols, rename binary in place
    
    ddr-testing-123-1-master-684e15e967
    ddr-testing-123-2-master-b9773b9aef
    """
    print('collection_path %s' % collection.path_abs)
    file_csv_path = os.path.join(
        test_csv_dir, 'ddrimport-files-import-external-nohashes-rename.csv')
    print('file_csv_path %s' % file_csv_path)
    rewrite_file_paths(file_csv_path, test_files_dir)
    log_path = os.path.join(
        test_files_dir, 'ddrimport-files-import-external-nohashes-rename.log')
    print('log_path %s' % log_path)

    print('test_files_dir %s' % test_files_dir)
    for path in os.listdir(test_files_dir):
        print(path)

    # copy test files so later tests don't crash
    # replace basename_orig in CSV with copied file
    # and rewrite CSV
    headers, rowds, csv_errs = csvfile.make_rowds(
        fileio.read_csv(file_csv_path))
    renamed_files = []
    copied_files = []
    ingested_files = []
    access_files = []
    for rowd in rowds:
        print(rowd)
        src_file = os.path.join(test_files_dir, rowd['basename_orig'])
        path, ext = os.path.splitext(src_file)
        dest_file = path + '-rename' + ext
        print('shutil.copy(%s, %s)' % (src_file, dest_file))
        shutil.copy(src_file, dest_file)
        if os.path.exists(dest_file):
            renamed_files.append(os.path.basename(dest_file))
        else:
            print('could not copy')
            assert False
        rowd['basename_orig'] = dest_file
        # figure out new file ID
        sha1 = util.file_hash(dest_file, 'sha1')[:10]
        idparts = rowd['id'].split('-') + [rowd['role']] + [sha1]
        final_file = '-'.join(idparts) + ext
        final_access = '-'.join(idparts + ['a.jpg'])
        copied_files.append(final_file)
        ingested_files.append(final_file)
        access_files.append(final_access)
    headers, rows = csvfile.make_rows(rowds)
    fileio.write_csv(file_csv_path, headers, rows)

    out = batch.Importer.import_files(
        file_csv_path,
        collection.identifier,
        VOCABS_URL,
        GIT_USER,
        GIT_MAIL,
        AGENT,
        log_path=log_path,
        tmp_dir=test_files_dir,
    )
    # save and commit
    repo = dvcs.repository(collection.path_abs)

    print('STAGED FILES')
    staged_files = sorted([path for path in dvcs.list_staged(repo)])
    for path in staged_files:
        print('  %s' % path)

    # after import_files, we expect to see
    offenses = 0
    # assert final_file in os.listdir(test_files_dir)

    print('test_files_dir')
    test_files = [path for path in os.listdir(test_files_dir)]
    for path in copied_files:
        print(path)
        if path not in test_files:
            print('RENAMED SRC FILE NOT PRESENT %s' % path)
            offenses += 1
    # assert files not ingested
    # assert no access files created
    for path in staged_files:
        if os.path.basename(path) in ingested_files:
            print('ERROR %s HAS BEEN IMPORTED!!' % path)
            offenses += 1
        if os.path.basename(path) in access_files:
            print('ERROR %s ACCESS FILE GENERATED!!' % path)
            offenses += 1

    commit = repo.index.commit('test_files_import_external_nohashes_rename')
    print('commit %s' % commit)
    if offenses:
        assert False
    # test hashes present
    check_file_hashes(collection.path_abs)
    # ensure no binaries in .git/objects
    print('log_path %s' % log_path)
    assert not find_binaries_in_git_objects(repo)
    assert not find_missing_annex_binaries(repo)