def checksums(src_path, log): md5 = util.file_hash(src_path, 'md5'); log.ok('| md5: %s' % md5) sha1 = util.file_hash(src_path, 'sha1'); log.ok('| sha1: %s' % sha1) sha256 = util.file_hash(src_path, 'sha256'); log.ok('| sha256: %s' % sha256) if not (sha1 and md5 and sha256): log.crash('Could not calculate checksums') return md5,sha1,sha256
def check_file(json_path, verbose=False): fi = Identifier(json_path) f = models.File.from_identifier(fi) if not os.path.exists(f.path_abs): result = ['missing', f.path_abs] print(result) return result mismatches = [] md5 = util.file_hash(f.path_abs, 'md5') if not (md5 == f.md5): mismatches.append['md5'] sha1 = util.file_hash(f.path_abs, 'sha1') if not (sha1 == f.sha1): mismatches.append['sha1'] sha256 = util.file_hash(f.path_abs, 'sha256') if not (sha256 == f.sha256): mismatches.append['sha256'] # SHA256 hash from the git-annex filename annex_sha256 = os.path.basename( os.path.realpath(f.path_abs) ).split('--')[1] if not (sha256 == annex_sha256): mismatches.append['annex_sha256'] if mismatches: mismatches.append(json_path) print(mismatches) return mismatches
def check_file(json_path, verbose=False): fi = Identifier(json_path) f = models.File.from_identifier(fi) if not os.path.exists(f.path_abs): result = ['missing', f.path_abs] print result return result mismatches = [] md5 = util.file_hash(f.path_abs, 'md5') if not (md5 == f.md5): mismatches.append['md5'] sha1 = util.file_hash(f.path_abs, 'sha1') if not (sha1 == f.sha1): mismatches.append['sha1'] sha256 = util.file_hash(f.path_abs, 'sha256') if not (sha256 == f.sha256): mismatches.append['sha256'] # SHA256 hash from the git-annex filename annex_sha256 = os.path.basename( os.path.realpath(f.path_abs) ).split('--')[1] if not (sha256 == annex_sha256): mismatches.append['annex_sha256'] if mismatches: mismatches.append(json_path) print mismatches return mismatches
def test_file_hash(): path = os.path.join(TESTING_BASE_DIR, 'test-hash-%s' % datetime.now(config.TZ).strftime('%Y%m%dT%H%M%S')) text = 'hash' sha1 = '2346ad27d7568ba9896f1b7da6b5991251debdf2' sha256 = 'd04b98f48e8f8bcc15c6ae5ac050801cd6dcfd428fb5f9e65c4e16e7807340fa' md5 = '0800fc577294c34e0b28ad2839435945' with open(path, 'w') as f: f.write(text) assert util.file_hash(path, 'sha1') == sha1 assert util.file_hash(path, 'sha256') == sha256 assert util.file_hash(path, 'md5') == md5 os.remove(path)
def test_file_hash(tmpdir): path = str(tmpdir / 'test-hash') text = 'hash' sha1 = '2346ad27d7568ba9896f1b7da6b5991251debdf2' sha256 = 'd04b98f48e8f8bcc15c6ae5ac050801cd6dcfd428fb5f9e65c4e16e7807340fa' md5 = '0800fc577294c34e0b28ad2839435945' with open(path, 'w') as f: f.write(text) assert util.file_hash(path, 'sha1') == sha1 assert util.file_hash(path, 'sha256') == sha256 assert util.file_hash(path, 'md5') == md5 os.remove(path)
def checksums(self, algo, force_read=False): """Calculates hash checksums for the Entity's files. Gets hashes from FILE.json metadata if the file(s) are absent from the filesystem (i.e. git-annex file symlinks). Overrides DDR.models.Entity.checksums. @param algo: str @param force_read: bool Traverse filesystem if true. @returns: list of (checksum, filepath) tuples """ checksums = [] if algo not in self.checksum_algorithms(): raise Error('BAD ALGORITHM CHOICE: {}'.format(algo)) for f in self._file_paths(): cs = None ext = None pathname = os.path.splitext(f)[0] # from metadata file json_path = os.path.join(self.files_path, f) for field in json.loads(fileio.read_text(json_path)): for k, v in field.items(): if k == algo: cs = v if k == 'basename_orig': ext = os.path.splitext(v)[-1] fpath = pathname + ext if force_read: # from filesystem # git-annex files are present if os.path.exists(fpath): cs = util.file_hash(fpath, algo) if cs: checksums.append((cs, os.path.basename(fpath))) return checksums
def file_name( entity, path_abs, role, sha1=None ): """Generate a new name for the specified file; Use only when ingesting a file! rename files to standard names on ingest: %{entity_id%}-%{role}-%{sha1}.%{ext} example: ddr-testing-56-101-master-fb73f9de29.jpg SHA1 is optional so it can be passed in by a calling process that has already generated it. @param entity @param path_abs: Absolute path to the file. @param role @param sha1: SHA1 hash (optional) """ if os.path.exists and os.access(path_abs, os.R_OK): ext = os.path.splitext(path_abs)[1] if not sha1: sha1 = util.file_hash(path_abs, 'sha1') if sha1: idparts = [a for a in entity.idparts] idparts.append(role) idparts.append(sha1[:10]) name = '{}{}'.format(Identifier(parts=idparts).id, ext) return name return None
def checksums(self, algo, force_read=False): """Calculates hash checksums for the Entity's files. Gets hashes from FILE.json metadata if the file(s) are absent from the filesystem (i.e. git-annex file symlinks). Overrides DDR.models.Entity.checksums. @param algo: str @param force_read: bool Traverse filesystem if true. @returns: list of (checksum, filepath) tuples """ checksums = [] if algo not in self.checksum_algorithms(): raise Error('BAD ALGORITHM CHOICE: {}'.format(algo)) for f in self._file_paths(): cs = None ext = None pathname = os.path.splitext(f)[0] # from metadata file json_path = os.path.join(self.files_path, f) for field in json.loads(fileio.read_text(json_path)): for k,v in field.iteritems(): if k == algo: cs = v if k == 'basename_orig': ext = os.path.splitext(v)[-1] fpath = pathname + ext if force_read: # from filesystem # git-annex files are present if os.path.exists(fpath): cs = util.file_hash(fpath, algo) if cs: checksums.append( (cs, os.path.basename(fpath)) ) return checksums
def file_name(entity, path_abs, role, sha1=None): """Generate a new name for the specified file; Use only when ingesting a file! rename files to standard names on ingest: %{entity_id%}-%{role}-%{sha1}.%{ext} example: ddr-testing-56-101-master-fb73f9de29.jpg SHA1 is optional so it can be passed in by a calling process that has already generated it. @param entity @param path_abs: Absolute path to the file. @param role @param sha1: SHA1 hash (optional) """ if os.path.exists and os.access(path_abs, os.R_OK): ext = os.path.splitext(path_abs)[1] if not sha1: sha1 = util.file_hash(path_abs, 'sha1') if sha1: idparts = [a for a in entity.idparts] idparts.append(role) idparts.append(sha1[:10]) name = '{}{}'.format(Identifier(parts=idparts).id, ext) return name return None
def add_access( entity, ddrfile, src_path, git_name, git_mail, agent='', log_path=None, show_staged=True ): """Generate new access file for entity This method breaks out of OOP and manipulates entity.json directly. Thus it needs to lock to prevent other edits while it does its thing. Writes a log to ${entity}/addfile.log, formatted in pseudo-TAP. This log is returned along with a File object. TODO Refactor this function! It is waaay too long! @param entity: Entity object @param ddrfile: File @param src_path: str Absolute path to the access file (ddrfile.path_abs) @param git_name: Username of git committer. @param git_mail: Email of git committer. @param agent: str (optional) Name of software making the change. @param log_path: str (optional) Absolute path to addfile log @param show_staged: boolean Log list of staged files @returns: file_,repo,log,next_op """ f = None repo = None if log_path: log = addfile_logger(log_path=log_path) else: log = addfile_logger(identifier=entity.identifier) log.ok('------------------------------------------------------------------------') log.ok('DDR.models.Entity.add_access: START') log.ok('entity: %s' % entity.id) log.ok('ddrfile: %s' % ddrfile) log.ok('Checking files/dirs') check_dir('| src_path', src_path, log, mkdir=False, perm=os.R_OK) log.ok('Identifier') log.ok('| file_id %s' % ddrfile.id) log.ok('| basepath %s' % entity.identifier.basepath) fidentifier = identifier.Identifier(ddrfile.id, entity.identifier.basepath) log.ok('| identifier %s' % fidentifier) file_class = fidentifier.object_class() dest_path = destination_path(src_path, entity.files_path, fidentifier) tmp_path = temporary_path(src_path, config.MEDIA_BASE, fidentifier) tmp_path_renamed = temporary_path_renamed(tmp_path, dest_path) access_dest_path = access_path(file_class, tmp_path_renamed) dest_dir = os.path.dirname(dest_path) tmp_dir = os.path.dirname(tmp_path) # this is the final path of the access file access_final_path = ddrfile.identifier.path_abs('access') log.ok('Checking files/dirs') check_dir('| tmp_dir', tmp_dir, log, mkdir=True, perm=os.W_OK) check_dir('| dest_dir', dest_dir, log, mkdir=True, perm=os.W_OK) log.ok('Making access file') tmp_access_path = make_access_file(src_path, access_dest_path, log) log.ok('File object') file_ = ddrfile log.ok('| file_ %s' % file_) # if new tmp_access_path and access_dest_path are same, declare success and quit existing_sha1 = None tmp_sha1 = None if os.path.exists(access_final_path): # if src_path is an existing file, it's probably a git-annex symlink # we want to compare two actual files, not a file and a symlink access_final_path_real = os.path.realpath(access_final_path) existing_sha1 = util.file_hash(access_final_path_real, 'sha1') log.ok('| existing_sha1: %s' % existing_sha1) if os.path.exists(access_dest_path): tmp_sha1 = util.file_hash(access_dest_path, 'sha1') log.ok('| tmp_sha1: %s' % tmp_sha1) if tmp_sha1 == existing_sha1: log.ok('New access file same as existing. Nothing to see here, move along.') return file_,repo,log,'pass' log.ok('Writing object metadata') tmp_file_json = write_object_metadata(file_, tmp_dir, log) #tmp_entity_json # WE ARE NOW MAKING CHANGES TO THE REPO ------------------------ log.ok('Moving files to dest_dir') new_files = [] if tmp_access_path and os.path.exists(tmp_access_path): new_files.append([tmp_access_path, file_.access_abs]) mvnew_fails = move_files(new_files, log) if mvnew_fails: log.not_ok('Failed to place one or more new files to destination repo') move_new_files_back(new_files, mvnew_fails, log) else: log.ok('| all files moved') # file metadata will only be copied if everything else was moved log.ok('Moving file .json to dest_dir') existing_files = [ (tmp_file_json, file_.json_path) ] mvold_fails = move_files(existing_files, log) if mvold_fails: log.not_ok('Failed to update metadata in destination repo') move_existing_files_back(existing_files, mvold_fails, log) else: log.ok('| all files moved') log.ok('Staging files') git_files = [ file_.json_path_rel ] annex_files = [ file_.access_rel ] repo = stage_files( entity=entity, git_files=git_files, annex_files=annex_files, log=log, show_staged=show_staged, ) # IMPORTANT: Files are only staged! Be sure to commit! # IMPORTANT: changelog is not staged! return file_,repo,log,'continue'
def add_access( entity, ddrfile, git_name, git_mail, agent='', log_path=None, show_staged=True ): """Generate new access file for entity This method breaks out of OOP and manipulates entity.json directly. Thus it needs to lock to prevent other edits while it does its thing. Writes a log to ${entity}/addfile.log, formatted in pseudo-TAP. This log is returned along with a File object. TODO Refactor this function! It is waaay too long! @param ddrfile: File @param git_name: Username of git committer. @param git_mail: Email of git committer. @param agent: str (optional) Name of software making the change. @param log_path: str (optional) Absolute path to addfile log @param show_staged: boolean Log list of staged files @returns: file_,repo,log,next_op """ f = None repo = None if log_path: log = addfile_logger(log_path=log_path) else: log = addfile_logger(identifier=entity.identifier) src_path = ddrfile.path_abs log.ok('------------------------------------------------------------------------') log.ok('DDR.models.Entity.add_access: START') log.ok('entity: %s' % entity.id) log.ok('ddrfile: %s' % ddrfile) log.ok('Checking files/dirs') check_dir('| src_path', src_path, log, mkdir=False, perm=os.R_OK) log.ok('Identifier') log.ok('| file_id %s' % ddrfile.id) log.ok('| basepath %s' % entity.identifier.basepath) fidentifier = identifier.Identifier(ddrfile.id, entity.identifier.basepath) log.ok('| identifier %s' % fidentifier) file_class = fidentifier.object_class() dest_path = destination_path(src_path, entity.files_path, fidentifier) tmp_path = temporary_path(src_path, config.MEDIA_BASE, fidentifier) tmp_path_renamed = temporary_path_renamed(tmp_path, dest_path) access_dest_path = access_path(file_class, tmp_path_renamed) dest_dir = os.path.dirname(dest_path) tmp_dir = os.path.dirname(tmp_path) # this is the final path of the access file access_final_path = ddrfile.identifier.path_abs('access') log.ok('Checking files/dirs') check_dir('| tmp_dir', tmp_dir, log, mkdir=True, perm=os.W_OK) check_dir('| dest_dir', dest_dir, log, mkdir=True, perm=os.W_OK) log.ok('Making access file') tmp_access_path = make_access_file(src_path, access_dest_path, log) log.ok('File object') file_ = ddrfile log.ok('| file_ %s' % file_) # if new tmp_access_path and access_dest_path are same, declare success and quit existing_sha1 = None tmp_sha1 = None if os.path.exists(access_final_path): # if src_path is an existing file, it's probably a git-annex symlink # we want to compare two actual files, not a file and a symlink access_final_path_real = os.path.realpath(access_final_path) existing_sha1 = util.file_hash(access_final_path_real, 'sha1') log.ok('| existing_sha1: %s' % existing_sha1) if os.path.exists(access_dest_path): tmp_sha1 = util.file_hash(access_dest_path, 'sha1') log.ok('| tmp_sha1: %s' % tmp_sha1) if tmp_sha1 == existing_sha1: log.ok('New access file same as existing. Nothing to see here, move along.') return file_,repo,log,'pass' log.ok('Writing object metadata') tmp_file_json = write_object_metadata(file_, tmp_dir, log) #tmp_entity_json # WE ARE NOW MAKING CHANGES TO THE REPO ------------------------ log.ok('Moving files to dest_dir') new_files = [] if tmp_access_path and os.path.exists(tmp_access_path): new_files.append([tmp_access_path, file_.access_abs]) mvnew_fails = move_files(new_files, log) if mvnew_fails: log.not_ok('Failed to place one or more new files to destination repo') move_new_files_back(new_files, mvnew_fails, log) else: log.ok('| all files moved') # file metadata will only be copied if everything else was moved log.ok('Moving file .json to dest_dir') existing_files = [ (tmp_file_json, file_.json_path) ] mvold_fails = move_files(existing_files, log) if mvold_fails: log.not_ok('Failed to update metadata in destination repo') move_existing_files_back(existing_files, mvold_fails, log) else: log.ok('| all files moved') log.ok('Staging files') git_files = [ file_.json_path_rel ] annex_files = [ file_.access_rel ] repo = stage_files(entity, git_files, annex_files, new_files, log, show_staged=show_staged) # IMPORTANT: Files are only staged! Be sure to commit! # IMPORTANT: changelog is not staged! return file_,repo,log,'continue'
def test_files_import_external_nohashes_rename(tmpdir, collection, test_csv_dir, test_files_dir): """Test importing *external* files with *no* hash cols but binaries present If file is external, binary is present, and no hash cols, rename binary in place ddr-testing-123-1-master-684e15e967 ddr-testing-123-2-master-b9773b9aef """ print('collection_path %s' % collection.path_abs) file_csv_path = os.path.join( test_csv_dir, 'ddrimport-files-import-external-nohashes-rename.csv') print('file_csv_path %s' % file_csv_path) rewrite_file_paths(file_csv_path, test_files_dir) log_path = os.path.join( test_files_dir, 'ddrimport-files-import-external-nohashes-rename.log') print('log_path %s' % log_path) print('test_files_dir %s' % test_files_dir) for path in os.listdir(test_files_dir): print(path) # copy test files so later tests don't crash # replace basename_orig in CSV with copied file # and rewrite CSV headers, rowds, csv_errs = csvfile.make_rowds( fileio.read_csv(file_csv_path)) renamed_files = [] copied_files = [] ingested_files = [] access_files = [] for rowd in rowds: print(rowd) src_file = os.path.join(test_files_dir, rowd['basename_orig']) path, ext = os.path.splitext(src_file) dest_file = path + '-rename' + ext print('shutil.copy(%s, %s)' % (src_file, dest_file)) shutil.copy(src_file, dest_file) if os.path.exists(dest_file): renamed_files.append(os.path.basename(dest_file)) else: print('could not copy') assert False rowd['basename_orig'] = dest_file # figure out new file ID sha1 = util.file_hash(dest_file, 'sha1')[:10] idparts = rowd['id'].split('-') + [rowd['role']] + [sha1] final_file = '-'.join(idparts) + ext final_access = '-'.join(idparts + ['a.jpg']) copied_files.append(final_file) ingested_files.append(final_file) access_files.append(final_access) headers, rows = csvfile.make_rows(rowds) fileio.write_csv(file_csv_path, headers, rows) out = batch.Importer.import_files( file_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT, log_path=log_path, tmp_dir=test_files_dir, ) # save and commit repo = dvcs.repository(collection.path_abs) print('STAGED FILES') staged_files = sorted([path for path in dvcs.list_staged(repo)]) for path in staged_files: print(' %s' % path) # after import_files, we expect to see offenses = 0 # assert final_file in os.listdir(test_files_dir) print('test_files_dir') test_files = [path for path in os.listdir(test_files_dir)] for path in copied_files: print(path) if path not in test_files: print('RENAMED SRC FILE NOT PRESENT %s' % path) offenses += 1 # assert files not ingested # assert no access files created for path in staged_files: if os.path.basename(path) in ingested_files: print('ERROR %s HAS BEEN IMPORTED!!' % path) offenses += 1 if os.path.basename(path) in access_files: print('ERROR %s ACCESS FILE GENERATED!!' % path) offenses += 1 commit = repo.index.commit('test_files_import_external_nohashes_rename') print('commit %s' % commit) if offenses: assert False # test hashes present check_file_hashes(collection.path_abs) # ensure no binaries in .git/objects print('log_path %s' % log_path) assert not find_binaries_in_git_objects(repo) assert not find_missing_annex_binaries(repo)