def filename(config, like, ilike, notlike, notilike, run): with self_contained_session(config.database, echo=config.database_echo) as session: query = session.query(Filename) for name in like: query = like_filter(query, name) for name in ilike: query = ilike_filter_escape(query, name) for name in notlike: query = like_filter(query, name, exclude=True) for name in notilike: query = ilike_filter_escape(query, name, exclude=True) #elif regex: # broken for bytes # query = session.query(Filename).filter(text('filename ~ :reg')).params(reg=name) #else: # assert len(names) == 1 # query = session.query(Filename).filter(Filename.filename == names[0]) for filename in query: #print(filename) for item in filename.filenames: print(item.file) if run: command = run + b' -- ' + b'"' + item.file + b'"' eprint(command)
def result_bool(results, verbose): for result in results: if verbose: eprint(True) #yield True quit(0) if verbose: eprint(False) #yield False quit(1)
def dupes(infile, verbose): infile = bytes(infile, 'UTF8') if verbose: eprint(infile) infile = os.path.realpath(infile) with open(infile, 'rb') as fh: infilehash = hashlib.sha1(fh.read()).hexdigest() results = match_field(field='data_hash', term=infilehash, substring=False) for result in results: yield result
def get_domains_from_url(url, no_cache=False, cache_expire=CACHE_EXPIRE): unexpired_copy = get_cached_url_copy(url=url, cache_expire=cache_expire) if unexpired_copy: eprint("Using cached copy: %s", unexpired_copy, level=LOG['INFO']) unexpired_copy_bytes = read_file_bytes(unexpired_copy) assert isinstance(unexpired_copy_bytes, bytes) return extract_domain_set_from_hosts_format_bytes(unexpired_copy_bytes) else: return extract_domain_set_from_hosts_format_url(url, no_cache)
def ffmpeg_file_is_corrupt(file, write_verification=False): command = "/home/cfg/media/ffmpeg/exit_on_first_error_found" command += " " command += '"' + os.fsdecode(file) + '"' try: run_command(command, verbose=True) if write_verification: eprint("could write verification file now, need hash") except CalledProcessError: return True return False
def getdigest(self, digest): realpath = self.digestpath(digest) if self.redis: if self.redis.zscore(self.rediskey, digest): if self.verbose: eprint("got cached digest from redis:", self.rediskey, digest.hex()) return HashAddress(digest, self, realpath) #if really_is_file(realpath): # self.redis.sadd(self.rediskey, digest) # return HashAddress(digest, self, realpath) #raise FileNotFoundError if really_is_file(realpath): return HashAddress(digest, self, realpath) # todo raise FileNotFoundError
def factorize_worker(job_q, res_q, function): worker_start = time.time() process_id = os.getpid() eprint('process id:', process_id) while True: try: job = job_q.get_nowait() out_dict = {} for n in job: job_start = time.time() out_dict[n] = {'result': function(n)} job_time = time.time() - job_start out_dict[n]['time'] = round(job_time, 5) out_dict[n]['pid'] = process_id res_q.put(out_dict) except queue.Empty: return
def path(config, path, like, regex): with self_contained_session(config.database, echo=config.database_echo) as session: if like and regex: eprint("--like and --regex are mutually exclusive.") quit(1) if like: path_generator = session.query(Path).filter( Path.path.like(b'%' + path + b'%')) elif regex: path_generator = session.query(Path).filter( text('path ~ :reg')).params(reg=path) else: path_generator = session.query(Path).filter(Path.path == path) for path in path_generator: #print(path) for item in path.filerecords: print(item)
def gethexdigest(self, hexdigest): realpath = self.hexdigestpath(hexdigest) digest = binascii.unhexlify(hexdigest) if self.redis: if self.redis.zscore(self.rediskey, digest): if self.verbose: eprint("got cached digest from redis:", self.rediskey, hexdigest) return HashAddress(digest, self, realpath) # shouldnt be added unless it's had it's on-disk hash verified? #if really_is_file(realpath): # self.redis.sadd(self.rediskey, digest) # return HashAddress(digest, self, realpath) #raise FileNotFoundError if really_is_file(realpath): return HashAddress(digest, self, realpath) # todo raise FileNotFoundError
def compare_files_by_size(source, destination, recommend_larger=True, skip_percent=False): #ic(source) #ic(destination) assert path_is_file(source) assert path_is_file(destination) source_stat = os.stat(source) destination_stat = os.stat(destination) #eprint("source_stat:", source_stat) #eprint("destination_stat:", destination_stat) if source_stat.st_size == destination_stat.st_size: eprint("files are the same size") return destination eprint("files differ in size:") eprint(" source :", source_stat.st_size) eprint(" destination:", destination_stat.st_size) if skip_percent: assert skip_percent < 100 assert skip_percent > 0 percent_difference = \ abs((source_stat.st_size - destination_stat.st_size) / max(source_stat.st_size, destination_stat.st_size)) if percent_difference < skip_percent: eprint("returning destination because percent_difference: ", percent_difference, "is < skip_percent: ", skip_percent) assert False return destination if recommend_larger: return source if source_stat.st_size > destination_stat.st_size else destination else: return destination if source_stat.st_size > destination_stat.st_size else source
def smartmove_file(source, destination, makedirs, verbose=False, skip_percent=False): #eprint("\n") eprint("source :", source) eprint("destination:", destination) assert path_is_file(source) if path_is_file(destination): #assert not destination.as_posix().endswith('/') source_classification = classify(source) if source_classification == 'media': source_corrupt = ffmpeg_file_is_corrupt(source) destination_corrupt = ffmpeg_file_is_corrupt(destination) if source_corrupt and destination_corrupt: eprint( "source and destination are corrupt according to ffmpeg, relying on file size instead" ) file_to_keep = compare_files_by_size(source=source, destination=destination, recommend_larger=True, skip_percent=skip_percent) eprint("file_to_keep:", file_to_keep) elif source_corrupt: file_to_keep = destination eprint("source is corrupt, keeping destination" ) # bug what if destination is much smaller? elif destination_corrupt: file_to_keep = source eprint("destination is corrupt, keeping source" ) # bug what if source is much smaller? else: # neither are corrupt... file_to_keep = compare_files_by_size(source=source, destination=destination, recommend_larger=True, skip_percent=skip_percent) eprint("did size comparison, file_to_keep:", file_to_keep) else: file_to_keep = compare_files_by_size(source=source, destination=destination, recommend_larger=True, skip_percent=skip_percent) eprint("(non media) did size comparison, file_to_keep:", file_to_keep) if empty_file(file_to_keep): assert empty_file(source) assert empty_file(destination) if file_to_keep == destination: eprint( "the destination file is being kept, so need to delete the source since it's not being moved" ) try: shutil.move(source, JUNK) # https://bugs.python.org/issue26791 except OSError: os.unlink(source) eprint("unlinked:", source) #except IsADirectoryError: # os.unlink(source) elif file_to_keep == source: eprint( "the source:", source, "is being kept, so need to move it to overwrite the destination" ) shutil.move(source, destination) else: eprint("file_to_keep:", file_to_keep, "does not match the source or destination, that's a bug") exit(1) return False elif path_is_dir(destination): if destination.endswith('/'): # should be fixed with a decorator destination = destination[:-1] source_filename = os.path.basename(source) destination = destination + '/' + source_filename # hmmm use a new var? assert not path_is_dir(destination) if path_is_file(destination): file_to_keep = compare_files_by_size(source=source, destination=destination, recommend_larger=True, skip_percent=skip_percent) eprint("file_to_keep:", file_to_keep) if file_to_keep == destination: eprint( "keeping destination file, no need to mv, just rm source") os.remove(source) return True elif file_to_keep == source: eprint( "moving source to destination, need to rm destination first" ) os.remove(destination) shutil.move(source, destination) return True else: if verbose: eprint(source, "->", destination) shutil.move(source, destination) return True else: destination_folder = os.path.dirname(destination) if makedirs: os.makedirs(destination_folder, exist_ok=True) if verbose: eprint(source, "->", destination) shutil.move(source, destination) else: eprint("destination:", destination, "is not a file or directory, exiting.") raise FileNotFoundError
def check(self, path, skip_cached=False, quiet=False, debug=False): # todo verify perms and attrs #import IPython #IPython.embed() # todo find broken latest_archive symlinks # todo find empty metadata folders, or with 1 broken latest_archive symlink assert path_is_parent(self.root, path) longest_path = 0 for path in self.paths(path=path, return_symlinks=False, return_dirs=True): try: pathlen = len(path.absolute().as_posix()) longest_path = max(longest_path, pathlen) pad = (longest_path - pathlen) + 4 # +4 to cover arrow presses like right arrow "^[[C" pad = pad * ' ' + '\r' if not self.verbose: if not quiet: print(path, end=pad, file=sys.stderr, flush=True) assert path_is_parent(self.root, path) rel_root = path.relative_to(self.root) if debug: eprint("path:", path) eprint("rel_root:", rel_root) if not self.legacy: assert rel_root.parts[0] in (self.algorithm, self.tmp) if really_is_file(path): if hasattr(self, "tmproot"): if path.parts[-2] == self.tmp: continue if self.redis and skip_cached: if self.redis.zscore(self.rediskey, binascii.unhexlify(path.name)): if self.verbose: print(path, "(redis)") continue digest = hash_file(path, self.algorithm, tmp=None) hexdigest = digest.hex() if self.verbose: print(path, "(hashed)") try: assert len(hexdigest) == len(path.name) except AssertionError as e: eprint("path:", path) raise e expected_path = self.hexdigestpath(hexdigest) if expected_path != path: yield (path, HashAddress(digest, self, expected_path)) else: if self.redis: self._commit_redis(digest, filepath=path) else: assert path.lstat().st_size == 0 elif really_is_dir(path): try: if hasattr(self, "tmproot"): assert (len(rel_root.parts) - 1) <= self.depth if rel_root == Path(self.tmp): continue if self.legacy: tree_path = rel_root else: tree_path = rel_root.relative_to(self.algorithm) if tree_path.name: if len(tree_path.parts) <= self.depth: assert len(tree_path.name) == self.width assert tree_path.name in self.ns # bug for angryfiles to find elif len(tree_path.parts) == self.depth + 1: assert len(tree_path.name) == self.hexdigestlen try: assert tree_path.parts[0:-1] == self.shard(tree_path.name) except AssertionError as e: print(e) import IPython IPython.embed() #raise e elif len(tree_path.parts) == self.depth + 2: assert tree_path.name in ('archive', 'tags', 'strings') elif len(tree_path.parts) == self.depth + 3: assert float(tree_path.name) elif len(tree_path.parts) == self.depth + 4: assert tree_path.name in ('021_requests.plugin') else: assert False except AssertionError as e: print("\n", path) raise e except Exception as e: # bare exception to catch every case and always print the offending file print("Exception on path:", path) raise e