Esempio n. 1
0
def filename(config, like, ilike, notlike, notilike, run):
    with self_contained_session(config.database,
                                echo=config.database_echo) as session:
        query = session.query(Filename)
        for name in like:
            query = like_filter(query, name)
        for name in ilike:
            query = ilike_filter_escape(query, name)
        for name in notlike:
            query = like_filter(query, name, exclude=True)
        for name in notilike:
            query = ilike_filter_escape(query, name, exclude=True)
        #elif regex:  # broken for bytes
        #    query = session.query(Filename).filter(text('filename ~ :reg')).params(reg=name)
        #else:
        #    assert len(names) == 1
        #    query = session.query(Filename).filter(Filename.filename == names[0])

        for filename in query:
            #print(filename)
            for item in filename.filenames:
                print(item.file)
                if run:
                    command = run + b' -- ' + b'"' + item.file + b'"'
                    eprint(command)
Esempio n. 2
0
def result_bool(results, verbose):
    for result in results:
        if verbose: eprint(True)
        #yield True
        quit(0)
    if verbose: eprint(False)
    #yield False
    quit(1)
Esempio n. 3
0
def dupes(infile, verbose):
    infile = bytes(infile, 'UTF8')
    if verbose: eprint(infile)
    infile = os.path.realpath(infile)
    with open(infile, 'rb') as fh:
        infilehash = hashlib.sha1(fh.read()).hexdigest()
    results = match_field(field='data_hash', term=infilehash, substring=False)
    for result in results:
        yield result
Esempio n. 4
0
def get_domains_from_url(url, no_cache=False, cache_expire=CACHE_EXPIRE):
    unexpired_copy = get_cached_url_copy(url=url, cache_expire=cache_expire)
    if unexpired_copy:
        eprint("Using cached copy: %s", unexpired_copy, level=LOG['INFO'])
        unexpired_copy_bytes = read_file_bytes(unexpired_copy)
        assert isinstance(unexpired_copy_bytes, bytes)
        return extract_domain_set_from_hosts_format_bytes(unexpired_copy_bytes)
    else:
        return extract_domain_set_from_hosts_format_url(url, no_cache)
Esempio n. 5
0
def ffmpeg_file_is_corrupt(file, write_verification=False):
    command = "/home/cfg/media/ffmpeg/exit_on_first_error_found"
    command += " "
    command += '"' + os.fsdecode(file) + '"'
    try:
        run_command(command, verbose=True)
        if write_verification:
            eprint("could write verification file now, need hash")
    except CalledProcessError:
        return True
    return False
Esempio n. 6
0
    def getdigest(self, digest):
        realpath = self.digestpath(digest)
        if self.redis:
            if self.redis.zscore(self.rediskey, digest):
                if self.verbose:
                    eprint("got cached digest from redis:", self.rediskey, digest.hex())
                return HashAddress(digest, self, realpath)

            #if really_is_file(realpath):
            #    self.redis.sadd(self.rediskey, digest)
            #    return HashAddress(digest, self, realpath)
            #raise FileNotFoundError

        if really_is_file(realpath):
            return HashAddress(digest, self, realpath)  # todo
        raise FileNotFoundError
Esempio n. 7
0
def factorize_worker(job_q, res_q, function):
    worker_start = time.time()
    process_id = os.getpid()
    eprint('process id:', process_id)
    while True:
        try:
            job = job_q.get_nowait()
            out_dict = {}
            for n in job:
                job_start = time.time()
                out_dict[n] = {'result': function(n)}
                job_time = time.time() - job_start
                out_dict[n]['time'] = round(job_time, 5)
                out_dict[n]['pid'] = process_id
            res_q.put(out_dict)
        except queue.Empty:
            return
Esempio n. 8
0
def path(config, path, like, regex):
    with self_contained_session(config.database,
                                echo=config.database_echo) as session:
        if like and regex:
            eprint("--like and --regex are mutually exclusive.")
            quit(1)
        if like:
            path_generator = session.query(Path).filter(
                Path.path.like(b'%' + path + b'%'))
        elif regex:
            path_generator = session.query(Path).filter(
                text('path ~ :reg')).params(reg=path)
        else:
            path_generator = session.query(Path).filter(Path.path == path)

        for path in path_generator:
            #print(path)
            for item in path.filerecords:
                print(item)
Esempio n. 9
0
    def gethexdigest(self, hexdigest):
        realpath = self.hexdigestpath(hexdigest)
        digest = binascii.unhexlify(hexdigest)

        if self.redis:
            if self.redis.zscore(self.rediskey, digest):
                if self.verbose:
                    eprint("got cached digest from redis:", self.rediskey, hexdigest)
                return HashAddress(digest, self, realpath)

            # shouldnt be added unless it's had it's on-disk hash verified?
            #if really_is_file(realpath):
            #    self.redis.sadd(self.rediskey, digest)
            #    return HashAddress(digest, self, realpath)
            #raise FileNotFoundError

        if really_is_file(realpath):
            return HashAddress(digest, self, realpath)  # todo
        raise FileNotFoundError
Esempio n. 10
0
def compare_files_by_size(source,
                          destination,
                          recommend_larger=True,
                          skip_percent=False):
    #ic(source)
    #ic(destination)
    assert path_is_file(source)
    assert path_is_file(destination)
    source_stat = os.stat(source)
    destination_stat = os.stat(destination)
    #eprint("source_stat:", source_stat)
    #eprint("destination_stat:", destination_stat)
    if source_stat.st_size == destination_stat.st_size:
        eprint("files are the same size")
        return destination

    eprint("files differ in size:")
    eprint("  source     :", source_stat.st_size)
    eprint("  destination:", destination_stat.st_size)

    if skip_percent:
        assert skip_percent < 100
        assert skip_percent > 0

        percent_difference = \
            abs((source_stat.st_size - destination_stat.st_size) / max(source_stat.st_size, destination_stat.st_size))
        if percent_difference < skip_percent:
            eprint("returning destination because percent_difference: ",
                   percent_difference, "is < skip_percent: ", skip_percent)
            assert False
            return destination

    if recommend_larger:
        return source if source_stat.st_size > destination_stat.st_size else destination
    else:
        return destination if source_stat.st_size > destination_stat.st_size else source
Esempio n. 11
0
def smartmove_file(source,
                   destination,
                   makedirs,
                   verbose=False,
                   skip_percent=False):
    #eprint("\n")
    eprint("source     :", source)
    eprint("destination:", destination)
    assert path_is_file(source)
    if path_is_file(destination):
        #assert not destination.as_posix().endswith('/')
        source_classification = classify(source)

        if source_classification == 'media':
            source_corrupt = ffmpeg_file_is_corrupt(source)
            destination_corrupt = ffmpeg_file_is_corrupt(destination)
            if source_corrupt and destination_corrupt:
                eprint(
                    "source and destination are corrupt according to ffmpeg, relying on file size instead"
                )
                file_to_keep = compare_files_by_size(source=source,
                                                     destination=destination,
                                                     recommend_larger=True,
                                                     skip_percent=skip_percent)
                eprint("file_to_keep:", file_to_keep)
            elif source_corrupt:
                file_to_keep = destination
                eprint("source is corrupt, keeping destination"
                       )  # bug what if destination is much smaller?
            elif destination_corrupt:
                file_to_keep = source
                eprint("destination is corrupt, keeping source"
                       )  # bug what if source is much smaller?
            else:  # neither are corrupt...
                file_to_keep = compare_files_by_size(source=source,
                                                     destination=destination,
                                                     recommend_larger=True,
                                                     skip_percent=skip_percent)
                eprint("did size comparison, file_to_keep:", file_to_keep)
        else:
            file_to_keep = compare_files_by_size(source=source,
                                                 destination=destination,
                                                 recommend_larger=True,
                                                 skip_percent=skip_percent)
            eprint("(non media) did size comparison, file_to_keep:",
                   file_to_keep)

        if empty_file(file_to_keep):
            assert empty_file(source)
            assert empty_file(destination)

        if file_to_keep == destination:
            eprint(
                "the destination file is being kept, so need to delete the source since it's not being moved"
            )
            try:
                shutil.move(source, JUNK)  # https://bugs.python.org/issue26791
            except OSError:
                os.unlink(source)
                eprint("unlinked:", source)
            #except IsADirectoryError:
            #    os.unlink(source)

        elif file_to_keep == source:
            eprint(
                "the source:", source,
                "is being kept, so need to move it to overwrite the destination"
            )
            shutil.move(source, destination)
        else:
            eprint("file_to_keep:", file_to_keep,
                   "does not match the source or destination, that's a bug")
            exit(1)

        return False

    elif path_is_dir(destination):
        if destination.endswith('/'):  # should be fixed with a decorator
            destination = destination[:-1]
        source_filename = os.path.basename(source)
        destination = destination + '/' + source_filename  # hmmm use a new var?
        assert not path_is_dir(destination)
        if path_is_file(destination):
            file_to_keep = compare_files_by_size(source=source,
                                                 destination=destination,
                                                 recommend_larger=True,
                                                 skip_percent=skip_percent)
            eprint("file_to_keep:", file_to_keep)
            if file_to_keep == destination:
                eprint(
                    "keeping destination file, no need to mv, just rm source")
                os.remove(source)
                return True
            elif file_to_keep == source:
                eprint(
                    "moving source to destination, need to rm destination first"
                )
                os.remove(destination)
                shutil.move(source, destination)
                return True
        else:
            if verbose:
                eprint(source, "->", destination)
            shutil.move(source, destination)
            return True
    else:
        destination_folder = os.path.dirname(destination)
        if makedirs:
            os.makedirs(destination_folder, exist_ok=True)
            if verbose:
                eprint(source, "->", destination)
            shutil.move(source, destination)
        else:
            eprint("destination:", destination,
                   "is not a file or directory, exiting.")
            raise FileNotFoundError
Esempio n. 12
0
    def check(self, path, skip_cached=False, quiet=False, debug=False):  # todo verify perms and attrs
        #import IPython
        #IPython.embed()
        # todo find broken latest_archive symlinks
        # todo find empty metadata folders, or with 1 broken latest_archive symlink
        assert path_is_parent(self.root, path)
        longest_path = 0
        for path in self.paths(path=path, return_symlinks=False, return_dirs=True):
            try:
                pathlen = len(path.absolute().as_posix())
                longest_path = max(longest_path, pathlen)
                pad = (longest_path - pathlen) + 4  # +4 to cover arrow presses like right arrow "^[[C"
                pad = pad * ' ' + '\r'
                if not self.verbose:
                    if not quiet:
                        print(path, end=pad, file=sys.stderr, flush=True)
                assert path_is_parent(self.root, path)
                rel_root = path.relative_to(self.root)
                if debug:
                    eprint("path:", path)
                    eprint("rel_root:", rel_root)
                if not self.legacy:
                    assert rel_root.parts[0] in (self.algorithm, self.tmp)
                if really_is_file(path):
                    if hasattr(self, "tmproot"):
                        if path.parts[-2] == self.tmp:
                            continue
                        if self.redis and skip_cached:
                            if self.redis.zscore(self.rediskey, binascii.unhexlify(path.name)):
                                if self.verbose:
                                    print(path, "(redis)")
                                continue

                        digest = hash_file(path, self.algorithm, tmp=None)
                        hexdigest = digest.hex()
                        if self.verbose:
                            print(path, "(hashed)")
                        try:
                            assert len(hexdigest) == len(path.name)
                        except AssertionError as e:
                            eprint("path:", path)
                            raise e
                        expected_path = self.hexdigestpath(hexdigest)
                        if expected_path != path:
                            yield (path, HashAddress(digest, self, expected_path))
                        else:
                            if self.redis:
                                self._commit_redis(digest, filepath=path)
                    else:
                        assert path.lstat().st_size == 0
                elif really_is_dir(path):
                    try:
                        if hasattr(self, "tmproot"):
                            assert (len(rel_root.parts) - 1) <= self.depth
                        if rel_root == Path(self.tmp):
                            continue
                        if self.legacy:
                            tree_path = rel_root
                        else:
                            tree_path = rel_root.relative_to(self.algorithm)
                        if tree_path.name:
                            if len(tree_path.parts) <= self.depth:
                                assert len(tree_path.name) == self.width
                                assert tree_path.name in self.ns  # bug for angryfiles to find
                            elif len(tree_path.parts) == self.depth + 1:
                                assert len(tree_path.name) == self.hexdigestlen
                                try:
                                    assert tree_path.parts[0:-1] == self.shard(tree_path.name)
                                except AssertionError as e:
                                    print(e)
                                    import IPython
                                    IPython.embed()
                                    #raise e
                            elif len(tree_path.parts) == self.depth + 2:
                                assert tree_path.name in ('archive', 'tags', 'strings')
                            elif len(tree_path.parts) == self.depth + 3:
                                assert float(tree_path.name)
                            elif len(tree_path.parts) == self.depth + 4:
                                assert tree_path.name in ('021_requests.plugin')
                            else:
                                assert False
                    except AssertionError as e:
                        print("\n", path)
                        raise e
            except Exception as e:  # bare exception to catch every case and always print the offending file
                print("Exception on path:", path)
                raise e