def main() -> None:
    """
    Command-line handler for the ``find_recovered_openxml`` tool.
    Use the ``--help`` option for help.
    """
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
                            description=f"""
Tool to recognize and rescue Microsoft Office OpenXML files, even if they have
garbage appended to them.        

- Rationale: when you have accidentally deleted files from an NTFS disk, and
  they really matter, you should (a) stop what you're doing; (b) clone the disk
  to an image file using "dd" under Linux; (c) perform all subsequent 
  operations on the cloned image (in read-only mode). Those steps might 
  include:
    - ntfsundelete, to find files that the filesystem is still aware of;
    - scalpel, to find files based on their contents.

- Scalpel is great at finding stuff efficiently, but it works best when files
  can be defined by both a start (header) signature and an end (footer)
  signature. However, the Microsoft Office OpenXML file format has a 
  recognizable header, but no standard footer. In these circumstances, Scalpel
  reads up to a certain limit that you specify in its configuration file. (To
  retrieve large Powerpoint files, this limit needs to be substantial, e.g.
  50 Mb or more, depending on your ways of working with Powerpoint.)

- That means that files emerging from a Scalpel search for DOCX/PPTX/XLSX files
  may be
    - false positives, having nothing to do with Office;
    - clean Office files (the least likely category!);
    - Office files with garbage stuck on the end.
    
- The OpenXML file format is just a zip file. If you stick too much garbage on
  the end of a zip file, zip readers will see it as corrupt.  
        
- THIS TOOL detects (and optionally moves) potentially corrupted zipfiles based 
  on file contents, by unzipping the file and checking for "inner" files with
  names like:

        File type       Contents filename signature (regular expression)
        ----------------------------------------------------------------
        DOCX            {DOCX_CONTENTS_REGEX_STR}  
        PPTX            {PPTX_CONTENTS_REGEX_STR}
        XLSX            {XLSX_CONTENTS_REGEX_STR}

- WARNING: it's possible for an OpenXML file to contain more than one of these.
  If so, they may be mis-classified.

- If a file is not immediately readable as a zip, it uses Linux's "zip -FF" to 
  repair zip files with corrupted ends, and tries again.
  
- Having found valid-looking files, you can elect to move them elsewhere.

- As an additional and VERY DANGEROUS operation, you can elect to delete files
  that this tool doesn't recognize. (Why? Because a 450Gb disk might produce
  well in excess of 1.7Tb of candidate files; many will be false positives and
  even the true positives will all be expanded to your file size limit, e.g.
  50 Mb. You may have a problem with available disk space, so running this tool
  regularly allows you to clear up the junk. Use the --run_every option to help 
  with this.)

        """)
    parser.add_argument(
        "filename",
        nargs="+",
        help="File(s) to check. You can also specify directores if you use "
        "--recursive")
    parser.add_argument(
        "--recursive",
        action="store_true",
        help="Allow search to descend recursively into any directories "
        "encountered.")
    parser.add_argument(
        "--skip_files",
        nargs="*",
        default=[],
        help="File pattern(s) to skip. You can specify wildcards like '*.txt' "
        "(but you will have to enclose that pattern in quotes under "
        "UNIX-like operating systems). The basename of each file will be "
        "tested against these filenames/patterns. Consider including "
        "Scalpel's 'audit.txt'.")
    parser.add_argument("--filetypes",
                        nargs="+",
                        default=FILETYPES,
                        help=f"File types to check. Options: {FILETYPES}")
    parser.add_argument(
        "--move_to",
        help="If the file is recognized as one of the specified file types, "
        "move it to the directory specified here.")
    parser.add_argument(
        "--delete_if_not_specified_file_type",
        action="store_true",
        help="If a file is NOT recognized as one of the specified file types, "
        "delete it. VERY DANGEROUS.")
    parser.add_argument(
        "--run_repeatedly",
        type=int,
        help="Run the tool repeatedly with a pause of <run_repeatedly> "
        "seconds between runs. (For this to work well with the move/"
        "delete options, you should specify one or more DIRECTORIES in "
        "the 'filename' arguments, not files, and you will need the "
        "--recursive option.)")
    parser.add_argument(
        "--nprocesses",
        type=int,
        default=multiprocessing.cpu_count(),
        help="Specify the number of processes to run in parallel.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Verbose output")
    parser.add_argument("--show_zip_output",
                        action="store_true",
                        help="Verbose output from the external 'zip' tool")
    args = parser.parse_args()
    main_only_quicksetup_rootlogger(
        level=logging.DEBUG if args.verbose else logging.INFO,
        with_process_id=True)

    # Further argument checks
    if args.move_to and not os.path.isdir(args.move_to):
        raise ValueError(
            f"Destination directory {args.move_to!r} is not a directory")
    if not args.filetypes:
        raise ValueError("No file type to scan for")
    filetypes = [ft.lower() for ft in args.filetypes]
    if any(ft not in FILETYPES for ft in filetypes):
        raise ValueError(f"Invalid filetypes; choose from {FILETYPES}")
    assert shutil.which("zip"), "Need 'zip' tool!"

    # Repeated scanning loop
    while True:
        log.info("Starting scan.")
        log.info("- Looking for filetypes {}", filetypes)
        log.info("- Scanning files/directories {!r}{}", args.filename,
                 " recursively" if args.recursive else "")
        log.info("- Skipping files matching {!r}", args.skip_files)
        log.info("- Using {} simultaneous processes", args.nprocesses)
        if args.move_to:
            log.info("- Moving target files to " + args.move_to)
        if args.delete_if_not_specified_file_type:
            log.info("- Deleting non-target files.")

        # Iterate through files
        pool = multiprocessing.Pool(processes=args.nprocesses)
        for filename in gen_filenames(starting_filenames=args.filename,
                                      recursive=args.recursive):
            src_basename = os.path.basename(filename)
            if any(
                    fnmatch.fnmatch(src_basename, pattern)
                    for pattern in args.skip_files):
                log.info("Skipping file as ordered: " + filename)
                continue
            exists, locked = exists_locked(filename)
            if locked or not exists:
                log.info("Skipping currently inaccessible file: " + filename)
                continue
            kwargs = {
                'filename': filename,
                'filetypes': filetypes,
                'move_to': args.move_to,
                'delete_if_not_specified_file_type':
                args.delete_if_not_specified_file_type,
                'show_zip_output': args.show_zip_output,
            }
            # log.critical("start")
            pool.apply_async(process_file, [], kwargs)
            # result = pool.apply_async(process_file, [], kwargs)
            # result.get()  # will re-raise any child exceptions
            # ... but it waits for the process to complete! That's no help.
            # log.critical("next")
            # ... https://stackoverflow.com/questions/22094852/how-to-catch-exceptions-in-workers-in-multiprocessing  # noqa
        pool.close()
        pool.join()

        log.info("Finished scan.")
        if args.run_repeatedly is None:
            break
        log.info("Sleeping for {} s...", args.run_repeatedly)
        sleep(args.run_repeatedly)
Exemple #2
0
def main() -> None:
    """
    Command-line handler for the ``grep_in_openxml`` tool.
    Use the ``--help`` option for help.
    """
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
                            description="""
Performs a grep (global-regular-expression-print) search of files in OpenXML
format, which is to say inside ZIP files.

Note that you can chain; for example, to search for OpenXML files containing
both "armadillo" and "bonobo", you can do:

    grep_in_openxml -l armadillo *.pptx | grep_in_openxml -x -l bonobo
                    ^^                                    ^^
                print filenames                       read filenames from stdin

""")
    parser.add_argument("pattern", help="Regular expression pattern to apply.")
    parser.add_argument(
        "filename",
        nargs="*",
        help="File(s) to check. You can also specify directores if you use "
        "--recursive")
    parser.add_argument(
        "--filenames_from_stdin",
        "-x",
        action="store_true",
        help="Take filenames from stdin instead, one line per filename "
        "(useful for chained grep).")
    parser.add_argument(
        "--recursive",
        action="store_true",
        help="Allow search to descend recursively into any directories "
        "encountered.")
    # Flag abbreviations to match grep:
    parser.add_argument("--ignore_case",
                        "-i",
                        action="store_true",
                        help="Ignore case")
    parser.add_argument("--invert_match",
                        "-v",
                        action="store_true",
                        help="Invert match")
    parser.add_argument("--files_with_matches",
                        "-l",
                        action="store_true",
                        help="Show filenames of files with matches")
    parser.add_argument("--files_without_match",
                        "-L",
                        action="store_true",
                        help="Show filenames of files with no match")
    parser.add_argument(
        "--grep_inner_file_name",
        action="store_true",
        help="Search the NAMES of the inner files, not their contents.")
    parser.add_argument(
        "--show_inner_file",
        action="store_true",
        help="For hits, show the filenames of inner files, within each ZIP.")
    parser.add_argument(
        "--nprocesses",
        type=int,
        default=multiprocessing.cpu_count(),
        help="Specify the number of processes to run in parallel.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Verbose output")
    args = parser.parse_args()
    main_only_quicksetup_rootlogger(
        level=logging.DEBUG if args.verbose else logging.INFO)
    if args.files_with_matches and args.files_without_match:
        raise ValueError("Can't specify both --files_with_matches (-l) and "
                         "--files_without_match (-L)!")
    if bool(args.filenames_from_stdin) == bool(args.filename):
        raise ValueError("Specify --filenames_from_stdin or filenames on the "
                         "command line, but not both")

    # Compile regular expression
    if args.grep_inner_file_name:
        final_pattern = args.pattern
    else:
        encoding = getdefaultencoding()
        final_pattern = args.pattern.encode(encoding)
    flags = re.IGNORECASE if args.ignore_case else 0
    log.debug("Using regular expression {!r} with flags {!r}", final_pattern,
              flags)
    regex = re.compile(final_pattern, flags)

    # Set up pool for parallel processing
    pool = multiprocessing.Pool(processes=args.nprocesses)

    # Iterate through files
    parse_kwargs = dict(regex=regex,
                        invert_match=args.invert_match,
                        files_with_matches=args.files_with_matches,
                        files_without_match=args.files_without_match,
                        grep_inner_file_name=args.grep_inner_file_name,
                        show_inner_file=args.show_inner_file)
    if args.filenames_from_stdin:
        for line in stdin.readlines():
            zipfilename = line.strip()
            parallel_kwargs = {'zipfilename': zipfilename}
            parallel_kwargs.update(**parse_kwargs)
            pool.apply_async(parse_zip, [], parallel_kwargs)
    else:
        for zipfilename in gen_filenames(starting_filenames=args.filename,
                                         recursive=args.recursive):
            parallel_kwargs = {'zipfilename': zipfilename}
            parallel_kwargs.update(**parse_kwargs)
            pool.apply_async(parse_zip, [], parallel_kwargs)
    pool.close()
    pool.join()
def deduplicate(directories: List[str], recursive: bool,
                dummy_run: bool) -> None:
    """
    De-duplicate files within one or more directories. Remove files
    that are identical to ones already considered.

    Args:
        directories: list of directories to process
        recursive: process subdirectories (recursively)?
        dummy_run: say what it'll do, but don't do it
    """
    # -------------------------------------------------------------------------
    # Catalogue files by their size
    # -------------------------------------------------------------------------
    files_by_size = {
    }  # type: Dict[int, List[str]]  # maps size to list of filenames  # noqa
    num_considered = 0
    for filename in gen_filenames(directories, recursive=recursive):
        if not os.path.isfile(filename):
            continue
        size = os.stat(filename)[stat.ST_SIZE]
        a = files_by_size.setdefault(size, [])
        a.append(filename)
        num_considered += 1

    log.debug("files_by_size =\n{}", pformat(files_by_size))

    # -------------------------------------------------------------------------
    # By size, look for duplicates using a hash of the first part only
    # -------------------------------------------------------------------------
    log.info("Finding potential duplicates...")
    potential_duplicate_sets = []
    potential_count = 0
    sizes = list(files_by_size.keys())
    sizes.sort()
    for k in sizes:
        files_of_this_size = files_by_size[k]
        out_files = []  # type: List[str]
        # ... list of all files having >1 file per hash, for this size
        hashes = {}  # type: Dict[bytes, Union[bool, str]]
        # ... key is a hash; value is either True or a filename
        if len(files_of_this_size) == 1:
            continue
        log.info("Testing {} files of size {}...", len(files_of_this_size), k)
        for filename in files_of_this_size:
            if not os.path.isfile(filename):
                continue
            log.debug("Quick-scanning file: {}", filename)
            with open(filename, 'rb') as fd:
                hasher = md5()
                hasher.update(fd.read(INITIAL_HASH_SIZE))
                hash_value = hasher.digest()
                if hash_value in hashes:
                    # We have discovered the SECOND OR SUBSEQUENT hash match.
                    first_file_or_true = hashes[hash_value]
                    if first_file_or_true is not True:
                        # We have discovered the SECOND file;
                        # first_file_or_true contains the name of the FIRST.
                        out_files.append(first_file_or_true)
                        hashes[hash_value] = True
                    out_files.append(filename)
                else:
                    # We have discovered the FIRST file with this hash.
                    hashes[hash_value] = filename
        if out_files:
            potential_duplicate_sets.append(out_files)
            potential_count = potential_count + len(out_files)

    del files_by_size

    log.info(
        "Found {} sets of potential duplicates, based on hashing the "
        "first {} bytes of each...", potential_count, INITIAL_HASH_SIZE)

    log.debug("potential_duplicate_sets =\n{}",
              pformat(potential_duplicate_sets))

    # -------------------------------------------------------------------------
    # Within each set, check for duplicates using a hash of the entire file
    # -------------------------------------------------------------------------
    log.info("Scanning for real duplicates...")

    num_scanned = 0
    num_to_scan = sum(len(one_set) for one_set in potential_duplicate_sets)
    duplicate_sets = []  # type: List[List[str]]
    for one_set in potential_duplicate_sets:
        out_files = []  # type: List[str]
        hashes = {}
        for filename in one_set:
            num_scanned += 1
            log.info("Scanning file [{}/{}]: {}", num_scanned, num_to_scan,
                     filename)
            with open(filename, 'rb') as fd:
                hasher = md5()
                while True:
                    r = fd.read(MAIN_READ_CHUNK_SIZE)
                    if len(r) == 0:
                        break
                    hasher.update(r)
            hash_value = hasher.digest()
            if hash_value in hashes:
                if not out_files:
                    out_files.append(hashes[hash_value])
                out_files.append(filename)
            else:
                hashes[hash_value] = filename
        if len(out_files):
            duplicate_sets.append(out_files)

    log.debug("duplicate_sets = \n{}", pformat(duplicate_sets))

    num_originals = 0
    num_deleted = 0
    for d in duplicate_sets:
        print(f"Original is: {d[0]}")
        num_originals += 1
        for f in d[1:]:
            if dummy_run:
                print(f"Would delete: {f}")
            else:
                print(f"Deleting: {f}")
                os.remove(f)
            num_deleted += 1
        print()

    num_unique = num_considered - (num_originals + num_deleted)
    print(
        "{action} {d} duplicates, leaving {o} originals (and {u} unique files "
        "not touched; {c} files considered in total)".format(
            action="Would delete" if dummy_run else "Deleted",
            d=num_deleted,
            o=num_originals,
            u=num_unique,
            c=num_considered))
Exemple #4
0
def main() -> None:
    """
    Command-line handler for the ``find_bad_openxml`` tool.
    Use the ``--help`` option for help.
    """
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
                            description="""
Tool to scan rescued Microsoft Office OpenXML files (produced by the
find_recovered_openxml.py tool in this kit; q.v.) and detect bad (corrupted)
ones.
        """)
    parser.add_argument(
        "filename",
        nargs="*",
        help="File(s) to check. You can also specify directores if you use "
        "--recursive")
    parser.add_argument(
        "--filenames_from_stdin",
        "-x",
        action="store_true",
        help="Take filenames from stdin instead, one line per filename "
        "(useful for chained grep).")
    parser.add_argument(
        "--recursive",
        action="store_true",
        help="Allow search to descend recursively into any directories "
        "encountered.")
    parser.add_argument(
        "--skip_files",
        nargs="*",
        default=[],
        help="File pattern(s) to skip. You can specify wildcards like '*.txt' "
        "(but you will have to enclose that pattern in quotes under "
        "UNIX-like operating systems). The basename of each file will be "
        "tested against these filenames/patterns. Consider including "
        "Scalpel's 'audit.txt'.")
    parser.add_argument("--good",
                        action="store_true",
                        help="List good files, not bad")
    parser.add_argument(
        "--delete_if_bad",
        action="store_true",
        help="If a file is found to be bad, delete it. DANGEROUS.")
    parser.add_argument(
        "--run_repeatedly",
        type=int,
        help="Run the tool repeatedly with a pause of <run_repeatedly> "
        "seconds between runs. (For this to work well with the move/"
        "delete options, you should specify one or more DIRECTORIES in "
        "the 'filename' arguments, not files, and you will need the "
        "--recursive option.)")
    parser.add_argument(
        "--nprocesses",
        type=int,
        default=multiprocessing.cpu_count(),
        help="Specify the number of processes to run in parallel.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Verbose output")
    args = parser.parse_args()
    main_only_quicksetup_rootlogger(
        level=logging.DEBUG if args.verbose else logging.INFO,
        with_process_id=True)
    if bool(args.filenames_from_stdin) == bool(args.filename):
        raise ValueError("Specify --filenames_from_stdin or filenames on the "
                         "command line, but not both")
    if args.filenames_from_stdin and args.run_repeatedly:
        raise ValueError("Can't use both --filenames_from_stdin and "
                         "--run_repeatedly")

    # Repeated scanning loop
    while True:
        log.debug("Starting scan.")
        log.debug("- Scanning files/directories {!r}{}", args.filename,
                  " recursively" if args.recursive else "")
        log.debug("- Skipping files matching {!r}", args.skip_files)
        log.debug("- Using {} simultaneous processes", args.nprocesses)
        log.debug("- Reporting {} filenames", "good" if args.good else "bad")
        if args.delete_if_bad:
            log.warning("- Deleting bad OpenXML files.")

        # Iterate through files
        pool = multiprocessing.Pool(processes=args.nprocesses)

        if args.filenames_from_stdin:
            generator = gen_from_stdin()
        else:
            generator = gen_filenames(starting_filenames=args.filename,
                                      recursive=args.recursive)

        for filename in generator:
            src_basename = os.path.basename(filename)
            if any(
                    fnmatch.fnmatch(src_basename, pattern)
                    for pattern in args.skip_files):
                log.debug("Skipping file as ordered: " + filename)
                continue
            exists, locked = exists_locked(filename)
            if locked or not exists:
                log.debug("Skipping currently inaccessible file: " + filename)
                continue
            kwargs = {
                'filename': filename,
                'print_good': args.good,
                'delete_if_bad': args.delete_if_bad,
            }
            # log.critical("start")
            pool.apply_async(process_openxml_file, [], kwargs)
            # result = pool.apply_async(process_file, [], kwargs)
            # result.get()  # will re-raise any child exceptions
            # ... but it waits for the process to complete! That's no help.
            # log.critical("next")
            # ... https://stackoverflow.com/questions/22094852/how-to-catch-exceptions-in-workers-in-multiprocessing  # noqa
        pool.close()
        pool.join()

        log.debug("Finished scan.")
        if args.run_repeatedly is None:
            break
        log.info("Sleeping for {} s...", args.run_repeatedly)
        sleep(args.run_repeatedly)