def test_delete(self):
        with DataGenerator() as test_scenario:
            duplicates = ('1/a.data', '2/a.data', '3/a.data', '4/a.data')
            test_scenario.create_duplicates(duplicates, size=10)

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, \
                    repository(conn) as repo, patch('os.remove') as mock_path:

                DupScanner(repo).scan((test_scenario.root_path,))

                for f in duplicates[:-1]:
                    deleteable = test_scenario.abs_path(f)
                    repo.delete_file(deleteable)
                    mock_path.assert_called_with(deleteable)
                    mock_path.reset_mock()

                try:
                    not_deleteable = test_scenario.abs_path(duplicates[-1])
                    repo.delete_file(not_deleteable)
                    assert False, 'Should have raised an exception'
                except Exception as e:
                    expected = Exception('409 Can\'t delete a file without duplicates: {}'.format(not_deleteable))
                    assert repr(expected) == repr(e)

                assert not mock_path.called, 'File should have not been deleted'
                mock_path.reset_mock
Exemple #2
0
    def test_delete(self):
        with DataGenerator() as test_scenario:
            duplicates = ('1/a.data', '2/a.data', '3/a.data', '4/a.data')
            test_scenario.create_duplicates(duplicates, size=10)

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, \
                    repository(conn) as repo, patch('os.remove') as mock_path:

                DupScanner(repo).scan((test_scenario.root_path, ))

                for f in duplicates[:-1]:
                    deleteable = test_scenario.abs_path(f)
                    repo.delete_file(deleteable)
                    mock_path.assert_called_with(deleteable)
                    mock_path.reset_mock()

                try:
                    not_deleteable = test_scenario.abs_path(duplicates[-1])
                    repo.delete_file(not_deleteable)
                    assert False, 'Should have raised an exception'
                except Exception as e:
                    expected = Exception(
                        '409 Can\'t delete a file without duplicates: {}'.
                        format(not_deleteable))
                    assert repr(expected) == repr(e)

                assert not mock_path.called, 'File should have not been deleted'
                mock_path.reset_mock
    def test_false_duplicates_in_path(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = set()

            uniques_expected = set()
            uniques_expected.add(test_scenario.create_file('1/a.data', size=4097))
            uniques_expected.add(test_scenario.create_file('1/b.data', size=4096))
            uniques_expected.add(test_scenario.create_file('1/c.data', size=512))
            uniques_expected.add(test_scenario.create_file('1/e.data', size=4069))
            uniques_expected.add(test_scenario.create_file('1/x.data', size=2048, readable=False))
            uniques_expected.add(test_scenario.create_file('1/xx.data', size=2048, readable=False))

            # TODO: This behavior is not consistent with don't follow links
            test_scenario.symlink('1/', 'links/1')
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/a.data'))
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/b.data'))
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/c.data'))
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/e.data'))
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/x.data'))
            uniques_expected.add(path.join(test_scenario.root_path, 'links/1/xx.data'))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(conn) as repo:
                # DupScanner(repo).scan((test_scenario.root_path,))
                DupScanner(repo).scan((
                    path.join(test_scenario.root_path, '1'),
                    path.join(test_scenario.root_path, 'links'),
                ))

                duplicates_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()}

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, \
                       'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found)

                uniques_found = {fullname for hash, size, fullname,
                                 path, abspath in repo.findBy_unique_hash()}

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, \
                       'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)
    def test_dont_follow_links(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = set()

            uniques_expected = set()
            uniques_expected.add(test_scenario.create_file('1/a.data', size=4097))
            uniques_expected.add(test_scenario.create_file('2/b.data', size=4096))
            uniques_expected.add(test_scenario.create_file('3/c.data', size=512))
            uniques_expected.add(test_scenario.create_file('3/e.data', size=4069))
            uniques_expected.add(test_scenario.create_file('4/x.data', size=2048, readable=False))
            uniques_expected.add(test_scenario.create_file('4/xx.data', size=2048, readable=False))

            ignored_links = set()
            ignored_links.add(test_scenario.symlink('1/a.data', '2/lnk-a.data'))
            ignored_links.add(test_scenario.symlink('2/b.data', '1/lnk-b.data'))
            ignored_links.add(test_scenario.symlink('4/x.data', '4/lnk-x.data'))
            ignored_links.add(test_scenario.symlink('4/xx.data', '4/lnk-xx.data'))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(conn) as repo:
                DupScanner(repo).scan((test_scenario.root_path,))

                duplicates_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()}

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, \
                       'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found)

                uniques_found = {fullname for hash, size, fullname,
                                 path, abspath in repo.findBy_unique_hash()}

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, \
                       'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)

                links_unexpected = (uniques_found | duplicates_found) & ignored_links

                assert not links_unexpected, 'Unexpected links found'
    def test_happy_path_with_nested_dirs(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = test_scenario.create_duplicates(
                ('1/a.data', '2/a.data', '3/a.data', '4/a.data'), size=4097)  # 4097 = block size + 1
            duplicates_expected |= test_scenario.create_duplicates(
                ('2/aa.data', '4/aa.data'), size=4096)
            duplicates_expected |= test_scenario.create_duplicates(
                ('1/aaa.data', '4/aaa.data'), size=1024)
            duplicates_expected |= test_scenario.create_duplicates(
                ('1/aaaa.data', '3/aaaa.data'), size=0)

            uniques_expected = set()
            uniques_expected.add(test_scenario.create_file('1/b.data', size=4097))
            uniques_expected.add(test_scenario.create_file('2/c.data', size=4096))
            uniques_expected.add(test_scenario.create_file('3/d.data', size=512))
            uniques_expected.add(test_scenario.create_file('1/e.data', size=4097))
            uniques_expected.add(test_scenario.create_file('4/x.data', size=2048, readable=False))
            uniques_expected.add(test_scenario.create_file('4/xx.data', size=2048, readable=False))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(conn) as repo:
                DupScanner(repo).scan((test_scenario.root_path,  test_scenario.abs_path('1/')))

                duplicates_found = {abspath for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()}

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(
                    duplicates_expected, duplicates_found)

                uniques_found = {abspath for hash, size, fullname, path, abspath in repo.findBy_unique_hash()}

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(
                    uniques_expected, uniques_found)
Exemple #6
0
    def test_false_duplicates_in_path(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = set()

            uniques_expected = set()
            uniques_expected.add(
                test_scenario.create_file('1/a.data', size=4097))
            uniques_expected.add(
                test_scenario.create_file('1/b.data', size=4096))
            uniques_expected.add(
                test_scenario.create_file('1/c.data', size=512))
            uniques_expected.add(
                test_scenario.create_file('1/e.data', size=4069))
            uniques_expected.add(
                test_scenario.create_file('1/x.data',
                                          size=2048,
                                          readable=False))
            uniques_expected.add(
                test_scenario.create_file('1/xx.data',
                                          size=2048,
                                          readable=False))

            # TODO: This behavior is not consistent with don't follow links
            test_scenario.symlink('1/', 'links/1')
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/a.data'))
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/b.data'))
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/c.data'))
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/e.data'))
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/x.data'))
            uniques_expected.add(
                path.join(test_scenario.root_path, 'links/1/xx.data'))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(
                    conn) as repo:
                # DupScanner(repo).scan((test_scenario.root_path,))
                DupScanner(repo).scan((
                    path.join(test_scenario.root_path, '1'),
                    path.join(test_scenario.root_path, 'links'),
                ))

                duplicates_found = {
                    fullname
                    for hash, size, fullname, path, abspath in
                    repo.findBy_duplicate_hash()
                }

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, \
                       'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found)

                uniques_found = {
                    fullname
                    for hash, size, fullname, path, abspath in
                    repo.findBy_unique_hash()
                }

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, \
                       'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)
Exemple #7
0
    def test_dont_follow_links(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = set()

            uniques_expected = set()
            uniques_expected.add(
                test_scenario.create_file('1/a.data', size=4097))
            uniques_expected.add(
                test_scenario.create_file('2/b.data', size=4096))
            uniques_expected.add(
                test_scenario.create_file('3/c.data', size=512))
            uniques_expected.add(
                test_scenario.create_file('3/e.data', size=4069))
            uniques_expected.add(
                test_scenario.create_file('4/x.data',
                                          size=2048,
                                          readable=False))
            uniques_expected.add(
                test_scenario.create_file('4/xx.data',
                                          size=2048,
                                          readable=False))

            ignored_links = set()
            ignored_links.add(test_scenario.symlink('1/a.data',
                                                    '2/lnk-a.data'))
            ignored_links.add(test_scenario.symlink('2/b.data',
                                                    '1/lnk-b.data'))
            ignored_links.add(test_scenario.symlink('4/x.data',
                                                    '4/lnk-x.data'))
            ignored_links.add(
                test_scenario.symlink('4/xx.data', '4/lnk-xx.data'))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(
                    conn) as repo:
                DupScanner(repo).scan((test_scenario.root_path, ))

                duplicates_found = {
                    fullname
                    for hash, size, fullname, path, abspath in
                    repo.findBy_duplicate_hash()
                }

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, \
                       'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found)

                uniques_found = {
                    fullname
                    for hash, size, fullname, path, abspath in
                    repo.findBy_unique_hash()
                }

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, \
                       'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)

                links_unexpected = (uniques_found
                                    | duplicates_found) & ignored_links

                assert not links_unexpected, 'Unexpected links found'
Exemple #8
0
    def test_happy_path_with_nested_dirs(self):
        with DataGenerator() as test_scenario:
            duplicates_expected = test_scenario.create_duplicates(
                ('1/a.data', '2/a.data', '3/a.data', '4/a.data'),
                size=4097)  # 4097 = block size + 1
            duplicates_expected |= test_scenario.create_duplicates(
                ('2/aa.data', '4/aa.data'), size=4096)
            duplicates_expected |= test_scenario.create_duplicates(
                ('1/aaa.data', '4/aaa.data'), size=1024)
            duplicates_expected |= test_scenario.create_duplicates(
                ('1/aaaa.data', '3/aaaa.data'), size=0)

            uniques_expected = set()
            uniques_expected.add(
                test_scenario.create_file('1/b.data', size=4097))
            uniques_expected.add(
                test_scenario.create_file('2/c.data', size=4096))
            uniques_expected.add(
                test_scenario.create_file('3/d.data', size=512))
            uniques_expected.add(
                test_scenario.create_file('1/e.data', size=4097))
            uniques_expected.add(
                test_scenario.create_file('4/x.data',
                                          size=2048,
                                          readable=False))
            uniques_expected.add(
                test_scenario.create_file('4/xx.data',
                                          size=2048,
                                          readable=False))

            connection_string = ':memory:'
            with connection_factory(connection_string) as conn, repository(
                    conn) as repo:
                DupScanner(repo).scan(
                    (test_scenario.root_path, test_scenario.abs_path('1/')))

                duplicates_found = {
                    abspath
                    for hash, size, fullname, path, abspath in
                    repo.findBy_duplicate_hash()
                }

                duplicates_missing = duplicates_expected - duplicates_found
                assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format(
                    duplicates_missing)

                duplicates_unexpected = duplicates_found - duplicates_expected
                assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format(
                    duplicates_unexpected)

                assert duplicates_expected == duplicates_found, 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(
                    duplicates_expected, duplicates_found)

                uniques_found = {
                    abspath
                    for hash, size, fullname, path, abspath in
                    repo.findBy_unique_hash()
                }

                uniques_missing = uniques_expected - uniques_found
                assert not uniques_missing, 'Expected unique elements were not found: {}'.format(
                    uniques_missing)

                uniques_unexpected = uniques_found - uniques_expected
                assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format(
                    uniques_unexpected)

                assert uniques_expected == uniques_found, 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(
                    uniques_expected, uniques_found)
Exemple #9
0
 def setUp(self):
     self.conn = sqlite3.connect(":memory:")
     self.repo = dupscanner.repository(self.conn)
     self.repo.create_schema()
Exemple #10
0
 def setUp(self):
     self.conn = sqlite3.connect(":memory:")
     self.repo =  dupscanner.repository(self.conn)
     self.repo.create_schema()
Exemple #11
0
def main():
  args_parser = argparse.ArgumentParser()
  parser = argparse.ArgumentParser()
  parser.set_defaults(action='duplicates')
  parser.add_argument("path", help="Path where to look for duplicates", nargs='+')
  parser.add_argument("-d", "--database", help="Stores a temporary SQLite database in a file", default=":memory:")
  parser.add_argument("-lf", "--log-format", help="Logging format", default='%(message)s')
  parser.add_argument("-l", "--log", help="File to output the log messages")
#  parser.add_argument("-u", "--unique", help="Find unique files", action="store_const", const='unique', dest='action')
  parser.add_argument("-u", "--unique", help="Find unique files", action="store_true")
  parser.add_argument("-o", "--output-file", help="Output file (default: stdout)", default='-', type=file('w', encoding='UTF-8'))
  g = parser.add_mutually_exclusive_group()
  g.add_argument(
    "-t", 
    "--template", 
    help="""Output template. Variables ${hash}, ${size}, ${fullname}, ${path}, ${abspath}, ${realpath} 
       will be replaced with the actual values""", 
    default="${hash}\t${size}\t${fullname}"
  )
  g.add_argument(
    "-e",
    "--evaluate",
    help="""For each result, evaluate the given python code to process the output.
        Variables hash, size, filename and output_file will be bounded to the appropiate values"""
  )
  g.add_argument(
    "-x",
    "--execute-script",
    help="""Executes the given python script to process the results.
        Variables results and output_file will be bounded to an iterator and the appropiate output stream"""
  )
  g.add_argument(
    "-p",
    "--pretty-print",
    action="store_true",
    help="Groups the results by hash and file size and displays a pretty output"
  )
  g.add_argument(
    "-i",
    "--interactive",
    action="store_true",
    help="Interactive mode"
  )
  parser.add_argument(
    "-v",
    "--verbosity",
    help="Verbosity level (default: WARN)",
    default='WARN',
    choices=['DEBUG','INFO','WARN','ERROR','CRITICAL'],
    type=lambda level: level.upper()
  )
  args = parser.parse_args()

  logging.basicConfig(
    level=args.verbosity,
    format=args.log_format,
    filename=args.log
  )

  connection_string = args.database
  path = args.path
  action = args.action
  template = args.template

  with connection_factory(connection_string) as conn, repository(conn) as repo, args.output_file as output_file:
    dupscanner = DupScanner(repo)
#    command = { 'unique': dupscanner.find_unique, 'duplicates': dupscanner.find_duplicates }

#    results = command[action](path)
    results = dupscanner.find_unique(path) if args.unique else dupscanner.find_duplicates(path)

    if args.execute_script: exec_script(args.execute_script, output_file, results, conn, repo)
    elif args.interactive: run_server(repo)
    elif args.evaluate: func = exec_command(args.evaluate, output_file, results)
    elif args.pretty_print: func = pretty_print(results, output_file)
    else: plain_print(template, results, output_file)