Exemple #1
0
 def get_dupe_groups(self, files, j=job.nulljob):
     j = j.start_subjob([8, 2])
     for f in (f for f in files if not hasattr(f, 'is_ref')):
         f.is_ref = False
     files = remove_dupe_paths(files)
     logging.info("Getting matches. Scan type: %d", self.scan_type)
     matches = self._getmatches(files, j)
     logging.info('Found %d matches' % len(matches))
     j.set_progress(100, tr("Removing false matches"))
     # In removing what we call here "false matches", we first want to remove, if we scan by
     # folders, we want to remove folder matches for which the parent is also in a match (they're
     # "duplicated duplicates if you will). Then, we also don't want mixed file kinds if the
     # option isn't enabled, we want matches for which both files exist and, lastly, we don't
     # want matches with both files as ref.
     if self.scan_type == ScanType.Folders and matches:
         allpath = {m.first.path for m in matches}
         allpath |= {m.second.path for m in matches}
         sortedpaths = sorted(allpath)
         toremove = set()
         last_parent_path = sortedpaths[0]
         for p in sortedpaths[1:]:
             if p in last_parent_path:
                 toremove.add(p)
             else:
                 last_parent_path = p
         matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
     if not self.mix_file_kind:
         matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
     matches = [m for m in matches if m.first.path.exists() and m.second.path.exists()]
     matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
     if self.ignore_list:
         j = j.start_subjob(2)
         iter_matches = j.iter_with_progress(matches, tr("Processed %d/%d matches against the ignore list"))
         matches = [
             m for m in iter_matches
             if not self.ignore_list.AreIgnored(str(m.first.path), str(m.second.path))
         ]
     logging.info('Grouping matches')
     groups = engine.get_groups(matches, j)
     matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
     if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}:
         self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
     else:
         # Ticket #195
         # To speed up the scan, we don't bother comparing contents of files that are both ref
         # files. However, this messes up "discarded" counting because there's a missing match
         # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's
         # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus
         # bypassing our tricky problem.
         # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also
         # bypass ref comparison, thus messing up with our "discarded" count. So we're
         # effectively disabling the "discarded" feature in PE, but it's better than falsely
         # reporting discarded matches.
         self.discarded_file_count = 0
     groups = [g for g in groups if any(not f.is_ref for f in g)]
     logging.info('Created %d groups' % len(groups))
     j.set_progress(100, tr("Doing group prioritization"))
     for g in groups:
         g.prioritize(self._key_func, self._tie_breaker)
     return groups
Exemple #2
0
 def get_dupe_groups(self, files, ignore_list=None, j=job.nulljob):
     for f in (f for f in files if not hasattr(f, "is_ref")):
         f.is_ref = False
     files = remove_dupe_paths(files)
     logging.info("Getting matches. Scan type: %d", self.scan_type)
     matches = self._getmatches(files, j)
     logging.info("Found %d matches" % len(matches))
     j.set_progress(100, tr("Almost done! Fiddling with results..."))
     # In removing what we call here "false matches", we first want to remove, if we scan by
     # folders, we want to remove folder matches for which the parent is also in a match (they're
     # "duplicated duplicates if you will). Then, we also don't want mixed file kinds if the
     # option isn't enabled, we want matches for which both files exist and, lastly, we don't
     # want matches with both files as ref.
     if self.scan_type == ScanType.FOLDERS and matches:
         allpath = {m.first.path for m in matches}
         allpath |= {m.second.path for m in matches}
         sortedpaths = sorted(allpath)
         toremove = set()
         last_parent_path = sortedpaths[0]
         for p in sortedpaths[1:]:
             if p in last_parent_path:
                 toremove.add(p)
             else:
                 last_parent_path = p
         matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove]
     if not self.mix_file_kind:
         matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
     matches = [m for m in matches if m.first.path.exists() and m.second.path.exists()]
     matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)]
     if ignore_list:
         matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))]
     logging.info("Grouping matches")
     groups = engine.get_groups(matches)
     if self.scan_type in {
         ScanType.FILENAME,
         ScanType.FIELDS,
         ScanType.FIELDSNOORDER,
         ScanType.TAG,
     }:
         matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
         self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
     else:
         # Ticket #195
         # To speed up the scan, we don't bother comparing contents of files that are both ref
         # files. However, this messes up "discarded" counting because there's a missing match
         # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's
         # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus
         # bypassing our tricky problem.
         # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also
         # bypass ref comparison, thus messing up with our "discarded" count. So we're
         # effectively disabling the "discarded" feature in PE, but it's better than falsely
         # reporting discarded matches.
         self.discarded_file_count = 0
     groups = [g for g in groups if any(not f.is_ref for f in g)]
     logging.info("Created %d groups" % len(groups))
     for g in groups:
         g.prioritize(self._key_func, self._tie_breaker)
     return groups
Exemple #3
0
 def can_handle(cls, path):
     return fs.File.can_handle(path) and get_file_ext(
         path.name) in cls.HANDLED_EXTS
Exemple #4
0
 def extension(self):
     return get_file_ext(self.name)
Exemple #5
0
 def extension(self):
     return get_file_ext(self.name)
Exemple #6
0
 def can_handle(cls, path):
     if not fs.File.can_handle(path):
         return False
     return get_file_ext(path.name) in auto.EXT2CLASS
Exemple #7
0
 def can_handle(cls, path):
     return fs.File.can_handle(path) and get_file_ext(path.name) in cls.HANDLED_EXTS
Exemple #8
0
 def can_handle(cls, path):
     if not fs.File.can_handle(path):
         return False
     return get_file_ext(path.name) in SUPPORTED_EXTS
Exemple #9
0
 def can_handle(cls, path):
     if not fs.File.can_handle(path):
         return False
     return get_file_ext(path.name) in auto.EXT2CLASS
Exemple #10
0
 def _create_sub_file(self, name, with_parent=True):
     parent = self if with_parent else None
     ext = get_file_ext(name)
     if ext in ASSOC:
         return ASSOC[ext](parent, name)