def search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = [] while 0 < len(file_list): dups = list(filter(lambda x: compare(file_list[0], x), file_list)) if len(dups) > 1: lol.append(dups) # continuously updating file_list so it's not an infinite loop file_list = list( filter(lambda x: not compare(file_list[0], x), file_list)) return lol
def search(file_list): '''Looking for duplicate files in the provided list of files: returns a list of lists, where each list contains files with the same content''' lol = [] #empty list of lists while 0 < len(file_list): dups = [x for x in file_list if compare(file_list[0], x) ] #copy duplicates of first file in list to dups file_list = [x for x in file_list if not compare(file_list[0], x) ] #files that don't compare stays in file list if 1 < len(dups): lol.append(dups) return lol
def search(file_list): """Iterates through the list of files(file_list) created by the 'all_files' function, creating lists of duplicates and appending those lists to a grand list of lists of duplicates. The function returns a list of lists.""" list_of_lists = [] while 0 < len(file_list): duplicate_files = list( filter(lambda file: compare(file_list[0], file), file_list)) if 1 < len(duplicate_files): list_of_lists.append(duplicate_files) file_list = list( filter(lambda file: not compare(file_list[0], file), file_list)) return list_of_lists
def search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = [] n = 0 while 0 < len(file_list): h = file_list.pop(0) lol.append([h]) n += 1 for i in file_list: if compare(h, i): lol[n - 1].append(i) else: break return lol
def search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = [] while 0 < len(file_list): """ Removes the first item from the list of every file and compares it against every single file in the file list. Adds it to a list of duplicates, then the list of duplicates is added to the list of lists """ duplicates = [file_list.pop(0)] for i in range(len(file_list) - 1, -1, -1): if compare(duplicates[0], file_list[i]): duplicates.append(file_list.pop(i)) if 1 < len(duplicates): lol.append(duplicates) return lol
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a dictionary, where each key contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ lol = {} lst1 = sorted(file_list, key = getsize) lst2 = sorted(file_list, key = getsize) #while 0 < len(file_list): # i = file_list.pop(0) # pops last item in lst #6 seconds slower for some reason for i in lst1: if i in lst2: lst2.remove(i) for k in lst2: if getsize(k) <= getsize(i): if getsize(k) == getsize(i): if compare(i, k): if i not in list(lol.keys()): lol.update({i: []}) lol[i].append(k) lst1.remove(k) else: lst2.remove(k) else: break return lol
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ lol = [] while 0 < len(file_list): dups = [] name = file_list.pop( 0) # Removes the 1st item from the provided file_list dups = [ name ] # Beginning the search for any duplicates in the remaining list for i in range(len(file_list) - 1, -1, -1): if compare(name, file_list[i]): dups.append(file_list.pop(i)) if len(dups) > 1: lol.append(dups) return lol
def faster_search(file_list): file_sizes = list(map(getsize, file_list)) file_list = list( filter(lambda x: 1 < file_sizes.count(getsize(x)), file_list)) lol = [] while 0 < len(file_list): dups = [file_list.pop(0)] for i in range(len(file_list) - 1, -1, -1): if compare(dups[0], file_list[i]): dups.append(file_list.pop(i)) if 1 < len(dups): lol.append(dups) return lol
def search(file_list): lol = [] while 0 < len(file_list): dups = [] next = [] for i in file_list: if compare(file_list[0], i): dups.append(i) else: next.append(i) if 1 < len(dups): lol.append(dups) file_list = next return lol
def faster_search(file_list): """Serves the same purpose as the 'search' function, however before iterating through, this function removes files that do not have any copies. This dramatically improves the speed that the function creates a list of lists of duplicates""" file_sizes = list(map(getsize, file_list)) file_list = list( filter(lambda file: 1 < file_sizes.count(getsize(file)), file_list)) list_of_lists = [] while 0 < len(file_list): duplicate_files = [file_list.pop(0)] for i in range(len(file_list) - 1, -1, -1): if compare(duplicate_files[0], file_list[i]): duplicate_files.append(file_list.pop(i)) if 1 < len(duplicate_files): list_of_lists.append(duplicate_files) return list_of_lists
def fasterSearch(file_list): """Looking for duplicate files""" cmpDict = { } # compare Dictionary [key:value] = {size of file: {index : names}} for i in range(len(file_list)): tmp = os.path.getsize(file_list[i]) cmpDict.setdefault(tmp, {})[len(cmpDict[tmp])] = [ file_list[i] ] # put names in the next index of size for k in range(len(cmpDict[tmp]) - 1): if p1utils.compare( file_list[i], cmpDict[tmp][k] [0]): # compare every new element with the original file cmpDict[tmp][k].append(file_list[i]) # if True, append list cmpDict[tmp].pop(len(cmpDict[tmp]) - 1) # after append, pop() return cmpDict
def search(file_list): """Looking for duplicate files""" doS = {} # dictionary of size [key:value] = {size of file: List of List]} for i in range(len(file_list)): tSize = os.path.getsize(file_list[i]) if not (tSize in doS): doS[tSize] = [[file_list[i]] ] # add the first file_name with particular size elif True: for item in doS[tSize]: if (p1utils.compare(file_list[i], item[0])): item.append(file_list[i]) # add to its orginal file's list break else: doS[tSize].append( [file_list[i]] ) # can't find original file, add as another original file return doS
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ lol = [] file_sizes = list(map(getsize, file_list)) file_list = list( filter(lambda file: 1 < file_sizes.count(getsize(file)), file_list)) while 0 < len(file_list): duplicate = [file_list.pop(0)] for i in range(len(file_list) - 1, -1, -1): if compare(duplicate[0], file_list[i]): duplicate.append(file_list.pop(i)) if 1 < len(duplicate): lol.append(duplicate) return lol
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ lol = [] k = 0 while 0 < len(file_list): h = file_list.pop(0) lol.append([h]) k += 1 for i in file_list: if getsize(h) == getsize(i): if compare(h, i): lol[k - 1].append(i) else: break return lol
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ lol = [] file_sizes = list(map(getsize, file_list)) duplicates = list( filter(lambda x: 1 < file_sizes.count(getsize(x)), file_list)) for i in duplicates: """ After creating a list of files with duplicate file sizes, compare among this much smaller list and appended similarly to search(), creating another list of lists """ copies = [x for x in duplicates if compare(i, x)] if 1 < len(copies): lol.append(copies) return lol
def search(file_list): """Looking for duplicate files in the provided list of files :returns a dictionary, where each key contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = {} while 0 < len(file_list): i = file_list.pop(0) # pops last item in lst for k in file_list: if compare(i, k): if i not in list(lol.keys()): lol.update({i: []}) lol[i].append(k) return lol
def faster_search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Here's an idea: executing the compare() function seems to take a lot of time. Therefore, let's optimize and try to call it a little less often. """ #Once I found a faster method I would replace the other method with it so I could try to optimise it further #This method is the same as the one above because I couldn't find a way to optimise it further #This method is faster most of the time by about .05 sec but they're essentially the same lol = [] flSize = list(map(getsize, file_list)) file_list = list(filter(lambda x: 1 < flSize.count(getsize(x)), file_list)) while 0 < len(file_list): tempLst = [file_list.pop(0)] [ tempLst.append(file_list.pop(i)) for i in range(len(file_list) - 1, -1, -1) if compare(tempLst[0], file_list[i]) ] if len(tempLst) > 1: lol.append(tempLst) return lol
def search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = [] flSize = list(map(getsize, file_list)) file_list = list(filter(lambda x: 1 < flSize.count(getsize(x)), file_list)) while 0 < len(file_list): tempLst = [file_list.pop(0)] for i in range(len(file_list) - 1, -1, -1): if compare(tempLst[0], file_list[i]): tempLst.append(file_list.pop(i)) if len(tempLst) > 1: lol.append(tempLst) return lol
def search(file_list): """Looking for duplicate files in the provided list of files :returns a list of lists, where each list contains files with the same content Basic search strategy goes like this: - until the provided list is empty. - remove the 1st item from the provided file_list - search for its duplicates in the remaining list and put the item and all its duplicates into a new list - if that new list has more than one item (i.e. we did find duplicates) save the list in the list of lists As a result we have a list, each item of that list is a list, each of those lists contains files that have the same content """ lol = [] while file_list: a = file_list.pop() l = [] for x in range(len(file_list) - 1, -1, -1): if compare(a, file_list[x]): l.append(file_list.pop(x)) if len(l) > 0: l.append(a) lol.append(l) return lol