from imagededup.methods import PHash import os import json root = '/media/palm/62C0955EC09538ED/ptt/full_sized' duplicates = [] for cls in os.listdir(root)[1:]: phasher = PHash() # Generate encodings for all images in an image directory encodings = phasher.encode_images(image_dir=os.path.join(root, cls)) # Find duplicates using the generated encodings duplicate = phasher.find_duplicates(encoding_map=encodings, max_distance_threshold=1) with open('/home/palm/PycharmProjects/ptt/datastuffs/dups/'+cls+'.json', 'w') as write: json.dump([duplicate, encodings], write) duplicates.append(duplicate)
parser.add_argument('-p', '--path', type=str, required=True, help="path of folder, for duplicated trashing") args = parser.parse_args() def remove(path): """ param <path> could either be relative or absolute. """ if os.path.isfile(path) or os.path.islink(path): os.remove(path) # remove the file elif os.path.isdir(path): shutil.rmtree(path) # remove dir and all contains else: raise ValueError("file {} is not a file or dir.".format(path)) if __name__ == "__main__": path = args.path del_list = [] phasher = PHash() encodings = phasher.encode_images(image_dir=path) duplicates = phasher.find_duplicates(encoding_map=encodings) for k,v in duplicates.items(): if len(v) and (k not in del_list): for fname in v: del_list.append(fname) print('Deleting Duplicates :\n{0}'.format(del_list)) for dl in del_list: remove(path + dl)
for item in dup_items: image2 = item[0].split('_')[0] if image1 != image2 and ( (image1, image2) not in duplicates_list) and ( (image2, image1) not in duplicates_list): duplicates_list.append((image1, image2)) scores.append(item[1]) duplicates_df = pd.DataFrame(duplicates_list, columns=['image1', 'image2']) duplicates_df['score'] = scores return duplicates_df # The max_distance_threshold parameter of phash.find_duplicates() specifies the hamming distance below which retrieved duplicates are considered valid. We'll start with a max_distance_threshold of 8. # In[5]: phash = PHash() encodings = phash.encode_images( image_dir='../input/hpa-single-cell-image-classification/train') encodings_public = phash.encode_images(image_dir='../input/publichpa_1024') encodings.update(encodings_public) duplicates = phash.find_duplicates(encoding_map=encodings, scores=True, max_distance_threshold=8) duplicates_df = convert_dict_to_df(duplicates) duplicates_df.to_csv('../input/duplicates.csv', index=False)