Esempio n. 1
0
from imagededup.methods import PHash
import os
import json


root = '/media/palm/62C0955EC09538ED/ptt/full_sized'
duplicates = []
for cls in os.listdir(root)[1:]:
    phasher = PHash()

    # Generate encodings for all images in an image directory
    encodings = phasher.encode_images(image_dir=os.path.join(root, cls))

    # Find duplicates using the generated encodings
    duplicate = phasher.find_duplicates(encoding_map=encodings, max_distance_threshold=1)
    with open('/home/palm/PycharmProjects/ptt/datastuffs/dups/'+cls+'.json', 'w') as write:
        json.dump([duplicate, encodings], write)
    duplicates.append(duplicate)
Esempio n. 2
0
parser.add_argument('-p', '--path', type=str, required=True, help="path of folder, for duplicated trashing")
args = parser.parse_args()

def remove(path):
    """ param <path> could either be relative or absolute. """
    if os.path.isfile(path) or os.path.islink(path):
        os.remove(path)  # remove the file
    elif os.path.isdir(path):
        shutil.rmtree(path)  # remove dir and all contains
    else:
        raise ValueError("file {} is not a file or dir.".format(path))


if __name__ == "__main__":
    path = args.path
    del_list = []

    phasher = PHash()
    encodings = phasher.encode_images(image_dir=path)
    duplicates = phasher.find_duplicates(encoding_map=encodings)

    for k,v in duplicates.items():
        if len(v) and (k not in del_list):
            for fname in v:
                del_list.append(fname)

    print('Deleting Duplicates :\n{0}'.format(del_list))

    for dl in del_list:
        remove(path + dl)
Esempio n. 3
0
        for item in dup_items:
            image2 = item[0].split('_')[0]
            if image1 != image2 and (
                (image1, image2) not in duplicates_list) and (
                    (image2, image1) not in duplicates_list):
                duplicates_list.append((image1, image2))
                scores.append(item[1])

    duplicates_df = pd.DataFrame(duplicates_list, columns=['image1', 'image2'])
    duplicates_df['score'] = scores
    return duplicates_df


# The max_distance_threshold parameter of phash.find_duplicates() specifies the hamming distance below which retrieved duplicates are considered valid.  We'll start with a max_distance_threshold of 8.

# In[5]:

phash = PHash()

encodings = phash.encode_images(
    image_dir='../input/hpa-single-cell-image-classification/train')
encodings_public = phash.encode_images(image_dir='../input/publichpa_1024')
encodings.update(encodings_public)

duplicates = phash.find_duplicates(encoding_map=encodings,
                                   scores=True,
                                   max_distance_threshold=8)

duplicates_df = convert_dict_to_df(duplicates)
duplicates_df.to_csv('../input/duplicates.csv', index=False)