Example #1
0
def get_duplicate_imgs_tuple(img_path):
    phasher = PHash()
    encodings = phasher.encode_images(image_dir=img_path)
    duplicates = phasher.find_duplicates(encoding_map=encodings)
    # Find two duplicate imgs --> duplicate_imgs_tuple
    duplicate_imgs_tuple = []
    img_checked = []
    for img, imgs in duplicates.items():
        if len(imgs) == 1 and img not in img_checked:
            temp = [img] + imgs
            duplicate_imgs_tuple.append(temp)
            # deduplication
            for each in temp:
                img_checked.append(each)
    # transformation type
    for i in range(len(duplicate_imgs_tuple)):
        duplicate_imgs_tuple[i] = [int(each.split('.')[0])
                                   for each in duplicate_imgs_tuple[i]]
    # post-processing:
    # Previous tests found that phash might put three pictures into two similar pairs, so a round of selection was added
    flatten_tuple = []
    for i in duplicate_imgs_tuple:
        for j in i:
            flatten_tuple.append(j)
    counter_dict = Counter(flatten_tuple)  # counter count, check duplicate
    error_keys = [key for key, value in counter_dict.items()
                  if value > 1]  # pick out duplicate images
    treated_dup_imgs_tuple = [each for each in duplicate_imgs_tuple if len(
        set(each+error_keys))-len(error_keys) == len(each)]  # remove duplicate image pairs

    return treated_dup_imgs_tuple
Example #2
0
def main():
    # Get all dirs in BASE_DIR
    BASE_DIR_SUBDIRS = [item[0] for item in os.walk(BASE_DIR)]
    phasher = PHash()

    duplicate_image_mappings = []
    all_encodings = {}
    for pic_dir in tqdm([BASE_DIR] + BASE_DIR_SUBDIRS):

        # Generate encodings for all images in an image directory
        encodings = phasher.encode_images(image_dir=pic_dir)

        # Add subdir to all keys (encode_images defaults to key = <filename>; I want the full path as the key)
        full_path_encodings = {
            f'{pic_dir}/{k}': v
            for k, v in encodings.items()
        }

        all_encodings.update(full_path_encodings)

    # Save encodings to disk
    with open(f"{BASE_DIR}/pictures_encodings.json", 'w') as f:
        json.dump(all_encodings, f)

    # Find duplicates using the generated encodings
    duplicates = phasher.find_duplicates(
        encoding_map=all_encodings,
        outfile=f"{BASE_DIR}/pictures_duplicates.json")
Example #3
0
def remove_duplicates(data_dir, image_names):
    phasher = PHash()
    hashed_images = dict()
    unique_images = set()
    for img in tqdm(image_names):
        img_hash = phasher.encode_image(data_dir + img)
        hashed_images.setdefault(img_hash, []).append(img)
    for phash, imgs in hashed_images.items():
        unique_images.add(imgs[0])
    return unique_images
Example #4
0
def test_imagededup(img_dir):  #not good for exact match
    from imagededup.methods import PHash, DHash
    phasher = PHash()

    # Generate encodings for all images in an image directory
    encodings = phasher.encode_images(image_dir=img_dir)

    # Find duplicates using the generated encodings
    duplicates = phasher.find_duplicates(encoding_map=encodings,
                                         scores=True,
                                         max_distance_threshold=0)
    #for key in duplicates:
    for key in sorted(duplicates.keys()):
        if len(duplicates[key]) > 0:
            print(key, ':', duplicates[key])
Example #5
0
    def __init__(self, songPath: str):

        self.songPath = songPath
        path , ext= os.path.splitext(self.songPath)
        self.head, self.fileName = os.path.split(self.songPath)
        if (ext == ".wav"):
            self.sampleRate, self.Data = wavfile.read(self.songPath)
            self.wavfile = self.songPath
        elif (ext ==".mp3"):
            self.Sound = AudioSegment.from_mp3(self.songPath)
            self.wavfile = 'database/'+self.fileName[:-4]+'.wav'
            self.Sound.export(self.wavfile, format="wav")    
            self.sampleRate, self.Data = wavfile.read(self.wavfile)
     
        if(len(self.Data.shape)==2):
            if (self.Data.shape[1]==2):
                self.Data = np.mean(self.Data, axis=1)

        self.TimeOfSampling=1/self.sampleRate
        self.NumberOfSample =int(60/self.TimeOfSampling) 
        self.Data = self.Data[0:self.NumberOfSample]
   
        self.HashFuncs = [AHash(),WHash(),PHash(),DHash()]
        self.HashFileNames  = ["AHash","WHash","PHash","DHash"]
        self.imageArray = None

        self.spectrogram(self.Data,self.sampleRate)
        self.SpectrogramFeatures()
        self.hashFunction()
Example #6
0
def get_duplicates(im_paths,
                   method="cnn",
                   encodings=None,
                   deduper=None,
                   **kwargs):
    from imagededup.methods import PHash, CNN
    if encodings is None:
        encodings, deduper = get_encodings(im_paths, method)
    if method == "cnn":
        deduper = deduper or CNN()
        assert isinstance(deduper, CNN)
        thresh = "min_similarity_threshold"
        if thresh not in kwargs:
            kwargs[thresh] = 0.9
    elif method == "phash":
        deduper = deduper or PHash()
        assert isinstance(deduper, PHash)
        thresh = "max_distance_threshold"
        if thresh not in kwargs:
            kwargs[thresh] = 10
    else:
        raise ValueError(f"Method {method} unknown")
    duplicates = deduper.find_duplicates(encoding_map=encodings,
                                         scores=True,
                                         **kwargs)
    return duplicates, encodings, deduper
def get_encodings(im_paths, method="cnn"):
    import imagededup.utils.data_generator as data_generator
    from imagededup.methods import PHash, CNN
    if method == "cnn":
        data_generator.DataGenerator = get_data_gen(im_paths)
        deduper = CNN()
        encodings = deduper.encode_images('/')
    elif method == "phash":
        deduper = PHash()
        encodings = run_parallel(deduper, im_paths)
    else:
        raise ValueError(f"Method {method} unknown")
    return encodings, deduper
Example #8
0
def remove_duplicates():
    phasher = PHash()

    # Generate encodings for all images in an image directory
    encodings = phasher.encode_images(image_dir=image_dir)

    # Find duplicates using the generated encodings
    duplicates = phasher.find_duplicates(encoding_map=encodings)

    # Remove duplicates
    duplicate_keys = [key for key in duplicates.keys() if len(duplicates[key]) != 0]
    removed = 0
    for key in duplicates.keys():
        if key in duplicate_keys:
            for dup in duplicates[key]:
                try:
                    os.remove(os.path.join(image_dir, dup))
                    duplicate_keys.remove(dup)
                except:
                    pass
                removed += 1
    print(f'Removed {removed} duplicates')
Example #9
0
    #print("Caught signal from %r, data %r" % (sender, kw))
    status.text(kw['message'])
    return 'received!'


@send_progress.connect
def receive_progress(sender, **kw):
    #print("Caught signal from %r, data %r" % (sender, kw))
    progress_bar.progress(int(round(kw['percent'])))
    #status.text(kw['message'])
    return 'received!'


if option == "pHash":
    try:
        phasher = PHash()
        duplicates = phasher.find_duplicates(image_dir, max_distance_threshold=max_distance, \
                                         scores=True)
    except Exception as e:
        duplicates = {}
        st.write(str(e))
elif option == "CNN":
    try:
        cnn_encoder = CNN()
        duplicates = cnn_encoder.find_duplicates(image_dir=image_dir,
                                                 scores=True)
    except Exception as e:
        duplicates = {}
        st.write(str(e))
elif option == "aHash":
    try:
Example #10
0
 def SpectrogramFeatures(self):
     featuresData = librosa.feature.spectral_bandwidth(y=np.array(self.Data,dtype=np.float32))
     featuresData = PHash().encode_image(image_array = featuresData)
     FileNameOfspectroFeatures = self.fileName[:-4] +".SpectroFeatures"    
     self.saveFeaturesData(FileNameOfspectroFeatures,featuresData)
Example #11
0
    ap.add_argument("-f",
                    "--folder",
                    required=True,
                    help="path to directory of images")
    args, unknown = ap.parse_known_args()

    folder = args.folder
    if os.path.isabs(folder): directory = folder
    else: directory = os.path.sep.join(["data", args.folder])

    deleted = directory + "_duplicates"

    if not os.path.exists(deleted):
        os.makedirs(deleted)

    phasher = PHash()
    duplicates = phasher.find_duplicates_to_remove(image_dir=directory,
                                                   max_distance_threshold=5)
    print("[INFO] removing {} duplicates".format(str(len(duplicates))))

    for duplicate in duplicates:

        pathFrom = os.path.join(directory, duplicate)
        pathTo = os.path.join(deleted, duplicate)
        os.rename(pathFrom, pathTo)

        noExtension = os.path.splitext(duplicate)[0]

        labelFile = os.path.join(directory, noExtension) + ".txt"
        if os.path.exists(labelFile):
            txtPath = os.path.join(deleted, noExtension) + ".txt"
Example #12
0
import os
from imagededup.methods import PHash
image_dir = 'I://net hat//001'

if __name__ == '__main__':
    phasher = PHash()
    duplicates_list = phasher.find_duplicates_to_remove(image_dir)
    for i in range(0, len(duplicates_list) - 1):
        duplicates_list_remove = image_dir + '//' + duplicates_list[i]
        os.remove(duplicates_list_remove)
import cv2
from shutil import copyfile

os.mkdir('clean_data')

files = os.listdir('data')

for i in tqdm(range(len(files))):
    try:
        im = cv2.imread('data/' + files[i], cv2.IMREAD_GRAYSCALE)
        im.shape
        copyfile('data/' + files[i], 'clean_data/' + files[i])
    except:
        pass

phasher = PHash()

encodings = phasher.encode_images(image_dir='clean_data/')

duplicates = phasher.find_duplicates(encoding_map=encodings)

data = []
dkeys = list(duplicates.keys())
ind_key = 0
while (True):
    print(len(dkeys))
    data.append(dkeys[ind_key])
    if (len(duplicates[dkeys[ind_key]]) > 0):
        for j in range(len(duplicates[dkeys[ind_key]])):
            try:
                dkeys.remove(duplicates[dkeys[ind_key]][j])
Example #14
0
from imagededup.methods import PHash
import os
import json


root = '/media/palm/62C0955EC09538ED/ptt/full_sized'
duplicates = []
for cls in os.listdir(root)[1:]:
    phasher = PHash()

    # Generate encodings for all images in an image directory
    encodings = phasher.encode_images(image_dir=os.path.join(root, cls))

    # Find duplicates using the generated encodings
    duplicate = phasher.find_duplicates(encoding_map=encodings, max_distance_threshold=1)
    with open('/home/palm/PycharmProjects/ptt/datastuffs/dups/'+cls+'.json', 'w') as write:
        json.dump([duplicate, encodings], write)
    duplicates.append(duplicate)
Example #15
0
PHash:  http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
"""
import os
import matplotlib.pylab as plt
import imagededupSt
from imagededup.methods import PHash
from imagededup.utils import plot_duplicates

PATH = 'E:\DATASET\pku-autonomous-driving'

test_img_dir = os.path.join(PATH, 'test_images')
# Find similar images

# __Note:__ `max_distance_threshold` defines the threshold of differences between two images to consider them similar,
# the higher the value, the more tolerant it is in differences.
#
# Below we list the first 15 images found having similar content according to imagededup.
# To get the full list, you have to display the content of variable `duplicates`.
phasher = PHash()
duplicates = phasher.find_duplicates(image_dir=test_img_dir,
                                     scores=True,
                                     max_distance_threshold=3)

print('There are', len([x for x in duplicates if duplicates[x] != []]),
      'images with similar images over', len(duplicates), 'images.')
# There are 429 images with similar images over 2021 images.

plt.figure(figsize=(20, 20))
plot_duplicates(image_dir=test_img_dir,
                duplicate_map=duplicates,
                filename='ID_5bf531cf3.jpg')
Example #16
0
from pyqtgraph import PlotWidget
import pyqtgraph as pg
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QDialog, QApplication, QPushButton, QVBoxLayout, QTableWidgetItem
from PyQt5 import QtCore, QtWidgets, QtMultimedia
import logging
import numpy as np
import matplotlib.pyplot as plt
from numpy import genfromtxt
import pandas as pd
import librosa
import librosa.display
#from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from PIL import Image
from imagededup.methods import PHash
phasher = PHash()
import os
import csv
from math import floor
from pydub import AudioSegment
count = 0
temp2 = []
for filename in os.listdir("./"):
    count += 1
    temp2 = temp2 + [filename]

if (("tempDir" in temp2) == False):
    os.mkdir('tempDir')
if (("back" in temp2) == False):
    os.mkdir('back')
Example #17
0
parser.add_argument('-p', '--path', type=str, required=True, help="path of folder, for duplicated trashing")
args = parser.parse_args()

def remove(path):
    """ param <path> could either be relative or absolute. """
    if os.path.isfile(path) or os.path.islink(path):
        os.remove(path)  # remove the file
    elif os.path.isdir(path):
        shutil.rmtree(path)  # remove dir and all contains
    else:
        raise ValueError("file {} is not a file or dir.".format(path))


if __name__ == "__main__":
    path = args.path
    del_list = []

    phasher = PHash()
    encodings = phasher.encode_images(image_dir=path)
    duplicates = phasher.find_duplicates(encoding_map=encodings)

    for k,v in duplicates.items():
        if len(v) and (k not in del_list):
            for fname in v:
                del_list.append(fname)

    print('Deleting Duplicates :\n{0}'.format(del_list))

    for dl in del_list:
        remove(path + dl)
Example #18
0
from imagededup.methods import PHash
phasher = PHash()

# Generate encodings for all images in an image directory
encodings = phasher.encode_images(image_dir='../Testout')

# Find duplicates using the generated encodings
duplicates = phasher.find_duplicates(encoding_map=encodings)

# from imagededup.utils import plot_duplicates
# # plot duplicates obtained for a given file using the duplicates dictionary
# plot_duplicates(image_dir='path/to/image/directory', 
#                 duplicate_map=duplicates, 
#                 filename='ukbench00120.jpg')
Example #19
0
                         help='save encoding map (phash of images) as pkl',
                         action='store_true')
cache_group.add_argument('--load',
                         help='load encoding map (phash of images) from pkl',
                         type=str)
args = parser.parse_args()

dist_thresh = int(args.thresh)
assert 0 <= dist_thresh <= 64

root_dir = Path(args.directory)
assert root_dir.is_dir()

out_dir = root_dir.parent / 'Dups_thresh{}'.format(dist_thresh)

phasher = PHash()

if args.load is not None and Path(args.load).is_file():
    import pickle
    encoding_map = pickle.load(open(args.load, 'rb'))
    print(f'Encoding map loaded from pickle file: {args.load}!')
else:
    tic = time.perf_counter()
    encoding_map = phasher.encode_images(image_dir=root_dir, rglob=True)
    toc = time.perf_counter()
    print(f'encoding duration: {toc-tic:.3f}s')
    if args.save:
        import pickle
        pickle_file = f"{root_dir.stem}_encoding_map.pkl"
        pickle.dump(encoding_map, open(pickle_file, "wb"))
        print(f'Encoding map dumped as pickle at: {pickle_file}')
Example #20
0
from imagededup.methods import PHash
phasher = PHash()

encodings = phasher.encode_images(image_dir='path/to/image/directory')

duplicates = phasher.find_duplicates(encoding_map=encodings)
Example #21
0
        for item in dup_items:
            image2 = item[0].split('_')[0]
            if image1 != image2 and (
                (image1, image2) not in duplicates_list) and (
                    (image2, image1) not in duplicates_list):
                duplicates_list.append((image1, image2))
                scores.append(item[1])

    duplicates_df = pd.DataFrame(duplicates_list, columns=['image1', 'image2'])
    duplicates_df['score'] = scores
    return duplicates_df


# The max_distance_threshold parameter of phash.find_duplicates() specifies the hamming distance below which retrieved duplicates are considered valid.  We'll start with a max_distance_threshold of 8.

# In[5]:

phash = PHash()

encodings = phash.encode_images(
    image_dir='../input/hpa-single-cell-image-classification/train')
encodings_public = phash.encode_images(image_dir='../input/publichpa_1024')
encodings.update(encodings_public)

duplicates = phash.find_duplicates(encoding_map=encodings,
                                   scores=True,
                                   max_distance_threshold=8)

duplicates_df = convert_dict_to_df(duplicates)
duplicates_df.to_csv('../input/duplicates.csv', index=False)