def generateRepostsForAll(self, count_per_post=1, res=None, rot=None, asp=None, crop=None, uid=None, seed=None): '''generates reposts for every single non repost image in the image directory''' names = list( filter(lambda x: '_REPOST_' not in x, self.__imageToHash.keys())) self.vPrint('generating ' + str(len(names)) + ' reposts') interrupted = False try: for i, name in enumerate(sorted(names)): repname = (str(uid) if uid else '') + '_REPOST_' + name if count_per_post == 1: if repname in self.__imageToHash and repname in self.__imageToText: continue elif count_per_post > 1: if (str(count_per_post - 1) + repname) in self.__imageToHash and \ (str(count_per_post - 1) + repname) in self.__imageToText: continue else: return if i < 30 or i % 10 == 0: self.vPrint('partial: %5d/%d' % (i, len(names))) try: target_path = join(self.img_dir, name) loc = join(self.img_dir, repname) bad_imgs = generate_bad_repost(target_path, count=(count_per_post), res=res, rot=rot, asp=asp, crop=crop, save_loc=loc, seed=(seed + i)) if not isinstance(bad_imgs, list): bad_imgs = [(repname, bad_imgs)] for newrepname, bad_img in bad_imgs: bad_img_hash = Hasher.hashImage( bad_img, self.__imagehash_method) bad_img_text = OCR.read2Normalized(bad_img) self.__imageToHash[newrepname] = bad_img_hash self.__imageToText[newrepname] = bad_img_text except FileNotFoundError as e: print(e) print("skipped an image that doesn't exist") continue except UnidentifiedImageError as e: print(e) print('skipped an unidentified image') continue self.vPrint('done!') except KeyboardInterrupt: self.vPrint('interrupted!') interrupted = True finally: self.saveProcessedDataToCache() self.vPrint('saved!') return not interrupted
def processData(self, only_cached_files=False, max_capacity=None): ''' Processes all posts and returns two dictionaries in a tuple. The first maps image name to hash, and the second maps image name to OCR results. The results will also be cached in memory within the class and will be used in other methods for checking reposts Returns: A tuple of two dictionaries, first one containing image name to hash mappings and second one containing image name to OCR readings. ''' if not only_cached_files: files = [ f for f in listdir(self.img_dir) if isfile(join(self.img_dir, f)) and not f.startswith('.') ] files.sort() self.readProcessedDataFromCache() else: self.readProcessedDataFromCache() files = list(self.__imageToHash.keys()) files.sort() if max_capacity is not None: files = files[:max_capacity] d = self.__imageToHash t = self.__imageToText self.vPrint("loading... " + str(len(files)) + ' items') for i, file in enumerate(files): if len(files) < 50 or i % (len(files) // 20) == 0: self.vPrint('partial: %5d/%d' % (i, len(files))) try: if file not in d or file not in t: img = Image.open(join(self.img_dir, file)) d[file] = Hasher.hashImage(img, self.__imagehash_method) t[file] = OCR.read2Normalized(img) except KeyboardInterrupt: self.vPrint('skipped remaining files') if file in d: del d[file] if file in t: del t[file] break except UnidentifiedImageError: self.vPrint('skipped ' + file + ' (not an image)') if file in d: del d[file] if file in t: del t[file] self.vPrint('loaded: ' + str(len(d.items())) + ' items') self.__imageToHash = d self.__imageToText = t self.saveProcessedDataToCache() return (d, t)
def checkRepostDetection(self, img: str, img_sim_min: int = 0.8, text_sim_min: float = 0.7, recheck_img: bool = True, generate_repost: bool = False, save_generated_repost: bool = True): ''' Checks whether reposts can be detected correctly using a naive algorithm considering image hashes and ocr text. This assumes the dataset is correctly labelled such that a reposted image is the image name prefixed with _REPOST_. If an image is custom crafted and you don't want it to make a deduction of whether it's a true positive or otherwise, simply avoid using the standard format name of: <subreddit>_<postID>.<imgExtension> ''' distances = [] name_dist_dict = {} d = self.__imageToHash t = self.__imageToText target_check = img target_path = join(self.img_dir, target_check) target_img = None self.vPrint('we\'ll process post : ' + target_check) if generate_repost or recheck_img: target_img = Image.open(target_path) if target_img and (recheck_img or target_check not in d or target_check not in t): self.vPrint('computing target metadata') target_hash = Hasher.hashImage(target_img, self.__imagehash_method) target_text = OCR.read2Normalized(target_img) target_texthash = Hasher.hashText(target_text) d[target_check] = target_hash t[target_check] = target_text self.__imageToHash = d self.__imageToText = t else: target_hash = d[target_check] target_text = t[target_check] bad_check = '_REPOST_' + target_check if generate_repost: self.vPrint('generating dummy repost : _REPOST_' + target_check) bad_img = generate_bad_repost(target_path) bad_img_path = join(self.img_dir, bad_check) self.vPrint('computing target metadata') bad_img_hash = Hasher.hashImage(bad_img, self.__imagehash_method) bad_img_text = OCR.read2Normalized(bad_img) bad_img_texthash = Hasher.hashText(bad_img_text) d[bad_check] = bad_img_hash t[bad_check] = bad_img_text if save_generated_repost: bad_img.save(bad_img_path) self.__imageToHash = d self.__imageToText = t if self.update_cache: self.saveProcessedDataToCache() self.vPrint('\nchecking...') for key, value in d.items(): if key == target_check: continue img_diff = Hasher.diff(value, target_hash, 'IMAGE') text_sim = 0.0 if text_sim_min <= 0.0 else Levenshtein.ratio( t[key], target_text) distances.append \ ( \ (key, \ img_diff, \ text_sim) ) name_dist_dict[key] = (distances[-1][1], distances[-1][2]) def orderOfSort(x): '''dynamic sorting to prioritise text if image and text are both really close''' img_diff = x[1] txt_diff = 1 - x[2] if txt_diff <= 1 - text_sim_min and img_diff <= 1 - img_sim_min: return (txt_diff - 1, img_diff - 1) return (img_diff, txt_diff) distances.sort(key=orderOfSort) counter = 0 results = {} FP = 0 FN = 0 self.vPrint('--- similar results ---') self.vPrint(' SAME? | IMG_SIM | TEXT_SIM | IMAGE') for a, b, c in distances: standardFormat = len(a.split('.')) == 2 and len( a.split('.')[0].split('_REPOST_')[-1].split('_')) == 2 is_known_same = a.split('_REPOST_')[-1] == target_check.split( '_REPOST_')[-1] is_repost = b <= 1 - img_sim_min and c >= text_sim_min if not standardFormat: validity = '??' else: if is_known_same: if is_repost: validity = 'TP' else: validity = 'FN' FN += 1 else: if is_repost: validity = 'FP' FP += 1 else: validity = 'TN' if counter < 10: counter += 1 if self.verbose: self.vPrint('%8s %7.3f %8.3f %-50s' % \ (('YES, ' if is_repost else ' NO, ') + validity,1-b,c,a)) if standardFormat: subreddit = a.split('_REPOST_')[-1].split('_')[0] post_id = a.split('_REPOST_')[-1].split('_')[-1].split( '.')[0] self.vPrint('reddit.com/r/' + subreddit + '/comments/' + post_id + '/') else: self.vPrint( '• this image isn\'t from the standard dataset') if a == target_check: self.vPrint('• this is the originally chosen image') elif is_known_same: self.vPrint( '• this is a known to be the same as the chosen image' ) self.vPrint() results[a] = { 'imgName': a, 'isRepost': is_repost, 'validity': validity, 'imgDiff': b, 'textSim': c } if FP or FN: self.vPrint('important notes:') self.vPrint( 'we have %d known false positives and %d known false negatives for this\n' % (FP, FN)) return results