def delete_from_db(condition=None): dbm = DBManager(expanduser(settings.IMAGES_DB)) dbm.connect() count = dbm.delete('images', condition) dbm.commit() dbm.disconnect() return count
def clear_duplicate_images(arg): dbm = DBManager(expanduser(settings.IMAGES_DB)) dbm.connect() jobname = arg result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1'%jobname) if len(result) == 0: print('no such job') imagepath = result[0]['path'] jobpath = imagepath[0:imagepath.rfind(jobname)+len(jobname)] print jobpath filelist = [] for root, dirs, files in os.walk(jobpath): for filename in files: md5hash = md5() filepath = joinpath(root, filename) with open(filepath, 'rb') as f: md5hash.update(f.read()) filehash = md5hash.hexdigest() filelist.append(FileItem(filepath, filehash, False)) dups_total = 0 for i in range(0, len(filelist)): if filelist[i].dup: continue hash = filelist[i].hash same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)] for j in range(i + 1, len(filelist)): if filelist[j].hash == hash: same_files.append((filelist[j].path, os.stat(filelist[j].path).st_mtime)) filelist[j] = FileItem(None, None, True) if len(same_files) > 1: min_mtime = sys.float_info.max keep = -1 for i in range(0, len(same_files)): if same_files[i][1] < min_mtime: min_mtime = same_files[i][1] keep = i for i in range(0, len(same_files)): if i != keep: dups_total += 1 print('deleting %s'%same_files[i][0]) try: os.remove(same_files[i][0]) except OSError as e: print(e.message) dbm.query('UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''%same_files[i][0]) dbm.commit() dbm.disconnect() print('%d duplicate images deleted.'%dups_total)
def clear_duplicate_images(arg): dbm = DBManager(expanduser(settings.IMAGES_DB)) dbm.connect() jobname = arg result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1' % jobname) if len(result) == 0: print('no such job') imagepath = result[0]['path'] jobpath = imagepath[0:imagepath.rfind(jobname) + len(jobname)] print jobpath filelist = [] for root, dirs, files in os.walk(jobpath): for filename in files: md5hash = md5() filepath = joinpath(root, filename) with open(filepath, 'rb') as f: md5hash.update(f.read()) filehash = md5hash.hexdigest() filelist.append(FileItem(filepath, filehash, False)) dups_total = 0 for i in range(0, len(filelist)): if filelist[i].dup: continue hash = filelist[i].hash same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)] for j in range(i + 1, len(filelist)): if filelist[j].hash == hash: same_files.append( (filelist[j].path, os.stat(filelist[j].path).st_mtime)) filelist[j] = FileItem(None, None, True) if len(same_files) > 1: min_mtime = sys.float_info.max keep = -1 for i in range(0, len(same_files)): if same_files[i][1] < min_mtime: min_mtime = same_files[i][1] keep = i for i in range(0, len(same_files)): if i != keep: dups_total += 1 print('deleting %s' % same_files[i][0]) try: os.remove(same_files[i][0]) except OSError as e: print(e.message) dbm.query( 'UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\'' % same_files[i][0]) dbm.commit() dbm.disconnect() print('%d duplicate images deleted.' % dups_total)
class ImageStorePipeline(object): def __init__(self): if exists(settings.IMAGES_DB): self._dbm = DBManager(settings.IMAGES_DB) self._dbm.connect() self._nodb = False log.debug('opened db: %s' % settings.IMAGES_DB) else: self._nodb = True log.debug('could not open db: %s' % settings.IMAGES_DB) def process_item(self, item, spider): if isinstance(item, ImageItem): images = item.get('images', None) final_storepath = joinpath(settings.IMAGES_STORE_FINAL, spider.jobname) if images: for d in item['images']: ext = d['path'][d['path'].rfind('.') + 1:] filebasename, ext = self.get_filename(d['url']) final_path = joinpath(final_storepath, filebasename + '.' + ext) i = 0 while exists(final_path): log.debug(final_path + ' exists') final_path = joinpath( final_storepath, filebasename + '_%02d' % i + '.' + ext) i += 1 try: os.rename(joinpath(settings.IMAGES_STORE, d['path']), final_path) log.debug('moved to: ' + final_path) spider.update_monitor(final_path) if not self._nodb: self._dbm.insert('images', (d['url'], final_path, spider.jobname, int(time()))) except OSError as e: log.error(e) if not self._nodb: self._dbm.commit() return item def get_filename(self, url): url_parts = url.split('/') del url_parts[0:2] filename = url_parts.pop() ext = filename[filename.rfind('.') + 1:] filename = filename[0:filename.rfind('.')] url_parts.append(filename) part = filename words = [] while not any([len(w) > 2 for w in words]) and len(url_parts) > 0: part = url_parts.pop() part = part.replace('-', '_').replace('.', '_').replace('+', '_') pwords = part.split('_') pwords = [ w for w in pwords if (len(w) > 0 and len(w) <= 2) or (len(w) > 2 and ( len([c for c in w if (ord(c) >= 48 and ord(c) <= 57)]) < len(w) / 2)) ] words = pwords + words final = '' if len(words) > 0: for w in words: final += w + '_' final = final.rstrip('_') else: final = 'image.' return final, ext def __del__(self): self._dbm.disconnect()