Example #1
0
 def delete_from_db(condition=None):
     dbm = DBManager(expanduser(settings.IMAGES_DB))
     dbm.connect()
     count = dbm.delete('images', condition)
     dbm.commit()
     dbm.disconnect()
     return count
Example #2
0
	def delete_from_db(condition=None):
		dbm = DBManager(expanduser(settings.IMAGES_DB))
		dbm.connect()
		count = dbm.delete('images', condition)
		dbm.commit()
		dbm.disconnect()
		return count
Example #3
0
def clear_duplicate_images(arg):
	dbm = DBManager(expanduser(settings.IMAGES_DB))
	dbm.connect()

	jobname = arg
	result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1'%jobname)
	if len(result) == 0:
		print('no such job')

	imagepath = result[0]['path']
	jobpath = imagepath[0:imagepath.rfind(jobname)+len(jobname)]

	print jobpath

	filelist = []
	for root, dirs, files in os.walk(jobpath):
		for filename in files:
			md5hash = md5()
			filepath = joinpath(root, filename)
			with open(filepath, 'rb') as f:
				md5hash.update(f.read())
			filehash = md5hash.hexdigest()
			filelist.append(FileItem(filepath, filehash, False))

	dups_total = 0

	for i in range(0, len(filelist)):
		if filelist[i].dup:
			continue
		hash = filelist[i].hash
		same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
		for j in range(i + 1, len(filelist)):
			if filelist[j].hash == hash:
				same_files.append((filelist[j].path, os.stat(filelist[j].path).st_mtime))
				filelist[j] = FileItem(None, None, True)

		if len(same_files) > 1:
			min_mtime = sys.float_info.max
			keep = -1
			for i in range(0, len(same_files)):
				if same_files[i][1] < min_mtime:
					min_mtime = same_files[i][1]
					keep = i

			for i in range(0, len(same_files)):
				if i != keep:
					dups_total += 1
					print('deleting %s'%same_files[i][0])
					try:
						os.remove(same_files[i][0])
					except OSError as e:
						print(e.message)
				
					dbm.query('UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''%same_files[i][0])

	dbm.commit()
	dbm.disconnect()

	print('%d duplicate images deleted.'%dups_total)		
Example #4
0
def clear_duplicate_images(arg):
    dbm = DBManager(expanduser(settings.IMAGES_DB))
    dbm.connect()

    jobname = arg
    result = dbm.query('SELECT path FROM images WHERE job = \'%s\' LIMIT 1' %
                       jobname)
    if len(result) == 0:
        print('no such job')

    imagepath = result[0]['path']
    jobpath = imagepath[0:imagepath.rfind(jobname) + len(jobname)]

    print jobpath

    filelist = []
    for root, dirs, files in os.walk(jobpath):
        for filename in files:
            md5hash = md5()
            filepath = joinpath(root, filename)
            with open(filepath, 'rb') as f:
                md5hash.update(f.read())
            filehash = md5hash.hexdigest()
            filelist.append(FileItem(filepath, filehash, False))

    dups_total = 0

    for i in range(0, len(filelist)):
        if filelist[i].dup:
            continue
        hash = filelist[i].hash
        same_files = [(filelist[i].path, os.stat(filelist[i].path).st_mtime)]
        for j in range(i + 1, len(filelist)):
            if filelist[j].hash == hash:
                same_files.append(
                    (filelist[j].path, os.stat(filelist[j].path).st_mtime))
                filelist[j] = FileItem(None, None, True)

        if len(same_files) > 1:
            min_mtime = sys.float_info.max
            keep = -1
            for i in range(0, len(same_files)):
                if same_files[i][1] < min_mtime:
                    min_mtime = same_files[i][1]
                    keep = i

            for i in range(0, len(same_files)):
                if i != keep:
                    dups_total += 1
                    print('deleting %s' % same_files[i][0])
                    try:
                        os.remove(same_files[i][0])
                    except OSError as e:
                        print(e.message)

                    dbm.query(
                        'UPDATE images SET path = \'#duplicate\' WHERE path = \'%s\''
                        % same_files[i][0])

    dbm.commit()
    dbm.disconnect()

    print('%d duplicate images deleted.' % dups_total)
Example #5
0
class ImageStorePipeline(object):
    def __init__(self):
        if exists(settings.IMAGES_DB):
            self._dbm = DBManager(settings.IMAGES_DB)
            self._dbm.connect()
            self._nodb = False
            log.debug('opened db: %s' % settings.IMAGES_DB)
        else:
            self._nodb = True
            log.debug('could not open db: %s' % settings.IMAGES_DB)

    def process_item(self, item, spider):
        if isinstance(item, ImageItem):
            images = item.get('images', None)
            final_storepath = joinpath(settings.IMAGES_STORE_FINAL,
                                       spider.jobname)

            if images:
                for d in item['images']:
                    ext = d['path'][d['path'].rfind('.') + 1:]
                    filebasename, ext = self.get_filename(d['url'])
                    final_path = joinpath(final_storepath,
                                          filebasename + '.' + ext)
                    i = 0
                    while exists(final_path):
                        log.debug(final_path + ' exists')
                        final_path = joinpath(
                            final_storepath,
                            filebasename + '_%02d' % i + '.' + ext)
                        i += 1

                    try:
                        os.rename(joinpath(settings.IMAGES_STORE, d['path']),
                                  final_path)
                        log.debug('moved to: ' + final_path)
                        spider.update_monitor(final_path)
                        if not self._nodb:
                            self._dbm.insert('images',
                                             (d['url'], final_path,
                                              spider.jobname, int(time())))
                    except OSError as e:
                        log.error(e)

        if not self._nodb:
            self._dbm.commit()
        return item

    def get_filename(self, url):
        url_parts = url.split('/')
        del url_parts[0:2]

        filename = url_parts.pop()
        ext = filename[filename.rfind('.') + 1:]
        filename = filename[0:filename.rfind('.')]
        url_parts.append(filename)

        part = filename
        words = []
        while not any([len(w) > 2 for w in words]) and len(url_parts) > 0:
            part = url_parts.pop()
            part = part.replace('-', '_').replace('.', '_').replace('+', '_')
            pwords = part.split('_')
            pwords = [
                w for w in pwords
                if (len(w) > 0 and len(w) <= 2) or (len(w) > 2 and (
                    len([c for c in w if
                         (ord(c) >= 48 and ord(c) <= 57)]) < len(w) / 2))
            ]
            words = pwords + words

        final = ''
        if len(words) > 0:
            for w in words:
                final += w + '_'
            final = final.rstrip('_')
        else:
            final = 'image.'

        return final, ext

    def __del__(self):
        self._dbm.disconnect()