def main(): db = sqlite3.connect("test.sqlite3") cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") cur.execute("DELETE FROM duplicate;") cur.execute("DELETE FROM issue;") readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): cur.execute( "SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue, )) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1], ) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute( "INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');" ) cur.execute( "INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';" ) cur.execute( "INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';" ) db.commit()
def compute_comparison(self, pid1, pid2): """Compute a sequence of comparison objects ordery by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to the same object. * size: Size of the object in bytes. * matches: A mapping from filenames in package 2 (pid2) to a mapping from hash function pairs to hash values. """ cur = self.db.cursor() cur.execute( "SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", (pid1, )) cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): if len(entry["matches"]) >= minmatch: yield entry files.clear() cursize = size if hashvalue in files: files[hashvalue]["filenames"].add(filename) continue entry = dict(filenames=set((filename, )), size=size, matches={}) files[hashvalue] = entry cur2 = self.db.cursor() cur2.execute( "SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue cur2.close() cur.close() for entry in files.values(): if len(entry["matches"]) >= minmatch: yield entry
def show_source(self, package): cur = self.db.cursor() cur.execute("SELECT name FROM package WHERE source = ?;", (package,)) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) if not binpkgs: raise NotFound cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", (package,)) for binary, otherbin, func1, func2, files, size in fetchiter(cur): entry = dict(package=otherbin, funccomb=function_combination(func1, func2), duplicate=files, savable=size) oldentry = binpkgs.get(binary) if not (oldentry and oldentry["savable"] >= size): binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params))
def compute_comparison(self, pid1, pid2): """Compute a sequence of comparison objects ordery by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to the same object. * size: Size of the object in bytes. * matches: A mapping from filenames in package 2 (pid2) to a mapping from hash function pairs to hash values. """ cur = self.db.cursor() cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", (pid1,)) cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): if len(entry["matches"]) >= minmatch: yield entry files.clear() cursize = size if hashvalue in files: files[hashvalue]["filenames"].add(filename) continue entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry cur2 = self.db.cursor() cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue cur2.close() cur.close() for entry in files.values(): if len(entry["matches"]) >= minmatch: yield entry
def show_source(self, package): cur = self.db.cursor() cur.execute("SELECT name FROM package WHERE source = ?;", (package, )) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) if not binpkgs: raise NotFound cur.execute( "SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", (package, )) for binary, otherbin, func1, func2, files, size in fetchiter(cur): entry = dict(package=otherbin, funccomb=function_combination(func1, func2), duplicate=files, savable=size) oldentry = binpkgs.get(binary) if not (oldentry and oldentry["savable"] >= size): binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params))
def show_hash(self, function, hashvalue): cur = self.db.cursor() cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", (function, hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") return html_response(hash_template.render(params))
def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: package2 = None curstats.append(dict(package=package2, duplicate=files, savable=size)) return sharedstats
def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} cur.execute( "SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid, )) for pid2, package2, func1, func2, files, size in fetchiter(cur): curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: package2 = None curstats.append( dict(package=package2, duplicate=files, savable=size)) return sharedstats
def show_hash(self, function, hashvalue): cur = self.db.cursor() cur.execute( "SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", ( function, hashvalue, )) entries = [ dict(package=package, filename=filename, size=size, function=otherfunc) for package, filename, size, otherfunc in fetchiter(cur) ] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") return html_response(hash_template.render(params))
def main(): db = sqlite3.connect("test.sqlite3") cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") cur.execute("DELETE FROM duplicate;") cur.execute("DELETE FROM issue;") readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") db.commit()
def get_dependencies(self, pid): cur = self.db.cursor() cur.execute("SELECT required FROM dependency WHERE pid = ?;", (pid,)) return set(row[0] for row in fetchiter(cur))
def get_dependencies(self, pid): cur = self.db.cursor() cur.execute("SELECT required FROM dependency WHERE pid = ?;", (pid, )) return set(row[0] for row in fetchiter(cur))