Esempio n. 1
0
def main():
    db = sqlite3.connect("test.sqlite3")
    cur = db.cursor()
    cur.execute("PRAGMA foreign_keys = ON;")
    cur.execute("DELETE FROM sharing;")
    cur.execute("DELETE FROM duplicate;")
    cur.execute("DELETE FROM issue;")
    readcur = db.cursor()
    readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
    for hashvalue, in fetchiter(readcur):
        cur.execute(
            "SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
            (hashvalue, ))
        rows = cur.fetchall()
        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
        pkgdict = compute_pkgdict(rows)
        cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
                        [(row[1], ) for row in rows])
        process_pkgdict(cur, pkgdict)
    cur.execute(
        "INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');"
    )
    cur.execute(
        "INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';"
    )
    cur.execute(
        "INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';"
    )
    db.commit()
Esempio n. 2
0
    def compute_comparison(self, pid1, pid2):
        """Compute a sequence of comparison objects ordery by the size of the
        object in the first package. Each element of the sequence is a dict
        defining the following keys:
         * filenames: A set of filenames in package 1 (pid1) all referring to
           the same object.
         * size: Size of the object in bytes.
         * matches: A mapping from filenames in package 2 (pid2) to a mapping
           from hash function pairs to hash values.
        """
        cur = self.db.cursor()
        cur.execute(
            "SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
            (pid1, ))
        cursize = -1
        files = dict()
        minmatch = 2 if pid1 == pid2 else 1
        for cid, filename, size, hashvalue in fetchiter(cur):
            if cursize != size:
                for entry in files.values():
                    if len(entry["matches"]) >= minmatch:
                        yield entry
                files.clear()
                cursize = size

            if hashvalue in files:
                files[hashvalue]["filenames"].add(filename)
                continue

            entry = dict(filenames=set((filename, )), size=size, matches={})
            files[hashvalue] = entry

            cur2 = self.db.cursor()
            cur2.execute(
                "SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
                (cid, pid2))
            for func1, hashvalue, func2, filename in fetchiter(cur2):
                entry["matches"].setdefault(filename, {})[func1, func2] = \
                        hashvalue
            cur2.close()
        cur.close()

        for entry in files.values():
            if len(entry["matches"]) >= minmatch:
                yield entry
Esempio n. 3
0
 def show_source(self, package):
     cur = self.db.cursor()
     cur.execute("SELECT name FROM package WHERE source = ?;",
                 (package,))
     binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
     if not binpkgs:
         raise NotFound
     cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
                 (package,))
     for binary, otherbin, func1, func2, files, size in fetchiter(cur):
         entry = dict(package=otherbin,
                      funccomb=function_combination(func1, func2),
                      duplicate=files, savable=size)
         oldentry = binpkgs.get(binary)
         if not (oldentry and oldentry["savable"] >= size):
             binpkgs[binary] = entry
     params = dict(source=package, packages=binpkgs, urlroot="..")
     return html_response(source_template.render(params))
Esempio n. 4
0
    def compute_comparison(self, pid1, pid2):
        """Compute a sequence of comparison objects ordery by the size of the
        object in the first package. Each element of the sequence is a dict
        defining the following keys:
         * filenames: A set of filenames in package 1 (pid1) all referring to
           the same object.
         * size: Size of the object in bytes.
         * matches: A mapping from filenames in package 2 (pid2) to a mapping
           from hash function pairs to hash values.
        """
        cur = self.db.cursor()
        cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
                    (pid1,))
        cursize = -1
        files = dict()
        minmatch = 2 if pid1 == pid2 else 1
        for cid, filename, size, hashvalue in fetchiter(cur):
            if cursize != size:
                for entry in files.values():
                    if len(entry["matches"]) >= minmatch:
                        yield entry
                files.clear()
                cursize = size

            if hashvalue in files:
                files[hashvalue]["filenames"].add(filename)
                continue

            entry = dict(filenames=set((filename,)), size=size, matches={})
            files[hashvalue] = entry

            cur2 = self.db.cursor()
            cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
                         (cid, pid2))
            for func1, hashvalue, func2, filename in fetchiter(cur2):
                entry["matches"].setdefault(filename, {})[func1, func2] = \
                        hashvalue
            cur2.close()
        cur.close()

        for entry in files.values():
            if len(entry["matches"]) >= minmatch:
                yield entry
Esempio n. 5
0
 def show_source(self, package):
     cur = self.db.cursor()
     cur.execute("SELECT name FROM package WHERE source = ?;", (package, ))
     binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
     if not binpkgs:
         raise NotFound
     cur.execute(
         "SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
         (package, ))
     for binary, otherbin, func1, func2, files, size in fetchiter(cur):
         entry = dict(package=otherbin,
                      funccomb=function_combination(func1, func2),
                      duplicate=files,
                      savable=size)
         oldentry = binpkgs.get(binary)
         if not (oldentry and oldentry["savable"] >= size):
             binpkgs[binary] = entry
     params = dict(source=package, packages=binpkgs, urlroot="..")
     return html_response(source_template.render(params))
Esempio n. 6
0
 def show_hash(self, function, hashvalue):
     cur = self.db.cursor()
     cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
                 (function, hashvalue,))
     entries = [dict(package=package, filename=filename, size=size,
                     function=otherfunc)
                for package, filename, size, otherfunc in fetchiter(cur)]
     if not entries:
         raise NotFound()
     params = dict(function=function, hashvalue=hashvalue, entries=entries,
                   urlroot="../..")
     return html_response(hash_template.render(params))
Esempio n. 7
0
 def cached_sharedstats(self, pid):
     cur = self.db.cursor()
     sharedstats = {}
     cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
                 (pid,))
     for pid2, package2, func1, func2, files, size in fetchiter(cur):
         curstats = sharedstats.setdefault(
                 function_combination(func1, func2), list())
         if pid2 == pid:
             package2 = None
         curstats.append(dict(package=package2, duplicate=files, savable=size))
     return sharedstats
Esempio n. 8
0
 def cached_sharedstats(self, pid):
     cur = self.db.cursor()
     sharedstats = {}
     cur.execute(
         "SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
         (pid, ))
     for pid2, package2, func1, func2, files, size in fetchiter(cur):
         curstats = sharedstats.setdefault(
             function_combination(func1, func2), list())
         if pid2 == pid:
             package2 = None
         curstats.append(
             dict(package=package2, duplicate=files, savable=size))
     return sharedstats
Esempio n. 9
0
 def show_hash(self, function, hashvalue):
     cur = self.db.cursor()
     cur.execute(
         "SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
         (
             function,
             hashvalue,
         ))
     entries = [
         dict(package=package,
              filename=filename,
              size=size,
              function=otherfunc)
         for package, filename, size, otherfunc in fetchiter(cur)
     ]
     if not entries:
         raise NotFound()
     params = dict(function=function,
                   hashvalue=hashvalue,
                   entries=entries,
                   urlroot="../..")
     return html_response(hash_template.render(params))
Esempio n. 10
0
def main():
    db = sqlite3.connect("test.sqlite3")
    cur = db.cursor()
    cur.execute("PRAGMA foreign_keys = ON;")
    cur.execute("DELETE FROM sharing;")
    cur.execute("DELETE FROM duplicate;")
    cur.execute("DELETE FROM issue;")
    readcur = db.cursor()
    readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
    for hashvalue, in fetchiter(readcur):
        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
                    (hashvalue,))
        rows = cur.fetchall()
        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
        pkgdict = compute_pkgdict(rows)
        cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
                        [(row[1],) for row in rows])
        process_pkgdict(cur, pkgdict)
    cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
    cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
    cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
    db.commit()
Esempio n. 11
0
 def get_dependencies(self, pid):
     cur = self.db.cursor()
     cur.execute("SELECT required FROM dependency WHERE pid = ?;",
                 (pid,))
     return set(row[0] for row in fetchiter(cur))
Esempio n. 12
0
 def get_dependencies(self, pid):
     cur = self.db.cursor()
     cur.execute("SELECT required FROM dependency WHERE pid = ?;", (pid, ))
     return set(row[0] for row in fetchiter(cur))