Ejemplo n.º 1
0
    def load_database(self):
        self.report_progress("Loading database...")
        self.db = url_database.ensure_database(self.args)

        cr = self.db.cursor()
        # Cache the status table in memory; it's reasonably small.
        self.report_progress("Loading database... (canon statuses)")
        cr.execute("SELECT id, status FROM canon_statuses;")
        self.canon_statuses = { row[1]: row[0]
                                for row in url_database.fetch_iter(cr) }
Ejemplo n.º 2
0
    def merge_ancillary(self, destdb, srcdbs):
        # The ancillary table merge currently assumes that we don't
        # need to do any ID fixups.  This is accurate for the present
        # data set just because each database-to-be-merged has only one
        # origin in it, but may become a problem later.

        writer = destdb.cursor()
        already_created = set(row[0] for row in writer.execute(
                "SELECT name FROM sqlite_master WHERE name NOT LIKE 'sqlite_%'"
                ).fetchall())

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Merging ancillary tables ({})..."
                                   .format(tag))
            self.mon.maybe_pause_or_stop()
            scur = sdb.cursor()
            for name, sql in scur.execute(
                    "SELECT name, sql FROM sqlite_master "
                    "WHERE type = 'table' "
                    "AND name NOT LIKE 'sqlite_%'").fetchall():
                if name in self.tables_processed:
                    continue

                self.mon.report_status("Merging ancillary tables ({})... {}"
                                       .format(tag, name))
                self.mon.maybe_pause_or_stop()

                if name not in already_created:
                    writer.executescript(sql)
                    for isql in scur.execute(
                            "SELECT sql FROM sqlite_master "
                            "WHERE type = 'index' "
                            "AND tbl_name = ?", (name,)).fetchall():
                        writer.executescript(isql[0])
                    already_created.add(name)

                # This is the least bad available way to find out how
                # many columns the table has.  It doesn't work if the
                # table is empty, but if the table is empty, we don't
                # need to do anything.
                row = scur.execute(
                    "SELECT * FROM \""+name+"\" LIMIT 1").fetchone()
                if row is None:
                    continue

                cols = len(row)
                insertion = ("INSERT INTO \""+name+"\" VALUES(" +
                             ",".join(["?"]*cols) + ")")
                selection = "SELECT * FROM \""+name+"\""
                # sqlite3.Row objects cannot be passed directly to execute().
                # Feh. Feh, I say.
                for row in url_database.fetch_iter(scur.execute(selection)):
                    writer.execute(insertion, tuple(row))
                destdb.commit()
Ejemplo n.º 3
0
    def merge_url_strings(self, destdb, srcdbs):
        # It is expected to be more efficient to merge the strings in
        # memory.
        stringmap = defaultdict(dict)
        uidmap = {}

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Reading URL strings ({})..."
                                   .format(tag))
            self.mon.maybe_pause_or_stop()
            scur = sdb.cursor()
            n = 0
            for row in url_database.fetch_iter(
                    scur.execute("SELECT * FROM url_strings")):
                stringmap[row['url']][tag] = row['id']
                n = max(n, row['id'])
                self.mon.report_status("Reading URL strings ({})... {}"
                                       .format(tag, n))
                self.mon.maybe_pause_or_stop()
            uidmap[tag] = [None]*(n+1)

        self.mon.report_status("Sorting URL strings...")
        self.mon.maybe_pause_or_stop()
        merged = sorted(stringmap.keys())
        self.mon.report_status("Writing URL strings...")
        self.mon.maybe_pause_or_stop()
        destdb.executemany("INSERT INTO url_strings VALUES(?,?)",
                           enumerate(merged, 1))
        self.mon.report_status("Writing URL strings (commit)...")
        self.mon.maybe_pause_or_stop()
        destdb.commit()
        self.mon.report_status("Constructing uidmap...")
        self.mon.maybe_pause_or_stop()
        for uid, url in enumerate(merged, 1):
            smap = stringmap[url]
            self.mon.report_status("Constructing uidmap... {}".format(uid))
            self.mon.maybe_pause_or_stop()
            for tag in srcdbs.keys():
                sid = smap.get(tag, None)
                if sid is not None:
                    uidmap[tag][sid] = uid

        self.tables_processed.add("url_strings")
        return uidmap
Ejemplo n.º 4
0
    def merge_urls(self, destdb, srcdbs):

        writer = destdb.cursor()
        write_batch = []

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Merging URLs ({})..."
                                   .format(tag))
            self.mon.maybe_pause_or_stop()

            scur = sdb.cursor()
            omap = self.oidmap[tag]
            umap = self.uidmap[tag]
            for row in url_database.fetch_iter(
                    scur.execute("SELECT * FROM urls")):
                m_origin = omap[row['origin']]
                m_origin_id = row['origin_id']
                m_url = umap[row['url']]
                write_batch.append((m_origin, m_origin_id, m_url))
                self.mon.report_status("Merging URLs ({})... {}.{}"
                                       .format(tag, m_origin, m_origin_id))
                self.mon.maybe_pause_or_stop()

                if len(write_batch) > 10000:
                    self.mon.report_status("Merging URLs ({})... writing"
                                           .format(tag))
                    writer.executemany("INSERT INTO urls VALUES(?,?,?)",
                                       write_batch)
                    write_batch = []
                    destdb.commit()
                    self.mon.maybe_pause_or_stop()

        self.mon.report_status("Merging URLs ({})... writing"
                               .format(tag))
        writer.executemany("INSERT INTO urls VALUES(?,?,?)",
                           write_batch)
        destdb.commit()
        self.tables_processed.add("urls")
Ejemplo n.º 5
0
    def load_database(self):
        self.report_progress("Loading database...")
        self.db = url_database.ensure_database(self.args)

        cr = self.db.cursor()
        # Cache the status table in memory; it's reasonably small.
        self.report_progress("Loading database... (canon statuses)")
        cr.execute("SELECT id, status FROM canon_statuses;")
        self.canon_statuses = { row[1]: row[0]
                                for row in url_database.fetch_iter(cr) }

        if self.args.work_queue:
            self.todo = open(self.args.work_queue, "rb")
        else:
            self.report_progress("Loading database... (work queue)")
            self.todo = tempfile.TemporaryFile("w+b")
            subprocess.check_call(["sqlite3", self.args.database,
                   "SELECT DISTINCT u.url, v.url"
                   "  FROM urls as u"
                   "  LEFT JOIN url_strings as v on u.url = v.id"
                   "  WHERE u.url NOT IN (SELECT url FROM canon_urls)"],
                                  stdout=self.todo)
            self.todo.seek(0)
Ejemplo n.º 6
0
    def merge_origins(self, destdb, srcdbs):
        # The logic here is essentially the same as for URL strings,
        # but the origins lists are much, much shorter.

        stringmap = defaultdict(dict)
        oidmap = {}

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Reading origin list ({})..."
                                   .format(tag))
            scur = sdb.cursor()
            n = 0
            for row in url_database.fetch_iter(
                    scur.execute("SELECT * FROM origins")):
                stringmap[row['label']][tag] = row['id']
                n += 1
                self.mon.report_status("Reading origin list ({})... {}"
                                       .format(tag, n))
            oidmap[tag] = [None]*(n+1)

        self.mon.report_status("Sorting origin list...")
        merged = sorted(stringmap.keys())
        self.mon.report_status("Writing origin list...")
        destdb.executemany("INSERT INTO origins VALUES(?,?)",
                           enumerate(merged, 1))
        self.mon.report_status("Writing origins (commit)...")
        destdb.commit()
        self.mon.report_status("Constructing oidmap...")
        for oid, label in enumerate(merged, 1):
            smap = stringmap[label]
            for tag in srcdbs.keys():
                sid = smap.get(tag, None)
                if sid is not None:
                    oidmap[tag][sid] = oid

        self.tables_processed.add("origins")
        return oidmap
Ejemplo n.º 7
0
    def merge_anomalies(self, destdb, srcdbs):
        writer = destdb.cursor()
        write_batch = []

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Merging anomalies ({})..."
                                   .format(tag))
            self.mon.maybe_pause_or_stop()

            scur = sdb.cursor()
            umap = self.uidmap[tag]
            cmap = self.cidmap[tag]
            for row in url_database.fetch_iter(
                    scur.execute("SELECT * FROM anomalies")):
                m_url = umap[row['url']]
                m_status = cmap[row['status']]
                write_batch.append((m_url, m_status, row['response']))
                self.mon.report_status("Merging anomalies ({})... {}.{}"
                                       .format(tag, m_url))
                self.mon.maybe_pause_or_stop()

                if len(write_batch) > 10000:
                    self.mon.report_status("Merging anomalies ({})... writing"
                                           .format(tag))
                    writer.executemany("INSERT INTO anomalies VALUES(?,?,?)",
                                       write_batch)
                    write_batch = []
                    destdb.commit()
                    self.mon.maybe_pause_or_stop()

        self.mon.report_status("Merging anomalies ({})... writing"
                               .format(tag))
        writer.executemany("INSERT INTO anomalies VALUES(?,?,?)",
                           write_batch)
        destdb.commit()
        self.tables_processed.add("anomalies")
Ejemplo n.º 8
0
    def merge_canon_statuses(self, destdb, srcdbs):
        # Similarly.

        stringmap = defaultdict(dict)
        cidmap = {}

        for tag, sdb in srcdbs.items():
            self.mon.report_status("Reading canon status list ({})..."
                                   .format(tag))
            scur = sdb.cursor()
            n = 0
            for row in url_database.fetch_iter(
                    scur.execute("SELECT * FROM canon_statuses")):
                stringmap[row['label']][tag] = row['id']
                n += 1
                self.mon.report_status("Reading canon_status list ({})... {}"
                                       .format(tag, n))
            cidmap[tag] = [None]*(n+1)

        self.mon.report_status("Sorting canon status list...")
        merged = sorted(stringmap.keys())
        self.mon.report_status("Writing canon status list...")
        destdb.executemany("INSERT INTO canon_statuses VALUES(?,?)",
                           enumerate(merged, 1))
        self.mon.report_status("Writing canon_statuses (commit)...")
        destdb.commit()
        self.mon.report_status("Constructing cidmap...")
        for oid, label in enumerate(merged, 1):
            smap = stringmap[label]
            for tag in srcdbs.keys():
                sid = smap.get(tag, None)
                if sid is not None:
                    cidmap[tag][sid] = oid

        self.tables_processed.add("canon_statuses")
        return cidmap