Python add_url_string Exemples, shared.url_database.add_url_string Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : s_capture.py Projet : svagionitis/tbbscraper

    def handle_complete_batch(self, cmd, serial, locale, raw_results):
        with self.db, self.db.cursor() as cr:
            results = []
            finished_urls = set()
            for r in raw_results:
                (url_id, surl) = url_database.add_url_string(cr, r['ourl'])
                redir_url_id = None
                if 'canon' in r:
                    redir_url = r['canon']
                    if redir_url == surl or redir_url == r['ourl']:
                        redir_url_id = url_id
                    elif redir_url is not None:
                        (redir_url_id, _) = \
                            url_database.add_url_string(cr, r['canon'])

                detail_id = self.canon_statuses.get(r['detail'])
                if detail_id is None and r['detail'] is not None:
                    cr.execute("INSERT INTO canon_statuses(id, detail) "
                               "  VALUES(DEFAULT, %s)"
                               "  RETURNING id", (r['detail'],))
                    detail_id = cr.fetchone()[0]
                    self.canon_statuses[r['detail']] = detail_id

                (_, result) = url_database.categorize_result(r['status'],
                                                             url_id,
                                                             redir_url_id)

                results.append({
                    "locale":       locale,
                    "url":          url_id,
                    "result":       result,
                    "detail":       detail_id,
                    "redir_url":    redir_url_id,
                    "html_content": r.get('content'),
                    "screenshot":   r.get('render')
                })
                finished_urls.add(url_id)

            cr.executemany("UPDATE captured_urls "
                           "SET access_time = TIMESTAMP 'now',"
                           "         result = %(result)s,"
                           "         detail = %(detail)s,"
                           "      redir_url = %(redir_url)s,"
                           "   html_content = %(html_content)s,"
                           "     screenshot = %(screenshot)s "
                           ""
                           "WHERE    locale = %(locale)s"
                           "AND         url = %(url)s",
                           results)

            self.processing[locale] -= finished_urls

Exemple #2

0

Afficher le fichier

Fichier : s_canonize.py Projet : hotelzululima/tbbscraper

    def record_canonized(self, result):
        try:
            self.processed += 1
            cr = self.db.cursor()
            status_id = self.canon_statuses.get(result.status)
            if status_id is None:
                cr.execute("INSERT INTO canon_statuses VALUES(NULL, ?)",
                           (result.status,))
                status_id = cr.lastrowid
                self.canon_statuses[result.status] = status_id

            if result.anomaly is not None:
                cr.execute("INSERT INTO anomalies VALUES(?, ?, ?)",
                           (result.original_uid, status_id, result.anomaly))
                self.anomalies += 1

            if result.canon_url is None:
                canon_id = None
                self.failures += 1
                self.report_result(result, result.status)
            else:
                (canon_id, curl) = \
                    url_database.add_url_string(cr, result.canon_url)
                self.successes += 1
                self.report_result(result, curl)

            cr.execute("INSERT INTO canon_urls VALUES (?, ?, ?)",
                       (result.original_uid, canon_id, status_id))

            if self.processed % 1000 == 0:
                self.db.commit()

        except Exception as e:
            raise type(e)("Bogus result: {{ status: {!r} canon: {!r} anomaly: {!r} }}".format(result.status, result.canon_url, result.anomaly)) from e

Exemple #3

0

Afficher le fichier

def process_urls(db, rd):
    batch = []
    for row in rd:
        (uid, _) = url_database.add_url_string(db, row['url'])
        batch.append( (uid, row['result'], row['locales']) )

    with db, db.cursor() as cur:
        batch_str = b",".join(cur.mogrify("(%s,%s,%s)", row)
                              for row in batch)
        cur.execute(b"INSERT INTO urls_rescan (url, result, locales)"
                    b"VALUES " + batch_str)
        db.commit()

Exemple #4

0

Afficher le fichier

Fichier : pinboard.py Projet : zackw/tbbscraper

    def load_urls(self, db, fp):
        to_insert = set()

        uname = self.args.user
        sys.stderr.write("Importing {} ({})... /".format(
            uname, self.args.file))
        sys.stderr.flush()

        spinner = "/-\\|"
        c = 0
        with db, db.cursor() as cur:
            for entry in json.load(fp):
                try:
                    url = url_database.add_url_string(cur, entry['href'])[0]
                    atime = entry['time']
                    title = entry['description']
                    annot = entry['extended']
                    tags = entry['tags']
                except Exception as e:
                    sys.stderr.write("\nInvalid entry: {}\n".format(
                        json.dumps(entry)))
                    for l in traceback.format_exception_only(type(e), e):
                        sys.stderr.write(l)

                to_insert.add(
                    cur.mogrify("(%s,%s,TIMESTAMP %s,%s,%s,%s)",
                                (uname, url, atime, title, annot, tags)))
                sys.stderr.write("\b" + spinner[c % 4])
                sys.stderr.flush()
                c += 1

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(
                b"INSERT INTO urls_pinboard"
                b"(username, url, access_time, title, annotation, tags)"
                b"VALUES" + b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()

        sys.stderr.write("\n")

Exemple #5

0

Afficher le fichier

Fichier : s_canonize_pj.py Projet : hotelzululima/tbbscraper

    def record_canonized(self, result):
        try:
            self.processed += 1
            cr = self.db.cursor()
            status_id = self.canon_statuses.get(result.status)
            if status_id is None:
                cr.execute("INSERT INTO canon_statuses VALUES(NULL, ?)",
                           (result.status,))
                status_id = cr.lastrowid
                self.canon_statuses[result.status] = status_id

            if result.anomaly is not None:
                cr.execute("INSERT INTO anomalies VALUES(?, ?, ?)",
                           (result.original_uid, status_id,
                            json.dumps(result.anomaly)))
                self.anomalies += 1

            if result.canon_url is None:
                canon_id = None
                self.failures += 1
                self.report_result(result, result.status)
            else:
                (canon_id, curl) = \
                    url_database.add_url_string(cr, result.canon_url)
                self.successes += 1
                self.report_result(result, curl)

            cr.execute("INSERT OR REPLACE INTO canon_urls VALUES (?, ?, ?)",
                       (result.original_uid, canon_id, status_id))

            if self.processed % 1000 == 0:
                self.db.commit()

        except Exception as e:
            self.anomalies += 1
            self.report_result(result, "bogus")
            self.bogus_results.write("{}\n".format(json.dumps({
                "exception": repr(e),
                "canon": result.canon_url,
                "status": result.status,
                "anomaly": result.anomaly
            })))

Exemple #6

0

Afficher le fichier

Fichier : s_pinboard.py Projet : anukat2015/tbbscraper

    def load_urls(self, db, fp):
        to_insert = set()

        uname = self.args.user
        sys.stderr.write("Importing {} ({})... /"
                         .format(uname, self.args.file))
        sys.stderr.flush()

        spinner = "/-\\|"
        c = 0
        with db, db.cursor() as cur:
            for entry in json.load(fp):
                try:
                    url   = url_database.add_url_string(cur, entry['href'])[0]
                    atime = entry['time']
                    title = entry['description']
                    annot = entry['extended']
                    tags  = entry['tags']
                except Exception as e:
                    sys.stderr.write("\nInvalid entry: {}\n"
                                     .format(json.dumps(entry)))
                    for l in traceback.format_exception_only(type(e), e):
                        sys.stderr.write(l)

                to_insert.add(cur.mogrify("(%s,%s,TIMESTAMP %s,%s,%s,%s)",
                                          (uname, url, atime, title,
                                           annot, tags)))
                sys.stderr.write("\b" + spinner[c % 4])
                sys.stderr.flush()
                c += 1

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(b"INSERT INTO urls_pinboard"
                        b"(username, url, access_time, title, annotation, tags)"
                        b"VALUES"
                        + b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()

        sys.stderr.write("\n")

Exemple #7

0

Afficher le fichier

Fichier : s_citizenlab.py Projet : marwan116/tbbscraper

    def process_one_import(self, cur, datestamp, country_code, reader):
        sys.stderr.write("Importing {}...".format(country_code))
        sys.stderr.flush()
        values = []
        for row in reader:
            category_code = row["category_code"]
            uid, url = url_database.add_url_string(cur, row["url"])
            values.append(cur.mogrify("(%s,%s,%s,%s)", (uid, country_code, category_code, datestamp)))

        if not values:
            return

        sys.stderr.write(" (insert)")
        sys.stderr.flush()
        cur.execute(
            b"INSERT INTO urls_citizenlab " b"(url, country, category, retrieval_date) " b"VALUES " + b",".join(values)
        )

        sys.stderr.write(" (commit)")
        sys.stderr.flush()

Exemple #8

0

Afficher le fichier

    def process_one_import(self, cur, datestamp, country_code, reader):
        sys.stderr.write("Importing {}...".format(country_code))
        sys.stderr.flush()
        values = []
        for row in reader:
            category_code = row['category_code']
            uid, url = url_database.add_url_string(cur, row['url'])
            values.append(
                cur.mogrify("(%s,%s,%s,%s)",
                            (uid, country_code, category_code, datestamp)))

        if not values:
            return

        sys.stderr.write(" (insert)")
        sys.stderr.flush()
        cur.execute(b"INSERT INTO urls_citizenlab "
                    b"(url, country, category, retrieval_date) "
                    b"VALUES " + b",".join(values))

        sys.stderr.write(" (commit)")
        sys.stderr.flush()

Exemple #9

0

Afficher le fichier

Fichier : s_herdict.py Projet : svagionitis/tbbscraper

    def __call__(self, mon, thr):
        db, start_date, end_date = self.prepare_database()
        self.db = db
        cur = db.cursor()
        pageq = queue.Queue()
        mon.add_work_thread(HerdictReader(pageq, start_date, end_date))

        n_accessible = 0
        n_inaccessible = 0
        n_total = 0
        lo_timestamp = time.time() + 86400
        hi_timestamp = 0

        while True:
            page = pageq.get()
            if not page:
                break
            batch = []
            for row in page:
                if ("url" not in row or
                    "reportDate" not in row or
                    "reportType" not in row):
                    continue

                timestamp = (datetime.datetime.strptime(row["reportDate"],
                                                        "%Y-%m-%dT%H:%M:%S %z")
                             .timestamp())
                lo_timestamp = min(timestamp, lo_timestamp)
                hi_timestamp = max(timestamp, hi_timestamp)

                url = row["url"]
                if "/" not in url:
                    url = url + "/"
                if "protocol" not in row:
                    url = "HTTP://" + url
                else:
                    url = row["protocol"] + "://" + url

                # Herdict reports have several more keys than this,
                # but none of them appear to be terribly trustworthy.
                accessible = (row["reportType"] != "INACCESSIBLE")
                if "country" in row and "shortName" in row["country"]:
                    country = row["country"]["shortName"]
                else:
                    country = "??"

                (uid, url) = url_database.add_url_string(cur, url)
                batch.append((uid, timestamp, accessible, country))

                n_total += 1
                if accessible: n_accessible += 1
                else: n_inaccessible += 1
                mon.report_status("Processed {} URLs; "
                                  "{} accessible, {} inaccessible"
                                  .format(n_total, n_accessible,
                                          n_inaccessible))

            mon.report_status("Processed {} URLs; "
                              "{} accessible, {} inaccessible; checkpointing"
                              .format(n_total, n_accessible,
                                      n_inaccessible))

            cur.execute(b"INSERT INTO urls_herdict "
                        b"(url, \"timestamp\", accessible, country) VALUES "
                        + b",".join(cur.mogrify("(%s,%s,%s,%s)", row)
                                    for row in batch))
            db.commit()
            mon.maybe_pause_or_stop()

        mon.report_status("Flushing duplicates...")
        # The urls_herdict table doesn't have any uniquifier.
        # Flush any duplicate rows that may have occurred.
        cur.execute(
            'DELETE FROM urls_herdict WHERE ctid IN (SELECT ctid FROM ('
            '  SELECT ctid, row_number() OVER ('
            '    PARTITION BY url,"timestamp",accessible,country'
            '    ORDER BY ctid) AS rnum FROM urls_herdict) t'
            '  WHERE t.rnum > 1)')
        db.commit()

        mon.report_status("Adding URLs to be canonicalized...")
        cur.execute("INSERT INTO canon_urls (url) "
                    "  SELECT DISTINCT url FROM urls_herdict"
                    "  EXCEPT SELECT url FROM canon_urls")
        db.commit()
        self.summary = (lo_timestamp, hi_timestamp,
                        n_total, n_accessible, n_inaccessible)

Exemple #10

0

Afficher le fichier

Fichier : herdict.py Projet : zackw/tbbscraper

    def __call__(self, mon, thr):
        db, start_date, end_date = self.prepare_database()
        self.db = db
        cur = db.cursor()
        pageq = queue.Queue()
        mon.add_work_thread(HerdictReader(pageq, start_date, end_date))

        n_accessible = 0
        n_inaccessible = 0
        n_total = 0
        lo_timestamp = time.time() + 86400
        hi_timestamp = 0

        while True:
            page = pageq.get()
            if not page:
                break
            batch = []
            for row in page:
                if ("url" not in row or
                    "reportDate" not in row or
                    "reportType" not in row):
                    continue

                timestamp = (datetime.datetime.strptime(row["reportDate"],
                                                        "%Y-%m-%dT%H:%M:%S %z")
                             .timestamp())
                lo_timestamp = min(timestamp, lo_timestamp)
                hi_timestamp = max(timestamp, hi_timestamp)

                url = row["url"]
                if "/" not in url:
                    url = url + "/"
                if "protocol" not in row:
                    url = "HTTP://" + url
                else:
                    url = row["protocol"] + "://" + url

                # Herdict reports have several more keys than this,
                # but none of them appear to be terribly trustworthy.
                accessible = (row["reportType"] != "INACCESSIBLE")
                if "country" in row and "shortName" in row["country"]:
                    country = row["country"]["shortName"]
                else:
                    country = "??"

                (uid, url) = url_database.add_url_string(cur, url)
                batch.append((uid, timestamp, accessible, country))

                n_total += 1
                if accessible: n_accessible += 1
                else: n_inaccessible += 1
                mon.report_status("Processed {} URLs; "
                                  "{} accessible, {} inaccessible"
                                  .format(n_total, n_accessible,
                                          n_inaccessible))

            mon.report_status("Processed {} URLs; "
                              "{} accessible, {} inaccessible; checkpointing"
                              .format(n_total, n_accessible,
                                      n_inaccessible))

            cur.execute(b"INSERT INTO urls_herdict "
                        b"(url, \"timestamp\", accessible, country) VALUES "
                        + b",".join(cur.mogrify("(%s,%s,%s,%s)", row)
                                    for row in batch))
            db.commit()
            mon.maybe_pause_or_stop()

        self.summary = (lo_timestamp, hi_timestamp,
                        n_total, n_accessible, n_inaccessible)

Exemple #11

0

Afficher le fichier

Fichier : staticlist.py Projet : zackw/tbbscraper

    def load_urls(self, db, fp):
        to_insert = set()

        sys.stderr.write("Importing {}...".format(self.source_label))
        sys.stderr.flush()

        with db, db.cursor() as cur:
            for line in fp:
                line = line.strip()
                self.lineno += 1

                if line == "" or line[0] == "#":
                    continue

                if self._has_scheme.match(line):
                    if (not line.startswith("http://") and
                        not line.startswith("https://")):
                        sys.stderr.write("{}:{}: non-HTTP(S) URL: {!r}\n"
                                         .format(self.args.file,
                                                 self.lineno, line))
                        self.delayed_failure = True
                        continue

                    try:
                        (url_id, _) = url_database.add_url_string(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n"
                                         .format(self.args.file, self.lineno,
                                                 str(e)))
                        self.delayed_failure = True
                        continue

                    to_insert.add(cur.mogrify("(%s, %s)",
                                              (url_id, self.import_id)))

                else:
                    try:
                        urls = url_database.add_site(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n"
                                         .format(self.args.file, self.lineno,
                                                 str(e)))
                        self.delayed_failure = True
                        continue

                    for pair in urls:
                        to_insert.add(cur.mogrify("(%s, %s)",
                                                  (pair[0], self.import_id)))

            if self.delayed_failure:
                raise SystemExit(1)

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(b"INSERT INTO urls_staticlist "
                        b"(url, listid) VALUES "
                        + b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()
        sys.stderr.write("\n")

Exemple #12

0

Afficher le fichier

    def load_urls(self, db, fp):
        to_insert = set()

        sys.stderr.write("Importing {}...".format(self.source_label))
        sys.stderr.flush()

        with db, db.cursor() as cur:
            for line in fp:
                line = line.strip()
                self.lineno += 1

                if line == "" or line[0] == "#":
                    continue

                if self._has_scheme.match(line):
                    if (not line.startswith("http://")
                            and not line.startswith("https://")):
                        sys.stderr.write(
                            "{}:{}: non-HTTP(S) URL: {!r}\n".format(
                                self.args.file, self.lineno, line))
                        self.delayed_failure = True
                        continue

                    try:
                        (url_id, _) = url_database.add_url_string(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n".format(
                            self.args.file, self.lineno, str(e)))
                        self.delayed_failure = True
                        continue

                    to_insert.add(
                        cur.mogrify("(%s, %s)", (url_id, self.import_id)))

                else:
                    try:
                        urls = url_database.add_site(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n".format(
                            self.args.file, self.lineno, str(e)))
                        self.delayed_failure = True
                        continue

                    for pair in urls:
                        to_insert.add(
                            cur.mogrify("(%s, %s)", (pair[0], self.import_id)))

            if self.delayed_failure:
                raise SystemExit(1)

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(b"INSERT INTO urls_staticlist "
                        b"(url, listid) VALUES " +
                        b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()
        sys.stderr.write("\n")

Exemple #13

0

Afficher le fichier

Fichier : s_capture.py Projet : marwan116/tbbscraper

    def record_batch(self, loc, successes, failures):
        locale = loc.locale
        loc.n_workers -= 1
        for r in failures:
            loc.in_progress.remove(r[0])

        if not successes:
            return

        with self.db, self.db.cursor() as cr:
            for s in successes:
                url_id = s[0]
                r      = s[1]
                loc.in_progress.remove(url_id)

                redir_url = None
                redir_url_id = None
                if r['canon']:
                    redir_url = r['canon']
                    if redir_url == r['ourl']:
                        redir_url_id = url_id
                    elif redir_url is not None:
                        try:
                            (redir_url_id, _) = \
                                url_database.add_url_string(cr, redir_url)
                        except (ValueError, UnicodeError):
                            addendum = "invalid redir url: " + redir_url
                            if ('detail' not in r or r['detail'] is None):
                                r['detail'] = addendum
                            else:
                                r['detail'] += " | " + addendum

                detail_id = self.capture_detail.get(r['detail'])
                if detail_id is None:
                    cr.execute("INSERT INTO capture_detail(id, detail) "
                               "  VALUES(DEFAULT, %s)"
                               "  RETURNING id", (r['detail'],))
                    detail_id = cr.fetchone()[0]
                    self.capture_detail[r['detail']] = detail_id

                result = url_database.categorize_result(r['status'],
                                                        r['detail'],
                                                        url_id,
                                                        redir_url_id)

                to_insert = {
                    "locale":       locale,
                    "url":          url_id,
                    "result":       result,
                    "detail":       detail_id,
                    "redir_url":    redir_url_id,
                    "log":          r['log'],
                    "html_content": r['content'],
                    "screenshot":   r['render']
                }
                cr.execute("INSERT INTO captured_pages"
                           "(locale, url, access_time, result, detail,"
                           " redir_url, capture_log, html_content,"
                           " screenshot)"
                           "VALUES ("
                           "  %(locale)s,"
                           "  %(url)s,"
                           "  TIMESTAMP 'now',"
                           "  %(result)s,"
                           "  %(detail)s,"
                           "  %(redir_url)s,"
                           "  %(log)s,"
                           "  %(html_content)s,"
                           "  %(screenshot)s)",
                           to_insert)
                cr.execute('UPDATE capture_progress SET "l_{0}" = TRUE '
                           ' WHERE url = {1}'.format(locale, url_id))
                loc.todo -= 1

Exemple #14

0

Afficher le fichier

Fichier : s_alexa.py Projet : hotelzululima/tbbscraper

    def add_urls_from_site(cur, site, ordinal, oid, already_seen):
        # Subroutine of process_sitelist.
        #
        # Alexa's "site" list has two different kinds of
        # addresses on it: with and without a URL path.
        # Also, most but not all of the sites are second-level
        # domains: any third-level piece (such as "www.") has
        # been stripped.  In no case is there a scheme; in
        # particular we have no idea whether the site prefers
        # http: or https:.  So we expand each entry to four:
        #
        #   http://       site
        #   https://      site
        #   http://  www. site
        #   https:// www. site
        #
        # If there was a path, we include all of the above
        # both with and without the path.  This scheme won't
        # do us any good if the actual content people are
        # loading is neither at the name in the list nor at
        # www. the name in the list; for instance,
        # akamaihd.net is site #68, but neither akamaihd.net
        # nor www.akamaihd.net has any A records, because,
        # being a CDN, all of the actual content is on servers
        # named SOMETHINGELSE.akamaihd.net, and you're not
        # expected to notice that the domain even exists.
        # But there's nothing we can do about that.
        #
        # Because the database schema requires the ordinal+oid to be
        # unique, we shift the ordinal left three bits to make room
        # for a prefix index and an indication of whether or not there
        # was a path component.
        #
        # It does not make sense to prepend 'www.' if 'site' already
        # starts with 'www.' or if it is an IP address.

        parsed = url_database.canon_url_syntax(
            urllib.parse.urlsplit("http://" + site))

        assert parsed.path != ""
        if parsed.path != "/":
            root = to_siteroot(parsed)
            need_path = True
        else:
            root = parsed
            need_path = False

        urls = [ (0, root.geturl()),
                 (1, to_https(root).geturl()) ]

        host = root.hostname
        if no_www_re.match(host):
            need_www = False
        else:
            need_www = True
            with_www = add_www(root)
            urls.extend([ (2, with_www.geturl()),
                          (3, to_https(with_www).geturl()) ])


        if need_path:
            urls.extend([ (4, parsed.geturl()),
                          (5, to_https(parsed).geturl()) ])

            if need_www:
                with_www = add_www(parsed)
                urls.extend([ (6, with_www.geturl()),
                              (7, to_https(with_www).geturl()) ])

        ordinal = int(ordinal) * 8

        nnew = 0
        for tag, url in urls:
            (uid, url) = url_database.add_url_string(cur, url)
            if url in already_seen:
                continue
            already_seen.add(url)

            # We want to add an url-table entry for this URL even if it's
            # already there from some other source; we only drop them if
            # they are redundant within this data set.  However, in case
            # the database-loading operation got interrupted midway,
            # do an INSERT OR IGNORE.
            cur.execute("INSERT OR IGNORE INTO urls VALUES(?, ?, ?)",
                        (oid, ordinal + tag, uid))
            nnew += 1

        return nnew

Exemple #15

0

Afficher le fichier

Fichier : s_alexa.py Projet : svagionitis/tbbscraper

    def add_urls_from_site(cur, site, rank, datestamp, batch, already_seen):
        # Subroutine of process_sitelist.
        #
        # Alexa's "site" list has two different kinds of
        # addresses on it: with and without a URL path.
        # Also, most but not all of the sites are second-level
        # domains: any third-level piece (such as "www.") has
        # been stripped.  In no case is there a scheme; in
        # particular we have no idea whether the site prefers
        # http: or https:.  So we expand each entry to four:
        #
        #   http://       site
        #   https://      site
        #   http://  www. site
        #   https:// www. site
        #
        # If there was a path, we include all of the above
        # both with and without the path.  This scheme won't
        # do us any good if the actual content people are
        # loading is neither at the name in the list nor at
        # www. the name in the list; for instance,
        # akamaihd.net is site #68, but neither akamaihd.net
        # nor www.akamaihd.net has any A records, because,
        # being a CDN, all of the actual content is on servers
        # named SOMETHINGELSE.akamaihd.net, and you're not
        # expected to notice that the domain even exists.
        # But there's nothing we can do about that.
        #
        # It does not make sense to prepend 'www.' if 'site' already
        # starts with 'www.' or if it is an IP address.

        parsed = url_database.canon_url_syntax("http://" + site,
                                               want_splitresult=True)

        assert parsed.path != ""
        if parsed.path != "/":
            root = to_siteroot(parsed)
            need_path = True
        else:
            root = parsed
            need_path = False

        urls = [ root.geturl(),
                 to_https(root).geturl() ]

        host = root.hostname
        if no_www_re.match(host):
            need_www = False
        else:
            need_www = True
            with_www = add_www(root)
            urls.extend([ with_www.geturl(),
                          to_https(with_www).geturl() ])


        if need_path:
            urls.extend([ parsed.geturl(),
                          to_https(parsed).geturl() ])

            if need_www:
                with_www = add_www(parsed)
                urls.extend([ with_www.geturl(),
                              to_https(with_www).geturl() ])

        for url in urls:
            (uid, url) = url_database.add_url_string(cur, url)
            if url in already_seen:
                continue
            batch.append( (uid, rank, datestamp) )
            already_seen.add(url)