Exemple #1
0
 def add_urls_from_site(self, cur, site, rank, datestamp, batch,
                        already_seen):
     for (uid, url) in url_database.add_site(cur, site, self.args.http_only,
                                             self.args.www_only):
         if url in already_seen:
             continue
         batch.append((uid, rank, datestamp))
         already_seen.add(url)
Exemple #2
0
 def add_urls_from_site(self, cur, site, rank, datestamp, batch,
                        already_seen):
     for (uid, url) in url_database.add_site(cur, site,
                                             self.args.http_only,
                                             self.args.www_only):
         if url in already_seen:
             continue
         batch.append( (uid, rank, datestamp) )
         already_seen.add(url)
Exemple #3
0
    def load_urls(self, db, fp):
        to_insert = set()

        sys.stderr.write("Importing {}...".format(self.source_label))
        sys.stderr.flush()

        with db, db.cursor() as cur:
            for line in fp:
                line = line.strip()
                self.lineno += 1

                if line == "" or line[0] == "#":
                    continue

                if self._has_scheme.match(line):
                    if (not line.startswith("http://") and
                        not line.startswith("https://")):
                        sys.stderr.write("{}:{}: non-HTTP(S) URL: {!r}\n"
                                         .format(self.args.file,
                                                 self.lineno, line))
                        self.delayed_failure = True
                        continue

                    try:
                        (url_id, _) = url_database.add_url_string(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n"
                                         .format(self.args.file, self.lineno,
                                                 str(e)))
                        self.delayed_failure = True
                        continue

                    to_insert.add(cur.mogrify("(%s, %s)",
                                              (url_id, self.import_id)))

                else:
                    try:
                        urls = url_database.add_site(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n"
                                         .format(self.args.file, self.lineno,
                                                 str(e)))
                        self.delayed_failure = True
                        continue

                    for pair in urls:
                        to_insert.add(cur.mogrify("(%s, %s)",
                                                  (pair[0], self.import_id)))

            if self.delayed_failure:
                raise SystemExit(1)

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(b"INSERT INTO urls_staticlist "
                        b"(url, listid) VALUES "
                        + b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()
        sys.stderr.write("\n")
Exemple #4
0
    def load_urls(self, db, fp):
        to_insert = set()

        sys.stderr.write("Importing {}...".format(self.source_label))
        sys.stderr.flush()

        with db, db.cursor() as cur:
            for line in fp:
                line = line.strip()
                self.lineno += 1

                if line == "" or line[0] == "#":
                    continue

                if self._has_scheme.match(line):
                    if (not line.startswith("http://")
                            and not line.startswith("https://")):
                        sys.stderr.write(
                            "{}:{}: non-HTTP(S) URL: {!r}\n".format(
                                self.args.file, self.lineno, line))
                        self.delayed_failure = True
                        continue

                    try:
                        (url_id, _) = url_database.add_url_string(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n".format(
                            self.args.file, self.lineno, str(e)))
                        self.delayed_failure = True
                        continue

                    to_insert.add(
                        cur.mogrify("(%s, %s)", (url_id, self.import_id)))

                else:
                    try:
                        urls = url_database.add_site(cur, line)

                    except Exception as e:
                        sys.stderr.write("{}:{}: {}\n".format(
                            self.args.file, self.lineno, str(e)))
                        self.delayed_failure = True
                        continue

                    for pair in urls:
                        to_insert.add(
                            cur.mogrify("(%s, %s)", (pair[0], self.import_id)))

            if self.delayed_failure:
                raise SystemExit(1)

            sys.stderr.write(" (insert)")
            sys.stderr.flush()
            cur.execute(b"INSERT INTO urls_staticlist "
                        b"(url, listid) VALUES " +
                        b",".join(sorted(to_insert)))

            sys.stderr.write(" (commit)")
            sys.stderr.flush()
        sys.stderr.write("\n")