def add_urls_from_site(self, cur, site, rank, datestamp, batch, already_seen): for (uid, url) in url_database.add_site(cur, site, self.args.http_only, self.args.www_only): if url in already_seen: continue batch.append((uid, rank, datestamp)) already_seen.add(url)
def add_urls_from_site(self, cur, site, rank, datestamp, batch, already_seen): for (uid, url) in url_database.add_site(cur, site, self.args.http_only, self.args.www_only): if url in already_seen: continue batch.append( (uid, rank, datestamp) ) already_seen.add(url)
def load_urls(self, db, fp): to_insert = set() sys.stderr.write("Importing {}...".format(self.source_label)) sys.stderr.flush() with db, db.cursor() as cur: for line in fp: line = line.strip() self.lineno += 1 if line == "" or line[0] == "#": continue if self._has_scheme.match(line): if (not line.startswith("http://") and not line.startswith("https://")): sys.stderr.write("{}:{}: non-HTTP(S) URL: {!r}\n" .format(self.args.file, self.lineno, line)) self.delayed_failure = True continue try: (url_id, _) = url_database.add_url_string(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n" .format(self.args.file, self.lineno, str(e))) self.delayed_failure = True continue to_insert.add(cur.mogrify("(%s, %s)", (url_id, self.import_id))) else: try: urls = url_database.add_site(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n" .format(self.args.file, self.lineno, str(e))) self.delayed_failure = True continue for pair in urls: to_insert.add(cur.mogrify("(%s, %s)", (pair[0], self.import_id))) if self.delayed_failure: raise SystemExit(1) sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_staticlist " b"(url, listid) VALUES " + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")
def load_urls(self, db, fp): to_insert = set() sys.stderr.write("Importing {}...".format(self.source_label)) sys.stderr.flush() with db, db.cursor() as cur: for line in fp: line = line.strip() self.lineno += 1 if line == "" or line[0] == "#": continue if self._has_scheme.match(line): if (not line.startswith("http://") and not line.startswith("https://")): sys.stderr.write( "{}:{}: non-HTTP(S) URL: {!r}\n".format( self.args.file, self.lineno, line)) self.delayed_failure = True continue try: (url_id, _) = url_database.add_url_string(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n".format( self.args.file, self.lineno, str(e))) self.delayed_failure = True continue to_insert.add( cur.mogrify("(%s, %s)", (url_id, self.import_id))) else: try: urls = url_database.add_site(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n".format( self.args.file, self.lineno, str(e))) self.delayed_failure = True continue for pair in urls: to_insert.add( cur.mogrify("(%s, %s)", (pair[0], self.import_id))) if self.delayed_failure: raise SystemExit(1) sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_staticlist " b"(url, listid) VALUES " + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")