def main_loop(self): self.report_overall_progress() while self.todo or self.in_progress: while self.todo and len(self.in_progress) < self.args.parallel: uid, url, retries = self.todo.popleft() url = url_database.canon_url_syntax(url) idx = self.assign_display_index(url) task = CanonTask(uid, url, retries, idx) self.in_progress[task.pid] = task try: while True: try: (pid, status) = os.wait() break except InterruptedError: continue except ChildProcessError: continue # no children to wait for: keep going task = self.in_progress.pop(pid) task.pickup_results(status) # Retry network timeouts up to five times. if task.status == "Network timeout": if task.retries < 5: self.report_result(task, task.status) self.todo.append((task.original_uid, task.original_url, task.retries + 1)) continue self.record_canonized(task)
def __init__(self, url, proxy): self.proc = None self.original_url = url self.canon_url = None self.status = None self.detail = None self.log = {} self.content = None self.render = None # Make sure the URL is not so mangled that phantomjs is just going # to give up and report nothing at all. try: self.original_url = \ url_database.canon_url_syntax(url, want_splitresult = False) except ValueError as e: self.status = 'invalid URL' self.detail = str(e) return except UnicodeError as e: while e.__cause__ is not None: e = e.__cause__ self.status = 'invalid URL' self.detail = 'invalid hostname: ' + str(e) return # We use a temporary file for the results, instead of a pipe, # so we don't have to worry about reading them until after the # child process exits. self.result_fd = tempfile.TemporaryFile("w+t", encoding="utf-8") self.errors_fd = tempfile.TemporaryFile("w+t", encoding="utf-8") self.proc = subprocess.Popen( proxy.adjust_command([ "isolate", "ISOL_RL_MEM=unlimited", "ISOL_RL_STACK=8388608", "PHANTOMJS_DISABLE_CRASH_DUMPS=1", "MALLOC_CHECK_=0", "phantomjs", "--local-url-access=no", pj_trace_redir, "--capture", self.original_url ]), stdin=subprocess.DEVNULL, stdout=self.result_fd, stderr=self.errors_fd)
def __init__(self, url): self.status = "" self.detail = "" self.log = {} self.canon_url = "" self.content = "" self.elapsed = 0. # Make sure the URL is not so mangled that phantomjs is just going # to give up and report nothing at all. try: self.original_url = canon_url_syntax(url, want_splitresult = False) except ValueError as e: self.original_url = url self.status = 'invalid URL' self.detail = str(e) except UnicodeError as e: while e.__cause__ is not None: e = e.__cause__ self.original_url = url self.status = 'invalid URL' self.detail = 'invalid hostname: ' + str(e)
def main_loop(self): self.report_overall_progress() all_read = False while self.in_progress or not all_read: while not all_read and len(self.in_progress) < self.args.parallel: try: raw_line = self.todo.readline() line = raw_line.decode("ascii").strip() except Exception as e: self.anomalies += 1 self.bogus_results.write("{}\n".format(json.dumps({ "exception": repr(e), "raw_line": repr(raw_line) }))) continue if line == "": all_read = True break uid, url = line.split("|", 1) url = url_database.canon_url_syntax(url) idx = self.assign_display_index(url) task = CanonTask(uid, url, idx) self.in_progress[task.pid] = task try: (pid, status) = os.wait() except ChildProcessError: continue # no children to wait for: keep going task = self.in_progress.pop(pid) task.pickup_results(status) self.record_canonized(task)
def add_urls_from_site(cur, site, ordinal, oid, already_seen): # Subroutine of process_sitelist. # # Alexa's "site" list has two different kinds of # addresses on it: with and without a URL path. # Also, most but not all of the sites are second-level # domains: any third-level piece (such as "www.") has # been stripped. In no case is there a scheme; in # particular we have no idea whether the site prefers # http: or https:. So we expand each entry to four: # # http:// site # https:// site # http:// www. site # https:// www. site # # If there was a path, we include all of the above # both with and without the path. This scheme won't # do us any good if the actual content people are # loading is neither at the name in the list nor at # www. the name in the list; for instance, # akamaihd.net is site #68, but neither akamaihd.net # nor www.akamaihd.net has any A records, because, # being a CDN, all of the actual content is on servers # named SOMETHINGELSE.akamaihd.net, and you're not # expected to notice that the domain even exists. # But there's nothing we can do about that. # # Because the database schema requires the ordinal+oid to be # unique, we shift the ordinal left three bits to make room # for a prefix index and an indication of whether or not there # was a path component. # # It does not make sense to prepend 'www.' if 'site' already # starts with 'www.' or if it is an IP address. parsed = url_database.canon_url_syntax( urllib.parse.urlsplit("http://" + site)) assert parsed.path != "" if parsed.path != "/": root = to_siteroot(parsed) need_path = True else: root = parsed need_path = False urls = [ (0, root.geturl()), (1, to_https(root).geturl()) ] host = root.hostname if no_www_re.match(host): need_www = False else: need_www = True with_www = add_www(root) urls.extend([ (2, with_www.geturl()), (3, to_https(with_www).geturl()) ]) if need_path: urls.extend([ (4, parsed.geturl()), (5, to_https(parsed).geturl()) ]) if need_www: with_www = add_www(parsed) urls.extend([ (6, with_www.geturl()), (7, to_https(with_www).geturl()) ]) ordinal = int(ordinal) * 8 nnew = 0 for tag, url in urls: (uid, url) = url_database.add_url_string(cur, url) if url in already_seen: continue already_seen.add(url) # We want to add an url-table entry for this URL even if it's # already there from some other source; we only drop them if # they are redundant within this data set. However, in case # the database-loading operation got interrupted midway, # do an INSERT OR IGNORE. cur.execute("INSERT OR IGNORE INTO urls VALUES(?, ?, ?)", (oid, ordinal + tag, uid)) nnew += 1 return nnew
def add_urls_from_site(cur, site, rank, datestamp, batch, already_seen): # Subroutine of process_sitelist. # # Alexa's "site" list has two different kinds of # addresses on it: with and without a URL path. # Also, most but not all of the sites are second-level # domains: any third-level piece (such as "www.") has # been stripped. In no case is there a scheme; in # particular we have no idea whether the site prefers # http: or https:. So we expand each entry to four: # # http:// site # https:// site # http:// www. site # https:// www. site # # If there was a path, we include all of the above # both with and without the path. This scheme won't # do us any good if the actual content people are # loading is neither at the name in the list nor at # www. the name in the list; for instance, # akamaihd.net is site #68, but neither akamaihd.net # nor www.akamaihd.net has any A records, because, # being a CDN, all of the actual content is on servers # named SOMETHINGELSE.akamaihd.net, and you're not # expected to notice that the domain even exists. # But there's nothing we can do about that. # # It does not make sense to prepend 'www.' if 'site' already # starts with 'www.' or if it is an IP address. parsed = url_database.canon_url_syntax("http://" + site, want_splitresult=True) assert parsed.path != "" if parsed.path != "/": root = to_siteroot(parsed) need_path = True else: root = parsed need_path = False urls = [ root.geturl(), to_https(root).geturl() ] host = root.hostname if no_www_re.match(host): need_www = False else: need_www = True with_www = add_www(root) urls.extend([ with_www.geturl(), to_https(with_www).geturl() ]) if need_path: urls.extend([ parsed.geturl(), to_https(parsed).geturl() ]) if need_www: with_www = add_www(parsed) urls.extend([ with_www.geturl(), to_https(with_www).geturl() ]) for url in urls: (uid, url) = url_database.add_url_string(cur, url) if url in already_seen: continue batch.append( (uid, rank, datestamp) ) already_seen.add(url)