Esempio n. 1
0
    def main_loop(self):
        self.report_overall_progress()

        while self.todo or self.in_progress:
            while self.todo and len(self.in_progress) < self.args.parallel:
                uid, url, retries = self.todo.popleft()
                url = url_database.canon_url_syntax(url)
                idx = self.assign_display_index(url)
                task = CanonTask(uid, url, retries, idx)
                self.in_progress[task.pid] = task

            try:
                while True:
                    try:
                        (pid, status) = os.wait()
                        break
                    except InterruptedError:
                        continue
            except ChildProcessError:
                continue # no children to wait for: keep going

            task = self.in_progress.pop(pid)
            task.pickup_results(status)

            # Retry network timeouts up to five times.
            if task.status == "Network timeout":
                if task.retries < 5:
                    self.report_result(task, task.status)
                    self.todo.append((task.original_uid,
                                      task.original_url,
                                      task.retries + 1))
                    continue

            self.record_canonized(task)
Esempio n. 2
0
    def __init__(self, url, proxy):
        self.proc         = None
        self.original_url = url
        self.canon_url    = None
        self.status       = None
        self.detail       = None
        self.log          = {}
        self.content      = None
        self.render       = None

        # Make sure the URL is not so mangled that phantomjs is just going
        # to give up and report nothing at all.
        try:
            self.original_url = \
                url_database.canon_url_syntax(url, want_splitresult = False)

        except ValueError as e:
            self.status = 'invalid URL'
            self.detail = str(e)
            return

        except UnicodeError as e:
            while e.__cause__ is not None: e = e.__cause__
            self.status = 'invalid URL'
            self.detail = 'invalid hostname: ' + str(e)
            return

        # We use a temporary file for the results, instead of a pipe,
        # so we don't have to worry about reading them until after the
        # child process exits.
        self.result_fd = tempfile.TemporaryFile("w+t", encoding="utf-8")
        self.errors_fd = tempfile.TemporaryFile("w+t", encoding="utf-8")

        self.proc = subprocess.Popen(
            proxy.adjust_command([
                "isolate",
                "ISOL_RL_MEM=unlimited",
                "ISOL_RL_STACK=8388608",
                "PHANTOMJS_DISABLE_CRASH_DUMPS=1",
                "MALLOC_CHECK_=0",
                "phantomjs",
                "--local-url-access=no",
                pj_trace_redir,
                "--capture",
                self.original_url
            ]),
            stdin=subprocess.DEVNULL,
            stdout=self.result_fd,
            stderr=self.errors_fd)
Esempio n. 3
0
    def __init__(self, url):
        self.status       = ""
        self.detail       = ""
        self.log          = {}
        self.canon_url    = ""
        self.content      = ""
        self.elapsed      = 0.

        # Make sure the URL is not so mangled that phantomjs is just going
        # to give up and report nothing at all.
        try:
            self.original_url = canon_url_syntax(url, want_splitresult = False)

        except ValueError as e:
            self.original_url = url
            self.status = 'invalid URL'
            self.detail = str(e)

        except UnicodeError as e:
            while e.__cause__ is not None: e = e.__cause__
            self.original_url = url
            self.status = 'invalid URL'
            self.detail = 'invalid hostname: ' + str(e)
Esempio n. 4
0
    def main_loop(self):
        self.report_overall_progress()

        all_read = False

        while self.in_progress or not all_read:
            while not all_read and len(self.in_progress) < self.args.parallel:

                try:
                    raw_line = self.todo.readline()
                    line = raw_line.decode("ascii").strip()
                except Exception as e:
                    self.anomalies += 1
                    self.bogus_results.write("{}\n".format(json.dumps({
                        "exception": repr(e),
                        "raw_line": repr(raw_line)
                    })))
                    continue

                if line == "":
                    all_read = True
                    break

                uid, url = line.split("|", 1)
                url = url_database.canon_url_syntax(url)
                idx = self.assign_display_index(url)
                task = CanonTask(uid, url, idx)
                self.in_progress[task.pid] = task

            try:
                (pid, status) = os.wait()
            except ChildProcessError:
                continue # no children to wait for: keep going

            task = self.in_progress.pop(pid)
            task.pickup_results(status)
            self.record_canonized(task)
Esempio n. 5
0
    def add_urls_from_site(cur, site, ordinal, oid, already_seen):
        # Subroutine of process_sitelist.
        #
        # Alexa's "site" list has two different kinds of
        # addresses on it: with and without a URL path.
        # Also, most but not all of the sites are second-level
        # domains: any third-level piece (such as "www.") has
        # been stripped.  In no case is there a scheme; in
        # particular we have no idea whether the site prefers
        # http: or https:.  So we expand each entry to four:
        #
        #   http://       site
        #   https://      site
        #   http://  www. site
        #   https:// www. site
        #
        # If there was a path, we include all of the above
        # both with and without the path.  This scheme won't
        # do us any good if the actual content people are
        # loading is neither at the name in the list nor at
        # www. the name in the list; for instance,
        # akamaihd.net is site #68, but neither akamaihd.net
        # nor www.akamaihd.net has any A records, because,
        # being a CDN, all of the actual content is on servers
        # named SOMETHINGELSE.akamaihd.net, and you're not
        # expected to notice that the domain even exists.
        # But there's nothing we can do about that.
        #
        # Because the database schema requires the ordinal+oid to be
        # unique, we shift the ordinal left three bits to make room
        # for a prefix index and an indication of whether or not there
        # was a path component.
        #
        # It does not make sense to prepend 'www.' if 'site' already
        # starts with 'www.' or if it is an IP address.

        parsed = url_database.canon_url_syntax(
            urllib.parse.urlsplit("http://" + site))

        assert parsed.path != ""
        if parsed.path != "/":
            root = to_siteroot(parsed)
            need_path = True
        else:
            root = parsed
            need_path = False

        urls = [ (0, root.geturl()),
                 (1, to_https(root).geturl()) ]

        host = root.hostname
        if no_www_re.match(host):
            need_www = False
        else:
            need_www = True
            with_www = add_www(root)
            urls.extend([ (2, with_www.geturl()),
                          (3, to_https(with_www).geturl()) ])


        if need_path:
            urls.extend([ (4, parsed.geturl()),
                          (5, to_https(parsed).geturl()) ])

            if need_www:
                with_www = add_www(parsed)
                urls.extend([ (6, with_www.geturl()),
                              (7, to_https(with_www).geturl()) ])

        ordinal = int(ordinal) * 8

        nnew = 0
        for tag, url in urls:
            (uid, url) = url_database.add_url_string(cur, url)
            if url in already_seen:
                continue
            already_seen.add(url)

            # We want to add an url-table entry for this URL even if it's
            # already there from some other source; we only drop them if
            # they are redundant within this data set.  However, in case
            # the database-loading operation got interrupted midway,
            # do an INSERT OR IGNORE.
            cur.execute("INSERT OR IGNORE INTO urls VALUES(?, ?, ?)",
                        (oid, ordinal + tag, uid))
            nnew += 1

        return nnew
Esempio n. 6
0
    def add_urls_from_site(cur, site, rank, datestamp, batch, already_seen):
        # Subroutine of process_sitelist.
        #
        # Alexa's "site" list has two different kinds of
        # addresses on it: with and without a URL path.
        # Also, most but not all of the sites are second-level
        # domains: any third-level piece (such as "www.") has
        # been stripped.  In no case is there a scheme; in
        # particular we have no idea whether the site prefers
        # http: or https:.  So we expand each entry to four:
        #
        #   http://       site
        #   https://      site
        #   http://  www. site
        #   https:// www. site
        #
        # If there was a path, we include all of the above
        # both with and without the path.  This scheme won't
        # do us any good if the actual content people are
        # loading is neither at the name in the list nor at
        # www. the name in the list; for instance,
        # akamaihd.net is site #68, but neither akamaihd.net
        # nor www.akamaihd.net has any A records, because,
        # being a CDN, all of the actual content is on servers
        # named SOMETHINGELSE.akamaihd.net, and you're not
        # expected to notice that the domain even exists.
        # But there's nothing we can do about that.
        #
        # It does not make sense to prepend 'www.' if 'site' already
        # starts with 'www.' or if it is an IP address.

        parsed = url_database.canon_url_syntax("http://" + site,
                                               want_splitresult=True)

        assert parsed.path != ""
        if parsed.path != "/":
            root = to_siteroot(parsed)
            need_path = True
        else:
            root = parsed
            need_path = False

        urls = [ root.geturl(),
                 to_https(root).geturl() ]

        host = root.hostname
        if no_www_re.match(host):
            need_www = False
        else:
            need_www = True
            with_www = add_www(root)
            urls.extend([ with_www.geturl(),
                          to_https(with_www).geturl() ])


        if need_path:
            urls.extend([ parsed.geturl(),
                          to_https(parsed).geturl() ])

            if need_www:
                with_www = add_www(parsed)
                urls.extend([ with_www.geturl(),
                              to_https(with_www).geturl() ])

        for url in urls:
            (uid, url) = url_database.add_url_string(cur, url)
            if url in already_seen:
                continue
            batch.append( (uid, rank, datestamp) )
            already_seen.add(url)