Ejemplo n.º 1
0
    def process_item(self, item, spider, db=None):
        # Get domain and parsed URL info.
        domain = Domain.find_stub_by_url(item["url"], db)
        parsed = ParsedURL(item["url"])
        now = datetime.now()

        # Get or create file.
        file_row = db.query(File).filter(File.url == item["url"]).scalar()

        if not file_row:
            statement = insert(File).values(
                url=item["url"],
                domain_id=domain.id,
                last_crawl=now,
                size=item["size"],
                path=parsed.path).on_conflict_do_nothing(
                    index_elements=["url"])
            db.execute(statement)
            file_row = db.query(File).filter(File.url == item["url"]).scalar()

        # Update file information.
        file_store = HashedFile.from_data(item["content"], save=False)

        file_row.last_crawl = now

        if domain.blacklisted:
            # Override the old file before replacing the content.
            file_store.write(BLACKLISTED_BLANK)
            file_row.content = BLACKLISTED_BLANK
        elif file_store.read() != item["content"]:
            file_row.content = item["content"]

        db.commit()

        return item
Ejemplo n.º 2
0
    def process_exception(self, response, exception, spider):
        parsed = ParsedURL(response.url)

        if isinstance(exception, TwistedTimeoutError):
            self.server.incr("timeouts:" + md5(parsed.host), 1)
            self.server.expire("timeouts:" + md5(parsed.host), 60 * 60 * 24)
        elif exception:
            self.logger.error("Caught unhandled exception in spider.")
            self.logger.error(traceback.format_exc())
Ejemplo n.º 3
0
    def process_request(self, request, spider):
        if not Domain.is_onion_url(request.url):
            return None

        parsed = ParsedURL(request.url)
        subdomains = parsed.host.count(".")
        if subdomains > 2:
            raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)

        return None
Ejemplo n.º 4
0
    def process_exception(self, request, exception, spider):
        parsed = ParsedURL(request.url)

        if isinstance(exception, TwistedTimeoutError):
            self.redis.incr("timeouts:" + md5(parsed.host), 1)
            self.redis.expire("timeouts:" + md5(parsed.host), 60 * 60 * 24)
        elif exception:
            spider.logger.error("Caught unhandled exception in handler.")
            spider.logger.error(traceback.format_exc())

        return None
Ejemplo n.º 5
0
    def is_onion_url(url: str):
        url = url.strip()
        if not re.match(r"http[s]?://", url):
            return False

        try:
            parsed_url = ParsedURL(url)
            if onion_regex.match(parsed_url.host):
                return True
            else:
                return False
        except TypeError:
            return False
Ejemplo n.º 6
0
    def process_request(self, request, spider):
        # Allow requests if the max pages is disabled.
        if self.max_pages == -1:
            return None

        parsed = ParsedURL(request.url)
        page_count = self.pages_script(args=[parsed.host, self.max_pages])
        if page_count < self.max_pages:
            spider.logger.info('Page count is %d for %s' %
                               (page_count, parsed.host))
            return None
        else:
            raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' %
                                request.url)
Ejemplo n.º 7
0
    def find_stub_by_url(cls, url: str, db):
        page = db.query(Page).filter(Page.url == url).scalar()

        if not page:
            domain = Domain.find_stub_by_url(url, db)
            parsed = ParsedURL(url)

            statement = insert(Page).values(
                url=url,
                domain_id=domain.id,
                path=parsed.path,
            ).on_conflict_do_nothing(index_elements=["url"])
            db.execute(statement)
            page = cls.find_stub_by_url(url, db)

        return page
Ejemplo n.º 8
0
    def process_item(self, item, spider, db=None):
        # Sanity checks
        if not item:
            raise DropItem("Somehow got a blank item dict.")

        if not Domain.is_onion_url(item["url"]):
            raise DropItem(f"{item['url']} is not an onion.")

        now = datetime.now()
        parsed = ParsedURL(item["url"])

        # Get or create domain and update info.
        domain = Domain.find_stub_by_url(item["url"], db)
        domain.last_crawl = now
        domain.alive = item["status_code"] not in BAD_STATUS_CODES
        if item["frontpage"]:
            if not (domain.title != '' and item["title"] == ''):
                domain.title = item["title"]
        db.commit()

        # Get or create page.
        page = Page.find_stub_by_url(item["url"], db)

        # Update domain information.
        page.status_code = item["status_code"]
        page.last_crawl = now
        page.header_server = item["server"]
        page.header_powered_by = item["powered_by"]
        page.title = item["title"]

        if page.is_frontpage != item["frontpage"]:
            page.is_frontpage = item["frontpage"]

        # Update links to.
        page.links_to = list(item["links_to"])

        db.commit()
        return item
Ejemplo n.º 9
0
    def parse_page_info(self, response):
        """
            Parses the page meta information for the pipeline.

            Example return:
                {
                    "host": "someonionpagehostname.onion",
                    "url": "someonionpagehostname.onion/",
                    "status_code": 200,
                    "size": 420,
                    "server": "TotallyReal Server",
                    "powered_by": "IE 6.0",
                    "title": "Page title",
                    "frontpage": True,
                    "content": "<h1>Under Construction</h1>",
                    "links_to": set()
                }
        """

        page_metadata = {
            # HTTP headers
            "host": "",
            "url": response.url,
            "status_code": response.status,
            "size": 0,
            "server": "",
            "powered_by": "",

            # Parsed from page
            "title": "",
            "frontpage": False,
            "content": None,
            "links_to": set(),
            "other_links": set(),
        }

        # Attempt setting the content
        try:
            page_metadata["content"] = response.text
        except AttributeError:
            page_metadata["content"] = response.body

        # Grab the title of the page.
        try:
            page_metadata["title"] = response.css(
                'title::text').extract_first()
        except AttributeError:
            pass
        except scrapy.exceptions.NotSupported:
            self.logger.debug(f"Fetched non-text file {response.url}")

        # Get tor URL "hostname"
        parsed = ParsedURL(response.url)

        self.log('Got %s (%s)' % (response.url, page_metadata["title"]))
        page_metadata["frontpage"] = Page.is_frontpage_request(
            response.request)
        page_metadata["size"] = len(response.body)
        page_metadata["host"] = parsed.host

        got_server_response = response.status in GOOD_STATUS_CODES

        # Domain headers
        if got_server_response:
            if response.headers.get("Server"):
                page_metadata["server"] = str(response.headers.get("Server"))
            if response.headers.get("X-Powered-By"):
                page_metadata["powered_by"] = str(
                    response.headers.get("X-Powered-By"))
            if response.headers.get("Powered-By"):
                page_metadata["powered_by"] = str(
                    response.headers.get("Powered-By"))

        is_text = False
        content_type = str(response.headers.get("Content-Type"))
        if got_server_response and content_type and re.match(
                '^text/', content_type.strip()):
            is_text = True

        # Update links_to
        if parsed.host not in self.spider_exclude:
            try:
                for url in response.xpath('//a/@href').extract():
                    # Split thhe URL for any onion to clean out web to onion services if they exist.
                    fullurl_parts = response.urljoin(url).split(".onion", 1)

                    # Skip this URL if it has only one part. Onions should have two parts.
                    if len(fullurl_parts) == 1:
                        self.logger.debug(
                            f"Stage 1 dropping non-onion URL '{fullurl_parts[0]}'."
                        )
                        continue

                    # Some people did things like qwertyuiop.onion.onion/index.php. No idea why but this happened.
                    while fullurl_parts[1].startswith(".onion"):
                        fullurl_parts[1] = fullurl_parts[1].lstrip(".onion")

                    # Merge the parts back together.
                    fullurl = urljoin(fullurl_parts[0] + ".onion",
                                      fullurl_parts[1])

                    # Do additional checks post-merge just in case things happen.
                    if not got_server_response:
                        self.logger.debug(
                            f"Did not get server response from '{fullurl}'.")
                    elif not Domain.is_onion_url(fullurl):
                        self.logger.debug(
                            f"Stage 2 dropping non-onion URL '{fullurl_parts[0]}'."
                        )

                    # Parse the link and update the lists.
                    try:
                        parsed_link = ParsedURL(fullurl)
                        link_host = parsed_link.host
                    except:
                        continue

                    if parsed.host != link_host:
                        page_metadata["links_to"].add(fullurl)
                    else:
                        page_metadata["other_links"].add(fullurl)

                if len(page_metadata["links_to"]) <= 5:
                    self.logger.debug("link_to_list len %s %s" % (len(
                        page_metadata["links_to"]), page_metadata["links_to"]))
                else:
                    self.logger.debug("link_to_list len %s truncated" %
                                      (len(page_metadata["links_to"])))
            except (AttributeError, scrapy.exceptions.NotSupported):
                pass

        return page_metadata
Ejemplo n.º 10
0
    def process_exception(self, request, exception, spider):
        parsed = ParsedURL(request.url)
        if exception:
            self.redis.hincrby("spider:pagecount", parsed.host, -1)

        return None
Ejemplo n.º 11
0
    if page.id % 250 == 0:
        print(f"Currently at ID {page.id}.")

    title = f"{domains_by_id[page.domain_id].host}\n{domains_by_id[page.domain_id].title or 'No title.'}"
    if page.domain_id not in used_domains:
        nodes.append({
            "id":
            domains_by_id[page.domain_id].host + ":" +
            str(domains_by_id[page.domain_id].port),
            "label":
            title
        })
        used_domains.add(page.domain_id)

    for link in page.links_to:
        parsed = ParsedURL(link)

        if not onion_regex.match(parsed.host):
            continue

        if parsed.host not in domains_by_host:
            continue

        link_iters[domains_by_id[page.domain_id].host + ":" + str(
            domains_by_id[page.domain_id].port)][parsed.host + ":" +
                                                 str(parsed.port)] += 1

print(f"{len(nodes)} nodes graphed.")

# Construct vis data.
for parent_node, child_link_list in link_iters.items():
Ejemplo n.º 12
0
 def find_stub_by_url(cls, url, db):
     parsed = ParsedURL(url)
     return cls.find_stub(parsed.host, parsed.port, parsed.secure, db)
Ejemplo n.º 13
0
    def is_frontpage_url(url):
        parsed = ParsedURL(url)
        if parsed.path == '/':
            return True

        return False