Example #1
0
    def crawl(self):
        """ Begins the crawling process using variables set earlier. Extracts
            queries by locating website-specific HTML tags or searching for
            common expression patterns. Writes queries to output after
            finishing each site.
        """

        while not self.link_queue.empty():
            # Retrieve the next link in the queue
            next_node = self.link_queue.get()
            node_url = next_node.get_url()
            node_depth = next_node.get_depth()

            # Check if crawler has exceeded maximum depth or maximum count
            if node_depth >= self.max_depth or self.count >= self.max_size:
                self.log.close()
                return

            html_response = self.get_html(node_url)
            if html_response is None:
                continue

            links = extractor.extract_links(html_response)
            for link in links:
                self.add_new_link(link, node_depth)

            queries = extractor.extract_queries(html_response)
            if queries:
                self.log.log_queries(queries, node_url)

            self.log.log_page(node_url, len(queries))
            self.count += 1

        self.log.close()
def test_generic_extraction():
    reader = open("resources/sample_a.html", "r")
    url = "https://fake.website"
    mock_request = MockRequest(url, 200, reader.read())
    queries = extractor.extract_queries(mock_request)
    assert len(queries) == 5
def test_extract_queries():
    reader = open("resources/googleCloudSite.html", "r")
    url = "https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax"
    mock_request = MockRequest(url, 200, reader.read())
    assert len(extractor.extract_queries(mock_request)) == 103
Example #4
0
def test_generic_extraction_with_links():
    reader = open("resources/sample_b.html", "r")
    url = "https://fake.website"
    mock_request = MockRequest(url, 200, reader.read())
    assert len(extractor.extract_links(mock_request)) == 2
    assert len(extractor.extract_queries(mock_request)) == 8