def crawl(self): """ Begins the crawling process using variables set earlier. Extracts queries by locating website-specific HTML tags or searching for common expression patterns. Writes queries to output after finishing each site. """ while not self.link_queue.empty(): # Retrieve the next link in the queue next_node = self.link_queue.get() node_url = next_node.get_url() node_depth = next_node.get_depth() # Check if crawler has exceeded maximum depth or maximum count if node_depth >= self.max_depth or self.count >= self.max_size: self.log.close() return html_response = self.get_html(node_url) if html_response is None: continue links = extractor.extract_links(html_response) for link in links: self.add_new_link(link, node_depth) queries = extractor.extract_queries(html_response) if queries: self.log.log_queries(queries, node_url) self.log.log_page(node_url, len(queries)) self.count += 1 self.log.close()
def test_generic_extraction(): reader = open("resources/sample_a.html", "r") url = "https://fake.website" mock_request = MockRequest(url, 200, reader.read()) queries = extractor.extract_queries(mock_request) assert len(queries) == 5
def test_extract_queries(): reader = open("resources/googleCloudSite.html", "r") url = "https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax" mock_request = MockRequest(url, 200, reader.read()) assert len(extractor.extract_queries(mock_request)) == 103
def test_generic_extraction_with_links(): reader = open("resources/sample_b.html", "r") url = "https://fake.website" mock_request = MockRequest(url, 200, reader.read()) assert len(extractor.extract_links(mock_request)) == 2 assert len(extractor.extract_queries(mock_request)) == 8