def testTruncationWithKeepAlive(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy_sizes.yaml") resource = self.getResource("cloudysunny14.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://static_resource/cloudysunny14.html" result_left = simple_http_fetcher.get(url) result_right = simple_http_fetcher.get(url) self.assertEqual(1000, result_left.get("content_length")) self.assertEqual(1000, result_right.get("content_length")) map(self.assertLR, result_left.get("content"), result_right.get("content")) resource = self.getResource("mining.png") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) url = "http://static_resource/mining.png" result = simple_http_fetcher.get(url) self.assertTrue(result.get("content_length") > 1000)
def _robots_fetch_map(data): """Map function of fetch robots.txt from page. Fetch robots.txt from Web Pages in specified url, Fetched result content will store to Blobstore, which will parse and set the score for urls. Args: data: key value data, that key is position, value is url. Returns: url: extract domain url. content: content of fetched from url's robots.txt """ fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) k, url = data logging.debug("data" + str(k) + ":" + str(url)) content = "" try: result = fetcher.get("%s/robots.txt" % str(url)) content = result.get("content") except Exception as e: logging.warning("Robots.txt Fetch Error Occurs:" + e.message) content = "User-agent: *\nDisallow: /" yield (url, content)
def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create( mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum( parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)
def testRealFetch(self): fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) #set min_response_rate of 20KByte/Sec simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://cloudysunny14.blogspot.jp/" result = simple_http_fetcher.get(url) self.assertTrue(result is not None)
def testMimeTypeFilteringNoContentType(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy.yaml") self.setReturnValue(headers={"Content-Length": 20000}, status_code=200, final_url=self.redirectUrl) url = "http://static_resource/simple-page.html" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) simple_http_fetcher.get(url)
def testMimeTypeFilteringWithCharset(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy.yaml") self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/html; charset=UTF-8" }, status_code=200, final_url=self.redirectUrl) url = "http://cloudysunny14.blogspot.jp/" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) simple_http_fetcher.get(url)
def testRedirectPolicy(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy_redirect_none.yaml") self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/html" }, status_code=301, final_url=self.redirectUrl) url = "http://static_resource/base" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) self.assertRaises(errors.RedirectError, simple_http_fetcher.get, url)
def testNotTerminatingSlowServer(self): #Return server 1Kbyte at 2K byte/sec. self.setReturnValue(headers={ "Content-Length": 5000, "Content-Type": "text/html" }, duration=0.25) fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) fetcher_policy_yaml.fetcher_policy.min_response_rate = configuration.NO_MIN_RESPONSE_RATE simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://static_resource/simple-page.html" simple_http_fetcher.get(url)
def testMimeTypeFiltering(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy.yaml") self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/xml" }, status_code=200, final_url=self.redirectUrl) url = "http://static_resource/simple-page.html" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) self.assertRaises(errors.AbortedFetchError, simple_http_fetcher.get, url)
def testRedirectHandling(self): fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/html" }, final_url=self.redirectUrl) url = "http://static_resource/base" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) result = simple_http_fetcher.get(url) self.assertTrue("http://static_resource/redirect", result.get("fetched_url"))
def testSlowServerTermination(self): #use 20KBytes. And the duration is 2 seconds, thus 10KBytes/Sec self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/html" }, duration=2) fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) #set min_response_rate of 20KByte/Sec fetcher_policy_yaml.fetcher_policy.min_response_rate = 20000 simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://static_resource/simple-page.html" self.assertRaises(errors.AbortedFetchError, simple_http_fetcher.get, url)
def testLargeContent(self): #Test for should be truncate. fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) content_size = fetcher_policy_yaml.fetcher_policy.max_content_size[0] max_content_size = int(content_size.size) self.setReturnValue(headers={ "Content-Length": max_content_size * 2, "Content-Type": "text/html" }) simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://static_resource/simple-page.html" result = simple_http_fetcher.get(url) self.assertTrue( result.get("content_length") <= max_content_size, "Should be truncate")
def testAcceptLanguage(self): fetcher_policy_yaml = self.getCustomFetcherPolicy( "fetcher_policy.yaml") self.setReturnValue(headers={ "Content-Length": 20000, "Content-Type": "text/html" }, status_code=200, language_content={ "en": "English", "ja": "Japanese" }, final_url=self.redirectUrl) url = "http://static_resource/simple-page.html" simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) result = simple_http_fetcher.get(url) self.assertTrue("English", result.get("content"))
def testContentTypeHeader(self): fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy( ) resource = self.getResource("cloudysunny14.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) simple_http_fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) url = "http://static_resource/cloudysunny14.html" result = simple_http_fetcher.get(url) header = result.get("headers") content_type = header["Content-Type"] self.assertTrue(content_type != None) self.assertEquals("text/html", content_type)
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum( parent=crawl_db_datums[0].key, url=url, fetched_url=fetch_result.get("fetched_url"), fetch_time=fetch_result.get("time"), fetched_content=fetch_result.get("content"), content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("read_rate"), response_rate=fetch_result.get("read_rate"), http_headers=str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n" % url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url