def getCustomFetcherPolicy(self, file_name): """To get custom fetcher policy """ yaml_path = os.path.join(os.path.dirname(__file__), "resource", file_name) fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml( open(yaml_path)) return fetcher_policy_yaml
def testToDict(self): """Tests encoding the FP document as JSON.""" fp_yaml = configuration.parse_fetcher_policy_yaml( "fetcher_policy:\n" " agent_name: test\n" " email_address: [email protected]\n" " web_address: http://test.domain.com\n" " min_response_rate: 0\n" " max_content_size:\n" " - content_type: default\n" " size: 1000\n" " - content_type: image/png\n" " size: 5000\n" " crawl_end_time: 15\n" " crawl_delay: 0\n" " max_redirects: 20\n" " accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n" " valid_mime_types: text/html\n" " redirect_mode: follow_all\n" " request_timeout: 20\n") all_configs = configuration.FetcherPolicyYaml.to_dict(fp_yaml) self.assertEquals( { 'agent_name': "test", 'email_address': "*****@*****.**", 'web_address': "http://test.domain.com", 'min_response_rate': "0", 'max_content_size': [{ "content_type": "default", "size": "1000" }, { "content_type": "image/png", "size": "5000" }], 'crawl_end_time': "15", 'crawl_delay': "0", 'max_redirects': "20", 'accept_language': "en-us,en-gb,en;q=0.7,*;q=0.3", 'valid_mime_types': "text/html", 'redirect_mode': "follow_all", 'request_timeout': "20" }, all_configs)
def testParse(self): """Parsing a single document in fetcher_policy.yaml.""" fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml( "fetcher_policy:\n" " agent_name: test\n" " email_address: [email protected]\n" " web_address: http://test.domain.com\n" " min_response_rate: 0\n" " max_content_size:\n" " - content_type: default\n" " size: 1000\n" " crawl_end_time: 15000\n" " crawl_delay: 0\n" " max_redirects: 20\n" " accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n" " valid_mime_types: text/html\n" " redirect_mode: follow_all\n" " request_timeout: 20000\n") self.assertTrue(fetcher_policy_yaml) self.assertTrue("test", fetcher_policy_yaml.fetcher_policy.agent_name) self.assertTrue("*****@*****.**", fetcher_policy_yaml.fetcher_policy.email_address) self.assertTrue("http://test.domain.com", fetcher_policy_yaml.fetcher_policy.web_address) self.assertEquals("0", fetcher_policy_yaml.fetcher_policy.min_response_rate) self.assertTrue(fetcher_policy_yaml.fetcher_policy.max_content_size) max_content_sizes = fetcher_policy_yaml.fetcher_policy.max_content_size self.assertTrue(1, len(max_content_sizes)) max_content_size = max_content_sizes[0] self.assertTrue("default", max_content_size.content_type) self.assertTrue("1000", max_content_size.size) self.assertEquals("15000", fetcher_policy_yaml.fetcher_policy.crawl_end_time) self.assertEquals("0", fetcher_policy_yaml.fetcher_policy.crawl_delay) self.assertEquals("20", fetcher_policy_yaml.fetcher_policy.max_redirects) self.assertEquals("en-us,en-gb,en;q=0.7,*;q=0.3", fetcher_policy_yaml.fetcher_policy.accept_language) self.assertEquals("text/html", fetcher_policy_yaml.fetcher_policy.valid_mime_types) self.assertEquals("follow_all", fetcher_policy_yaml.fetcher_policy.redirect_mode) self.assertEquals("20000", fetcher_policy_yaml.fetcher_policy.request_timeout)
def testToDict(self): """Tests encoding the FP document as JSON.""" fp_yaml = configuration.parse_fetcher_policy_yaml( "fetcher_policy:\n" " agent_name: test\n" " email_address: [email protected]\n" " web_address: http://test.domain.com\n" " min_response_rate: 0\n" " max_content_size:\n" " - content_type: default\n" " size: 1000\n" " - content_type: image/png\n" " size: 5000\n" " crawl_end_time: 15\n" " crawl_delay: 0\n" " max_redirects: 20\n" " accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n" " valid_mime_types: text/html\n" " redirect_mode: follow_all\n" " request_timeout: 20\n") all_configs = configuration.FetcherPolicyYaml.to_dict(fp_yaml) self.assertEquals( { 'agent_name': "test", 'email_address': "*****@*****.**", 'web_address': "http://test.domain.com", 'min_response_rate': "0", 'max_content_size':[{"content_type": "default", "size": "1000"}, {"content_type": "image/png", "size": "5000"}], 'crawl_end_time': "15", 'crawl_delay': "0", 'max_redirects': "20", 'accept_language': "en-us,en-gb,en;q=0.7,*;q=0.3", 'valid_mime_types': "text/html", 'redirect_mode': "follow_all", 'request_timeout': "20" }, all_configs)
def createDefaultFetcherPolicy(cls): """ to get default fetcher policy """ path = os.path.join(os.path.dirname(__file__), "resource", "fetcher_policy.yaml") fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(open(path)) return fetcher_policy_yaml
def getCustomFetcherPolicy(self, file_name): """To get custom fetcher policy """ yaml_path = os.path.join(os.path.dirname(__file__), "resource", file_name) fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(open(yaml_path)) return fetcher_policy_yaml