Esempio n. 1
0
 def getCustomFetcherPolicy(self, file_name):
     """To get custom fetcher policy """
     yaml_path = os.path.join(os.path.dirname(__file__), "resource",
                              file_name)
     fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(
         open(yaml_path))
     return fetcher_policy_yaml
 def testToDict(self):
     """Tests encoding the FP document as JSON."""
     fp_yaml = configuration.parse_fetcher_policy_yaml(
         "fetcher_policy:\n"
         "  agent_name: test\n"
         "  email_address: [email protected]\n"
         "  web_address: http://test.domain.com\n"
         "  min_response_rate: 0\n"
         "  max_content_size:\n"
         "  - content_type: default\n"
         "    size: 1000\n"
         "  - content_type: image/png\n"
         "    size: 5000\n"
         "  crawl_end_time: 15\n"
         "  crawl_delay: 0\n"
         "  max_redirects: 20\n"
         "  accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n"
         "  valid_mime_types: text/html\n"
         "  redirect_mode: follow_all\n"
         "  request_timeout: 20\n")
     all_configs = configuration.FetcherPolicyYaml.to_dict(fp_yaml)
     self.assertEquals(
         {
             'agent_name':
             "test",
             'email_address':
             "*****@*****.**",
             'web_address':
             "http://test.domain.com",
             'min_response_rate':
             "0",
             'max_content_size': [{
                 "content_type": "default",
                 "size": "1000"
             }, {
                 "content_type": "image/png",
                 "size": "5000"
             }],
             'crawl_end_time':
             "15",
             'crawl_delay':
             "0",
             'max_redirects':
             "20",
             'accept_language':
             "en-us,en-gb,en;q=0.7,*;q=0.3",
             'valid_mime_types':
             "text/html",
             'redirect_mode':
             "follow_all",
             'request_timeout':
             "20"
         }, all_configs)
    def testParse(self):
        """Parsing a single document in fetcher_policy.yaml."""
        fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(
            "fetcher_policy:\n"
            "  agent_name: test\n"
            "  email_address: [email protected]\n"
            "  web_address: http://test.domain.com\n"
            "  min_response_rate: 0\n"
            "  max_content_size:\n"
            "  - content_type: default\n"
            "    size: 1000\n"
            "  crawl_end_time: 15000\n"
            "  crawl_delay: 0\n"
            "  max_redirects: 20\n"
            "  accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n"
            "  valid_mime_types: text/html\n"
            "  redirect_mode: follow_all\n"
            "  request_timeout: 20000\n")

        self.assertTrue(fetcher_policy_yaml)
        self.assertTrue("test", fetcher_policy_yaml.fetcher_policy.agent_name)
        self.assertTrue("*****@*****.**",
                        fetcher_policy_yaml.fetcher_policy.email_address)
        self.assertTrue("http://test.domain.com",
                        fetcher_policy_yaml.fetcher_policy.web_address)
        self.assertEquals("0",
                          fetcher_policy_yaml.fetcher_policy.min_response_rate)
        self.assertTrue(fetcher_policy_yaml.fetcher_policy.max_content_size)
        max_content_sizes = fetcher_policy_yaml.fetcher_policy.max_content_size
        self.assertTrue(1, len(max_content_sizes))
        max_content_size = max_content_sizes[0]
        self.assertTrue("default", max_content_size.content_type)
        self.assertTrue("1000", max_content_size.size)
        self.assertEquals("15000",
                          fetcher_policy_yaml.fetcher_policy.crawl_end_time)
        self.assertEquals("0", fetcher_policy_yaml.fetcher_policy.crawl_delay)
        self.assertEquals("20",
                          fetcher_policy_yaml.fetcher_policy.max_redirects)
        self.assertEquals("en-us,en-gb,en;q=0.7,*;q=0.3",
                          fetcher_policy_yaml.fetcher_policy.accept_language)
        self.assertEquals("text/html",
                          fetcher_policy_yaml.fetcher_policy.valid_mime_types)
        self.assertEquals("follow_all",
                          fetcher_policy_yaml.fetcher_policy.redirect_mode)
        self.assertEquals("20000",
                          fetcher_policy_yaml.fetcher_policy.request_timeout)
Esempio n. 4
0
 def testToDict(self):
   """Tests encoding the FP document as JSON."""
   fp_yaml = configuration.parse_fetcher_policy_yaml(
       "fetcher_policy:\n"
       "  agent_name: test\n"
       "  email_address: [email protected]\n"
       "  web_address: http://test.domain.com\n"
       "  min_response_rate: 0\n"
       "  max_content_size:\n"
       "  - content_type: default\n"
       "    size: 1000\n"
       "  - content_type: image/png\n"
       "    size: 5000\n"
       "  crawl_end_time: 15\n"
       "  crawl_delay: 0\n"
       "  max_redirects: 20\n"
       "  accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n"
       "  valid_mime_types: text/html\n"
       "  redirect_mode: follow_all\n"
       "  request_timeout: 20\n")
   all_configs = configuration.FetcherPolicyYaml.to_dict(fp_yaml)
   self.assertEquals(
     {
           'agent_name': "test",
           'email_address': "*****@*****.**",
           'web_address': "http://test.domain.com",
           'min_response_rate': "0",
           'max_content_size':[{"content_type": "default",
                                "size": "1000"},
                               {"content_type": "image/png",
                                "size": "5000"}],
           'crawl_end_time': "15",
           'crawl_delay': "0",
           'max_redirects': "20",
           'accept_language': "en-us,en-gb,en;q=0.7,*;q=0.3",
           'valid_mime_types': "text/html",
           'redirect_mode': "follow_all",
           'request_timeout': "20"
     }, all_configs)
Esempio n. 5
0
  def testParse(self):
    """Parsing a single document in fetcher_policy.yaml."""
    fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(
        "fetcher_policy:\n"
        "  agent_name: test\n"
        "  email_address: [email protected]\n"
        "  web_address: http://test.domain.com\n"
        "  min_response_rate: 0\n"
        "  max_content_size:\n"
        "  - content_type: default\n"
        "    size: 1000\n"
        "  crawl_end_time: 15000\n"
        "  crawl_delay: 0\n"
        "  max_redirects: 20\n"
        "  accept_language: en-us,en-gb,en;q=0.7,*;q=0.3\n"
        "  valid_mime_types: text/html\n"
        "  redirect_mode: follow_all\n"
        "  request_timeout: 20000\n")

    self.assertTrue(fetcher_policy_yaml)
    self.assertTrue("test", fetcher_policy_yaml.fetcher_policy.agent_name)
    self.assertTrue("*****@*****.**", fetcher_policy_yaml.fetcher_policy.email_address)
    self.assertTrue("http://test.domain.com", fetcher_policy_yaml.fetcher_policy.web_address)
    self.assertEquals("0", fetcher_policy_yaml.fetcher_policy.min_response_rate)
    self.assertTrue(fetcher_policy_yaml.fetcher_policy.max_content_size)
    max_content_sizes = fetcher_policy_yaml.fetcher_policy.max_content_size
    self.assertTrue(1, len(max_content_sizes))
    max_content_size = max_content_sizes[0]
    self.assertTrue("default", max_content_size.content_type)
    self.assertTrue("1000", max_content_size.size)
    self.assertEquals("15000", fetcher_policy_yaml.fetcher_policy.crawl_end_time)
    self.assertEquals("0", fetcher_policy_yaml.fetcher_policy.crawl_delay)
    self.assertEquals("20", fetcher_policy_yaml.fetcher_policy.max_redirects)
    self.assertEquals("en-us,en-gb,en;q=0.7,*;q=0.3",
                       fetcher_policy_yaml.fetcher_policy.accept_language)
    self.assertEquals("text/html", fetcher_policy_yaml.fetcher_policy.valid_mime_types)
    self.assertEquals("follow_all", fetcher_policy_yaml.fetcher_policy.redirect_mode)
    self.assertEquals("20000", fetcher_policy_yaml.fetcher_policy.request_timeout)
Esempio n. 6
0
 def createDefaultFetcherPolicy(cls):
   """ to get default fetcher policy """
   path = os.path.join(os.path.dirname(__file__), "resource", "fetcher_policy.yaml")
   fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(open(path))
   return fetcher_policy_yaml
Esempio n. 7
0
 def getCustomFetcherPolicy(self, file_name):
   """To get custom fetcher policy """
   yaml_path = os.path.join(os.path.dirname(__file__), "resource", file_name)
   fetcher_policy_yaml = configuration.parse_fetcher_policy_yaml(open(yaml_path))
   return fetcher_policy_yaml