class ContextResolverTest(unittest.TestCase):
    def setUp(self):
        self.cr = ContextResolver()
        self.soup = ContentCleaner().clean_content(html)
        self.element01 = self.soup.find('td', text='Value 01').parent
        self.element02 = self.soup.find('td', text='Value 03').parent

    def tearDown(self):
        pass

    def test_get_context(self):
        context = self.cr.get_context(self.element01)
        self.failUnless(context[u'Field 01:'] == 1)

    def test_get_tree_context(self):
        context = self.cr.get_context(self.element02)
        self.failUnless(context[u'Field 03'] == 1)
        self.failUnless(context[u'33'] == 1)

    def test_merge_contexts(self):
        context01 = {u'Field 01:': 1}
        context02 = {u'Field 01:': 3, u'Field 02:': 1, u'Field 03:': 4}
        merged = self.cr.merge_context(context01, context02)
        self.failUnless(merged == {
            u'Field 02:': 1,
            u'Field 01:': 4,
            u'Field 03:': 4
        })

    def test_clean_context(self):
        context = {
            'a': 2,
            'b': 3,
            'c': 1,
            'this string is quite long. yes indeed': 4
        }
        result = self.cr.clean_context(context)
        self.failUnless(result == {'a': 2, 'b': 3})

    def test_get_top_words(self):
        context = {u'a': 3, 'b': 5, 'c': 1, u'd': 2, 'e': 4}
        expected = ['b', 'e', u'a']
        result = self.cr.get_top_strings(context, 3)
        self.failUnless(result == expected)

    def test_check_context(self):
        context01 = {'a': 3, 'b': 5, 'c': 1, 'd': 2, 'e': 4}
        context02 = {'a': 1, 'x': 3}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)

        context02 = {'x': 3}
        result = self.cr.check_context(context01, context02)
        self.failIf(result)

        context01 = {}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
class ContextResolverTest(unittest.TestCase):
    def setUp(self):
        self.cr = ContextResolver()
        self.soup = ContentCleaner().clean_content(html)
        self.element01 = self.soup.find('td', text='Value 01').parent
        self.element02 = self.soup.find('td', text='Value 03').parent
        
    def tearDown(self):
        pass

    def test_get_context(self):
        context = self.cr.get_context(self.element01)
        self.failUnless(context[u'Field 01:'] == 1)
        
    def test_get_tree_context(self):
        context = self.cr.get_context(self.element02)
        self.failUnless(context[u'Field 03'] == 1)
        self.failUnless(context[u'33'] == 1)

    def test_merge_contexts(self):
        context01 = {u'Field 01:':1}
        context02 = {u'Field 01:':3, u'Field 02:':1, u'Field 03:':4}
        merged = self.cr.merge_context(context01, context02)
        self.failUnless(merged == {u'Field 02:': 1, u'Field 01:': 4,
                                   u'Field 03:': 4})
    
    def test_clean_context(self):
        context = {'a':2, 'b':3, 'c':1,
                   'this string is quite long. yes indeed':4}
        result = self.cr.clean_context(context)
        self.failUnless(result == {'a':2, 'b':3})
        
    def test_get_top_words(self):
        context = {u'a':3, 'b':5, 'c':1, u'd':2, 'e':4}
        expected = ['b', 'e', u'a']
        result = self.cr.get_top_strings(context, 3)
        self.failUnless(result == expected)
        
    def test_check_context(self):
        context01 = {'a':3, 'b':5, 'c':1, 'd':2, 'e':4}
        context02 = {'a':1, 'x':3}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
        
        context02 = {'x':3}
        result = self.cr.check_context(context01, context02)
        self.failIf(result)
        
        context01 = {}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
Beispiel #3
0
 def _choose_element(self, elements):
     matches = []
     context_resolver = ContextResolver()
     for element in elements:
         
         # Check field context
         context = context_resolver.get_context(element)            
         if not context_resolver.check_context(self.context, context):
             continue
         
         # Use value guide
         texts = element.findAll(name=True, text=True)
         element_text = ''.join(texts) 
         match = re.search(self.value_guide, element_text)
         if match:
             matches.append(element_text)
     return matches