class ContextResolverTest(unittest.TestCase):
    def setUp(self):
        self.cr = ContextResolver()
        self.soup = ContentCleaner().clean_content(html)
        self.element01 = self.soup.find('td', text='Value 01').parent
        self.element02 = self.soup.find('td', text='Value 03').parent

    def tearDown(self):
        pass

    def test_get_context(self):
        context = self.cr.get_context(self.element01)
        self.failUnless(context[u'Field 01:'] == 1)

    def test_get_tree_context(self):
        context = self.cr.get_context(self.element02)
        self.failUnless(context[u'Field 03'] == 1)
        self.failUnless(context[u'33'] == 1)

    def test_merge_contexts(self):
        context01 = {u'Field 01:': 1}
        context02 = {u'Field 01:': 3, u'Field 02:': 1, u'Field 03:': 4}
        merged = self.cr.merge_context(context01, context02)
        self.failUnless(merged == {
            u'Field 02:': 1,
            u'Field 01:': 4,
            u'Field 03:': 4
        })

    def test_clean_context(self):
        context = {
            'a': 2,
            'b': 3,
            'c': 1,
            'this string is quite long. yes indeed': 4
        }
        result = self.cr.clean_context(context)
        self.failUnless(result == {'a': 2, 'b': 3})

    def test_get_top_words(self):
        context = {u'a': 3, 'b': 5, 'c': 1, u'd': 2, 'e': 4}
        expected = ['b', 'e', u'a']
        result = self.cr.get_top_strings(context, 3)
        self.failUnless(result == expected)

    def test_check_context(self):
        context01 = {'a': 3, 'b': 5, 'c': 1, 'd': 2, 'e': 4}
        context02 = {'a': 1, 'x': 3}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)

        context02 = {'x': 3}
        result = self.cr.check_context(context01, context02)
        self.failIf(result)

        context01 = {}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
Esempio n. 2
0
class PathRuler(Ruler):
    """
    Creates a rule described by the path to locate some piece of information 
    in an HTML document
    
    Content of the examples must be a BeautifulSoup object that describes an
    HTML document.
    """ 
    
    def __init__(self, value_guide='.*'):
        super(PathRuler, self).__init__()
        self.context_resolver = ContextResolver()
        self.value_guide = value_guide
    
    def rule(self, training):
        rules = super(PathRuler, self).rule(training)
        for rule in rules:
            rule.pattern.insert(0, self.value_guide)
            # Clean context
            rule.pattern[1] = self.context_resolver.clean_context(
                                                            rule.pattern[1])
        return rules
    
    def _rule_example(self, example):
        log.debug('Ruling example with PathRuler. Value %s' % #@UndefinedVariable
                  str(example.value))
        rules = []
        element_rules = []
        for element in self._get_content_elements(example.value,
                                                  example.content):
            rule = self._rule_element(example, element)
            if rule:
                element_rules.append(rule)
        self._merge_rules(rules, element_rules)
        return rules
    
    def _rule_element(self, example, element):
        try:
            pattern = self._get_element_path(example.content, element.parent)
            context = self.context_resolver.get_context(element.parent)
            pattern.insert(0, context)
            return PathRule(pattern)
        except Exception, e:
            log.warn('Path ruler cannot rule element %s: %s' #@UndefinedVariable 
                     % (str(element), e)) 
            return None
Esempio n. 3
0
 def _choose_element(self, elements):
     matches = []
     context_resolver = ContextResolver()
     for element in elements:
         
         # Check field context
         context = context_resolver.get_context(element)            
         if not context_resolver.check_context(self.context, context):
             continue
         
         # Use value guide
         texts = element.findAll(name=True, text=True)
         element_text = ''.join(texts) 
         match = re.search(self.value_guide, element_text)
         if match:
             matches.append(element_text)
     return matches
class ContextResolverTest(unittest.TestCase):
    def setUp(self):
        self.cr = ContextResolver()
        self.soup = ContentCleaner().clean_content(html)
        self.element01 = self.soup.find('td', text='Value 01').parent
        self.element02 = self.soup.find('td', text='Value 03').parent
        
    def tearDown(self):
        pass

    def test_get_context(self):
        context = self.cr.get_context(self.element01)
        self.failUnless(context[u'Field 01:'] == 1)
        
    def test_get_tree_context(self):
        context = self.cr.get_context(self.element02)
        self.failUnless(context[u'Field 03'] == 1)
        self.failUnless(context[u'33'] == 1)

    def test_merge_contexts(self):
        context01 = {u'Field 01:':1}
        context02 = {u'Field 01:':3, u'Field 02:':1, u'Field 03:':4}
        merged = self.cr.merge_context(context01, context02)
        self.failUnless(merged == {u'Field 02:': 1, u'Field 01:': 4,
                                   u'Field 03:': 4})
    
    def test_clean_context(self):
        context = {'a':2, 'b':3, 'c':1,
                   'this string is quite long. yes indeed':4}
        result = self.cr.clean_context(context)
        self.failUnless(result == {'a':2, 'b':3})
        
    def test_get_top_words(self):
        context = {u'a':3, 'b':5, 'c':1, u'd':2, 'e':4}
        expected = ['b', 'e', u'a']
        result = self.cr.get_top_strings(context, 3)
        self.failUnless(result == expected)
        
    def test_check_context(self):
        context01 = {'a':3, 'b':5, 'c':1, 'd':2, 'e':4}
        context02 = {'a':1, 'x':3}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
        
        context02 = {'x':3}
        result = self.cr.check_context(context01, context02)
        self.failIf(result)
        
        context01 = {}
        result = self.cr.check_context(context01, context02)
        self.failUnless(result)
Esempio n. 5
0
 def __init__(self, value_guide='.*'):
     super(PathRuler, self).__init__()
     self.context_resolver = ContextResolver()
     self.value_guide = value_guide
 def setUp(self):
     self.cr = ContextResolver()
     self.soup = ContentCleaner().clean_content(html)
     self.element01 = self.soup.find('td', text='Value 01').parent
     self.element02 = self.soup.find('td', text='Value 03').parent
 def setUp(self):
     self.cr = ContextResolver()
     self.soup = ContentCleaner().clean_content(html)
     self.element01 = self.soup.find('td', text='Value 01').parent
     self.element02 = self.soup.find('td', text='Value 03').parent