Beispiel #1
0
 def _rule_element(self, example, element):
     try:
         pattern = self._get_element_path(example.content, element.parent)
         context = self.context_resolver.get_context(element.parent)
         pattern.insert(0, context)
         return PathRule(pattern)
     except Exception, e:
         log.warn('Path ruler cannot rule element %s: %s' #@UndefinedVariable 
                  % (str(element), e)) 
         return None
 def _get_new_example_set(self, rule, example_set):
     """
     Return a list of examples with the same value attribute as example_set
     but where the content is the result of applying rule.
     """
     new_example_set = []
     for example in example_set:
         value = example.value
         content = rule.apply(example.content)
         if value and content:
             new_example_set.append(Example(value, content))
         else:
             log.warn('Example content is None after applying rule')  #@UndefinedVariable
     return new_example_set
 def _do_citeseerx(self, source, page):
     """
     Searches the page for a link to the reference, and then retrieves the
     reference.
     Returns a tuple with the full reference and its format.
     """ 
     log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable
     ref = (None, None)
     
     try:
         ref_element = page.find('div', {'class':'content'},
                                 text=re.compile('@\w*{'))
         ref_element = ref_element.parent.findAll(text=True)
         reference = ''.join(ref_element)
     except Exception, e:
         log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable
         return ref
 def run(self):
     self.formatted_references = []
     for item in self.items:
         if not item.extraction.references:
             log.warn('Item has no references') #@UndefinedVariable
             continue
         
         if item.reference_entry:
             self.formatted_references.append(item.reference_entry)
             continue
     
         id = item.extraction.references[0].id
         log.debug('Formatting reference with id %d' % id) #@UndefinedVariable
         entry = self.reference_formatter.format_reference(id)
         if(entry):
             log.debug('Reference with id %d formatted' % id) #@UndefinedVariable
             self.formatted_references.append(entry)
             item.reference_entry = entry
 def train(self, examples):
     """
     Generates all the possible wrappers that cover the given examples.
     
     The content of the examples must be compatible with the input type
     of the first of the rules, i.e. if the first ruler expects a string, 
     the content attribute of the examples must be a string.
     """
     wrappers = []
     if len(examples) < self.num_examples:
         log.warn('Too few examples. Could not train wrappers') #@UndefinedVariable
         return wrappers
     rule_sets = self._get_rule_sets(list(self.rulers), examples)
     for rule_set in rule_sets:
         wrapper = Wrapper(rules=rule_set)
         self._evaluate_wrapper(wrapper, examples)
         wrappers.append(wrapper)
     log.debug('Trainer generated %d wrappers (not prunned)' % #@UndefinedVariable 
               len(wrappers))
     return wrappers
Beispiel #6
0
 def _rule_example_content(self, value, content):
     try:
         text = content.strip()
     except Exception, e:
         log.warn('Error stripping %s: %s' % (str(content)[:40], e)) #@UndefinedVariable
         return None