def _rule_element(self, example, element): try: pattern = self._get_element_path(example.content, element.parent) context = self.context_resolver.get_context(element.parent) pattern.insert(0, context) return PathRule(pattern) except Exception, e: log.warn('Path ruler cannot rule element %s: %s' #@UndefinedVariable % (str(element), e)) return None
def _get_new_example_set(self, rule, example_set): """ Return a list of examples with the same value attribute as example_set but where the content is the result of applying rule. """ new_example_set = [] for example in example_set: value = example.value content = rule.apply(example.content) if value and content: new_example_set.append(Example(value, content)) else: log.warn('Example content is None after applying rule') #@UndefinedVariable return new_example_set
def _do_citeseerx(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable ref = (None, None) try: ref_element = page.find('div', {'class':'content'}, text=re.compile('@\w*{')) ref_element = ref_element.parent.findAll(text=True) reference = ''.join(ref_element) except Exception, e: log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable return ref
def run(self): self.formatted_references = [] for item in self.items: if not item.extraction.references: log.warn('Item has no references') #@UndefinedVariable continue if item.reference_entry: self.formatted_references.append(item.reference_entry) continue id = item.extraction.references[0].id log.debug('Formatting reference with id %d' % id) #@UndefinedVariable entry = self.reference_formatter.format_reference(id) if(entry): log.debug('Reference with id %d formatted' % id) #@UndefinedVariable self.formatted_references.append(entry) item.reference_entry = entry
def train(self, examples): """ Generates all the possible wrappers that cover the given examples. The content of the examples must be compatible with the input type of the first of the rules, i.e. if the first ruler expects a string, the content attribute of the examples must be a string. """ wrappers = [] if len(examples) < self.num_examples: log.warn('Too few examples. Could not train wrappers') #@UndefinedVariable return wrappers rule_sets = self._get_rule_sets(list(self.rulers), examples) for rule_set in rule_sets: wrapper = Wrapper(rules=rule_set) self._evaluate_wrapper(wrapper, examples) wrappers.append(wrapper) log.debug('Trainer generated %d wrappers (not prunned)' % #@UndefinedVariable len(wrappers)) return wrappers
def _rule_example_content(self, value, content): try: text = content.strip() except Exception, e: log.warn('Error stripping %s: %s' % (str(content)[:40], e)) #@UndefinedVariable return None