class TestElementsRegexRuler(object):#(TestRuler): def setUp(self): self.ruler = ElementsRegexRuler() super(TestElementsRegexRuler, self).setUp() self.example06 = Example(['(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'], ['The author Alberto Del Angel', 'Pierre Geurts The author', 'Damien Ernst']) self.example07 = Example(['(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst']) def test_rule_example(self): result = self.ruler._rule_example(self.example06) self.failUnless(result[0].pattern == u'The\\ author\\ (.*)') def test_rule(self): rules = self.ruler.rule([self.example06, self.example07]) self.failUnless(len(rules) == 2)
class TestElementsRegexRuler(object): #(TestRuler): def setUp(self): self.ruler = ElementsRegexRuler() super(TestElementsRegexRuler, self).setUp() self.example06 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], [ 'The author Alberto Del Angel', 'Pierre Geurts The author', 'Damien Ernst' ]) self.example07 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst']) def test_rule_example(self): result = self.ruler._rule_example(self.example06) self.failUnless(result[0].pattern == u'The\\ author\\ (.*)') def test_rule(self): rules = self.ruler.rule([self.example06, self.example07]) self.failUnless(len(rules) == 2)
def setUp(self): self.ruler = ElementsRegexRuler() super(TestElementsRegexRuler, self).setUp() self.example06 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], [ 'The author Alberto Del Angel', 'Pierre Geurts The author', 'Damien Ernst' ]) self.example07 = Example([ '(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)' ], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst'])
def setUp(self): self.ruler = ElementsRegexRuler() super(TestElementsRegexRuler, self).setUp() self.example06 = Example(['(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'], ['The author Alberto Del Angel', 'Pierre Geurts The author', 'Damien Ernst']) self.example07 = Example(['(Alberto.*Angel|Angel.*Alberto)', '(Geurts.*Pierre|Pierre.*Geurts)', '(Damien.*Ernst|Ernst.*Damien)'], ['Alberto Del Angel', 'Pierre Geurts', 'Damien Ernst'])
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable