class TestMultiValuePathRuler(TestRuler): def setUp(self): self.ruler = MultiValuePathRuler() super(TestMultiValuePathRuler, self).setUp() self.example06 = Example(['.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*'], self.soup03) self.example07 = Example(['.*(Alberto.*Angel|Angel.*Alberto).*', '.*(Geurts.*Pierre|Pierre.*Geurts).*'], self.soup01) self.example08 = Example(['.*(Michael.*Sweredoski|Sweredoski.*Michael).*', '.*(Pierre.*Baldi|Baldi.*Pierre).*'], self.soup02) def test_rule_example(self): rules = self.ruler._rule_example(self.example06) self.failUnless(len(rules) == 1) rules = self.ruler._rule_example(self.example07) self.failUnless(len(rules) == 2) def test_rule(self): rules = self.ruler.rule([self.example07, self.example08]) self.failUnless(len(rules) == 2) result = rules[0].apply(self.soup01) self.failUnless(len(result) == 5)
class TestMultiValuePathRuler(TestRuler): def setUp(self): self.ruler = MultiValuePathRuler() super(TestMultiValuePathRuler, self).setUp() self.example06 = Example([ '.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*' ], self.soup03) self.example07 = Example([ '.*(Alberto.*Angel|Angel.*Alberto).*', '.*(Geurts.*Pierre|Pierre.*Geurts).*' ], self.soup01) self.example08 = Example([ '.*(Michael.*Sweredoski|Sweredoski.*Michael).*', '.*(Pierre.*Baldi|Baldi.*Pierre).*' ], self.soup02) def test_rule_example(self): rules = self.ruler._rule_example(self.example06) self.failUnless(len(rules) == 1) rules = self.ruler._rule_example(self.example07) self.failUnless(len(rules) == 2) def test_rule(self): rules = self.ruler.rule([self.example07, self.example08]) self.failUnless(len(rules) == 2) result = rules[0].apply(self.soup01) self.failUnless(len(result) == 5)
def setUp(self): self.ruler = MultiValuePathRuler() super(TestMultiValuePathRuler, self).setUp() self.example06 = Example([ '.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*' ], self.soup03) self.example07 = Example([ '.*(Alberto.*Angel|Angel.*Alberto).*', '.*(Geurts.*Pierre|Pierre.*Geurts).*' ], self.soup01) self.example08 = Example([ '.*(Michael.*Sweredoski|Sweredoski.*Michael).*', '.*(Pierre.*Baldi|Baldi.*Pierre).*' ], self.soup02)
def setUp(self): self.ruler = MultiValuePathRuler() super(TestMultiValuePathRuler, self).setUp() self.example06 = Example(['.*(Botella.*P\.|P\..*Botella).*', '.*(Solona.*B\.|B\..*Solsona).*'], self.soup03) self.example07 = Example(['.*(Alberto.*Angel|Angel.*Alberto).*', '.*(Geurts.*Pierre|Pierre.*Geurts).*'], self.soup01) self.example08 = Example(['.*(Michael.*Sweredoski|Sweredoski.*Michael).*', '.*(Pierre.*Baldi|Baldi.*Pierre).*'], self.soup02)
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable