class TestSeparatorsRegexRuler(object): #(TestRuler): def setUp(self): self.ruler = SeparatorsRegexRuler() super(TestSeparatorsRegexRuler, self).setUp() self.example06 = Example([ '(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)', '(A\..*Martinez-Arias|Martinez-Arias.*A\.)', '(J\.M\..*Nieto|Nieto.*J\.M\.)' ], [ u'P. Botella1, B. Solsona1, ' 'A. Martinez-Arias2 and J.M. ' 'Lopez Nieto1' ]) self.example07 = Example([ '(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)', '(J\..*Solsona|Solsona.*J\.)' ], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, ' ' and the Bioethics Working ' 'Group of the SEMICYUC') def test_find_separators(self): pattern = '(.*)1, (.*)1, (.*)2 and (.*)1 ' separators = self.ruler._find_separators(pattern) self.failUnless(separators == ['1\\,\\ ', '2\\ and\\ ']) def test_merge_separators(self): sep01 = [u'1, ', u'2, ', u'2 and '] sep02 = [u'1, ', u'3, ', u'2 and '] expected = [u'1\\,\\ ', '2\\,\\ ', u'2\\ and\\ ', u'3\\,\\ '] result = self.ruler._merge_separators(sep01, sep02) self.failUnless(result == expected) def test_rule_example(self): rules = self.ruler._rule_example(self.example06) self.failUnless(len(rules) == 1) self.failUnless(len(rules[0].pattern) == 2) def test_rule(self): rules = self.ruler.rule([self.example06, self.example07]) self.failUnless(len(rules[0].pattern) == 3) pass
class TestSeparatorsRegexRuler(object):#(TestRuler): def setUp(self): self.ruler = SeparatorsRegexRuler() super(TestSeparatorsRegexRuler, self).setUp() self.example06 = Example(['(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)', '(A\..*Martinez-Arias|Martinez-Arias.*A\.)', '(J\.M\..*Nieto|Nieto.*J\.M\.)'], [u'P. Botella1, B. Solsona1, ' 'A. Martinez-Arias2 and J.M. ' 'Lopez Nieto1']) self.example07 = Example(['(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)', '(J\..*Solsona|Solsona.*J\.)'], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, ' ' and the Bioethics Working ' 'Group of the SEMICYUC') def test_find_separators(self): pattern = '(.*)1, (.*)1, (.*)2 and (.*)1 ' separators = self.ruler._find_separators(pattern) self.failUnless(separators == ['1\\,\\ ', '2\\ and\\ ']) def test_merge_separators(self): sep01 = [u'1, ', u'2, ', u'2 and '] sep02 = [u'1, ', u'3, ', u'2 and '] expected = [u'1\\,\\ ', '2\\,\\ ', u'2\\ and\\ ', u'3\\,\\ '] result = self.ruler._merge_separators(sep01, sep02) self.failUnless(result == expected) def test_rule_example(self): rules = self.ruler._rule_example(self.example06) self.failUnless(len(rules) == 1) self.failUnless(len(rules[0].pattern) == 2) def test_rule(self): rules = self.ruler.rule([self.example06, self.example07]) self.failUnless(len(rules[0].pattern) == 3) pass
def setUp(self): self.ruler = SeparatorsRegexRuler() super(TestSeparatorsRegexRuler, self).setUp() self.example06 = Example([ '(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)', '(A\..*Martinez-Arias|Martinez-Arias.*A\.)', '(J\.M\..*Nieto|Nieto.*J\.M\.)' ], [ u'P. Botella1, B. Solsona1, ' 'A. Martinez-Arias2 and J.M. ' 'Lopez Nieto1' ]) self.example07 = Example([ '(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)', '(J\..*Solsona|Solsona.*J\.)' ], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, ' ' and the Bioethics Working ' 'Group of the SEMICYUC')
def setUp(self): self.ruler = SeparatorsRegexRuler() super(TestSeparatorsRegexRuler, self).setUp() self.example06 = Example(['(Botella.*P\.|P\..*Botella)', '(Solona.*B\.|B\..*Solsona)', '(A\..*Martinez-Arias|Martinez-Arias.*A\.)', '(J\.M\..*Nieto|Nieto.*J\.M\.)'], [u'P. Botella1, B. Solsona1, ' 'A. Martinez-Arias2 and J.M. ' 'Lopez Nieto1']) self.example07 = Example(['(Cabre.*L\.|L\..*Cabre)', '(Mancebo.*J\.|J\..*Mancebo)', '(J\..*Solsona|Solsona.*J\.)'], u'L. Cabre1, J. Mancebo2, J. F. Solsona3, ' ' and the Bioethics Working ' 'Group of the SEMICYUC')
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable