def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def setUp(self): self.ruler = RegexRuler() super(TestRegexRuler, self).setUp()
class TestRegexRuler(object):#(TestRuler): def setUp(self): self.ruler = RegexRuler() super(TestRegexRuler, self).setUp() def test_rule_example(self): example = Example('2007', 'Volume 31, Number 7 / July, 2007') rules = self.ruler._rule_example(example) expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)' self.failUnless(rules[0].pattern == expected) def test_should_merge(self): rule01 = Rule('Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)') rule02 = Rule('Wednesday\,\ November\ 03\,\ (.*)') should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule(u'\\ Volume\\ 70\\ \\,\\ \\;\\ ' 'Issue\\ 16\\-18\\ \\ \\;\\(October\\ (.*)\\)') rule02 = Rule(u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 21\\-23\\ \\ \\;\\(January\\ (.*)\\)') should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == True) def test_merge_patterns(self): general = 'aa3aaxxxx\(\)' pattern = 'aa1aaxxxx\(\)' result = self.ruler._merge_patterns(general, pattern) expected = 'aa(?:.*)aaxxxx\(\)' self.failUnless(result == expected) general = (u'\\ Volume\\ 70\\ \\,\\ \\;\\ ' 'Issue\\ 16\\-18\\ \\ \\;\\(October\\ (.*)\\)') pattern = (u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 21\\-23\\ \\ \\;\\(January\\ (.*)\\)') result = self.ruler._merge_patterns(general, pattern) expected = (u'\\ Volume\\ (?:.*)\\ \\,\\ \\;\\ ' 'Issue\\ (?:.*)\\-(?:.*)\\ \\&nb' 'sp\\;\\((?:.*)\\ (.*)\\)') self.failUnless(result == expected) pattern = (u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 22\\-23\\ \\ \\;\\(May\\ (.*)\\)') result = self.ruler._merge_patterns(general, pattern) expected = (u'\\ Volume\\ (?:.*)\\ \\,\\ \\;\\ ' 'Issue\\ (?:.*)\\-(?:.*)\\ \\ \\;' '\\((?:.*)\\ (.*)\\)') self.failUnless(result == expected) general = '(.*)\ \/\ \(2007\)' pattern = '(.*)\ \(2010\)' result = self.ruler._merge_patterns(general, pattern) pass def test_rule(self): example01 = Example(u'2007', u' Volume 22 , ' 'Issue 22-23 (May 2007)') example02 = Example(u'2009', u' Volume 11 , ' 'Issue 16-25 (May 2009)') example03 = Example(u'2008', u' Year of publication: 2008') results = self.ruler.rule([example01, example02]) self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\ ' '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\ \\;\\(May\\ ' '(.*)\\)') results = self.ruler.rule([example01, example02, example03]) self.failUnless(len(results) == 2) self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:' '\\ \\;(.*)') def test_apply_heuristics(self): sm = difflib.SequenceMatcher(None, 'The 3rd House', 'The 35th house') result = self.ruler._apply_heuristics('The 35th house', sm.get_matching_blocks()) self.failUnless(len(result) == 3)
class TestRegexRuler(object): #(TestRuler): def setUp(self): self.ruler = RegexRuler() super(TestRegexRuler, self).setUp() def test_rule_example(self): example = Example('2007', 'Volume 31, Number 7 / July, 2007') rules = self.ruler._rule_example(example) expected = 'Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)' self.failUnless(rules[0].pattern == expected) def test_should_merge(self): rule01 = Rule('Volume\ 31\,\ Number\ 7\ \/\ July\,\ (.*)') rule02 = Rule('Wednesday\,\ November\ 03\,\ (.*)') should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == False) rule01 = Rule(u'\\ Volume\\ 70\\ \\,\\ \\;\\ ' 'Issue\\ 16\\-18\\ \\ \\;\\(October\\ (.*)\\)') rule02 = Rule(u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 21\\-23\\ \\ \\;\\(January\\ (.*)\\)') should_merge = self.ruler._should_merge(rule01, rule02) self.failUnless(should_merge == True) def test_merge_patterns(self): general = 'aa3aaxxxx\(\)' pattern = 'aa1aaxxxx\(\)' result = self.ruler._merge_patterns(general, pattern) expected = 'aa(?:.*)aaxxxx\(\)' self.failUnless(result == expected) general = (u'\\ Volume\\ 70\\ \\,\\ \\;\\ ' 'Issue\\ 16\\-18\\ \\ \\;\\(October\\ (.*)\\)') pattern = (u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 21\\-23\\ \\ \\;\\(January\\ (.*)\\)') result = self.ruler._merge_patterns(general, pattern) expected = (u'\\ Volume\\ (?:.*)\\ \\,\\ \\;\\ ' 'Issue\\ (?:.*)\\-(?:.*)\\ \\&nb' 'sp\\;\\((?:.*)\\ (.*)\\)') self.failUnless(result == expected) pattern = (u'\\ Volume\\ 22\\ \\,\\ \\;\\ ' 'Issue\\ 22\\-23\\ \\ \\;\\(May\\ (.*)\\)') result = self.ruler._merge_patterns(general, pattern) expected = (u'\\ Volume\\ (?:.*)\\ \\,\\ \\;\\ ' 'Issue\\ (?:.*)\\-(?:.*)\\ \\ \\;' '\\((?:.*)\\ (.*)\\)') self.failUnless(result == expected) general = '(.*)\ \/\ \(2007\)' pattern = '(.*)\ \(2010\)' result = self.ruler._merge_patterns(general, pattern) pass def test_rule(self): example01 = Example( u'2007', u' Volume 22 , ' 'Issue 22-23 (May 2007)') example02 = Example( u'2009', u' Volume 11 , ' 'Issue 16-25 (May 2009)') example03 = Example(u'2008', u' Year of publication: 2008') results = self.ruler.rule([example01, example02]) self.failUnless(results[0].pattern == u'Volume\\ (?:.*)\\ \\,\\ ' '\\;\\ Issue\\ (?:.*)\\-2(?:.*)\\ \\ \\;\\(May\\ ' '(.*)\\)') results = self.ruler.rule([example01, example02, example03]) self.failUnless(len(results) == 2) self.failUnless(results[0].pattern == u'Year\\ of\\ publication\\:' '\\ \\;(.*)') def test_apply_heuristics(self): sm = difflib.SequenceMatcher(None, 'The 3rd House', 'The 35th house') result = self.ruler._apply_heuristics('The 35th house', sm.get_matching_blocks()) self.failUnless(len(result) == 3)