class WrapperManagerWizard(QtGui.QWizard): def __init__(self): super(WrapperManagerWizard, self).__init__() self.initialize() def initialize(self): self.setOption(QtGui.QWizard.NoCancelButton, True) self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True) self.wrapper_gw = WrapperGateway() wizard_title = 'Manage Wrappers' self.page01 = WrapperManagerPage(wizard_title, self) self.addPage(self.page01) def show(self): self.removePage(0) self.initialize() self.restart() super(WrapperManagerWizard, self).show() def done(self, status): self.page01._update_collection() self.page01._update_wrapper() self.wrapper_gw.flush()
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway(max_examples=self.max_examples, max_examples_from_db= self.max_examples_from_db, seconds_between_requests= self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler()] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def initialize(self): self.setOption(QtGui.QWizard.NoCancelButton, True) self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True) self.wrapper_gw = WrapperGateway() wizard_title = 'Manage Wrappers' self.page01 = WrapperManagerPage(wizard_title, self) self.addPage(self.page01)
def _sort_results(self, results): """ Sorts the results depending on the available wrappers. Returns a list with the results that have a wrapper available on top of it, and those with no wrapper are discarded. The list is ordered depending on the quality of the wrappers. """ # Create a list with all the available wrappers ordered by priority # Reference wrapper will be at the very beginning of the priority queue reference_wrappers = ReferenceWrapper().get_available_wrappers() available_wrappers = list(reference_wrappers) field_wrappers = WrapperGateway().get_available_wrappers() available_wrappers.extend(list(field_wrappers)) wrappers_heap = [] for result in results: base_url = result.base_url if self._in_black_list(result.url): continue elif not base_url in available_wrappers: continue else: # TODO: Remove this conditional if base_url.startswith('http://citeseerx'): wrapper_index = len(results) + 5 else: wrapper_index = available_wrappers.index(base_url) heapq.heappush(wrappers_heap, (wrapper_index, result)) results = heapq.nsmallest(len(results), wrappers_heap) return [result[1] for result in results]
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return []
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def initialize(self): self.setDefaultProperty('FileChooser', 'path', QtCore.SIGNAL('pathChanged()')) self.setDefaultProperty('QProgressBar', 'value', QtCore.SIGNAL('valueChanged(int)')) self.setOption(QtGui.QWizard.NoCancelButton, True) self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True) self.setOption(QtGui.QWizard.NoBackButtonOnLastPage, True) self.wrapper_gw = WrapperGateway() self.page01 = URLChoosePage(self) self.page02 = ProgressPage(self) #self.page03 = FinishedPage(self) self.addPage(self.page01) self.addPage(self.page02)
class TestWrapperManager(unittest.TestCase): def setUp(self): self.wm = WrapperGateway( create_session(sql_uri='sqlite:///:memory:', debug=True)) def test_find_collection(self): # Do not create collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failIf(collection1) # New collection collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field', True) self.failUnless(collection1) self.failUnless(type(collection1) == mappers.WrapperCollection) # Existent collection collection2 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failUnless(collection2) self.failUnless(collection1 is collection2) def test_find_collections(self): collection11 = self.wm.find_wrapper_collection(u'c01', u'f01', True) #@UnusedVariable collection12 = self.wm.find_wrapper_collection(u'c01', u'f02', True) #@UnusedVariable collection21 = self.wm.find_wrapper_collection(u'c02', u'f01', True) #@UnusedVariable collection22 = self.wm.find_wrapper_collection(u'c02', u'f02', True) #@UnusedVariable collections = self.wm.find_wrapper_collections() self.failUnless(collections.count() >= 4) collections = self.wm.find_wrapper_collections(url=u'c02') self.failUnless(collections.count() == 2) collections = self.wm.find_wrapper_collections(field=u'f02') self.failUnless(collections.count() == 2) def test_get_unavailable_wrappers(self): wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field') self.failUnless(wrappers == []) def xtest_persist_wrapper_with_incorrect_rules(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(MockRule02(33))) self.failUnlessRaises(TypeError, self.wm.persist_wrapper, u'some_url', u'some_field', wrapper) def xtest_persist_and_get_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(66)) wrapper.add_rule(MockRule01(77)) wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(11)) wrapper.add_rule(MockRule01(22)) wrapper.add_rule(MockRule01(33)) self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper) # Get non-existent wrapper wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field') self.failIf(wrappers, 'Get non-existent wrapper') # Get wrappers wrappers = self.wm.get_wrappers(u'some_url', u'some_field') self.failUnless(len(wrappers) == 2) wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field') self.failUnless(len(wrappers) == 1) def test_persist_and_update_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper) # Get wrappers wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) # Update wrapper wrapper = wrappers[0] wrapper.upvotes += 1 wrapper.downvotes -= 1 self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].upvotes == 1) self.failUnless(wrappers[0].downvotes == -1) # Update wrapper rules wrapper = wrappers[0] wrapper.rules[0].pattern = 223 wrapper.rules[2].pattern = [1, 6] self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].rules[0].pattern == 223) self.failUnless(wrappers[0].rules[2].pattern == [1, 6]) # Add another wrapper rule wrapper = wrappers[0] wrapper.rules.append(MockRule02([2, 3, 4, 5])) self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(len(wrappers[0].rules) == 4) self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])
def setUp(self): self.wm = WrapperGateway(create_session( sql_uri='sqlite:///:memory:', debug=True))
def _populate_db(self): wg = WrapperGateway(self.session) collection01 = wg.new_wrapper_collection() collection01.field = u"a" collection01.url = u"url01" collection02 = wg.new_wrapper_collection() collection02.field = u"b" collection02.url = u"url01" collection03 = wg.new_wrapper_collection() collection03.field = u"a" collection03.url = u"url02" collection04 = wg.new_wrapper_collection() collection04.field = u"b" collection04.url = u"url02" collection05 = wg.new_wrapper_collection() collection05.field = u"a" collection05.url = u"url03" wrapper01 = wg.new_wrapper() wrapper01.downvotes = 0 wrapper01.upvotes = 3 wrapper01.score = 1.0 collection01.wrappers.append(wrapper01) wrapper02 = wg.new_wrapper() wrapper02.downvotes = 0 wrapper02.upvotes = 2 wrapper02.score = 1.0 collection01.wrappers.append(wrapper02) wrapper03 = wg.new_wrapper() wrapper03.downvotes = 1 wrapper03.upvotes = 1 wrapper03.score = 0.5 collection02.wrappers.append(wrapper03) wrapper04 = wg.new_wrapper() wrapper04.downvotes = 0 wrapper04.upvotes = 3 wrapper04.score = 1.0 collection04.wrappers.append(wrapper04) wrapper05 = wg.new_wrapper() wrapper05.downvotes = 0 wrapper05.upvotes = 2 wrapper05.score = 1.0 collection05.wrappers.append(wrapper05) wrapper06 = wg.new_wrapper() wrapper06.downvotes = 1 wrapper06.upvotes = 1 wrapper06.score = 0.8 collection05.wrappers.append(wrapper06) wrapper07 = wg.new_wrapper() wrapper07.downvotes = 0 wrapper07.upvotes = 3 wrapper07.score = 1.0 collection05.wrappers.append(wrapper07) wrapper08 = wg.new_wrapper() wrapper08.downvotes = 1 wrapper08.upvotes = 1 wrapper08.score = 0.2 collection05.wrappers.append(wrapper08) self.session.flush()
class TestWrapperManager(unittest.TestCase): def setUp(self): self.wm = WrapperGateway(create_session( sql_uri='sqlite:///:memory:', debug=True)) def test_find_collection(self): # Do not create collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failIf(collection1) # New collection collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field', True) self.failUnless(collection1) self.failUnless(type(collection1) == mappers.WrapperCollection) # Existent collection collection2 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failUnless(collection2) self.failUnless(collection1 is collection2) def test_find_collections(self): collection11 = self.wm.find_wrapper_collection(u'c01', u'f01', True) #@UnusedVariable collection12 = self.wm.find_wrapper_collection(u'c01', u'f02', True) #@UnusedVariable collection21 = self.wm.find_wrapper_collection(u'c02', u'f01', True) #@UnusedVariable collection22 = self.wm.find_wrapper_collection(u'c02', u'f02', True) #@UnusedVariable collections = self.wm.find_wrapper_collections() self.failUnless(collections.count() >= 4) collections = self.wm.find_wrapper_collections(url=u'c02') self.failUnless(collections.count() == 2) collections = self.wm.find_wrapper_collections(field=u'f02') self.failUnless(collections.count() == 2) def test_get_unavailable_wrappers(self): wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field') self.failUnless(wrappers == []) def xtest_persist_wrapper_with_incorrect_rules(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(MockRule02(33))) self.failUnlessRaises(TypeError, self.wm.persist_wrapper, u'some_url', u'some_field', wrapper) def xtest_persist_and_get_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(66)) wrapper.add_rule(MockRule01(77)) wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(11)) wrapper.add_rule(MockRule01(22)) wrapper.add_rule(MockRule01(33)) self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper) # Get non-existent wrapper wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field') self.failIf(wrappers, 'Get non-existent wrapper') # Get wrappers wrappers = self.wm.get_wrappers(u'some_url', u'some_field') self.failUnless(len(wrappers) == 2) wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field') self.failUnless(len(wrappers) == 1) def test_persist_and_update_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper) # Get wrappers wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) # Update wrapper wrapper = wrappers[0] wrapper.upvotes += 1 wrapper.downvotes -= 1 self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].upvotes == 1) self.failUnless(wrappers[0].downvotes == -1) # Update wrapper rules wrapper = wrappers[0] wrapper.rules[0].pattern = 223 wrapper.rules[2].pattern = [1, 6] self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].rules[0].pattern == 223) self.failUnless(wrappers[0].rules[2].pattern == [1, 6]) # Add another wrapper rule wrapper = wrappers[0] wrapper.rules.append(MockRule02([2, 3, 4, 5])) self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(len(wrappers[0].rules) == 4) self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])
def setUp(self): self.session = create_session('sqlite:///:memory:', True) self.wg = WrapperGateway(session=self.session)
class TestWrapperGateway(unittest.TestCase): def setUp(self): self.session = create_session('sqlite:///:memory:', True) self.wg = WrapperGateway(session=self.session) def _populate_db(self): collection01 = self.wg.new_wrapper_collection() collection01.field = u'a' collection01.url = u'url01' collection02 = self.wg.new_wrapper_collection() collection02.field = u'b' collection02.url = u'url01' collection03 = self.wg.new_wrapper_collection() collection03.field = u'a' collection03.url = u'url02' collection04 = self.wg.new_wrapper_collection() collection04.field = u'b' collection04.url = u'url02' collection05 = self.wg.new_wrapper_collection() collection05.field = u'a' collection05.url = u'url03' wrapper01 = self.wg.new_wrapper() wrapper01.downvotes = 0 wrapper01.upvotes = 3 wrapper01.score = 1.0 collection01.wrappers.append(wrapper01) wrapper02 = self.wg.new_wrapper() wrapper02.downvotes = 0 wrapper02.upvotes = 2 wrapper02.score = 1.0 collection01.wrappers.append(wrapper02) wrapper03 = self.wg.new_wrapper() wrapper03.downvotes = 1 wrapper03.upvotes = 1 wrapper03.score = 0.5 collection02.wrappers.append(wrapper03) wrapper04 = self.wg.new_wrapper() wrapper04.downvotes = 0 wrapper04.upvotes = 3 wrapper04.score = 1.0 collection04.wrappers.append(wrapper04) wrapper05 = self.wg.new_wrapper() wrapper05.downvotes = 0 wrapper05.upvotes = 2 wrapper05.score = 1.0 collection05.wrappers.append(wrapper05) wrapper06 = self.wg.new_wrapper() wrapper06.downvotes = 1 wrapper06.upvotes = 1 wrapper06.score = 0.8 collection05.wrappers.append(wrapper06) wrapper07 = self.wg.new_wrapper() wrapper07.downvotes = 0 wrapper07.upvotes = 3 wrapper07.score = 1.0 collection05.wrappers.append(wrapper07) wrapper08 = self.wg.new_wrapper() wrapper08.downvotes = 1 wrapper08.upvotes = 1 wrapper08.score = 0.2 collection05.wrappers.append(wrapper08) self.session.flush() def test_get_available_wrappers(self): self._populate_db() wrappers = self.wg.get_available_wrappers() self.failUnless(wrappers == [u'url03', u'url01', u'url02'])
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return []
def setUp(self): self.wm = WrapperGateway( create_session(sql_uri='sqlite:///:memory:', debug=True))