def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return []
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return []
class TestWrapperManager(unittest.TestCase): def setUp(self): self.wm = WrapperGateway(create_session( sql_uri='sqlite:///:memory:', debug=True)) def test_find_collection(self): # Do not create collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failIf(collection1) # New collection collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field', True) self.failUnless(collection1) self.failUnless(type(collection1) == mappers.WrapperCollection) # Existent collection collection2 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failUnless(collection2) self.failUnless(collection1 is collection2) def test_find_collections(self): collection11 = self.wm.find_wrapper_collection(u'c01', u'f01', True) #@UnusedVariable collection12 = self.wm.find_wrapper_collection(u'c01', u'f02', True) #@UnusedVariable collection21 = self.wm.find_wrapper_collection(u'c02', u'f01', True) #@UnusedVariable collection22 = self.wm.find_wrapper_collection(u'c02', u'f02', True) #@UnusedVariable collections = self.wm.find_wrapper_collections() self.failUnless(collections.count() >= 4) collections = self.wm.find_wrapper_collections(url=u'c02') self.failUnless(collections.count() == 2) collections = self.wm.find_wrapper_collections(field=u'f02') self.failUnless(collections.count() == 2) def test_get_unavailable_wrappers(self): wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field') self.failUnless(wrappers == []) def xtest_persist_wrapper_with_incorrect_rules(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(MockRule02(33))) self.failUnlessRaises(TypeError, self.wm.persist_wrapper, u'some_url', u'some_field', wrapper) def xtest_persist_and_get_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(66)) wrapper.add_rule(MockRule01(77)) wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(11)) wrapper.add_rule(MockRule01(22)) wrapper.add_rule(MockRule01(33)) self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper) # Get non-existent wrapper wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field') self.failIf(wrappers, 'Get non-existent wrapper') # Get wrappers wrappers = self.wm.get_wrappers(u'some_url', u'some_field') self.failUnless(len(wrappers) == 2) wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field') self.failUnless(len(wrappers) == 1) def test_persist_and_update_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper) # Get wrappers wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) # Update wrapper wrapper = wrappers[0] wrapper.upvotes += 1 wrapper.downvotes -= 1 self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].upvotes == 1) self.failUnless(wrappers[0].downvotes == -1) # Update wrapper rules wrapper = wrappers[0] wrapper.rules[0].pattern = 223 wrapper.rules[2].pattern = [1, 6] self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].rules[0].pattern == 223) self.failUnless(wrappers[0].rules[2].pattern == [1, 6]) # Add another wrapper rule wrapper = wrappers[0] wrapper.rules.append(MockRule02([2, 3, 4, 5])) self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(len(wrappers[0].rules) == 4) self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])
class TestWrapperManager(unittest.TestCase): def setUp(self): self.wm = WrapperGateway( create_session(sql_uri='sqlite:///:memory:', debug=True)) def test_find_collection(self): # Do not create collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failIf(collection1) # New collection collection1 = self.wm.find_wrapper_collection(u'some_url', u'some_field', True) self.failUnless(collection1) self.failUnless(type(collection1) == mappers.WrapperCollection) # Existent collection collection2 = self.wm.find_wrapper_collection(u'some_url', u'some_field') self.failUnless(collection2) self.failUnless(collection1 is collection2) def test_find_collections(self): collection11 = self.wm.find_wrapper_collection(u'c01', u'f01', True) #@UnusedVariable collection12 = self.wm.find_wrapper_collection(u'c01', u'f02', True) #@UnusedVariable collection21 = self.wm.find_wrapper_collection(u'c02', u'f01', True) #@UnusedVariable collection22 = self.wm.find_wrapper_collection(u'c02', u'f02', True) #@UnusedVariable collections = self.wm.find_wrapper_collections() self.failUnless(collections.count() >= 4) collections = self.wm.find_wrapper_collections(url=u'c02') self.failUnless(collections.count() == 2) collections = self.wm.find_wrapper_collections(field=u'f02') self.failUnless(collections.count() == 2) def test_get_unavailable_wrappers(self): wrappers = self.wm.get_wrappers(u'non_existent_url', u'no_field') self.failUnless(wrappers == []) def xtest_persist_wrapper_with_incorrect_rules(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(MockRule02(33))) self.failUnlessRaises(TypeError, self.wm.persist_wrapper, u'some_url', u'some_field', wrapper) def xtest_persist_and_get_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(66)) wrapper.add_rule(MockRule01(77)) wrapper.add_rule(MockRule02([[2, 3, 4, 5], 4])) self.wm.persist_wrapper(u'some_url', u'some_field', wrapper) wrapper = Wrapper() wrapper.add_rule(MockRule01(11)) wrapper.add_rule(MockRule01(22)) wrapper.add_rule(MockRule01(33)) self.wm.persist_wrapper(u'some_url', u'some_other_field', wrapper) # Get non-existent wrapper wrappers = self.wm.get_wrappers(u'some_url', u'non_existent_field') self.failIf(wrappers, 'Get non-existent wrapper') # Get wrappers wrappers = self.wm.get_wrappers(u'some_url', u'some_field') self.failUnless(len(wrappers) == 2) wrappers = self.wm.get_wrappers(u'some_url', u'some_other_field') self.failUnless(len(wrappers) == 1) def test_persist_and_update_wrapper(self): wrapper = Wrapper() wrapper.add_rule(MockRule01(33)) wrapper.add_rule(MockRule01(55)) wrapper.add_rule(MockRule02([1, [2, 3, 4, 5], 6])) self.wm.persist_wrapper(u'concrete_url', u'concrete_field', wrapper) # Get wrappers wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) # Update wrapper wrapper = wrappers[0] wrapper.upvotes += 1 wrapper.downvotes -= 1 self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].upvotes == 1) self.failUnless(wrappers[0].downvotes == -1) # Update wrapper rules wrapper = wrappers[0] wrapper.rules[0].pattern = 223 wrapper.rules[2].pattern = [1, 6] self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(wrappers[0].rules[0].pattern == 223) self.failUnless(wrappers[0].rules[2].pattern == [1, 6]) # Add another wrapper rule wrapper = wrappers[0] wrapper.rules.append(MockRule02([2, 3, 4, 5])) self.wm.update_wrapper(wrapper) # Get the wrapper again wrappers = self.wm.get_wrappers(u'concrete_url', u'concrete_field') self.failUnless(len(wrappers) == 1) self.failUnless(len(wrappers[0].rules) == 4) self.failUnless(wrappers[0].rules[3].pattern == [2, 3, 4, 5])