def _sort_results(self, results): """ Sorts the results depending on the available wrappers. Returns a list with the results that have a wrapper available on top of it, and those with no wrapper are discarded. The list is ordered depending on the quality of the wrappers. """ # Create a list with all the available wrappers ordered by priority # Reference wrapper will be at the very beginning of the priority queue reference_wrappers = ReferenceWrapper().get_available_wrappers() available_wrappers = list(reference_wrappers) field_wrappers = WrapperGateway().get_available_wrappers() available_wrappers.extend(list(field_wrappers)) wrappers_heap = [] for result in results: base_url = result.base_url if self._in_black_list(result.url): continue elif not base_url in available_wrappers: continue else: # TODO: Remove this conditional if base_url.startswith('http://citeseerx'): wrapper_index = len(results) + 5 else: wrapper_index = available_wrappers.index(base_url) heapq.heappush(wrappers_heap, (wrapper_index, result)) results = heapq.nsmallest(len(results), wrappers_heap) return [result[1] for result in results]
def initialize(self): self.setOption(QtGui.QWizard.NoCancelButton, True) self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True) self.wrapper_gw = WrapperGateway() wizard_title = 'Manage Wrappers' self.page01 = WrapperManagerPage(wizard_title, self) self.addPage(self.page01)
def initialize(self): self.setDefaultProperty('FileChooser', 'path', QtCore.SIGNAL('pathChanged()')) self.setDefaultProperty('QProgressBar', 'value', QtCore.SIGNAL('valueChanged(int)')) self.setOption(QtGui.QWizard.NoCancelButton, True) self.setOption(QtGui.QWizard.NoBackButtonOnStartPage, True) self.setOption(QtGui.QWizard.NoBackButtonOnLastPage, True) self.wrapper_gw = WrapperGateway() self.page01 = URLChoosePage(self) self.page02 = ProgressPage(self) #self.page03 = FinishedPage(self) self.addPage(self.page01) self.addPage(self.page02)
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return []
def setUp(self): self.wm = WrapperGateway( create_session(sql_uri='sqlite:///:memory:', debug=True))
def setUp(self): self.session = create_session('sqlite:///:memory:', True) self.wg = WrapperGateway(session=self.session)