def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent()
def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples
def _get_content(self, url): """ This method looks for the content of an example's URL. In order not to overload the server, it sleeps for some time between multiple calls. """ time_to_sleep = (self.seconds_between_requests - (datetime.now() - self.last_request).seconds) if time_to_sleep > 0: sleep(time_to_sleep) content = None try: content = Browser().get_page(url) content = ContentCleaner().clean_content(content) except BrowserError as e: log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable e.error)) self.last_request = datetime.now() return content
class IEController(Controller): def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % ( result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result) def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return [] def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper' ) #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper' ) #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % ( format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable return references
class ReferenceWrapper(Wrapper): """ Offers methods to extract complete references from som webpages """ _available_wrappers = {'http://portal.acm.org':'portal_acm', 'http://citeseerx.ist.psu.edu':'citeseerx'} _browser = Browser() def extract_info(self, source, page): """ Extracts a reference from the given page. """ if source not in self._available_wrappers.keys(): log.debug('No reference wrapper available for source %s' % source) #@UndefinedVariable return (None, None) wrapper_method = getattr(self, '_do_' + self._available_wrappers[source]) return wrapper_method(source, page) def get_available_wrappers(self): return self._available_wrappers.keys() def _do_portal_acm(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using ACM Portal reference wrapper') #@UndefinedVariable ref = (None, None) anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')}) if not anchor: return ref jscript = anchor['onclick'].replace('window.open', '').strip('\(\)') ref_url = jscript.split(',')[0].strip('\'') ref_url = source + '/' + ref_url try: page = BeautifulSoup(self._browser.get_page(ref_url)) except BrowserError: log.error('Browse error while retrieving entry page') #@UndefinedVariable return ref pre = page.find('pre') if not pre: return ref # As the wrapper has been hardcoded, we already know what will be the # format of the reference return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX) def _do_citeseerx(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable ref = (None, None) try: ref_element = page.find('div', {'class':'content'}, text=re.compile('@\w*{')) ref_element = ref_element.parent.findAll(text=True) reference = ''.join(ref_element) except Exception, e: log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable return ref return (reference.strip(), ReferenceFormat.BIBTEX)
class IEController(Controller): def __init__(self, factory, target_format=ReferenceFormat.BIBTEX, max_wrappers=MAX_WRAPPERS, max_examples=MAX_EXAMPLES, max_examples_from_db=MAX_EXAMPLES_FROM_DB, min_validity=MIN_VALIDITY, secs_between_reqs=SECONDS_BETWEEN_REQUESTS, wrapper_gen_examples=WRAPPER_GEN_EXAMPLES): super(IEController, self).__init__(factory) self.browser = Browser() self.format = target_format self.field_validation = {} self._set_field_validation() self.value_guides = configuration.wrapper_properties['value_guide'] self.max_wrappers = max_wrappers self.max_examples = max_examples self.max_examples_from_db = max_examples_from_db self.min_validity = min_validity self.secs_between_reqs = secs_between_reqs self.wrapper_gen_examples = wrapper_gen_examples def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result) def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return [] def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable return references
class Searcher(object): """ Base class for searching with a search engine """ GOOGLE = 0 SCHOLAR = 1 BING = 2 YAHOO = 3 def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent() def prepare(self): self.results_info = None self.eor = False # end of results self._page = 0 self._results_per_page = 30 self._last_from = 0 def get_query(self): return self.__query def set_query(self, value): self.__query = value self.prepare() query = property(get_query, set_query) @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def search_engine_url(self): raise NotImplementedError() def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] page = self._get_results_page() search_info = self._extract_info(page) if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] results = self._extract_results(page) if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_safe_url(self): return self.search_engine_url % { 'query': urllib.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page } def _get_results_page(self): safe_url = self._get_safe_url() # Wait a random time between 0.5 and 1,5 seconds before doing the # search #time_to_wait = random.randrange(5, 15, 2) / 10.0 #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url)) #time.sleep(time_to_wait) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page)
class Searcher(object): """ Base class for searching with a search engine """ GOOGLE = 0 SCHOLAR = 1 BING = 2 YAHOO = 3 def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent() def prepare(self): self.results_info = None self.eor = False # end of results self._page = 0 self._results_per_page = 30 self._last_from = 0 def get_query(self): return self.__query def set_query(self, value): self.__query = value self.prepare() query = property(get_query, set_query) @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def search_engine_url(self): raise NotImplementedError() def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] page = self._get_results_page() search_info = self._extract_info(page) if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] results = self._extract_results(page) if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_safe_url(self): return self.search_engine_url % {'query':urllib.quote_plus(self.query), 'start':self._page * self._results_per_page, 'num' :self._results_per_page } def _get_results_page(self): safe_url = self._get_safe_url() # Wait a random time between 0.5 and 1,5 seconds before doing the # search #time_to_wait = random.randrange(5, 15, 2) / 10.0 #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url)) #time.sleep(time_to_wait) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page)