def search_async(self, search_string, callback, error_callback): ''' Asynchronously search for `search_string` and hand over a list of search results to the callback. Each single search result is a dictionary containing all the information that could be fetched from the webpage, e.g.: .. [{'title': 'A paper title', 'authors': ['Author A', 'Author B']}, {'title': 'Another paper', 'authors': ['Author C'], 'import_url': 'http://example.com/paper.pdf'}] In addition, each paper can also contain arbitrary additional data as the value for a 'data' key. This could for example be used to save the full HTML code of a search result (which might be useful for an import of this paper) as opposed to only the extracted information. This method should not block but use the :class:`AsyncSoupSession` object `importer.soup_session` for getting the information. ''' try: # Call the method defined in the subclass message = self.prepare_search_message(search_string) def my_callback(session, message, user_data): self.handle_response_received(message, callback, error_callback) soup_session.queue_message(message, my_callback, None) except Exception as ex: error_callback(ex, search_string)
def import_paper_after_search(self, paper, callback): log_info('Trying to import google scholar citation') try: data = paper.data citations = data.findAll('div', {'class': 'gs_fl'})[0] log_debug('Citations: %s' % str(citations)) for link in citations.findAll('a'): log_debug('Link: %s' % str(link)) if link['href'].startswith('/scholar.bib'): log_debug('Found BibTex link: %s' % link['href']) def bibtex_callback(session, message, user_data): self._got_bibtex(message, callback, user_data) message = Soup.Message.new(method='GET', uri_string=BASE_URL + link['href']) message.request_headers.append( 'Cookie', 'GSP=ID=%s:CF=4' % self.google_id) soup_session.queue_message(message, bibtex_callback, self.label) #FIXME: Google scholar does not always seem to include the # URL in the bibtex data -- in this case add a link except: traceback.print_exc()
def _ids_received(self, message, callback, error_callback): if not message.status_code == Soup.KnownStatusCode.OK: error_callback('Pubmed replied with error code %d.' % message.status_code) else: response_data = message.response_body.flatten().get_data() parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data) # Check whether there were any hits at all if int(parsed_response.esearchresult.count.string) == 0: return # Nothing to do anymore # Continue with a second request asking for the summaries web_env = parsed_response.esearchresult.webenv.string query_key = parsed_response.esearchresult.querykey.string log_debug('Continuing Pubmed query (downloading summaries)') query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env) message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._summaries_received(message, callback, error_callback) soup_session.queue_message(message, mycallback, None)
def import_paper_after_search(self, paper, callback): pubmed_id = paper.data log_info('Trying to import pubmed citation with id %s' % pubmed_id) query = BASE_URL + EFETCH_QUERY % pubmed_id message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._paper_info_received(message, callback, user_data) soup_session.queue_message(message, mycallback, (self.label, pubmed_id))
def search_async(self, search_text, callback, error_callback): ''' Returns a list of dictionaries: The PUBMED results for the given search query ''' # First do a query only for ids that is saved on the server log_debug('Starting Pubmed query for string "%s"' % search_text) query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text) message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._ids_received(message, callback, error_callback) soup_session.queue_message(message, mycallback, None)
def import_paper_after_search(self, paper_obj, callback): ''' This method is called when a search result is requested to be imported. The given `paper_obj` is a :class:`VirtualPaper` which has all the information previously returned by the search as attributes, e.g. `paper_obj.doi` is its DOI. The special attribute `data` should be used for information that can be useful for importing the paper, in addition to the default paper attributes. For example, :class:`GoogleScholarSearch` saves the complete HTML code for a search result, which contains a link to BibTeX data and possibly to a PDF document. If this method is not overwritten, it asynchronously downloads a document given in import_url (if any) and returns the original `paper_obj` and possibly the PDF document to the callback. In case the search provider does not have any info to add to the initial search result, this is all that is needed. In cases where the search provider can add more information (e.g. the :class:`PubMedSearch` only requests summaries for the search, but when a specific paper is requested it gets the full record), this method should be overwritten. ''' # in case the paper already had an import URL, download from this URL if hasattr(paper_obj, 'import_url') and paper_obj.import_url: message = Soup.Message.new(method='GET', uri_string=paper_obj.import_url) def mycallback(session, message, user_data): if message.status_code == Soup.KnownStatusCode.OK: paper_data = message.response_body.flatten().get_data() callback(paper_obj=paper_obj, paper_data=paper_data, user_data=user_data) else: log_error("%: got status %s while trying to fetch PDF" % (self.__class__.__name__, message.status_code)) callback(paper_obj=paper_obj, user_data=user_data) log_debug("%s: trying to fetch %s" % (self.__class__.__name__, paper_obj.import_url)) soup_session.queue_message(message, mycallback, (self.label, paper_obj.import_url)) else: callback(paper_obj=paper_obj, user_data=self.label)
def import_paper_after_search(self, paper, callback): log_info('Trying to import google scholar citation') try: data = paper.data citations = data.findAll('div', {'class': 'gs_fl'})[0] log_debug('Citations: %s' % str(citations)) for link in citations.findAll('a'): log_debug('Link: %s' % str(link)) if link['href'].startswith('/scholar.bib'): log_debug('Found BibTex link: %s' % link['href']) def bibtex_callback(session, message, user_data): self._got_bibtex(message, callback, user_data) message = Soup.Message.new(method='GET', uri_string=BASE_URL + link['href']) message.request_headers.append('Cookie', 'GSP=ID=%s:CF=4' % self.google_id) soup_session.queue_message(message, bibtex_callback, self.label) #FIXME: Google scholar does not always seem to include the # URL in the bibtex data -- in this case add a link except: traceback.print_exc()