def process_title_response(response): """Just fetch the first 10, ignore pagination""" try: results = response['query']['search'] titles = (x['title'] for x in results) title_string = '|'.join(titles) except KeyError: # Malformed response raise CannotCompleteRequestError('Cannot produce a valid title string') if '' == title_string: raise CannotCompleteRequestError('Cannot produce a valid title string') return title_string
def process_pageid_response(response): try: pages = response['query']['pages'] keys = pages.keys() except KeyError: raise CannotCompleteRequestError( 'Cannot produce pageid list from response') except AttributeError: raise CannotCompleteRequestError( 'Cannot produce pageid list from response') for key in keys: yield key
def process_pageid_response(response): """Retrieve pageids from response to query for title string, then yield each""" try: pages = response['query']['pages'] keys = pages.keys() except KeyError: raise CannotCompleteRequestError( 'Cannot generate pageids from response') except AttributeError: raise CannotCompleteRequestError( 'Cannot generate pageids from response') for key in keys: yield key
def search_for_titles(self, query): params = { 'format': 'json', 'action': 'query', 'list': 'search', 'srwhat': 'text', 'srsearch': query.strip() } r = self.get_JSON(params) try: hits = r['query']['searchinfo']['totalhits'] except KeyError: raise CannotCompleteRequestError( 'Cannot retrieve number of search hits') if hits < 1: raise CannotCompleteRequestError('Search produced no results') return r
def process_page_stream(stream): """Yield only pages with the data needed to build corpus""" found_one = False for data in stream: try: page = process_page_data(data) except KeyError: continue if page is not None: found_one = True yield page if not found_one: raise CannotCompleteRequestError('All parsed pages invalid')
def query(self, query): """Return generator of (title,text) pairs""" try: title_data = self.search_for_titles(query) title_string = process.process_title_response(title_data) pageid_response = self.get_pageids_from_title_string(title_string) pageids = process.process_pageid_response(pageid_response) results = self.get_pages_by_id(pageids) except RequestError as e: # Can't complete, so bail. Retry from top level if necessary. raise CannotCompleteRequestError() text_stream = process.process_page_stream(results) clean_stream = ((title, process.plain_corpus(text)) for title, text in text_stream) # lol "clean." It's not. Someone please XSS me via WikiHow before I fix this. return clean_stream