Beispiel #1
0
def process_title_response(response):
    """Just fetch the first 10, ignore pagination"""
    try:
        results = response['query']['search']
        titles = (x['title'] for x in results)
        title_string = '|'.join(titles)
    except KeyError:
        # Malformed response
        raise CannotCompleteRequestError('Cannot produce a valid title string')
    if '' == title_string:
        raise CannotCompleteRequestError('Cannot produce a valid title string')
    return title_string
Beispiel #2
0
def process_pageid_response(response):
    try:
        pages = response['query']['pages']
        keys = pages.keys()
    except KeyError:
        raise CannotCompleteRequestError(
            'Cannot produce pageid list from response')
    except AttributeError:
        raise CannotCompleteRequestError(
            'Cannot produce pageid list from response')
    for key in keys:
        yield key
Beispiel #3
0
def process_pageid_response(response):
    """Retrieve pageids from response to query for title string, then yield each"""
    try:
        pages = response['query']['pages']
        keys = pages.keys()
    except KeyError:
        raise CannotCompleteRequestError(
            'Cannot generate pageids from response')
    except AttributeError:
        raise CannotCompleteRequestError(
            'Cannot generate pageids from response')
    for key in keys:
        yield key
Beispiel #4
0
 def search_for_titles(self, query):
     params = {
         'format': 'json',
         'action': 'query',
         'list': 'search',
         'srwhat': 'text',
         'srsearch': query.strip()
     }
     r = self.get_JSON(params)
     try:
         hits = r['query']['searchinfo']['totalhits']
     except KeyError:
         raise CannotCompleteRequestError(
             'Cannot retrieve number of search hits')
     if hits < 1:
         raise CannotCompleteRequestError('Search produced no results')
     return r
Beispiel #5
0
def process_page_stream(stream):
    """Yield only pages with the data needed to build corpus"""
    found_one = False
    for data in stream:
        try:
            page = process_page_data(data)
        except KeyError:
            continue
        if page is not None:
            found_one = True
            yield page
        if not found_one:
            raise CannotCompleteRequestError('All parsed pages invalid')
Beispiel #6
0
    def query(self, query):
        """Return generator of (title,text) pairs"""
        try:
            title_data = self.search_for_titles(query)
            title_string = process.process_title_response(title_data)
            pageid_response = self.get_pageids_from_title_string(title_string)
            pageids = process.process_pageid_response(pageid_response)
            results = self.get_pages_by_id(pageids)

        except RequestError as e:
            # Can't complete, so bail. Retry from top level if necessary.
            raise CannotCompleteRequestError()

        text_stream = process.process_page_stream(results)
        clean_stream = ((title, process.plain_corpus(text))
                        for title, text in text_stream)
        # lol "clean." It's not. Someone please XSS me via WikiHow before I fix this.
        return clean_stream