def process(self, method, data): if method not in ['url', 'term']: raise ValueError("method must be one of 'url', 'term'") document = None if method == 'term': page_name = wikipedia.get_page_for_query(data) document = wikipedia.fetch_page((None, page_name, None)) else: result = urlfetch.fetch(urllib.unquote(data), deadline=20) if result.status_code == 200: document = etree.fromstring(result.content, parser=self.parser) if document is None: self.error(500) # document_content = document.cssselect('#mw-content-text').text() # TODO: Extract terms # TODO: Extract definitions relevant_pages = wikipedia.get_relevant_pages(document) data = { # TODO 'terms': [], 'cards': [], 'shmoop': [], 'related': [{'url': wikipedia.page_name_to_link(page), 'name': page[1]} for page in relevant_pages] } return data
def get_definition(term, document=None): """Get a definition for a term. Caller can optionally provide the `lxml` document in which the term was first found (and thus where a definition may also be found).""" # First stab: try to fetch a relevant article page_name = wikipedia.get_page_for_query(term) document = wikipedia.fetch_page((None, page_name, None)) if document is None: # TODO: Other extraction method return content_el = document.xpath('//div[@id="mw-content-text"]/p[1]')[0] text_content = etree.tostring(content_el, method='text', encoding='utf-8') text_content = PARENTHETICAL_EXPRESSION.sub(r'\1', text_content) # Grab first sentence. # TODO: Better sentence segmentation sentence = text_content.split('.')[0] return sentence.strip() + '.'