def extract_fields_from_html(self, data): root = lxml.html.fromstring(data, parser=self.htmlparser) text = root.find('body/div') if text is None: raise ValueError("Structure of html is not body > div") metaarea = root.cssselect('#metaarea') author = [] if metaarea: author = ' '.join( unique(e.text_content() for e in metaarea[0].cssselect('.author'))) metaarea[0].drop_tree() for section in root.iter('section'): for sib in section.itersiblings(): if sib.tag == 'section': break sib.drop_tree() for p in root.iter('p'): p.tail = '\n\n' + (p.tail or '') for e in root.cssselect('.add'): e.drop_tree() hgroup = text.cssselect('.hgroup')[0] division = hgroup[0] title = hgroup[-1] if title == division: division = None others = hgroup[1:-1] hgroup.drop_tree() content = self.fix_text(text.text_content()) if title is not None: title = self.fix_text(title.text_content()) else: title = '' if division is not None: division = self.fix_text(division.text_content()) else: division = '' return { 'content': content, 'author': author, 'heading': { 'title': title, 'division': division, 'subhead': [e.text_content().strip() for e in others] }, 'boost': self.boost_factor(content) }
def extract_fields_from_html(self, data): root = lxml.html.fromstring(data, parser=self.htmlparser) text = root.find('body/div') if text is None: raise ValueError("Structure of html is not body > div") metaarea = root.cssselect('#metaarea') author = [] if metaarea: author = ' '.join(unique(e.text_content() for e in metaarea[0].cssselect('.author'))) metaarea[0].drop_tree() for section in root.iter('section'): for sib in section.itersiblings(): if sib.tag == 'section': break sib.drop_tree() for p in root.iter('p'): p.tail = '\n\n' + (p.tail or '') for e in root.cssselect('.add'): e.drop_tree() hgroup = text.cssselect('.hgroup')[0] division = hgroup[0] title = hgroup[-1] if title == division: division = None others = hgroup[1:-1] hgroup.drop_tree() content = self.fix_text(text.text_content()) if title is not None: title = self.fix_text(title.text_content()) else: title = '' if division is not None: division = self.fix_text(division.text_content()) else: division = '' return { 'content': content, 'author': author, 'heading': { 'title': title, 'division': division, 'subhead': [e.text_content().strip() for e in others] }, 'boost': self.boost_factor(content) }
def suggest_pi_dict(word): _cache = cache["pi"] if word not in _cache: body = { "query": { "term": { "term.folded": asciify(word) } } } is_title = word.istitle() r = es.search('en-dict', 'definition', body) matching_terms = [hit['_source']['term'] for hit in r['hits']['hits']] if is_title: matching_terms = (term.title() for term in matching_terms) matching_terms = (term for term in matching_terms if ' ' not in term) matching_terms = unique(matching_terms) result = [(term, 1) for term in matching_terms] _cache[word] = result return _cache[word]