def scrape_meaning(self, page): """ Return meaning. """ html = Utils.text_between(page, *TAG_MEANING, force_html=True) text = Utils.remove_tags(html) return Utils.remove_spaces(text)
def first_synonym(self, html): """ Return the first synonym found and html without his marking. """ synonym = Utils.text_between(html, *TAG_SYNONYMS_DELIMITER, force_html=True) synonym = Utils.remove_spaces(synonym) _html = html.replace(TAG_SYNONYMS_DELIMITER[0], "", 1) _html = _html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1) return Word(synonym), _html
def scrape_examples(self, page): """ Return a list of examples. """ examples = [] html = page index = html.find(TAG_PHRASE_DELIMITER[0]) while index > -1: example_html = Utils.text_between(html, *TAG_PHRASE_DELIMITER, force_html=True) examples += [Utils.remove_spaces(Utils.remove_tags(example_html))] html = html[index+len(TAG_PHRASE_DELIMITER[0]):] index = html.find(TAG_PHRASE_DELIMITER[0]) return examples
def synonyms(self, page): """ Return list of synonyms. """ synonyms = [] if page.find(TAG_SYNONYMS[0]) > -1: synonyms_html = Utils.text_between(page, TAG_SYNONYMS[0], TAG_SYNONYMS[1], True) while synonyms_html.find(TAG_SYNONYMS_DELIMITER[0]) > -1: synonym = Utils.text_between(synonyms_html, TAG_SYNONYMS_DELIMITER[0], TAG_SYNONYMS_DELIMITER[1], True) synonyms.append(Word(Utils.remove_spaces(synonym))) synonyms_html = synonyms_html.replace(TAG_SYNONYMS_DELIMITER[0], "", 1) synonyms_html = synonyms_html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1) return synonyms
def __init__(self, word, meaning=None, synonyms=[], examples=[], extra={}): self.word = word.strip().lower() self.url = BASE_URL.format(Utils.remove_accents(self.word)) self.meaning = meaning self.synonyms = synonyms self.extra = extra self.examples = examples
def scrape_extra(self, page): """ Return a dictionary of extra information. """ dict_extra = {} try: if page.find(TAG_EXTRA[0]) > -1: html = Utils.text_between(page, *TAG_EXTRA, force_html=True) extra_rows = Utils.split_html_tag(Utils.remove_spaces(html), TAG_EXTRA_SEP) for row in extra_rows: _row = Utils.remove_tags(row) key, value = map(Utils.remove_spaces, _row.split(":")) dict_extra[key] = value except: pass return dict_extra
def scrape_synonyms(self, page): """ Return list of synonyms. """ synonyms = [] if page.find(TAG_SYNONYMS[0]) > -1: html = Utils.text_between(page, *TAG_SYNONYMS, force_html=True) while html.find(TAG_SYNONYMS_DELIMITER[0]) > -1: synonym, html = self.first_synonym(html) synonyms.append(synonym) return synonyms
def search(self, word): """ Search for word. """ if len(word.split()) > 1: return None _word = Utils.remove_accents(word).strip().lower() try: with self.get(BASE_URL.format(_word)) as request: page = html.unescape(request.read().decode(CHARSET)) except: return None return Word( Utils.text_between(page, "<h1", "</h1>", force_html=True).lower(), meaning=self.scrape_meaning(page), synonyms=self.scrape_synonyms(page), examples=self.scrape_examples(page), extra=self.scrape_extra(page), )
def scrape_meaning(self, page): """ Return meaning and etymology. """ html = Utils.text_between(page, *TAG_MEANING, force_html=True) etymology = Utils.text_between(html, *TAG_ETYMOLOGY, force_html=True) etymology = Utils.remove_spaces(Utils.remove_tags(etymology)) meanings = Utils.split_html_tag(html, 'br') meanings = [ Utils.remove_spaces(Utils.remove_tags(x)) for x in meanings ] meaning = [x for x in meanings if x != etymology] return meaning, etymology
def search(self, word): """ Search for word. """ if len(word.split()) > 1: return None _word = Utils.remove_accents(word).strip().lower() try: with self.get(BASE_URL.format(_word)) as request: page = html.unescape(request.read().decode(CHARSET)) except: return None found = Word(word) found.meaning = self.scrape_meaning(page) found.synonyms = self.scrape_synonyms(page) found.extra = self.scrape_extra(page) return found
def extra(self, page): """ Return a dictionary of extra information. """ dic_extra = {} try: if page.find(TAG_EXTRA[0]) > -1: extra_html = Utils.text_between(page, TAG_EXTRA[0], TAG_EXTRA[1], True) extra_rows = Utils.split_html_tag(Utils.remove_spaces(extra_html), TAG_EXTRA_SEP) for row in extra_rows: _row = Utils.remove_tags(row) key, value = _row.split(":") dic_extra[Utils.remove_spaces(key)] = Utils.remove_spaces(value) except: pass return dic_extra
def search(self, word): """ Search for word. """ if len(word.split()) > 1: return None _word = Utils.remove_accents(word).strip().lower() try: url = request.urlopen(BASE_URL.format(_word)) except: return None page = html.unescape(url.read().decode(CHARSET)) if page.find(TAG_ENCHANT[0]) > -1: return None found = Word(word) found.meaning = self.meaning(page) found.synonyms = self.synonyms(page) found.extra = self.extra(page) return found
def meaning(self, page): """ Return meaning. """ return Utils.remove_spaces(Utils.remove_tags(Utils.text_between(page, TAG_MEANING[0], TAG_MEANING[1], True)))
def __init__(self, word, meaning=None, synonyms=[], extra={}): self.word = word.strip().lower() self.url = BASE_URL.format(Utils.remove_accents(word).strip().lower()) self.meaning = meaning self.synonyms = synonyms self.extra = extra