def collect_variants(text, term, replace="_TERM_"): """ This finds all spelling variants of term in text. >>> text = "I had a Deja-vu, or Déjàvu") >>> collect_variants(text, "Déjà Vu") returns {"Deja-vu", "Déjàvu"} Args: text: str -- text in which to search for spelling variants term: str Returns: set -- A set of all variants found. *NB: This is used in the output JSON as an additional index within Wordnik #TODO """ squashed_term = squashed(term) clean_text = unidecode(text).lower() # This RE allows for up to one non-letter character between all letters fuzzy_term = ''.join("{}[^a-z0-9]?".format(c) for c in squashed_term[:-1]) + squashed_term[-1] term_re = r'\b({})s?\b'.format(fuzzy_term) # s? for plurals collected = set() for m in re.finditer(term_re, clean_text): variant = text[m.start():m.end()] if variant.lower().endswith("s") and not term.lower().endswith("s"): variant = variant[:-1] collected.add(variant) # collected.add(term + 's') # account for finding plurals # collected.add(term + 'es') return collected
def get_html_features(self, html): """Detects whether the search term exists is highlighted (bolded, emphasised) or in quotes. Needs to be called after self.request_page. """ if not self.term: return None minimal_html = squashed(html, keep='<>/&;') minimal_term = squashed(self.term) highlight_re = r"<(em|i|b|strong|span)[^>]*> *{}[ ,:]*</\1>".format(minimal_term) quote_re = r"<({})[^>]*> *{}[ ,:]*</({})>".format("|".join(self.OPENING_QUOTES), minimal_term, "|".join(self.CLOSING_QUOTES)) self.features = { "highlighted": bool(re.search(highlight_re, minimal_html, re.IGNORECASE)), "quotes": bool(re.search(quote_re, minimal_html, re.IGNORECASE)), }
def get_html_features(self, html): """Detects whether the search term exists is highlighted (bolded, emphasised) or in quotes. Needs to be called after self.request_page. """ if not self.term: return None minimal_html = squashed(html, keep='<>/&;') minimal_term = squashed(self.term) highlight_re = r"<(em|i|b|strong|span)[^>]*> *{}[ ,:]*</\1>".format( minimal_term) quote_re = r"<({})[^>]*> *{}[ ,:]*</({})>".format( "|".join(self.OPENING_QUOTES), minimal_term, "|".join(self.CLOSING_QUOTES)) self.features = { "highlighted": bool(re.search(highlight_re, minimal_html, re.IGNORECASE)), "quotes": bool(re.search(quote_re, minimal_html, re.IGNORECASE)), }
def clean_and_qualify_wordlist(wordlist): """Generator that returns cleaned version of a list of words. Will remove any non-words. Args: wordlist: list Returns: list """ cleaned = filter(bool, map(clean_and_qualify_term, wordlist)) cleaned_squashed = set() for term in cleaned: s = squashed(term) if s not in cleaned_squashed: cleaned_squashed.add(s) yield term