Ejemplo n.º 1
0
def collect_variants(text, term, replace="_TERM_"):
    """
    This finds all spelling variants of term in text.

    >>> text = "I had a Deja-vu, or Déjàvu")
    >>> collect_variants(text, "Déjà Vu")

    returns {"Deja-vu", "Déjàvu"}

    Args:
        text: str -- text in which to search for spelling variants
        term: str
    Returns:
        set -- A set of all variants found.

    *NB: This is used in the output JSON as an additional index within Wordnik #TODO

    """
    squashed_term = squashed(term)
    clean_text = unidecode(text).lower()
    # This RE allows for up to one non-letter character between all letters
    fuzzy_term = ''.join("{}[^a-z0-9]?".format(c) for c in squashed_term[:-1]) + squashed_term[-1]
    term_re = r'\b({})s?\b'.format(fuzzy_term)  # s? for plurals
    collected = set()
    for m in re.finditer(term_re, clean_text):
        variant = text[m.start():m.end()]
        if variant.lower().endswith("s") and not term.lower().endswith("s"):
            variant = variant[:-1]
        collected.add(variant)
        # collected.add(term + 's')  # account for finding plurals
        # collected.add(term + 'es')
    return collected
Ejemplo n.º 2
0
def collect_variants(text, term, replace="_TERM_"):
    """
    This finds all spelling variants of term in text.

    >>> text = "I had a Deja-vu, or Déjàvu")
    >>> collect_variants(text, "Déjà Vu")

    returns {"Deja-vu", "Déjàvu"}

    Args:
        text: str -- text in which to search for spelling variants
        term: str
    Returns:
        set -- A set of all variants found.

    *NB: This is used in the output JSON as an additional index within Wordnik #TODO

    """
    squashed_term = squashed(term)
    clean_text = unidecode(text).lower()
    # This RE allows for up to one non-letter character between all letters
    fuzzy_term = ''.join("{}[^a-z0-9]?".format(c)
                         for c in squashed_term[:-1]) + squashed_term[-1]
    term_re = r'\b({})s?\b'.format(fuzzy_term)  # s? for plurals
    collected = set()
    for m in re.finditer(term_re, clean_text):
        variant = text[m.start():m.end()]
        if variant.lower().endswith("s") and not term.lower().endswith("s"):
            variant = variant[:-1]
        collected.add(variant)
        # collected.add(term + 's')  # account for finding plurals
        # collected.add(term + 'es')
    return collected
Ejemplo n.º 3
0
    def get_html_features(self, html):
        """Detects whether the search term exists is highlighted (bolded, emphasised) or
        in quotes. Needs to be called after self.request_page.
        """
        if not self.term:
            return None

        minimal_html = squashed(html, keep='<>/&;')
        minimal_term = squashed(self.term)

        highlight_re = r"<(em|i|b|strong|span)[^>]*> *{}[ ,:]*</\1>".format(minimal_term)
        quote_re = r"<({})[^>]*> *{}[ ,:]*</({})>".format("|".join(self.OPENING_QUOTES), minimal_term, "|".join(self.CLOSING_QUOTES))

        self.features = {
            "highlighted": bool(re.search(highlight_re, minimal_html, re.IGNORECASE)),
            "quotes": bool(re.search(quote_re, minimal_html, re.IGNORECASE)),
        }
Ejemplo n.º 4
0
    def get_html_features(self, html):
        """Detects whether the search term exists is highlighted (bolded, emphasised) or
        in quotes. Needs to be called after self.request_page.
        """
        if not self.term:
            return None

        minimal_html = squashed(html, keep='<>/&;')
        minimal_term = squashed(self.term)

        highlight_re = r"<(em|i|b|strong|span)[^>]*> *{}[ ,:]*</\1>".format(
            minimal_term)
        quote_re = r"<({})[^>]*> *{}[ ,:]*</({})>".format(
            "|".join(self.OPENING_QUOTES), minimal_term,
            "|".join(self.CLOSING_QUOTES))

        self.features = {
            "highlighted":
            bool(re.search(highlight_re, minimal_html, re.IGNORECASE)),
            "quotes":
            bool(re.search(quote_re, minimal_html, re.IGNORECASE)),
        }
Ejemplo n.º 5
0
def clean_and_qualify_wordlist(wordlist):
    """Generator that returns cleaned version of a list of words.
    Will remove any non-words.

    Args:
        wordlist: list
    Returns:
        list
    """
    cleaned = filter(bool, map(clean_and_qualify_term, wordlist))
    cleaned_squashed = set()
    for term in cleaned:
        s = squashed(term)
        if s not in cleaned_squashed:
            cleaned_squashed.add(s)
            yield term
Ejemplo n.º 6
0
def clean_and_qualify_wordlist(wordlist):
    """Generator that returns cleaned version of a list of words.
    Will remove any non-words.

    Args:
        wordlist: list
    Returns:
        list
    """
    cleaned = filter(bool, map(clean_and_qualify_term, wordlist))
    cleaned_squashed = set()
    for term in cleaned:
        s = squashed(term)
        if s not in cleaned_squashed:
            cleaned_squashed.add(s)
            yield term