Ejemplos de HunSpell en Python, ejemplos de hunspell.HunSpell en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_hunspell.py Proyecto: jurismarches/pyhunspell

class HunSpellTest(unittest.TestCase):
    def setUp(self):
        self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic",
                                 "/usr/share/hunspell/en_US.aff")

    def tearDown(self):
        try:
            del self.hunspell
        except AttributeError:
            pass

    def test_hunspell_spell(self):
        self.assertFalse(self.hunspell.spell('dpg'))
        self.assertTrue(self.hunspell.spell('dog'))
        self.assertFalse(self.hunspell.spell('spookie'))
        self.assertTrue(self.hunspell.spell('spooky'))

    def test_hunspell_suggest(self):
        self.assertEqual(self.hunspell.suggest('dpg'),
                         [b'dog', b'pg', b'deg', b'dig', b'dpt',
                          b'dug', b'mpg', b'd pg', b'GDP',
                          b'DP', b'PG', b'DTP', b'dip'])
        self.assertEqual(self.hunspell.suggest('spookie'),
                         [b'spookier', b'spookiness', b'spook', b'cookie',
                          b'bookie', b'Spokane', b'spoken'])

    def test_hunspell_stem(self):
        self.assertEqual(self.hunspell.stem('dog'), [b'dog'])
        self.assertEqual(self.hunspell.stem('permanently'), [b'permanent'])
        self.assertEqual(self.hunspell.stem('linked'), [b'linked', b'link'])

    def test_analyze(self):
        self.assertEqual(self.hunspell.analyze('linked'),
                         [b' st:linked', b' st:link fl:D'])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: stemmer.py Proyecto: nativeborncitizen/stemmer

def stem(word):
    hunspell_object = HunSpell(DIC_FILE, AFF_FILE)
    stemmed_list = hunspell_object.stem(word)
    if len(stemmed_list) > 0:
        return stemmed_list[0]
    else:
        sys.exit(1)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_hunspell.py Proyecto: blatinier/pyhunspell

class HunSpellGenerateTest(unittest.TestCase):
    def setUp(self):
        self.hunspell = HunSpell("/usr/share/hunspell/en_GB.dic",
                                 "/usr/share/hunspell/en_GB.aff")

    def test_generate(self):
        self.assertEqual(self.hunspell.generate('boy', 'girls'), [b'boys'])

    def test_generate2(self):
        self.assertEqual(self.hunspell.generate2('boy', 'is:Ns'), [b'boys'])

Ejemplo n.º 4

0

Mostrar archivo

Archivo: hunspell_wrapper.py Proyecto: liuenda/RAE_project

 def get_hunspell(prefix):
     try:
         from hunspell import HunSpell
         dic_fn = "{0}.dic".format(prefix)
         aff_fn = "{0}.dic".format(prefix)
         logging.info('loading hunspell dictionaries: {0} and {1}'.format(
             dic_fn, aff_fn))
         return HunSpell(dic_fn, aff_fn)
     except ImportError:
         logging.warning('hunspell is not present, using cache file only$')
         return None

Ejemplo n.º 5

0

Mostrar archivo

Archivo: __init__.py Proyecto: TwylaHelps/spacy_hunspell

class Hunspell(object):

    name = 'hunspell'

    def __init__(self, nlp: Language, path: str, lang: str='en_US'):
        path = Path.cwd() / path
        
        if not any([nlp, isinstance(nlp, Language)]):
            raise ValueError('nlp must be of a spaCy Language.') from None

        if not path.exists():
            raise NotADirectoryError('{} does not exist.'.format(path)) from None

        dic_path, aff_path = (
            path / '{}.dic'.format(lang),
            path / '{}.aff'.format(lang),
        )

        self.hobj = HunSpell(dic_path, aff_path)

        Token.set_extension('hunspell_spell', default=None)
        Token.set_extension('hunspell_suggest', getter=self.get_suggestion)

    def __call__(self, doc):
        for token in doc:
            try:
                token._.hunspell_spell = self.hobj.spell(token.text)
            except UnicodeEncodeError:
                pass
        return doc

    def get_suggestion(self, token):
        # TODO: include a lower option?
        # TODO: include suggestion numbers?
        # TODO: include stemmer?
        try:
            suggestions = self.hobj.suggest(token.text)
        except UnicodeEncodeError:
            suggestions = []
        return suggestions

Ejemplo n.º 6

0

Mostrar archivo

Archivo: app.py Proyecto: abhinavprince/geeksforgeeks-search-engine

def stem_query(query):
    # returns list of stemmed words

    hunspell_object = HunSpell(DIC_FILE, AFF_FILE)

    stemmed_list = []

    tokens = tokenize(query)
    for word in tokens:
        if word not in dummy_words:
            stemmed_list.append(stem(hunspell_object, word))

    return stemmed_list

Ejemplo n.º 7

0

Mostrar archivo

class spaCyHunSpell(object):

    name = 'hunspell'

    def __init__(self, nlp, path=HUNSPELL_PROFILE):
        if path in DEFAULT_DICTIONARY_PATHS:
            default_path = DEFAULT_DICTIONARY_PATHS[path]
            dic_path, aff_path = (
                os.path.join(default_path, 'en_US.dic'),
                os.path.join(default_path, 'en_US.aff'),
            )
        else:
            assert len(path) == 2, 'Include two paths: dic_path and aff_path'
            dic_path, aff_path = path

        self.hobj = HunSpell(dic_path, aff_path)

        Token.set_extension('hunspell_spell', default=None)
        Token.set_extension('hunspell_suggest', getter=self.get_suggestion)

    def __call__(self, doc):
        for token in doc:
            try:
                token._.hunspell_spell = self.hobj.spell(token.text)
            except UnicodeEncodeError:
                pass
        return doc

    def get_suggestion(self, token):
        # TODO: include a lower option?
        # TODO: include suggestion numbers?
        # TODO: include stemmer?
        try:
            suggestions = self.hobj.suggest(token.text)
        except UnicodeEncodeError:
            suggestions = []
        return suggestions

Ejemplo n.º 8

0

Mostrar archivo

 def setUp(self):
     self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic",
                              "/usr/share/hunspell/en_US.aff")

Ejemplo n.º 9

0

Mostrar archivo

class HunSpellTest(unittest.TestCase):
    def setUp(self):
        self.hunspell = HunSpell("/usr/share/hunspell/en_US.dic",
                                 "/usr/share/hunspell/en_US.aff")

    def tearDown(self):
        try:
            del self.hunspell
        except AttributeError:
            pass

    def test_hunspell_spell(self):
        self.assertFalse(self.hunspell.spell('dpg'))
        self.assertTrue(self.hunspell.spell('dog'))
        self.assertFalse(self.hunspell.spell('spookie'))
        self.assertTrue(self.hunspell.spell('spooky'))

    def test_hunspell_suggest(self):
        self.assertEqual(self.hunspell.suggest('dpg'),
                         ['dog', 'pg', 'deg', 'dig', 'dpt',
                          'dug', 'mpg', 'd pg', 'GDP',
                          'DP', 'PG', 'DTP', 'dip'])
        self.assertEqual(self.hunspell.suggest('spookie'),
                         ['spookier', 'spookiness', 'spook', 'cookie',
                          'bookie', 'Spokane', 'spoken'])
        self.assertEqual(self.hunspell.suggest('Eelysa'),
                         ['Elyssa', 'Elysees', 'Elysha', 'Elysia',
                          'Elissa', 'Elysée'])

    def test_hunspell_stem(self):
        self.assertEqual(self.hunspell.stem('dog'), [b'dog'])
        self.assertEqual(self.hunspell.stem('permanently'), [b'permanent'])
        self.assertEqual(self.hunspell.stem('linked'), [b'linked', b'link'])

    def test_analyze(self):
        self.assertEqual(self.hunspell.analyze('linked'),
                         [b' st:linked', b' st:link fl:D'])

    def test_add_remove(self):
        self.assertFalse(self.hunspell.spell('pipo'))
        self.hunspell.add('pipo')
        self.assertTrue(self.hunspell.spell('pipo'))
        self.hunspell.remove('pipo')
        self.assertFalse(self.hunspell.spell('pipo'))

    def test_add_dic(self):
        self.assertFalse(self.hunspell.spell("dictionnaire"))
        try:
            self.hunspell.add_dic("/usr/share/hunspell/fr.dic")
        except HunSpellError:
            raise ValueError("/usr/share/hunspell/fr.dic is not installed. "
                             "Please install hunspell-fr to validate this test.")
        self.assertTrue(self.hunspell.spell("dictionnaire"))

Ejemplo n.º 10

0

Mostrar archivo

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 22 16:28:51 2018

@author: Samuele Garda
"""

import logging
import argparse
from hunspell import HunSpell


DISCARD = ['\n', 'Twitter / Account gesperr\n']
DICS = ['./en_US.dic', './en_US.aff']
SPELLER = HunSpell(*DICS)
NO_SPELL = ['^','Z','L','M','!','Y','#','@','~','U','E',',','G','S']

logger = logging.getLogger(__name__)
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(module)s: %(message)s', level = 'INFO')


def parse_arguments():
  parser = argparse.ArgumentParser(description='Utility for pos tagging tweets via TweetNLP ark-tweet-nlp without using java. ONLY LINUX SUPPORTED. \
                                   Strings discarded when loading tweets: `\n`,`Twitter / Account gesperr\n`')
  parser.add_argument('-r' , '--raw-tweets', help = 'Path where to file where raw tweets are stored')
  parser.add_argument('-p' , '--parsed-tweets', help = 'Path where to file where parsed tweets are stored')
  parser.add_argument('-o' , '--output', default = 'tweet_data_for_dp.txt', help = 'Out path to write to')
  
  return parser.parse_args()

Ejemplo n.º 11

0

Mostrar archivo

        model_dir: Optional[Text] = None,
        model_metadata: Optional["Metadata"] = None,
        cached_component: Optional["Component"] = None,
        **kwargs: Any
    ) -> "Component":
        """Load this component from file."""

        if cached_component:
            return cached_component
        else:
            return cls(meta)


try:
    from hunspell import HunSpell
    spell_checker = HunSpell('./pyhunspell/dictionaries/en-GB/index.dic','./pyhunspell/dictionaries/en-GB/index.aff') #Load Hunspell Dicionary and Affrims
    encoding = spell_checker.get_dic_encoding() #Gets the dictionary Encoding
    imported = True
except ModuleNotFoundError:
    print("Cannot Import HunSpell")
    imported = False


def add_correct_Words(words = []):
    """
    Adds the grammatically correct words to Hunspell Dictionary
    Arguments:
        words = List of the words
    Returns: None
    """
    if imported:

Ejemplo n.º 12

0

Mostrar archivo

class HunSpelling:
    """
    Use the hunspell tool to detect isolated non-word spelling errors
    and to suggest candidate corrections
    """
    def __init__(self, dic_file, aff_file, extra_dic=None):
        """
        Load the dictionary and affix files for spell checking.
        Allow adding an extra dictionary.
        """

        io_utils.check_file_readable(dic_file)
        io_utils.check_file_readable(aff_file)

        self.hunspell = HunSpell(dic_file, aff_file)
        if extra_dic:
            io_utils.check_file_readable(extra_dic)
            self.hunspell.add_dic(extra_dic)

    def is_misspelled(self, word):
        """Check if given word is misspelled"""
        return not self.hunspell.spell(word)

    def add_word(self, word):
        """Add new word into hunspell's dictionary"""
        if word:
            self.hunspell.add(word)

    def add_words(self, words):
        """Add new words into hunspell's dictionary"""
        if not isinstance(words, list):
            return
        for word in words:
            self.add_word(word)

    def add_extra_dictionary(self, dic_file):
        """Add an extra dictionary to the current instance"""
        io_utils.check_file_readable(dic_file)
        self.hunspell.add_dic(dic_file)

    def remove_word(self, word):
        """Remove word from hunspell's dictionary"""
        self.hunspell.remove(word)

    def remove_words(self, words):
        """Remove words from hunspell's dictionary"""
        if not isinstance(words, list):
            return
        for word in words:
            self.remove_word(word)

    def get_suggestions(self, word):
        """Return correction suggestions"""

        suggestions = []
        for sgt in self.hunspell.suggest(word):
            sgt = sgt.replace('-', ' ')
            if not sgt in suggestions:
                suggestions.append(sgt)
        return suggestions

    def correct(self, query, ignore=None, topn=None):
        """
        Return top candidate corrections for given query.
        The ignore flag can allow ignoring certain words
        (e.g. named entities)
        """

        if not isinstance(query, list):
            query = query.split()

        if ignore is None:
            ignore = [0] * len(query)

        solutions = []

        for i, token in enumerate(query):
            if token.isalpha() \
                    and not self.hunspell.spell(token) \
                    and not ignore[i]:

                suggestions = self.get_suggestions(token)

                if suggestions:
                    solutions.append(suggestions)
                else:
                    solutions.append([token])
            else:
                solutions.append([token])

        # merge solutions
        candidates = [' '.join(sol) for sol in product(*solutions)]

        return candidates[:topn]

Ejemplo n.º 13

0

Mostrar archivo

Archivo: index.py Proyecto: abhinavprince/geeksforgeeks-search-engine

import math
import pickle
import os
from collections import Counter
from config import DIC_FILE, AFF_FILE
from hunspell import HunSpell 

symbols = [',', '.', ';', '\'', '"', '{', '}', '[', ']', '(', ')', '?', ':', '*', '^', '-', '%', '\\', '/']

hunspell_object = HunSpell(DIC_FILE, AFF_FILE)

class GFG:

	def __init__(self):
		print "Done"
        
       
	def stem(self, hunspell_object, word):
    		stemmed_list = hunspell_object.stem(word)
    		if len(stemmed_list) > 0:
        		return str(stemmed_list[0])

 

	def index(self, words_file):
		open_file = open(words_file, 'r')
		words_list =[]
		contents = open_file.readlines()

		for i in range(len(contents)):
                    for s in symbols:

Ejemplo n.º 14

0

Mostrar archivo

#!/usr/bin/env python3

from lazydata import track
from hunspell import HunSpell

h = HunSpell(
    track(
        './data/en.dic',
        'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.dic'
    ),
    track(
        './data/en.aff',
        'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/en/en_US.aff'
    ))
print([x.encode(h.get_dic_encoding()) for x in h.suggest('hollo')])

Ejemplo n.º 15

0

Mostrar archivo

Archivo: deperson.py Proyecto: kpnDataScienceLab/deperson

    def __init__(self, autocorrect=False, check_compound=False):
        self.autocorrect = autocorrect
        self.check_compound = check_compound

        self.masking_regexes = [(r'(?:van|aan|in)\s(?:der|den)\s\b\w+\b', 3),
                                (re.escape(PLACEHOLDER) +
                                 r'\s(?:van|aan|in)\s(?:de|het|t)\s\b\w+\b', 4)
                                ]

        self.email_regex = r'[^@\s]+@[^@\s]+\.[^@\s]+'

        self.email_placeholder = 'maskedemail'

        self.url_regex = r'(?:http:\/\/|https:\/\/)?(?:www\.)?[a-z]+\.[a-z]{2,5}[^\s]*'

        # Import blacklists
        blacklists_path = script_path + '/blacklists/'
        blacklists = []

        # Loop over available blacklists
        for filename in os.listdir(blacklists_path):
            if filename.endswith(".txt"):
                # Open assuming we may be dealing with unicode
                with codecs.open(blacklists_path + filename,
                                 encoding='utf-8') as f:
                    # Decode unicode into ASCII
                    words = {unidecode(x.strip()) for x in f.readlines()}
                # Store current blacklist
                blacklists.append(words)
            else:
                continue

        # Combine all blacklists into one
        self.blacklist = set.union(*blacklists)

        # Regex-based whitelists
        self.whitelist_regexes = [
            r'\d+gb$', r'\d+ghz$', r'\d+mb$', r'\d+mbit$', r'\d+gbit$',
            r'\d+(?:\.|,)?\d{0,2}euro?', r'v\d+$',
            r'0(?:8|9)00(?:[\s\-]{1}\d{4})?', u'\u20AC\d+(?:\.|,)?\d{0,2}',
            r'^\d$', r'^[a-zA-Z]{1,2}\d{1,3}$', r'\d{1,2}:\d{2}(?::\d{2})?',
            r'\d{1,2}-\d{1,2}(?:-\d{2,4})?'
        ] + [self.url_regex]

        self.protected_punctuation_regexes = [
            r'0(?:8|9)00[\s\-]{1}\d{4}',
            u'\u20AC\d+(?:\.|,)?\d{0,2}',
        ] + [self.url_regex, self.email_regex] + self.whitelist_regexes

        self.protected_punctuation_regex = '(' + \
            '|'.join(self.protected_punctuation_regexes) + ')'

        # Load in HunSpell files
        self.d = HunSpell(script_path + '/dict/Dutch.dic',
                          script_path + '/dict/Dutch.aff')

        self.clean_d = HunSpell(script_path + '/dict/Dutch_clean.dic',
                                script_path + '/dict/Dutch.aff')

        # Load in curated autocorrection list
        with open(script_path + '/spellcheck/autocorrect.csv') as f:
            reader = csv.reader(f, skipinitialspace=True)
            self.autocorrecter = dict(reader)

        # Import whitelists
        whitelists_path = script_path + '/whitelists/'
        whitelists = []

        # Loop over available whitelists
        for filename in os.listdir(whitelists_path):
            if filename.endswith(".txt"):
                # Open assuming we may be dealing with unicode
                with codecs.open(whitelists_path + filename,
                                 encoding='utf-8') as f:
                    # Decode unicode into ASCII
                    words = {unidecode(x.strip()) for x in f.readlines()}
                # Store current whitelist
                whitelists.append(words)
            else:
                continue

        # Combine all whitelists into one
        self.whitelist = set.union(*whitelists)
        self.whitelist = self.whitelist.union(set(self.autocorrecter.keys()))

        # Specific domain words whitelist
        with codecs.open(whitelists_path + 'domainwords.txt',
                         encoding='utf-8') as f:
            self.domain_whitelist = {
                unidecode(x.strip())
                for x in f.readlines()
            }

Ejemplo n.º 16

0

Mostrar archivo

Archivo: deperson.py Proyecto: kpnDataScienceLab/deperson

class Deperson():
    def __init__(self, autocorrect=False, check_compound=False):
        self.autocorrect = autocorrect
        self.check_compound = check_compound

        self.masking_regexes = [(r'(?:van|aan|in)\s(?:der|den)\s\b\w+\b', 3),
                                (re.escape(PLACEHOLDER) +
                                 r'\s(?:van|aan|in)\s(?:de|het|t)\s\b\w+\b', 4)
                                ]

        self.email_regex = r'[^@\s]+@[^@\s]+\.[^@\s]+'

        self.email_placeholder = 'maskedemail'

        self.url_regex = r'(?:http:\/\/|https:\/\/)?(?:www\.)?[a-z]+\.[a-z]{2,5}[^\s]*'

        # Import blacklists
        blacklists_path = script_path + '/blacklists/'
        blacklists = []

        # Loop over available blacklists
        for filename in os.listdir(blacklists_path):
            if filename.endswith(".txt"):
                # Open assuming we may be dealing with unicode
                with codecs.open(blacklists_path + filename,
                                 encoding='utf-8') as f:
                    # Decode unicode into ASCII
                    words = {unidecode(x.strip()) for x in f.readlines()}
                # Store current blacklist
                blacklists.append(words)
            else:
                continue

        # Combine all blacklists into one
        self.blacklist = set.union(*blacklists)

        # Regex-based whitelists
        self.whitelist_regexes = [
            r'\d+gb$', r'\d+ghz$', r'\d+mb$', r'\d+mbit$', r'\d+gbit$',
            r'\d+(?:\.|,)?\d{0,2}euro?', r'v\d+$',
            r'0(?:8|9)00(?:[\s\-]{1}\d{4})?', u'\u20AC\d+(?:\.|,)?\d{0,2}',
            r'^\d$', r'^[a-zA-Z]{1,2}\d{1,3}$', r'\d{1,2}:\d{2}(?::\d{2})?',
            r'\d{1,2}-\d{1,2}(?:-\d{2,4})?'
        ] + [self.url_regex]

        self.protected_punctuation_regexes = [
            r'0(?:8|9)00[\s\-]{1}\d{4}',
            u'\u20AC\d+(?:\.|,)?\d{0,2}',
        ] + [self.url_regex, self.email_regex] + self.whitelist_regexes

        self.protected_punctuation_regex = '(' + \
            '|'.join(self.protected_punctuation_regexes) + ')'

        # Load in HunSpell files
        self.d = HunSpell(script_path + '/dict/Dutch.dic',
                          script_path + '/dict/Dutch.aff')

        self.clean_d = HunSpell(script_path + '/dict/Dutch_clean.dic',
                                script_path + '/dict/Dutch.aff')

        # Load in curated autocorrection list
        with open(script_path + '/spellcheck/autocorrect.csv') as f:
            reader = csv.reader(f, skipinitialspace=True)
            self.autocorrecter = dict(reader)

        # Import whitelists
        whitelists_path = script_path + '/whitelists/'
        whitelists = []

        # Loop over available whitelists
        for filename in os.listdir(whitelists_path):
            if filename.endswith(".txt"):
                # Open assuming we may be dealing with unicode
                with codecs.open(whitelists_path + filename,
                                 encoding='utf-8') as f:
                    # Decode unicode into ASCII
                    words = {unidecode(x.strip()) for x in f.readlines()}
                # Store current whitelist
                whitelists.append(words)
            else:
                continue

        # Combine all whitelists into one
        self.whitelist = set.union(*whitelists)
        self.whitelist = self.whitelist.union(set(self.autocorrecter.keys()))

        # Specific domain words whitelist
        with codecs.open(whitelists_path + 'domainwords.txt',
                         encoding='utf-8') as f:
            self.domain_whitelist = {
                unidecode(x.strip())
                for x in f.readlines()
            }

    def curated_autocorrect(self, word):
        """
            Autocorrect a word based on the curated list of autocorrection
            sets.
        """

        if word in self.autocorrecter:
            corr = self.autocorrecter[word]
        else:
            return word

        return corr

    def smart_suggest(self, word):
        """
            Autocorrect a word. (This is computationally expensive,
            use sparingly.)

            Keyword arguments:
            word -- a (potentially misspelled) Dutch word

            Return value:
            Corrected word or, if no suggestion could be found,
            the original word.
        """

        if word.isdigit():
            return word

        # Set of all suggestions provided by HunSpell for this word;
        # insensitive to the capitalization of the first letter
        suggestions = set(self.d.suggest(word)).union(
            set(self.d.suggest(word.capitalize())))

        # If we have no suggestions, just return the word
        if (len(suggestions) == 0):
            return word

        # Otherwise, we want to return the closest match, where 'closest' is
        # defined by the Ratcliff/Obershelp pattern matching algorithm.
        worset, max = {}, 0
        for sugg in suggestions:
            # Create a temporary SequenceMatcher instance (provides the
            # matching algorithm) that operates case-insensitively
            tmp = difflib.SequenceMatcher(None,
                                          word.lower(),
                                          sugg.lower(),
                                          autojunk=False).ratio()

            # Store the suggestion with its score
            worset[tmp] = sugg

            # Keep track of the suggestion with max score
            if tmp > max:
                max = tmp

        # Return best match (case insensitive)
        return worset[max].lower()

    def remove_punctuation(self, s):
        """
            Drop punctuation from a string
        """

        # Punctuation but not the '|' character
        punct = string.punctuation.replace("|",
                                           "").replace("[",
                                                       "").replace("]", "")

        # Translator that maps punctuation to spaces
        translator = str.maketrans(punct, ' ' * len(punct))

        # Split out protected patterns to preserve them
        split = re.split(self.protected_punctuation_regex, s)

        # print(split)

        # Remove punctuation from even-index parts of the split
        cleaned_split = [
            part.translate(translator) if i % 2 == 0 else part
            for i, part in enumerate(split)
        ]

        # Put string back together
        t = ''.join(cleaned_split)

        # Mask out e-mail addresses
        t = re.sub(self.email_regex, self.email_placeholder, t)

        # Translated string
        # t = s.translate(translator)

        # Remove duplicate whitespaces
        # t = ' '.join(t.split())

        return t

    def apply_whitelist(self, text):
        """
            Filter text using whitelist
        """

        # Split text into words
        words = self.remove_punctuation(text.lower()).split()

        # Correct text based on curated autocorrecter
        if self.autocorrect:
            for word in words:
                if word in self.autocorrecter:
                    idx = words.index(word)

                    # Replacement may consist of more than one word, so insert
                    # as a list
                    words[idx:idx + 1] = self.curated_autocorrect(word).split()

        # Calculate filter based on whitelist(TODO paralellize?)
        filter = list(map(lambda word: (word, word in self.whitelist), words))

        # Update filter based on whitelist regexes
        filter = [
            (word, f) if f else
            (word,
             any(re.match(regex, word) for regex in self.whitelist_regexes))
            for (word, f) in filter
        ]

        # Update filter based on compound words
        if self.check_compound:
            for i in range(len(filter)):
                (word, f) = filter[i]
                if f or word.isdigit():
                    continue
                else:
                    if self.clean_d.spell(word):
                        filter[i] = (word, True)

        # Update filter based on autocorrection
        # # This is currently too slow, do not use
        # if self.autocorrect:
        #     for i in range(len(filter)):
        #         (word, f) = filter[i]
        #         if f or word.isdigit():
        #             continue
        #         else:
        #             corr = self.smart_suggest(word)
        #             if corr in self.whitelist:
        #                 filter[i] = (corr, True)

        # Put text back together without filtered words
        return ' '.join([
            word if f
            # or (self.check_compound and self.d.spell(word))
            # or (self.autocorrect and self.smart_suggest(word) in
            # self.whitelist)
            else PLACEHOLDER for (word, f) in filter
        ])

    def domain_guarded_replace(self, regex, text, repl):
        """
            Replace based on regex, but guarding domain-specific words.
        """

        # Loop over matches for regex
        for match in re.finditer(regex, text):
            # Pull out the matched substring
            g = match.group()

            # Replace it with placeholder only if none of the words in the
            # substring are in the domain whitelist
            if not any(word in self.domain_whitelist for word in g.split()):
                text = text.replace(g, repl)
        return text

    def apply_blacklist(self, text):
        """
            Apply all defined regex masks and the blacklist
        """

        # Copy text
        output = text[:]

        # Mask out blacklisted strings
        for bad_word in self.blacklist:
            num_ph = len(bad_word.split())
            multi_ph = (PLACEHOLDER, ) * num_ph
            repl = ' '.join(multi_ph)
            output = output.replace(bad_word, repl)

        # Mask out based on regexes
        for (regex, num_ph) in self.masking_regexes:
            multi_ph = (PLACEHOLDER, ) * num_ph
            repl = ' '.join(multi_ph)
            output = self.domain_guarded_replace(regex, output, repl)

        return output

    def get_illegal_words(self, text):
        """
            Return words masked by apply_whitelist() from the
            given text.
        """

        # Get masked words
        cleaned = self.apply_blacklist(self.apply_whitelist(text)).split()

        # Original text
        original = self.remove_punctuation(text).split()

        # Pull out original words that were filtered out
        filtered = [
            original[i] for i in range(len(original))
            if cleaned[i] == PLACEHOLDER
        ]

        # Return as string
        return ' '.join(filtered)