Beispiel #1
0
    def __init__(self,
                 lang=None,
                 text=None,
                 tokenize=None,
                 chunkers=None,
                 filters=None):
        """Constructor for the SpellChecker class.

        SpellChecker objects can be created in two ways, depending on
        the nature of the first argument.  If it is a string, it
        specifies a language tag from which a dictionary is created.
        Otherwise, it must be an enchant Dict object to be used.
        
        Optional keyword arguments are:
            
            * text:  to set the text to be checked at creation time
            * tokenize:  a custom tokenization function to use
            * chunkers:  a list of chunkers to apply during tokenization
            * filters:  a list of filters to apply during tokenization
        
        If <tokenize> is not given and the first argument is a Dict,
        its 'tag' attribute must be a language tag so that a tokenization
        function can be created automatically.  If this attribute is missing
        the user's default language will be used.
        """
        if lang is None:
            lang = get_default_language()
        if isinstance(lang, basestring):
            dict = enchant.Dict(lang)
        else:
            dict = lang
            try:
                lang = dict.tag
            except AttributeError:
                lang = get_default_language()
        if lang is None:
            raise DefaultLanguageNotFoundError
        self.lang = lang
        self.dict = dict
        if tokenize is None:
            try:
                tokenize = get_tokenizer(lang, chunkers, filters)
            except TokenizerNotFoundError:
                # Fall back to default tokenization if no match for 'lang'
                tokenize = get_tokenizer(None, chunkers, filters)
        self._tokenize = tokenize

        self.word = None
        self.wordpos = None
        self._ignore_words = {}
        self._replace_words = {}
        # Default to the empty string as the text to be checked
        self._text = array.array('u')
        self._use_tostring = False
        self._tokens = iter([])

        if text is not None:
            self.set_text(text)
Beispiel #2
0
    def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None):
        """Constructor for the SpellChecker class.

        SpellChecker objects can be created in two ways, depending on
        the nature of the first argument.  If it is a string, it
        specifies a language tag from which a dictionary is created.
        Otherwise, it must be an enchant Dict object to be used.
        
        Optional keyword arguments are:
            
            * text:  to set the text to be checked at creation time
            * tokenize:  a custom tokenization function to use
            * chunkers:  a list of chunkers to apply during tokenization
            * filters:  a list of filters to apply during tokenization
        
        If <tokenize> is not given and the first argument is a Dict,
        its 'tag' attribute must be a language tag so that a tokenization
        function can be created automatically.  If this attribute is missing
        the user's default language will be used.
        """
        if lang is None:
            lang = get_default_language()
        if isinstance(lang,basestring):
            dict = enchant.Dict(lang)
        else:
            dict = lang
            try:
                lang = dict.tag
            except AttributeError:
                lang = get_default_language()
        if lang is None:
            raise DefaultLanguageNotFoundError
        self.lang = lang
        self.dict = dict
        if tokenize is None:
            try:
                tokenize = get_tokenizer(lang,chunkers,filters)
            except TokenizerNotFoundError:
                # Fall back to default tokenization if no match for 'lang'
                tokenize = get_tokenizer(None,chunkers,filters)
        self._tokenize = tokenize
        
        self.word = None
        self.wordpos = None
        self._ignore_words = {}
        self._replace_words = {}
        # Default to the empty string as the text to be checked
        self._text = array.array('u')
        self._use_tostring = False
        self._tokens = iter([])
        
        if text is not None:
            self.set_text(text)
Beispiel #3
0
    def setDict(self, sp_dict):
        """Sets the spelling dictionary to be used"""
        try:
            self.tokenizer = tokenize.get_tokenizer(sp_dict.tag,
                                                    chunkers=self._chunkers,
                                                    filters=self.token_filters)
        except TokenizerNotFoundError:
            # Fall back to the "good for most euro languages" English tokenizer
            self.tokenizer = tokenize.get_tokenizer(chunkers=self._chunkers,
                                                    filters=self.token_filters)
        self._sp_dict = sp_dict

        self.rehighlight()
Beispiel #4
0
 def __init__(self, lang='en_US'):
     """
     Setup tokenizer.
     """
     self.lang = lang
     self._dict = enchant.Dict(self.lang)
     self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker,))
Beispiel #5
0
def spellcheck():
    global SCORE
    dictionary = enchant.Dict("en_US")
    tokenizer = get_tokenizer("en_US")
    # has_mispellings = any((not dictionary.check(word)) and word[0].islower() for word in word_list)
    for word in tokenizer(EMAIL_BODY):
        word = word[0]
        if not dictionary.check(word):
            print word
            misspelled = True
        else:
            misspelled = False

        if word[0].islower():
            weirdly_cased = False
        else:
            weirdly_cased = True

        if misspelled == True and weirdly_cased == False:  # ONLY increment score when mispelled, not capitalized
            SCORE += MISSPELL_SCORE
            global PHISHING_FLAGS
            PHISHING_FLAGS.append('Misspelled word(s)')
            return

    return
Beispiel #6
0
def test_HTMLChunker():
    """Test filtering of URLs"""
    text = """hello<html><head><title>my title</title></head><body>this is a
              <b>simple</b> HTML document for <p> test<i>ing</i> purposes</p>.
            It < contains > various <-- special characters.
            """
    tkns = get_tokenizer("en_US", chunkers=(HTMLChunker, ))(text)
    out = [t for t in tkns]
    exp = [
        ("hello", 0),
        ("my", 24),
        ("title", 27),
        ("this", 53),
        ("is", 58),
        ("a", 61),
        ("simple", 80),
        ("HTML", 91),
        ("document", 96),
        ("for", 105),
        ("test", 113),
        ("ing", 120),
        ("purposes", 128),
        ("It", 154),
        ("contains", 159),
        ("various", 170),
        ("special", 182),
        ("characters", 190),
    ]
    assert out == exp
    for (word, pos) in out:
        assert text[pos:pos + len(word)] == word
Beispiel #7
0
def test_EmailFilter(test_text):
    """Test filtering of email addresses"""
    tkns = get_tokenizer("en_US", filters=(EmailFilter, ))(test_text)
    out = [t for t in tkns]
    exp = [
        ("this", 0),
        ("text", 5),
        ("with", 10),
        ("http", 15),
        ("url", 22),
        ("com", 26),
        ("and", 30),
        ("SomeLinksLike", 34),
        ("ftp", 62),
        ("my", 68),
        ("site", 71),
        ("com", 76),
        ("au", 80),
        ("some", 83),
        ("file", 88),
        ("AndOthers", 93),
        ("not", 103),
        ("quite", 108),
        ("a", 114),
        ("url", 116),
        ("as", 157),
        ("well", 160),
    ]
    assert out == exp
Beispiel #8
0
def test_WikiWordFilter(test_text):
    """Test filtering of WikiWords"""
    tkns = get_tokenizer("en_US", filters=(WikiWordFilter, ))(test_text)
    out = [t for t in tkns]
    exp = [
        ("this", 0),
        ("text", 5),
        ("with", 10),
        ("http", 15),
        ("url", 22),
        ("com", 26),
        ("and", 30),
        ("ftp", 62),
        ("my", 68),
        ("site", 71),
        ("com", 76),
        ("au", 80),
        ("some", 83),
        ("file", 88),
        ("not", 103),
        ("quite", 108),
        ("a", 114),
        ("url", 116),
        ("with", 134),
        ("an", 139),
        ("aemail", 142),
        ("address", 149),
        ("as", 157),
        ("well", 160),
    ]
    assert out == exp
Beispiel #9
0
    def __call__(self, query):
        if EnchantTokenizer._SINGLETON_TOKENIZER is None:
            from enchant.tokenize import get_tokenizer
            EnchantTokenizer._SINGLETON_TOKENIZER = get_tokenizer('en_US')
            # XXX make language configurable

        return EnchantTokenizer._SINGLETON_TOKENIZER(query)
Beispiel #10
0
 def __init__(self, language="en_US"):
     if not enchant.dict_exists(language):
         logging.warning("Spelling_Corrector: Don't have {} , Please check it!!!", language)
         logging.warning("Recommend same language for you: {}", enchant.list_languages())
         language = "en_US"
     self.dict = enchant.Dict(language)
     self.check = SpellChecker(language)
     self.tokenizer = get_tokenizer(language)
Beispiel #11
0
def custom_word_tokenize(text):
    tokenizer = get_tokenizer("en_US")
    words = []

    for w in tokenizer(text):
        words.append(w[0])

    return words
Beispiel #12
0
def filter_text_before_spell_check(language,text):

    "The function will ignore email ,url , html tags from raw text and return the list of tuples"

    tknzr = get_tokenizer(language,chunkers=(HTMLChunker,),filters=[EmailFilter,URLFilter,WikiWordFilter])
    filteredText= ""
    filteredText = filteredText.join([w[0]+" " for w in tknzr(text)])

    return filteredText.strip()
Beispiel #13
0
def striptxt_pcap(pcap):
    tokenizer = get_tokenizer("en_US")
    a = rdpcap(pcap)
    sessions = a.sessions()
    packet_count = 0
    unencrypted_packet_count = 0
    encrypted_packet_count = 0
    encrypted_len = 0
    unencrypted_len = 0
    convs = {'Total Packets': 0, 'Plaintext Packets': 0, 'Encrypted Packets': 0, 'Plaintext Bytes': 0, 'Encrypted Bytes': 0, 'Plaintext Conversations':[], 'Encrypted Conversations':[]}
    for session in sessions:
        http_payload = b""
        encrypted = 'unknown'
        session_packets = 0
        for packet in sessions[session]:
            session_packets += 1
            packet_count += 1
            try:
                payload = bytes(packet[TCP].payload)
                payload = payload.decode('utf-8')
                word_tuple = [w for w in tokenizer(payload)]
                encrypted = 'Plaintext Conversations' if word_tuple else 'Encrypted Conversations'
                if encrypted == 'Plaintext Conversations':
                    encrypted_len += len(packet[TCP].payload)
                else:
                    unencrypted_len += len(packet[TCP].payload)
                convs[encrypted].append(f'{packet[IP].src}:{packet[TCP].sport},{packet[IP].dst}:{packet[TCP].dport}')
            except Exception as e:
                pass
            try:
                payload = bytes(packet[UDP].payload)
                payload = payload.decode('utf-8')
                word_tuple = [w for w in tokenizer(payload)]
                encrypted = 'Plaintext Conversations' if word_tuple else 'Encrypted Conversations'
                if encrypted == 'Plaintext Conversations':
                    encrypted_len += len(packet[UDP].payload)
                else:
                    unencrypted_len += len(packet[UDP].payload)
                convs[encrypted].append(f'{packet[IP].src}:{packet[UDP].sport},{packet[IP].dst}:{packet[UDP].dport}')
            except Exception as e:
                pass
        if encrypted == 'Plaintext Conversations':
            unencrypted_packet_count += session_packets
        elif encrypted == 'Encrypted Conversations':
            encrypted_packet_count += session_packets

    convs['Total Packets'] = packet_count
    convs['Plaintext Packets'] = unencrypted_packet_count
    convs['Encrypted Packets'] = encrypted_packet_count
    convs['Plaintext Bytes'] = unencrypted_len
    convs['Encrypted Bytes'] = encrypted_len
    convs['Plaintext Conversations'] = list(set(convs['Plaintext Conversations']))
    convs['Encrypted Conversations'] = list(set(convs['Encrypted Conversations']))
    results = {'convcontents': convs}
    print(results)
    return results
Beispiel #14
0
    def __init__(self, lang='en_US'):
        """
		Setup tokenizer..
		Create a new tokenizer based on lang.
		This lets us skip the HTML and only care
		about our contents
		"""
        self.lang = lang
        self._dict = enchant.Dict(self.lang)
        self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker, ))
Beispiel #15
0
 def __init__(self, scheduler=None,inq=None,outq=None):
     # multiprocessing.Process.__init__(self)
     super(Interpreter, self).__init__()
     self.tknzr = get_tokenizer("en_US")
     print "I:",self.name
     self.scheduler = scheduler
     self.inq=inq
     self.outq=outq
     self.daemon = True
     self.stop=threading.Event()
Beispiel #16
0
 def __init__(self,
              lang,
              suggest,
              word_list_filename,
              tokenizer_lang='en_US',
              filters=[]):
     self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
     self.tokenizer = get_tokenizer(tokenizer_lang, filters)
     self.original_tokenizer = self.tokenizer
     self.suggest = suggest
Beispiel #17
0
    def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.linter.namespace.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [
            w.strip() for w in self.linter.namespace.spelling_ignore_words.split(",")
        ]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        self.ignore_comment_directive_list = [
            w.strip()
            for w in self.linter.namespace.spelling_ignore_comment_directives.split(",")
        ]

        # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict
        if self.linter.namespace.spelling_private_dict_file:
            self.linter.namespace.spelling_private_dict_file = os.path.expanduser(
                self.linter.namespace.spelling_private_dict_file
            )

        if self.linter.namespace.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.linter.namespace.spelling_private_dict_file
            )
            self.private_dict_file = open(  # pylint: disable=consider-using-with
                self.linter.namespace.spelling_private_dict_file, "a", encoding="utf-8"
            )
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.linter.namespace.spelling_store_unknown_words:
            self.unknown_words = set()

        self.tokenizer = get_tokenizer(
            dict_name,
            chunkers=[ForwardSlashChunker],
            filters=[
                EmailFilter,
                URLFilter,
                WikiWordFilter,
                WordsWithDigitsFilter,
                WordsWithUnderscores,
                CamelCasedWord,
                SphinxDirectives,
            ],
        )
        self.initialized = True
 def __init__(self, lang="en_US"):
     """
     Setup tokenizer.
     
     Create a new tokenizer based on lang.
     This lets us skip the HTML and only
     care about our contents.
     """
     self.lang = lang
     self._dict = enchant.Dict(self.lang)
     self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker,))
Beispiel #19
0
def check(s):
    chkr = SpellChecker("en_US")
    chkr.set_text(s)
    chkr_count=0
    thnzr_count=0
    for i in chkr: 
        chkr_count=chkr_count+1
    thnzr = get_tokenizer("en_US")
    for i in thnzr(s): 
        thnzr_count=thnzr_count+1
    return 1 - chkr_count / thnzr_count
Beispiel #20
0
def text2words(text, lang='en_US', min_length=3):

    dict_en_US = Dict(lang)
    tknzr = get_tokenizer(lang)

    # Processed text: punctuation removal (except '-')
    p_text = regex.sub('', text)
    tokens = [token for token, _ in tknzr(p_text)]
    words = filter(lambda token: len(token) >= min_length, tokens)
    words = filter(dict_en_US.check, words)
    return words
Beispiel #21
0
 def __init__(self, fn):
     self.fn = fn
     self.content = open(fn).read().decode('utf8')
     self.lines = self.split_(self.content)
     self.errors = []
     self.spelld = enchant.Dict("en_UK")
     self.tknzr = get_tokenizer("en_UK")
     self.spellerrors = []
     self.latexterms = ("newpage", "clearpage", "textit", "textbf",
                        "textsc", "textwidth", "tabref", "figref",
                        "sectref", "emph")
def pos_and_lemmatize(text, lemmatizer):
    sc = checker.SpellChecker('en_US', text)
    for err in sc:
        try:
            err.replace(err.suggest()[0])
        except IndexError:
            pass
    tokenizer = tokenize.get_tokenizer('en_US')
    words = [w[0].encode('ascii', 'ignore').decode('utf-8') for w in tokenizer(text)]
    pos = nltk.pos_tag(words)
    lemmas = [lemmatizer.lemmatize(w, pos=get_wordnet_pos(p)) for (w, p) in pos]
    return lemmas
Beispiel #23
0
 def __init__(self):
     
     self.name = 'a sentence, that can be mutated'
     self.sentence = ''
     self.donothingrate = 0
     self.pointmutationrate = 0
     self.insertionrate = 0
     self.deletionrate = 0
     
     self.d = enchant.Dict("en_US")
     self.tknzr = get_tokenizer("en_US")
     
     return None
 def process_text(self, text):
     """
     accepts: [String] text input
     returns: [List] list of lower-case tokens with URLs filtered out
     """
     try:
         del self.result[:]
         to_check = [] 
         for (word,pos) in basic_tokenize(text): 
             if '@' not in word and 'RT' not in word: to_check.append(word) 
         tknzr = get_tokenizer("en_US",filters=[URLFilter])
         return [word for (word,pos) in tknzr(' '.join(to_check))]
     except UnicodeEncodeError: pass
Beispiel #25
0
 def __init__(self, lang, suggest, word_list_filename,
              tokenizer_lang='en_US', filters=None, context_line=False):
     if enchant_import_error is not None:
         raise RuntimeError(
             'Cannot instantiate SpellingChecker '
             'without PyEnchant installed',
         ) from enchant_import_error
     if filters is None:
         filters = []
     self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
     self.tokenizer = get_tokenizer(tokenizer_lang, filters=filters)
     self.original_tokenizer = self.tokenizer
     self.suggest = suggest
     self.context_line = context_line
Beispiel #26
0
 def __init__(self, chatWindow, parent=None):
     QtGui.QTextEdit.__init__(self, parent)
     self.spChecker = enchant.DictWithPWL("en-US", "lolspeak.txt")
     self.spTokenizer = get_tokenizer("en-US", [EmailFilter, URLFilter])
     self.font = None
     self.color = None
     self.charFormat = QtGui.QTextCharFormat()
     self.brush = QtGui.QBrush()
     self.setReadOnly(True)
     self.setAcceptRichText(True)
     self.isUserInput = False
     self.chatWindow = chatWindow
     self.wrongWordList = list()
     self.wrongWord = WrongWords()
Beispiel #27
0
def tally_word_counts_in_column(table_column, word_list, output_len):
    '''
    Inputs:  'table_column' is a Pandas DataSeries where each element is a
        string (or list of strings) to be searched; 'word_list' is a list of
        words to search for; 'output_len' is the length of the output table
        'tallies'
    Outputs:  a table/DataFrame with each element in 'word_list' as a column
        name, each row corresponding to each element in 'table_column', and
        values of '0' or '1' indicating whether that column's 'word_list'
        element appeared in that row's 'table_column' element; '0' is 'No' and
        '1' is 'Yes'
    'word_list' elements that are single words must match a token from the
        'table_column' string to be considered a match; this prevents words from
        matching when they are only sub-words in the string (e.g., so that
        searching for 'hat' in 'that' does not yield a match);
    'word_list' elements that are multiple words or hyphenated are searched for
        in the string itself (which can produce the sub-word problem described
        above)
    '''

    import pandas as pd
    from numpy import arange
    from enchant.tokenize import get_tokenizer

    tallies = pd.DataFrame(0, index=arange(output_len), columns=word_list)
    tokenizer = get_tokenizer('en_US')
    message_interval = 1000

    for i, text in table_column.iteritems():

        print_intermittent_status_message_in_loop(i, message_interval,
                                                  output_len)

        if isinstance(text, list):  # if text stored in list instead of string
            if not text[0] and len(text) == 1:
                continue            # skips empty lists
            text = ' '.join(text)

        tokens = [w[0].lower() for w in tokenizer(text)]

        for j in range(len(word_list)):

            if (' ' in word_list[j]) or ('-' in word_list[j]):
                if word_list[j] in text:
                    tallies.iloc[i, j] = 1
            else:
                if word_list[j] in tokens:
                    tallies.iloc[i, j] = 1

    return(tallies)
Beispiel #28
0
 def _get_contributors(self):
     logger.info('Scanning contributors')
     cmd = [
         'git', 'log', '--quiet', '--no-color',
         '--pretty=format:' + self._pretty_format,
     ]
     try:
         p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
     except subprocess.CalledProcessError as err:
         logger.warning('Called: {}'.format(' '.join(cmd)))
         logger.warning('Failed to scan contributors: {}'.format(err))
         return set()
     output = p.stdout.decode('utf-8')
     tokenizer = get_tokenizer('en_US', filters=[])
     return set(word for word, pos in tokenizer(output))
Beispiel #29
0
    def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [
            w.strip() for w in self.config.spelling_ignore_words.split(",")
        ]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict
        if self.config.spelling_private_dict_file:
            self.config.spelling_private_dict_file = os.path.expanduser(
                self.config.spelling_private_dict_file
            )

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file
            )
            self.private_dict_file = open(self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        self.tokenizer = get_tokenizer(
            dict_name,
            chunkers=[ForwardSlashChunkder],
            filters=[
                EmailFilter,
                URLFilter,
                WikiWordFilter,
                WordsWithDigigtsFilter,
                WordsWithUnderscores,
                CamelCasedWord,
                SphinxDirectives,
            ],
        )
        self.initialized = True
Beispiel #30
0
    def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [
            w.strip() for w in self.config.spelling_ignore_words.split(",")
        ]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict
        if self.config.spelling_private_dict_file:
            self.config.spelling_private_dict_file = os.path.expanduser(
                self.config.spelling_private_dict_file)

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.tokenizer = get_tokenizer(dict_name,
                                       filters=[
                                           EmailFilter, URLFilter,
                                           WikiWordFilter,
                                           WordsWithDigigtsFilter,
                                           WordsWithUnderscores
                                       ])
        self.initialized = True
Beispiel #31
0
def generate_post_statistics(
        text,
        spell_checking_locale="en_US",
        hyphenation_dictionary='/usr/share/myspell/dicts/hyph_en_US.dic'):
    """Creates a number of statistics of a forum post including:
	- a list of emails
	- a list of urls
	- number of misspelt works
	- Flesch-Kincaid readability score
	- a list of the spell checked text
	These results are only meaningful for (US) english text.  Two dictionaries are
	used, one for spell checking and one for hyphenation.  The first is provided
	as a locale (e.g. "en_US") that maps to a dictionary installed in enchant,
	the second as a filepath to the hyphenation dictionary that should be used
	for syllabul detection (e.g. /usr/share/myspell/dicts/hyph_en_US.dic).
	"""
    #spell checking first to get cleaner data for f-k
    #create dict and tokenizer at this level to save on reallocation
    dic = enchant.Dict(spell_checking_locale)
    tknzr = get_tokenizer(spell_checking_locale, (HTMLChunker, ),
                          (EmailFilter, URLFilter))

    sentances = __sentances_from_post(text)
    clean_sentances = []
    misspellings = 0
    for sentance in sentances:
        sentance_stats = __spell_check_sentance(sentance, dic, tknzr)
        corrected_string = sentance_stats["corrected_string"]
        if len(corrected_string) > 0:
            clean_sentances.append(corrected_string)
        misspellings += sentance_stats["misspelt_words"]
    clean_text = ". ".join(clean_sentances)

    #run f-k, from http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    scores = __readability_score_from_post(clean_text,
                                           locale=hyphenation_dictionary)
    scores["misspellings"] = misspellings
    scores["correct_post_text"] = clean_text

    #pull out emails and urls
    urls, emails = __urls_and_emails_from_post(text)
    scores["emails"] = " ".join(emails)
    scores["urls"] = " ".join(urls)
    return scores
Beispiel #32
0
    def open(self) -> None:
        self.initialized = False
        if enchant is None:
            return
        dict_name = self.linter.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [
            w.strip()
            for w in self.linter.config.spelling_ignore_words.split(",")
        ]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        self.ignore_comment_directive_list = [
            w.strip() for w in
            self.linter.config.spelling_ignore_comment_directives.split(",")
        ]

        if self.linter.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.linter.config.spelling_private_dict_file)
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.linter.config.spelling_store_unknown_words:
            self.unknown_words: set[str] = set()

        self.tokenizer = get_tokenizer(
            dict_name,
            chunkers=[ForwardSlashChunker],
            filters=[
                EmailFilter,
                URLFilter,
                WikiWordFilter,
                WordsWithDigitsFilter,
                WordsWithUnderscores,
                CamelCasedWord,
                SphinxDirectives,
            ],
        )
        self.initialized = True
Beispiel #33
0
def check_spelling(text):
    # TODO check if language is not English
    # TODO use dictionary with persons names
    valid_words = []
    invalid_words = []
    unchecked_words = []
    
    tknzr = get_tokenizer("en_GB", (URLFilter, HashFilter, MentionFilter))
    for (word, pos) in tknzr(text):
        try:
            valid = d.check(word) # check if word is valid
        except enchant.errors.Error as e:
            unchecked_words.append(word)
            #logger.debug("Unable to check if word is valid: '%s' reason: '%s'" % (word, e))
        else:
            l = valid_words if valid else invalid_words
            l.append(word)
        
    return {"valid": valid_words, "invalid": invalid_words, "unchecked": unchecked_words}
Beispiel #34
0
def test_CombinedFilter(test_text):
    """Test several filters combined"""
    tkns = get_tokenizer("en_US",
                         filters=(URLFilter, WikiWordFilter,
                                  EmailFilter))(test_text)
    out = [t for t in tkns]
    exp = [
        ("this", 0),
        ("text", 5),
        ("with", 10),
        ("and", 30),
        ("not", 103),
        ("quite", 108),
        ("a", 114),
        ("url", 116),
        ("as", 157),
        ("well", 160),
    ]
    assert out == exp
Beispiel #35
0
def check_file(fileName):
	dictonary = enchant.Dict(LANGUAGE)
	vocabulary = read_file(fileName)

	line_counter = 1
	error_counter = 0
	printHeader = False
	for line in vocabulary:
		#print(questions)
		#all_is_good = True

		error_line = {"line" : line_counter, "question" : line['translation'], "words" : []}
		line_counter = line_counter+1
		for question in line['translation']:
			tknzr = get_tokenizer(LANGUAGE)
			#tknzr(question)
			#words = question.split(" ")
			for (word, pos) in tknzr(question):
				if dictonary.check(word) != True:
					#all_is_good = False
					error_counter = error_counter+1
					error_line["words"].append(word)
					#print(word)
					#print("\t--> " + color.RED + question +color.END,flush=True)
					if not printHeader:
						printHeader = True
						print(Color.RED + "\n============== FEHLER IN DATEI " + fileName + "==============" + Color.END)
					print("Zeile %i : Begriff >%s< falsch oder unbekannt in: %s"%(error_line["line"], word, error_line["question"]))
		#if allgood != True:
		#	print("%i : %s in (%s)"%(errorLine["line"], errorLine["words"], errorLine["question"]))
			#print(errorLine)
			#line = errorLine["line"]
			#print("%i :"%(line))
	#print(Color.RED)
	#print("%i Fehler gefunden"%(lineCnt))
	if error_counter == 0:
		print(Color.GREEN)
	else:
		print(Color.RED)
	print("--> " + str(error_counter) +
							" Rechtschreibfehler in der Vokabeldatei gefunden!\n" + Color.END, flush=True)
	return len(error_line)
Beispiel #36
0
    def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict
        if self.config.spelling_private_dict_file:
            self.config.spelling_private_dict_file = os.path.expanduser(
                self.config.spelling_private_dict_file)

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.tokenizer = get_tokenizer(dict_name, filters=[EmailFilter,
                                                           URLFilter,
                                                           WikiWordFilter,
                                                           WordsWithDigigtsFilter,
                                                           WordsWithUnderscores])
        self.initialized = True
    def __init__(self, document, parent=None):
        super(Spellcheck, self).__init__(parent)

        self.createUI()
        if document is None:
            return
        else:
            self.doc = document.toPlainText()
        # copy the document text and strip out HTML, URL's and Email addresses
        tokens = get_tokenizer("en_US", chunkers=(HTMLChunker,), filters=[EmailFilter, URLFilter])
        self.editDoc = [] # tuples go into this list
        for word in tokens(self.doc):
            self.editDoc.append(word)
        self.wordsToCheck = dict((t[0], i) for i, t in enumerate(self.editDoc))
        # >>> Output self.wordsToCheck , unit Test with 10 cases
        self.wordlist = enchant.request_dict("en_GB")
        self.misspeltList = []
        for key in self.wordsToCheck.keys():
            self.checkWord(key)
        # >>> Plonk a test here
        
        self.highlightMisspelt(self.misspeltList[Spellcheck.index:])
Beispiel #38
0
def test_URLFilter(test_text):
    """Test filtering of URLs"""
    tkns = get_tokenizer("en_US", filters=(URLFilter, ))(test_text)
    out = [t for t in tkns]
    exp = [
        ("this", 0),
        ("text", 5),
        ("with", 10),
        ("and", 30),
        ("SomeLinksLike", 34),
        ("AndOthers", 93),
        ("not", 103),
        ("quite", 108),
        ("a", 114),
        ("url", 116),
        ("with", 134),
        ("an", 139),
        ("aemail", 142),
        ("address", 149),
        ("as", 157),
        ("well", 160),
    ]
    assert out == exp
Beispiel #39
0
#!/usr/bin/env python
#coding=utf-8
from enchant.tokenize import get_tokenizer
tknzr = get_tokenizer("en_US")
x = """
qui est simple pour toutes les autres. Lorsqu‘elle le fait remarquer,
on lui rétorque que c‘est le système qui veut ça. .. « La bureaucratie
universitaire est si lourde », dit-elle avec ironie, en ajoutant: « Son
poids n’est pas le même pour tout le monde.»

QUELQUES aerouns

La situation de Joëlle — tous ceux qui ont eu l’occasion d‘évoluer
en organisation le savent — est d'une extrême banalité. Des person—
nes qui se trouvent mises sur la touche, sans avoir jamais démÉriuâ
et ayant même, en tendance, une implication plutôt plus élevée
que la moyenne, tout le monde en a rencontré, La fréquence de ces
situations banalise—t-elle leur violence? Le discours qui consiste à
s‘accommoder de ce constat est bien connu: « C‘est un peu injuste
bien sûr, mais la justice, n‘est-elle pas une utopie romantique? Et
puis, si toutes ses collègues ayant moins d’ancienneté ont réussi à
connaître une promotion, c’est quand même un signe. . . Il n’y a pas
de fumée sans feu ! » Le propos pourrait être prolongé car joelle,
tout le monde la connaît: elle est la, dans ce bureau de l‘université ;
ici, dans une caisse d’allocations familiales ; ailleurs, dans un secré-
"""
print[w for w in tknzr(x)]
Beispiel #40
0
def test_acronym_unicode():
    text = u'a front-end for DBM-style databases'
    t = get_tokenizer('en_US', [])
    f = filters.AcronymFilter(t)
    words = [w[0] for w in f(text)]
    assert u'DBM' not in words, 'Failed to filter out acronym'
Beispiel #41
0
"""
import csv
import string
import enchant
import textmining
import datetime

from termDocumentMatrix import TermDocumentMatrix
from enchant.checker import SpellChecker
from enchant.tokenize import get_tokenizer, HTMLChunker, EmailFilter, URLFilter, WikiWordFilter

TEST = ('test_with_solutions.csv', 'test.csv')
TRAIN = ('train.csv', 'train.csv')

dictionary = enchant.Dict("en_US")
tokenizer = get_tokenizer(tag="en_US", chunkers=[HTMLChunker], filters=[EmailFilter, URLFilter, WikiWordFilter])

def preprocessSpellCheck():
    """ Parses the comments from each csv file in DATA_FILES and creates a new
        correpsonding file with comments that have been delimited into words
        that have been spell checked.
    """
    for dataFile in DATA_FILES:
        try:
            inputFile = open("../data/" + dataFile, 'rb')
            outputFile = open("../data/spellChecked_" + dataFile, 'wb')
            fileReader = csv.reader(inputFile, delimiter=',')
            fileWriter = csv.writer(outputFile, delimiter=',')
            fileReader.next() #Skip header labels

            for row in fileReader:
def extract_features(tlc):
    """extract features from the text

    Args:
        tlc (dict[str]): all the attributes of a tlc

    Returns:
        [dict]: a dictionary of features extracted
    """
    text = clean_text(tlc['body'])
    fields = dict()
    # add features here #
    fields['Top_comment_word_count'] = len(text.split(' '))
    fields['Top_comment_text'] = text

    # Extract time-based features
    def get_day_of_week(text):
        return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').weekday() + 1

    def get_day_of_month(text):
        return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').day

    def get_time_of_day(text):
        return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').hour
    time_local = time.localtime(tlc['created_utc'])
    time_local = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
    fields['Top_comment_day'] = get_day_of_month(time_local)
    fields['Top_comment_day_of_week'] = get_day_of_week(time_local)
    fields['Top_comment_hour'] = get_time_of_day(time_local)

    # Extract gender value
    gp = GenderPerformr()
    probs, _ = gp.predict(tlc['author'])
    # Rescale it from [0,1] to [-1,1]
    fields['Top_comment_author_gender_value'] = 2 * probs - 1

    # Extract percentage of mispellings
    check = SpellChecker("en_US")
    tokenizer = get_tokenizer("en_US")
    # Prevent the denominator from 0
    def weird_division(n, d):
        return n / d if d else 0

    def get_mispellings_percentage(text):
        mispelling_count = 0
        total_count = 0
        if text == 'nan':
            return total_count
        else:
            check.set_text(text)
            for err in check:
                mispelling_count = mispelling_count + 1
            for w in tokenizer(text):
                total_count = total_count + 1
            value = weird_division(mispelling_count, total_count)
            return value
    fields['Top_comment_mispellings'] = get_mispellings_percentage(text)

    # Get politeness, agreement, support scores, and rescale them from [1,5] to [-1,1]
    ar = Agreementr()
    pr = Politenessr()
    sr = Supportr()
    fields['Top_comment_agreement_value'] = 0.5*float(ar.predict([text]))-1.5
    fields['Top_comment_politeness_value'] = 0.5*float(pr.predict([text]))-1.5
    fields['Top_comment_support_value'] = 0.5*float(sr.predict([text]))-1.5

    # Get toxicity scores
    KEY = "yourkey.txt" # os.getenv("GOOGLE_API_KEY")
    service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=KEY)

    def get_results(request_id, response, exception):
        toxicity_scores.append((request_id, response))

    toxicity_scores = []
    count = 0
    batch = service.new_batch_http_request(callback=get_results)
    analyze_request = {
        'comment': {'text': text},
        "requestedAttributes": {
            "TOXICITY": {},
            "SEVERE_TOXICITY": {},
            "ATTACK_ON_COMMENTER": {}
        }
    }
    batch.add(service.comments().analyze(body=analyze_request), request_id=str(count))
    batch.execute()
    toxic_score = toxicity_scores[0][1]['attributeScores']['TOXICITY']['summaryScore']['value']
    attack_score = toxicity_scores[0][1]['attributeScores']['ATTACK_ON_COMMENTER']['summaryScore']['value']
    if toxic_score > 0.5:
        fields['Top_comment_untuned_toxicity'] = 1
    else:
        fields['Top_comment_untuned_toxicity'] = 0
    if toxic_score > 0.8 and attack_score > 0.5:
        fields['Top_comment_tuned_toxicity'] = 1
    else:
        fields['Top_comment_tuned_toxicity'] = 0
    # end of feature extractions #
    return fields
Beispiel #43
0
        print "word is error"
        print "word may be: ",
        print us_dict.suggest(word)


print "#### test combine dictionary ####"
# combine dictionary, add words in file to dictionary
combine_dict = enchant.DictWithPWL("en_US", "my_words.text")
if combine_dict.check(test_words[1]):
    print "combine dictionary has the word: %s" %(test_words[1])
else:
    print "combine dictionary doesn't have word: %s" %(test_words[1])

print "#### test SpellChecker ####"
test_text = "it's okay, tomorow is a god choise"
chkr = SpellChecker("en_US")
chkr.set_text(test_text)
# return error words_list
# god is an error, but it's a spell error
for err in chkr:
    print "[ERROR]: %s " %(err.word)

print "#### test tokenizer ####"
test_tokenizer_text = "It rains dog and cat. What? Dog and cat?"
tknzr = get_tokenizer("en_US")
tknzr_rlt = tknzr(test_tokenizer_text)
# return is a tuple, first is word, second is position
for w in tknzr_rlt:
    print w

 def __init__(self, lang, suggest, word_list_filename, filters=[]):
     self.dictionary = enchant.DictWithPWL(lang, word_list_filename)
     self.tokenizer = get_tokenizer(lang, filters)
     self.original_tokenizer = self.tokenizer
     self.suggest = suggest
Beispiel #45
0
def check_collection(inpath, outpath, lang, wordFiles=[]):
    """
    Checks the orthography of the text in a collection. The expected input are plain text files.
    
    Arguments:
    inpath (string): path to the input files, including file name pattern
    outpath (string): path to the output file, including the output file's name
    lang (string): which dictionary to use, e.g. "es", "fr", "de"
    wordFiles (list): optional; list of strings; paths to files with lists of words which will not be treated as errors (e.g. named entities)
    """

    try:
        enchant.dict_exists(lang)
        try:
            tknzr = get_tokenizer(lang)
        except enchant.errors.TokenizerNotFoundError:    
            tknzr = get_tokenizer()
        chk = checker.SpellChecker(lang, tokenize=tknzr)
        
    except enchant.errors.DictNotFoundError:
        print("ERROR: The dictionary " + lang + "doesn't exist. Please choose another dictionary.")
        sys.exit(0)

    all_words = []
    all_num = []
    all_idnos = []

    print("...checking...")
    for file in glob.glob(inpath):
        idno = os.path.basename(file)[-10:-4]
        all_idnos.append(idno)
        
        err_words = []

        with open(file, "r", encoding="UTF-8") as fin:
            intext = fin.read().lower()
            chk.set_text(intext)

        if len(wordFiles) !=0:
            allCorrects = ""
            for file in wordFiles:
                with open(file, "r", encoding="UTF-8") as f:
                     corrects = f.read().lower()
                     allCorrects = allCorrects + corrects

        for err in chk:
            if not wordFiles or err.word not in allCorrects: 
                err_words.append(err.word)
            all_words.append(err_words)

        err_num = collections.Counter(err_words)
        all_num.append(err_num)
        
        print("..." + str(len(err_num)) + " different errors found in " + idno)
        
    df = pd.DataFrame(all_num,index=all_idnos).T
    
    df = df.fillna(0)
    df = df.astype(int)
    
    df["sum"] = df.sum(axis=1)
    # df = df.sort("sum", ascending=False)
    df = df.sort_values(by="sum", ascending=False)
    
    df.to_csv(outpath)
    print("done")

from enchant.tokenize import get_tokenizer, HTMLChunker
from enchant.checker import SpellChecker
import codecs

# By default PyEnchant support

 # en_GB: British English
 # en_US: American English
 # de_DE: German
 # fr_FR: French


chkr = SpellChecker("en_GB")
tknzr = get_tokenizer("en_GB",chunkers=(HTMLChunker,))

# HTMLChunker able to deal with XML perfectly. (same syntax)

file = codecs.open("SpellX_test.txt", 'r',encoding='latin-1')
resu = open("test-result.txt", "w")
for f in file.readlines():
    a = [w for w in tknzr(f)]
    chkr.set_text(f)
    for err in chkr:
        resu.writelines(err.word + "\n")
        print("ERROR:", err.word)
        
resu.close()
    
Beispiel #47
0
import interface
import urllib2
import json
import enchant
from enchant.tokenize import get_tokenizer, EmailFilter, URLFilter
from enchant.checker import SpellChecker
from stringsafety import *

d = enchant.Dict("en_UK")
tkn = get_tokenizer("en_UK",filters=[URLFilter,EmailFilter])

def SetYahooID(str):
    global id
    id = str

id=''

def Spell(word):

    if d.check(word)==True:
        return word
    else:
        return d.suggest(word)
    '''
    url = "http://search.yahooapis.com/WebSearchService/V1/spellingSuggestion?appid={0}&output=json&query={1}".format(id,escapeurl(word))
    request = urllib2.Request(url,None,{'Referer':'http://spacerat.meteornet.net'})
    response={}
    data = None
    try:
        response = urllib2.urlopen(request)
Beispiel #48
0
blog = pyblog.WordPress('http://prideout.net/blog/xmlrpc.php', 'admin', passwd)

# If this can't find the post, it'll throw an exception with a good error message.
# Since it goes uncaught, it aborts the program.  Which is fine.
post = blog.get_post(postid)

#print "Found post %d with the following keys:" % postid
#print '\n'.join(post.keys())

contents = open(filename, 'r').read()
#contents = filter(lambda c: c not in "\r", contents)
print "Slurped up '%s'" % filename

if spellCheck:
    tokenizer = get_tokenizer("en_US",chunkers=(HTMLChunker,))
    words = tokenizer(contents)
    dictionary = enchant.Dict("en_US") 
    misspelled = set()
    for word in words:
        if not dictionary.check(word[0]):
            misspelled.add(word[0])
    print colorama.Fore.CYAN + colorama.Back.BLACK
    for line in formatColumns(list(misspelled), 3):
        print line
    print colorama.Fore.RESET + colorama.Back.RESET
    
post['description'] = contents
publish = False
blog.edit_post(postid, post, publish)
from __future__ import unicode_literals
from random import randint

from django.db import models
from django.contrib.auth.models import User

from enchant import Dict
from enchant.tokenize import get_tokenizer


DICTIONARY = Dict('en_US')
TOKENIZER = get_tokenizer('en_US')


def default_randomness():
	return randint(0, 10000)


class MotionFile(models.Model):
	MARKER_SET_KIT = 0  # do not change values, since they are stored in the DB!
	MARKER_SET_CMU = 1
	
	class Meta:
		unique_together = ('motion_db_id', 'motion_db_file_id')
	
	motion_db_id = models.PositiveIntegerField()
	motion_db_file_id = models.PositiveIntegerField()
	filename = models.CharField(max_length=255, unique=True)
	mean_perplexity = models.FloatField(default=0.)
	is_broken_confirmed = models.BooleanField(default=False)
	is_broken_reported = models.BooleanField(default=False)