def __init__(self, lang=None, text=None, tokenize=None, chunkers=None, filters=None): """Constructor for the SpellChecker class. SpellChecker objects can be created in two ways, depending on the nature of the first argument. If it is a string, it specifies a language tag from which a dictionary is created. Otherwise, it must be an enchant Dict object to be used. Optional keyword arguments are: * text: to set the text to be checked at creation time * tokenize: a custom tokenization function to use * chunkers: a list of chunkers to apply during tokenization * filters: a list of filters to apply during tokenization If <tokenize> is not given and the first argument is a Dict, its 'tag' attribute must be a language tag so that a tokenization function can be created automatically. If this attribute is missing the user's default language will be used. """ if lang is None: lang = get_default_language() if isinstance(lang, basestring): dict = enchant.Dict(lang) else: dict = lang try: lang = dict.tag except AttributeError: lang = get_default_language() if lang is None: raise DefaultLanguageNotFoundError self.lang = lang self.dict = dict if tokenize is None: try: tokenize = get_tokenizer(lang, chunkers, filters) except TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tokenize = get_tokenizer(None, chunkers, filters) self._tokenize = tokenize self.word = None self.wordpos = None self._ignore_words = {} self._replace_words = {} # Default to the empty string as the text to be checked self._text = array.array('u') self._use_tostring = False self._tokens = iter([]) if text is not None: self.set_text(text)
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None): """Constructor for the SpellChecker class. SpellChecker objects can be created in two ways, depending on the nature of the first argument. If it is a string, it specifies a language tag from which a dictionary is created. Otherwise, it must be an enchant Dict object to be used. Optional keyword arguments are: * text: to set the text to be checked at creation time * tokenize: a custom tokenization function to use * chunkers: a list of chunkers to apply during tokenization * filters: a list of filters to apply during tokenization If <tokenize> is not given and the first argument is a Dict, its 'tag' attribute must be a language tag so that a tokenization function can be created automatically. If this attribute is missing the user's default language will be used. """ if lang is None: lang = get_default_language() if isinstance(lang,basestring): dict = enchant.Dict(lang) else: dict = lang try: lang = dict.tag except AttributeError: lang = get_default_language() if lang is None: raise DefaultLanguageNotFoundError self.lang = lang self.dict = dict if tokenize is None: try: tokenize = get_tokenizer(lang,chunkers,filters) except TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tokenize = get_tokenizer(None,chunkers,filters) self._tokenize = tokenize self.word = None self.wordpos = None self._ignore_words = {} self._replace_words = {} # Default to the empty string as the text to be checked self._text = array.array('u') self._use_tostring = False self._tokens = iter([]) if text is not None: self.set_text(text)
def setDict(self, sp_dict): """Sets the spelling dictionary to be used""" try: self.tokenizer = tokenize.get_tokenizer(sp_dict.tag, chunkers=self._chunkers, filters=self.token_filters) except TokenizerNotFoundError: # Fall back to the "good for most euro languages" English tokenizer self.tokenizer = tokenize.get_tokenizer(chunkers=self._chunkers, filters=self.token_filters) self._sp_dict = sp_dict self.rehighlight()
def __init__(self, lang='en_US'): """ Setup tokenizer. """ self.lang = lang self._dict = enchant.Dict(self.lang) self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker,))
def spellcheck(): global SCORE dictionary = enchant.Dict("en_US") tokenizer = get_tokenizer("en_US") # has_mispellings = any((not dictionary.check(word)) and word[0].islower() for word in word_list) for word in tokenizer(EMAIL_BODY): word = word[0] if not dictionary.check(word): print word misspelled = True else: misspelled = False if word[0].islower(): weirdly_cased = False else: weirdly_cased = True if misspelled == True and weirdly_cased == False: # ONLY increment score when mispelled, not capitalized SCORE += MISSPELL_SCORE global PHISHING_FLAGS PHISHING_FLAGS.append('Misspelled word(s)') return return
def test_HTMLChunker(): """Test filtering of URLs""" text = """hello<html><head><title>my title</title></head><body>this is a <b>simple</b> HTML document for <p> test<i>ing</i> purposes</p>. It < contains > various <-- special characters. """ tkns = get_tokenizer("en_US", chunkers=(HTMLChunker, ))(text) out = [t for t in tkns] exp = [ ("hello", 0), ("my", 24), ("title", 27), ("this", 53), ("is", 58), ("a", 61), ("simple", 80), ("HTML", 91), ("document", 96), ("for", 105), ("test", 113), ("ing", 120), ("purposes", 128), ("It", 154), ("contains", 159), ("various", 170), ("special", 182), ("characters", 190), ] assert out == exp for (word, pos) in out: assert text[pos:pos + len(word)] == word
def test_EmailFilter(test_text): """Test filtering of email addresses""" tkns = get_tokenizer("en_US", filters=(EmailFilter, ))(test_text) out = [t for t in tkns] exp = [ ("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26), ("and", 30), ("SomeLinksLike", 34), ("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80), ("some", 83), ("file", 88), ("AndOthers", 93), ("not", 103), ("quite", 108), ("a", 114), ("url", 116), ("as", 157), ("well", 160), ] assert out == exp
def test_WikiWordFilter(test_text): """Test filtering of WikiWords""" tkns = get_tokenizer("en_US", filters=(WikiWordFilter, ))(test_text) out = [t for t in tkns] exp = [ ("this", 0), ("text", 5), ("with", 10), ("http", 15), ("url", 22), ("com", 26), ("and", 30), ("ftp", 62), ("my", 68), ("site", 71), ("com", 76), ("au", 80), ("some", 83), ("file", 88), ("not", 103), ("quite", 108), ("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142), ("address", 149), ("as", 157), ("well", 160), ] assert out == exp
def __call__(self, query): if EnchantTokenizer._SINGLETON_TOKENIZER is None: from enchant.tokenize import get_tokenizer EnchantTokenizer._SINGLETON_TOKENIZER = get_tokenizer('en_US') # XXX make language configurable return EnchantTokenizer._SINGLETON_TOKENIZER(query)
def __init__(self, language="en_US"): if not enchant.dict_exists(language): logging.warning("Spelling_Corrector: Don't have {} , Please check it!!!", language) logging.warning("Recommend same language for you: {}", enchant.list_languages()) language = "en_US" self.dict = enchant.Dict(language) self.check = SpellChecker(language) self.tokenizer = get_tokenizer(language)
def custom_word_tokenize(text): tokenizer = get_tokenizer("en_US") words = [] for w in tokenizer(text): words.append(w[0]) return words
def filter_text_before_spell_check(language,text): "The function will ignore email ,url , html tags from raw text and return the list of tuples" tknzr = get_tokenizer(language,chunkers=(HTMLChunker,),filters=[EmailFilter,URLFilter,WikiWordFilter]) filteredText= "" filteredText = filteredText.join([w[0]+" " for w in tknzr(text)]) return filteredText.strip()
def striptxt_pcap(pcap): tokenizer = get_tokenizer("en_US") a = rdpcap(pcap) sessions = a.sessions() packet_count = 0 unencrypted_packet_count = 0 encrypted_packet_count = 0 encrypted_len = 0 unencrypted_len = 0 convs = {'Total Packets': 0, 'Plaintext Packets': 0, 'Encrypted Packets': 0, 'Plaintext Bytes': 0, 'Encrypted Bytes': 0, 'Plaintext Conversations':[], 'Encrypted Conversations':[]} for session in sessions: http_payload = b"" encrypted = 'unknown' session_packets = 0 for packet in sessions[session]: session_packets += 1 packet_count += 1 try: payload = bytes(packet[TCP].payload) payload = payload.decode('utf-8') word_tuple = [w for w in tokenizer(payload)] encrypted = 'Plaintext Conversations' if word_tuple else 'Encrypted Conversations' if encrypted == 'Plaintext Conversations': encrypted_len += len(packet[TCP].payload) else: unencrypted_len += len(packet[TCP].payload) convs[encrypted].append(f'{packet[IP].src}:{packet[TCP].sport},{packet[IP].dst}:{packet[TCP].dport}') except Exception as e: pass try: payload = bytes(packet[UDP].payload) payload = payload.decode('utf-8') word_tuple = [w for w in tokenizer(payload)] encrypted = 'Plaintext Conversations' if word_tuple else 'Encrypted Conversations' if encrypted == 'Plaintext Conversations': encrypted_len += len(packet[UDP].payload) else: unencrypted_len += len(packet[UDP].payload) convs[encrypted].append(f'{packet[IP].src}:{packet[UDP].sport},{packet[IP].dst}:{packet[UDP].dport}') except Exception as e: pass if encrypted == 'Plaintext Conversations': unencrypted_packet_count += session_packets elif encrypted == 'Encrypted Conversations': encrypted_packet_count += session_packets convs['Total Packets'] = packet_count convs['Plaintext Packets'] = unencrypted_packet_count convs['Encrypted Packets'] = encrypted_packet_count convs['Plaintext Bytes'] = unencrypted_len convs['Encrypted Bytes'] = encrypted_len convs['Plaintext Conversations'] = list(set(convs['Plaintext Conversations'])) convs['Encrypted Conversations'] = list(set(convs['Encrypted Conversations'])) results = {'convcontents': convs} print(results) return results
def __init__(self, lang='en_US'): """ Setup tokenizer.. Create a new tokenizer based on lang. This lets us skip the HTML and only care about our contents """ self.lang = lang self._dict = enchant.Dict(self.lang) self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker, ))
def __init__(self, scheduler=None,inq=None,outq=None): # multiprocessing.Process.__init__(self) super(Interpreter, self).__init__() self.tknzr = get_tokenizer("en_US") print "I:",self.name self.scheduler = scheduler self.inq=inq self.outq=outq self.daemon = True self.stop=threading.Event()
def __init__(self, lang, suggest, word_list_filename, tokenizer_lang='en_US', filters=[]): self.dictionary = enchant.DictWithPWL(lang, word_list_filename) self.tokenizer = get_tokenizer(tokenizer_lang, filters) self.original_tokenizer = self.tokenizer self.suggest = suggest
def open(self): self.initialized = False self.private_dict_file = None if enchant is None: return dict_name = self.linter.namespace.spelling_dict if not dict_name: return self.ignore_list = [ w.strip() for w in self.linter.namespace.spelling_ignore_words.split(",") ] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) self.ignore_comment_directive_list = [ w.strip() for w in self.linter.namespace.spelling_ignore_comment_directives.split(",") ] # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict if self.linter.namespace.spelling_private_dict_file: self.linter.namespace.spelling_private_dict_file = os.path.expanduser( self.linter.namespace.spelling_private_dict_file ) if self.linter.namespace.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.linter.namespace.spelling_private_dict_file ) self.private_dict_file = open( # pylint: disable=consider-using-with self.linter.namespace.spelling_private_dict_file, "a", encoding="utf-8" ) else: self.spelling_dict = enchant.Dict(dict_name) if self.linter.namespace.spelling_store_unknown_words: self.unknown_words = set() self.tokenizer = get_tokenizer( dict_name, chunkers=[ForwardSlashChunker], filters=[ EmailFilter, URLFilter, WikiWordFilter, WordsWithDigitsFilter, WordsWithUnderscores, CamelCasedWord, SphinxDirectives, ], ) self.initialized = True
def __init__(self, lang="en_US"): """ Setup tokenizer. Create a new tokenizer based on lang. This lets us skip the HTML and only care about our contents. """ self.lang = lang self._dict = enchant.Dict(self.lang) self._tk = get_tokenizer(self.lang, chunkers=(HTMLChunker,))
def check(s): chkr = SpellChecker("en_US") chkr.set_text(s) chkr_count=0 thnzr_count=0 for i in chkr: chkr_count=chkr_count+1 thnzr = get_tokenizer("en_US") for i in thnzr(s): thnzr_count=thnzr_count+1 return 1 - chkr_count / thnzr_count
def text2words(text, lang='en_US', min_length=3): dict_en_US = Dict(lang) tknzr = get_tokenizer(lang) # Processed text: punctuation removal (except '-') p_text = regex.sub('', text) tokens = [token for token, _ in tknzr(p_text)] words = filter(lambda token: len(token) >= min_length, tokens) words = filter(dict_en_US.check, words) return words
def __init__(self, fn): self.fn = fn self.content = open(fn).read().decode('utf8') self.lines = self.split_(self.content) self.errors = [] self.spelld = enchant.Dict("en_UK") self.tknzr = get_tokenizer("en_UK") self.spellerrors = [] self.latexterms = ("newpage", "clearpage", "textit", "textbf", "textsc", "textwidth", "tabref", "figref", "sectref", "emph")
def pos_and_lemmatize(text, lemmatizer): sc = checker.SpellChecker('en_US', text) for err in sc: try: err.replace(err.suggest()[0]) except IndexError: pass tokenizer = tokenize.get_tokenizer('en_US') words = [w[0].encode('ascii', 'ignore').decode('utf-8') for w in tokenizer(text)] pos = nltk.pos_tag(words) lemmas = [lemmatizer.lemmatize(w, pos=get_wordnet_pos(p)) for (w, p) in pos] return lemmas
def __init__(self): self.name = 'a sentence, that can be mutated' self.sentence = '' self.donothingrate = 0 self.pointmutationrate = 0 self.insertionrate = 0 self.deletionrate = 0 self.d = enchant.Dict("en_US") self.tknzr = get_tokenizer("en_US") return None
def process_text(self, text): """ accepts: [String] text input returns: [List] list of lower-case tokens with URLs filtered out """ try: del self.result[:] to_check = [] for (word,pos) in basic_tokenize(text): if '@' not in word and 'RT' not in word: to_check.append(word) tknzr = get_tokenizer("en_US",filters=[URLFilter]) return [word for (word,pos) in tknzr(' '.join(to_check))] except UnicodeEncodeError: pass
def __init__(self, lang, suggest, word_list_filename, tokenizer_lang='en_US', filters=None, context_line=False): if enchant_import_error is not None: raise RuntimeError( 'Cannot instantiate SpellingChecker ' 'without PyEnchant installed', ) from enchant_import_error if filters is None: filters = [] self.dictionary = enchant.DictWithPWL(lang, word_list_filename) self.tokenizer = get_tokenizer(tokenizer_lang, filters=filters) self.original_tokenizer = self.tokenizer self.suggest = suggest self.context_line = context_line
def __init__(self, chatWindow, parent=None): QtGui.QTextEdit.__init__(self, parent) self.spChecker = enchant.DictWithPWL("en-US", "lolspeak.txt") self.spTokenizer = get_tokenizer("en-US", [EmailFilter, URLFilter]) self.font = None self.color = None self.charFormat = QtGui.QTextCharFormat() self.brush = QtGui.QBrush() self.setReadOnly(True) self.setAcceptRichText(True) self.isUserInput = False self.chatWindow = chatWindow self.wrongWordList = list() self.wrongWord = WrongWords()
def tally_word_counts_in_column(table_column, word_list, output_len): ''' Inputs: 'table_column' is a Pandas DataSeries where each element is a string (or list of strings) to be searched; 'word_list' is a list of words to search for; 'output_len' is the length of the output table 'tallies' Outputs: a table/DataFrame with each element in 'word_list' as a column name, each row corresponding to each element in 'table_column', and values of '0' or '1' indicating whether that column's 'word_list' element appeared in that row's 'table_column' element; '0' is 'No' and '1' is 'Yes' 'word_list' elements that are single words must match a token from the 'table_column' string to be considered a match; this prevents words from matching when they are only sub-words in the string (e.g., so that searching for 'hat' in 'that' does not yield a match); 'word_list' elements that are multiple words or hyphenated are searched for in the string itself (which can produce the sub-word problem described above) ''' import pandas as pd from numpy import arange from enchant.tokenize import get_tokenizer tallies = pd.DataFrame(0, index=arange(output_len), columns=word_list) tokenizer = get_tokenizer('en_US') message_interval = 1000 for i, text in table_column.iteritems(): print_intermittent_status_message_in_loop(i, message_interval, output_len) if isinstance(text, list): # if text stored in list instead of string if not text[0] and len(text) == 1: continue # skips empty lists text = ' '.join(text) tokens = [w[0].lower() for w in tokenizer(text)] for j in range(len(word_list)): if (' ' in word_list[j]) or ('-' in word_list[j]): if word_list[j] in text: tallies.iloc[i, j] = 1 else: if word_list[j] in tokens: tallies.iloc[i, j] = 1 return(tallies)
def _get_contributors(self): logger.info('Scanning contributors') cmd = [ 'git', 'log', '--quiet', '--no-color', '--pretty=format:' + self._pretty_format, ] try: p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE) except subprocess.CalledProcessError as err: logger.warning('Called: {}'.format(' '.join(cmd))) logger.warning('Failed to scan contributors: {}'.format(err)) return set() output = p.stdout.decode('utf-8') tokenizer = get_tokenizer('en_US', filters=[]) return set(word for word, pos in tokenizer(output))
def open(self): self.initialized = False self.private_dict_file = None if enchant is None: return dict_name = self.config.spelling_dict if not dict_name: return self.ignore_list = [ w.strip() for w in self.config.spelling_ignore_words.split(",") ] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict if self.config.spelling_private_dict_file: self.config.spelling_private_dict_file = os.path.expanduser( self.config.spelling_private_dict_file ) if self.config.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.config.spelling_private_dict_file ) self.private_dict_file = open(self.config.spelling_private_dict_file, "a") else: self.spelling_dict = enchant.Dict(dict_name) if self.config.spelling_store_unknown_words: self.unknown_words = set() self.tokenizer = get_tokenizer( dict_name, chunkers=[ForwardSlashChunkder], filters=[ EmailFilter, URLFilter, WikiWordFilter, WordsWithDigigtsFilter, WordsWithUnderscores, CamelCasedWord, SphinxDirectives, ], ) self.initialized = True
def open(self): self.initialized = False self.private_dict_file = None if enchant is None: return dict_name = self.config.spelling_dict if not dict_name: return self.ignore_list = [ w.strip() for w in self.config.spelling_ignore_words.split(",") ] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict if self.config.spelling_private_dict_file: self.config.spelling_private_dict_file = os.path.expanduser( self.config.spelling_private_dict_file) if self.config.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.config.spelling_private_dict_file) self.private_dict_file = open( self.config.spelling_private_dict_file, "a") else: self.spelling_dict = enchant.Dict(dict_name) if self.config.spelling_store_unknown_words: self.unknown_words = set() # Prepare regex for stripping punctuation signs from text. # ' and _ are treated in a special way. puncts = string.punctuation.replace("'", "").replace("_", "") self.punctuation_regex = re.compile('[%s]' % re.escape(puncts)) self.tokenizer = get_tokenizer(dict_name, filters=[ EmailFilter, URLFilter, WikiWordFilter, WordsWithDigigtsFilter, WordsWithUnderscores ]) self.initialized = True
def generate_post_statistics( text, spell_checking_locale="en_US", hyphenation_dictionary='/usr/share/myspell/dicts/hyph_en_US.dic'): """Creates a number of statistics of a forum post including: - a list of emails - a list of urls - number of misspelt works - Flesch-Kincaid readability score - a list of the spell checked text These results are only meaningful for (US) english text. Two dictionaries are used, one for spell checking and one for hyphenation. The first is provided as a locale (e.g. "en_US") that maps to a dictionary installed in enchant, the second as a filepath to the hyphenation dictionary that should be used for syllabul detection (e.g. /usr/share/myspell/dicts/hyph_en_US.dic). """ #spell checking first to get cleaner data for f-k #create dict and tokenizer at this level to save on reallocation dic = enchant.Dict(spell_checking_locale) tknzr = get_tokenizer(spell_checking_locale, (HTMLChunker, ), (EmailFilter, URLFilter)) sentances = __sentances_from_post(text) clean_sentances = [] misspellings = 0 for sentance in sentances: sentance_stats = __spell_check_sentance(sentance, dic, tknzr) corrected_string = sentance_stats["corrected_string"] if len(corrected_string) > 0: clean_sentances.append(corrected_string) misspellings += sentance_stats["misspelt_words"] clean_text = ". ".join(clean_sentances) #run f-k, from http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests scores = __readability_score_from_post(clean_text, locale=hyphenation_dictionary) scores["misspellings"] = misspellings scores["correct_post_text"] = clean_text #pull out emails and urls urls, emails = __urls_and_emails_from_post(text) scores["emails"] = " ".join(emails) scores["urls"] = " ".join(urls) return scores
def open(self) -> None: self.initialized = False if enchant is None: return dict_name = self.linter.config.spelling_dict if not dict_name: return self.ignore_list = [ w.strip() for w in self.linter.config.spelling_ignore_words.split(",") ] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) self.ignore_comment_directive_list = [ w.strip() for w in self.linter.config.spelling_ignore_comment_directives.split(",") ] if self.linter.config.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.linter.config.spelling_private_dict_file) else: self.spelling_dict = enchant.Dict(dict_name) if self.linter.config.spelling_store_unknown_words: self.unknown_words: set[str] = set() self.tokenizer = get_tokenizer( dict_name, chunkers=[ForwardSlashChunker], filters=[ EmailFilter, URLFilter, WikiWordFilter, WordsWithDigitsFilter, WordsWithUnderscores, CamelCasedWord, SphinxDirectives, ], ) self.initialized = True
def check_spelling(text): # TODO check if language is not English # TODO use dictionary with persons names valid_words = [] invalid_words = [] unchecked_words = [] tknzr = get_tokenizer("en_GB", (URLFilter, HashFilter, MentionFilter)) for (word, pos) in tknzr(text): try: valid = d.check(word) # check if word is valid except enchant.errors.Error as e: unchecked_words.append(word) #logger.debug("Unable to check if word is valid: '%s' reason: '%s'" % (word, e)) else: l = valid_words if valid else invalid_words l.append(word) return {"valid": valid_words, "invalid": invalid_words, "unchecked": unchecked_words}
def test_CombinedFilter(test_text): """Test several filters combined""" tkns = get_tokenizer("en_US", filters=(URLFilter, WikiWordFilter, EmailFilter))(test_text) out = [t for t in tkns] exp = [ ("this", 0), ("text", 5), ("with", 10), ("and", 30), ("not", 103), ("quite", 108), ("a", 114), ("url", 116), ("as", 157), ("well", 160), ] assert out == exp
def check_file(fileName): dictonary = enchant.Dict(LANGUAGE) vocabulary = read_file(fileName) line_counter = 1 error_counter = 0 printHeader = False for line in vocabulary: #print(questions) #all_is_good = True error_line = {"line" : line_counter, "question" : line['translation'], "words" : []} line_counter = line_counter+1 for question in line['translation']: tknzr = get_tokenizer(LANGUAGE) #tknzr(question) #words = question.split(" ") for (word, pos) in tknzr(question): if dictonary.check(word) != True: #all_is_good = False error_counter = error_counter+1 error_line["words"].append(word) #print(word) #print("\t--> " + color.RED + question +color.END,flush=True) if not printHeader: printHeader = True print(Color.RED + "\n============== FEHLER IN DATEI " + fileName + "==============" + Color.END) print("Zeile %i : Begriff >%s< falsch oder unbekannt in: %s"%(error_line["line"], word, error_line["question"])) #if allgood != True: # print("%i : %s in (%s)"%(errorLine["line"], errorLine["words"], errorLine["question"])) #print(errorLine) #line = errorLine["line"] #print("%i :"%(line)) #print(Color.RED) #print("%i Fehler gefunden"%(lineCnt)) if error_counter == 0: print(Color.GREEN) else: print(Color.RED) print("--> " + str(error_counter) + " Rechtschreibfehler in der Vokabeldatei gefunden!\n" + Color.END, flush=True) return len(error_line)
def open(self): self.initialized = False self.private_dict_file = None if enchant is None: return dict_name = self.config.spelling_dict if not dict_name: return self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict if self.config.spelling_private_dict_file: self.config.spelling_private_dict_file = os.path.expanduser( self.config.spelling_private_dict_file) if self.config.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.config.spelling_private_dict_file) self.private_dict_file = open( self.config.spelling_private_dict_file, "a") else: self.spelling_dict = enchant.Dict(dict_name) if self.config.spelling_store_unknown_words: self.unknown_words = set() # Prepare regex for stripping punctuation signs from text. # ' and _ are treated in a special way. puncts = string.punctuation.replace("'", "").replace("_", "") self.punctuation_regex = re.compile('[%s]' % re.escape(puncts)) self.tokenizer = get_tokenizer(dict_name, filters=[EmailFilter, URLFilter, WikiWordFilter, WordsWithDigigtsFilter, WordsWithUnderscores]) self.initialized = True
def __init__(self, document, parent=None): super(Spellcheck, self).__init__(parent) self.createUI() if document is None: return else: self.doc = document.toPlainText() # copy the document text and strip out HTML, URL's and Email addresses tokens = get_tokenizer("en_US", chunkers=(HTMLChunker,), filters=[EmailFilter, URLFilter]) self.editDoc = [] # tuples go into this list for word in tokens(self.doc): self.editDoc.append(word) self.wordsToCheck = dict((t[0], i) for i, t in enumerate(self.editDoc)) # >>> Output self.wordsToCheck , unit Test with 10 cases self.wordlist = enchant.request_dict("en_GB") self.misspeltList = [] for key in self.wordsToCheck.keys(): self.checkWord(key) # >>> Plonk a test here self.highlightMisspelt(self.misspeltList[Spellcheck.index:])
def test_URLFilter(test_text): """Test filtering of URLs""" tkns = get_tokenizer("en_US", filters=(URLFilter, ))(test_text) out = [t for t in tkns] exp = [ ("this", 0), ("text", 5), ("with", 10), ("and", 30), ("SomeLinksLike", 34), ("AndOthers", 93), ("not", 103), ("quite", 108), ("a", 114), ("url", 116), ("with", 134), ("an", 139), ("aemail", 142), ("address", 149), ("as", 157), ("well", 160), ] assert out == exp
#!/usr/bin/env python #coding=utf-8 from enchant.tokenize import get_tokenizer tknzr = get_tokenizer("en_US") x = """ qui est simple pour toutes les autres. Lorsqu‘elle le fait remarquer, on lui rétorque que c‘est le système qui veut ça. .. « La bureaucratie universitaire est si lourde », dit-elle avec ironie, en ajoutant: « Son poids n’est pas le même pour tout le monde.» QUELQUES aerouns La situation de Joëlle — tous ceux qui ont eu l’occasion d‘évoluer en organisation le savent — est d'une extrême banalité. Des person— nes qui se trouvent mises sur la touche, sans avoir jamais démÉriuâ et ayant même, en tendance, une implication plutôt plus élevée que la moyenne, tout le monde en a rencontré, La fréquence de ces situations banalise—t-elle leur violence? Le discours qui consiste à s‘accommoder de ce constat est bien connu: « C‘est un peu injuste bien sûr, mais la justice, n‘est-elle pas une utopie romantique? Et puis, si toutes ses collègues ayant moins d’ancienneté ont réussi à connaître une promotion, c’est quand même un signe. . . Il n’y a pas de fumée sans feu ! » Le propos pourrait être prolongé car joelle, tout le monde la connaît: elle est la, dans ce bureau de l‘université ; ici, dans une caisse d’allocations familiales ; ailleurs, dans un secré- """ print[w for w in tknzr(x)]
def test_acronym_unicode(): text = u'a front-end for DBM-style databases' t = get_tokenizer('en_US', []) f = filters.AcronymFilter(t) words = [w[0] for w in f(text)] assert u'DBM' not in words, 'Failed to filter out acronym'
""" import csv import string import enchant import textmining import datetime from termDocumentMatrix import TermDocumentMatrix from enchant.checker import SpellChecker from enchant.tokenize import get_tokenizer, HTMLChunker, EmailFilter, URLFilter, WikiWordFilter TEST = ('test_with_solutions.csv', 'test.csv') TRAIN = ('train.csv', 'train.csv') dictionary = enchant.Dict("en_US") tokenizer = get_tokenizer(tag="en_US", chunkers=[HTMLChunker], filters=[EmailFilter, URLFilter, WikiWordFilter]) def preprocessSpellCheck(): """ Parses the comments from each csv file in DATA_FILES and creates a new correpsonding file with comments that have been delimited into words that have been spell checked. """ for dataFile in DATA_FILES: try: inputFile = open("../data/" + dataFile, 'rb') outputFile = open("../data/spellChecked_" + dataFile, 'wb') fileReader = csv.reader(inputFile, delimiter=',') fileWriter = csv.writer(outputFile, delimiter=',') fileReader.next() #Skip header labels for row in fileReader:
def extract_features(tlc): """extract features from the text Args: tlc (dict[str]): all the attributes of a tlc Returns: [dict]: a dictionary of features extracted """ text = clean_text(tlc['body']) fields = dict() # add features here # fields['Top_comment_word_count'] = len(text.split(' ')) fields['Top_comment_text'] = text # Extract time-based features def get_day_of_week(text): return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').weekday() + 1 def get_day_of_month(text): return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').day def get_time_of_day(text): return datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S').hour time_local = time.localtime(tlc['created_utc']) time_local = time.strftime("%Y-%m-%d %H:%M:%S", time_local) fields['Top_comment_day'] = get_day_of_month(time_local) fields['Top_comment_day_of_week'] = get_day_of_week(time_local) fields['Top_comment_hour'] = get_time_of_day(time_local) # Extract gender value gp = GenderPerformr() probs, _ = gp.predict(tlc['author']) # Rescale it from [0,1] to [-1,1] fields['Top_comment_author_gender_value'] = 2 * probs - 1 # Extract percentage of mispellings check = SpellChecker("en_US") tokenizer = get_tokenizer("en_US") # Prevent the denominator from 0 def weird_division(n, d): return n / d if d else 0 def get_mispellings_percentage(text): mispelling_count = 0 total_count = 0 if text == 'nan': return total_count else: check.set_text(text) for err in check: mispelling_count = mispelling_count + 1 for w in tokenizer(text): total_count = total_count + 1 value = weird_division(mispelling_count, total_count) return value fields['Top_comment_mispellings'] = get_mispellings_percentage(text) # Get politeness, agreement, support scores, and rescale them from [1,5] to [-1,1] ar = Agreementr() pr = Politenessr() sr = Supportr() fields['Top_comment_agreement_value'] = 0.5*float(ar.predict([text]))-1.5 fields['Top_comment_politeness_value'] = 0.5*float(pr.predict([text]))-1.5 fields['Top_comment_support_value'] = 0.5*float(sr.predict([text]))-1.5 # Get toxicity scores KEY = "yourkey.txt" # os.getenv("GOOGLE_API_KEY") service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=KEY) def get_results(request_id, response, exception): toxicity_scores.append((request_id, response)) toxicity_scores = [] count = 0 batch = service.new_batch_http_request(callback=get_results) analyze_request = { 'comment': {'text': text}, "requestedAttributes": { "TOXICITY": {}, "SEVERE_TOXICITY": {}, "ATTACK_ON_COMMENTER": {} } } batch.add(service.comments().analyze(body=analyze_request), request_id=str(count)) batch.execute() toxic_score = toxicity_scores[0][1]['attributeScores']['TOXICITY']['summaryScore']['value'] attack_score = toxicity_scores[0][1]['attributeScores']['ATTACK_ON_COMMENTER']['summaryScore']['value'] if toxic_score > 0.5: fields['Top_comment_untuned_toxicity'] = 1 else: fields['Top_comment_untuned_toxicity'] = 0 if toxic_score > 0.8 and attack_score > 0.5: fields['Top_comment_tuned_toxicity'] = 1 else: fields['Top_comment_tuned_toxicity'] = 0 # end of feature extractions # return fields
print "word is error" print "word may be: ", print us_dict.suggest(word) print "#### test combine dictionary ####" # combine dictionary, add words in file to dictionary combine_dict = enchant.DictWithPWL("en_US", "my_words.text") if combine_dict.check(test_words[1]): print "combine dictionary has the word: %s" %(test_words[1]) else: print "combine dictionary doesn't have word: %s" %(test_words[1]) print "#### test SpellChecker ####" test_text = "it's okay, tomorow is a god choise" chkr = SpellChecker("en_US") chkr.set_text(test_text) # return error words_list # god is an error, but it's a spell error for err in chkr: print "[ERROR]: %s " %(err.word) print "#### test tokenizer ####" test_tokenizer_text = "It rains dog and cat. What? Dog and cat?" tknzr = get_tokenizer("en_US") tknzr_rlt = tknzr(test_tokenizer_text) # return is a tuple, first is word, second is position for w in tknzr_rlt: print w
def __init__(self, lang, suggest, word_list_filename, filters=[]): self.dictionary = enchant.DictWithPWL(lang, word_list_filename) self.tokenizer = get_tokenizer(lang, filters) self.original_tokenizer = self.tokenizer self.suggest = suggest
def check_collection(inpath, outpath, lang, wordFiles=[]): """ Checks the orthography of the text in a collection. The expected input are plain text files. Arguments: inpath (string): path to the input files, including file name pattern outpath (string): path to the output file, including the output file's name lang (string): which dictionary to use, e.g. "es", "fr", "de" wordFiles (list): optional; list of strings; paths to files with lists of words which will not be treated as errors (e.g. named entities) """ try: enchant.dict_exists(lang) try: tknzr = get_tokenizer(lang) except enchant.errors.TokenizerNotFoundError: tknzr = get_tokenizer() chk = checker.SpellChecker(lang, tokenize=tknzr) except enchant.errors.DictNotFoundError: print("ERROR: The dictionary " + lang + "doesn't exist. Please choose another dictionary.") sys.exit(0) all_words = [] all_num = [] all_idnos = [] print("...checking...") for file in glob.glob(inpath): idno = os.path.basename(file)[-10:-4] all_idnos.append(idno) err_words = [] with open(file, "r", encoding="UTF-8") as fin: intext = fin.read().lower() chk.set_text(intext) if len(wordFiles) !=0: allCorrects = "" for file in wordFiles: with open(file, "r", encoding="UTF-8") as f: corrects = f.read().lower() allCorrects = allCorrects + corrects for err in chk: if not wordFiles or err.word not in allCorrects: err_words.append(err.word) all_words.append(err_words) err_num = collections.Counter(err_words) all_num.append(err_num) print("..." + str(len(err_num)) + " different errors found in " + idno) df = pd.DataFrame(all_num,index=all_idnos).T df = df.fillna(0) df = df.astype(int) df["sum"] = df.sum(axis=1) # df = df.sort("sum", ascending=False) df = df.sort_values(by="sum", ascending=False) df.to_csv(outpath) print("done")
from enchant.tokenize import get_tokenizer, HTMLChunker from enchant.checker import SpellChecker import codecs # By default PyEnchant support # en_GB: British English # en_US: American English # de_DE: German # fr_FR: French chkr = SpellChecker("en_GB") tknzr = get_tokenizer("en_GB",chunkers=(HTMLChunker,)) # HTMLChunker able to deal with XML perfectly. (same syntax) file = codecs.open("SpellX_test.txt", 'r',encoding='latin-1') resu = open("test-result.txt", "w") for f in file.readlines(): a = [w for w in tknzr(f)] chkr.set_text(f) for err in chkr: resu.writelines(err.word + "\n") print("ERROR:", err.word) resu.close()
import interface import urllib2 import json import enchant from enchant.tokenize import get_tokenizer, EmailFilter, URLFilter from enchant.checker import SpellChecker from stringsafety import * d = enchant.Dict("en_UK") tkn = get_tokenizer("en_UK",filters=[URLFilter,EmailFilter]) def SetYahooID(str): global id id = str id='' def Spell(word): if d.check(word)==True: return word else: return d.suggest(word) ''' url = "http://search.yahooapis.com/WebSearchService/V1/spellingSuggestion?appid={0}&output=json&query={1}".format(id,escapeurl(word)) request = urllib2.Request(url,None,{'Referer':'http://spacerat.meteornet.net'}) response={} data = None try: response = urllib2.urlopen(request)
blog = pyblog.WordPress('http://prideout.net/blog/xmlrpc.php', 'admin', passwd) # If this can't find the post, it'll throw an exception with a good error message. # Since it goes uncaught, it aborts the program. Which is fine. post = blog.get_post(postid) #print "Found post %d with the following keys:" % postid #print '\n'.join(post.keys()) contents = open(filename, 'r').read() #contents = filter(lambda c: c not in "\r", contents) print "Slurped up '%s'" % filename if spellCheck: tokenizer = get_tokenizer("en_US",chunkers=(HTMLChunker,)) words = tokenizer(contents) dictionary = enchant.Dict("en_US") misspelled = set() for word in words: if not dictionary.check(word[0]): misspelled.add(word[0]) print colorama.Fore.CYAN + colorama.Back.BLACK for line in formatColumns(list(misspelled), 3): print line print colorama.Fore.RESET + colorama.Back.RESET post['description'] = contents publish = False blog.edit_post(postid, post, publish)
from __future__ import unicode_literals from random import randint from django.db import models from django.contrib.auth.models import User from enchant import Dict from enchant.tokenize import get_tokenizer DICTIONARY = Dict('en_US') TOKENIZER = get_tokenizer('en_US') def default_randomness(): return randint(0, 10000) class MotionFile(models.Model): MARKER_SET_KIT = 0 # do not change values, since they are stored in the DB! MARKER_SET_CMU = 1 class Meta: unique_together = ('motion_db_id', 'motion_db_file_id') motion_db_id = models.PositiveIntegerField() motion_db_file_id = models.PositiveIntegerField() filename = models.CharField(max_length=255, unique=True) mean_perplexity = models.FloatField(default=0.) is_broken_confirmed = models.BooleanField(default=False) is_broken_reported = models.BooleanField(default=False)