def test_DefaultLang(en_us_dict): """Test behaviour of default language selection.""" defLang = get_default_language() if defLang is None: # If no default language, shouldn't work with pytest.raises(Error): Dict() else: # If there is a default language, should use it # Of course, no need for the dict to actually exist try: d = Dict() assert d.tag == defLang except DictNotFoundError: pass
def __init__(self, model_path=None, tag=None, broker=None): """XDict object constructor. XDict requires pretrained model "GoogleNews-vectors-negative300.bin" in order to give smart suggestions for misspelled words. It is recommended to give model path while creating XDict object. Otherwise XDict will try to download and search the model at following locations: 1) /home/$USER/.enchantx/GoogleNews-vectors-negative300.bin 2) In current working directory A dictionary belongs to a specific language, identified by the string <tag>. If the tag is not given or is None, an attempt to determine the language currently in use is made using the 'locale' module. If the current language cannot be determined, Error is raised. If <tag> is instead given the value of False, a 'dead' Dict object is created without any reference to a language. This is typically only useful within PyEnchant itself. Any other non-string value for <tag> raises Error. Each dictionary must also have an associated Broker object which obtains the dictionary information from the underlying system. This may be specified using <broker>. If not given, the default broker is used. """ self._home_dir = os.path.expanduser("~/.enchantx") self.enchant_obj = Dict(tag=tag, broker=broker) self.enchantX = WORD2VEC(model_path) if model_path is None: self._create_home_dir_and_download_glove()
def babykangarooify(path, arg): head, tail = ntpath.split(path) d = Dict("en_US") doc = docx.Document(path) new_doc = docx.Document() for paragraph in doc.paragraphs: new_paragraph = [] for word in paragraph.text.split(): if 'joey' in word.lower(): new_paragraph.append('baby kangaroo') continue syns = [l.name() for syn in wordnet.synsets(word) for l in syn.lemmas() if d.check(l.name())] if arg.corporate and syns: synergize_words = open('buzzwords/corporate.txt').read().splitlines() possible = [] for syn_word in synergize_words: synergy_syns = [l.name() for syn in wordnet.synsets(syn_word) for l in syn.lemmas() if d.check(l.name())] synergy_exists = [i for i in synergy_syns if i in syns] if synergy_exists: possible.append(syn_word) new_word = max(possible, key=lambda s: (len(s), s)) if possible else word new_paragraph.append(new_word) elif syns: new_word = max(syns, key=lambda s: (len(s), s)) new_paragraph.append(new_word) else: new_paragraph.append(word) new_cap_paragraph = capitalize_sentences(new_paragraph) new_doc.add_paragraph(new_cap_paragraph) new_doc.save(head + '/bk_' + tail)
def unscrambler(*value): value = int(input("Enter a value: ")) alp = {x + 1: y for x, y in enumerate(ascii_lowercase)} splitter = [int(x) for x in str(value)] pot_v = [alp[i] for i in splitter] for i in range(len(splitter) - 1): fin_val = int(str(splitter[i]) + str(splitter[i + 1])) if fin_val <= 26: pot_v.append((alp[fin_val])) pot_v.sort() print(pot_v) str_pot_val = [] d = Dict("en-US") f = open('unscrambled.txt', 'w') for i in range(len(pot_v)): for combination in set(map("".join, itertools.permutations(pot_v, i))): if len(combination) > 2 and d.check(combination): f.write(combination + '\n') print(combination) f.close()
def test_unicode_tag(broker): """Test that unicode language tags are accepted""" d1 = broker._request_dict_data("en_US") assert d1 broker._free_dict_data(d1) d1 = Dict("en_US") assert d1
def test_pickling(en_us_dict): """Test that pickling doesn't corrupt internal state.""" d1 = Dict("en") assert d1.check("hello") d2 = pickle.loads(pickle.dumps(d1)) assert d1.check("hello") assert d2.check("hello") d1._free() assert d2.check("hello")
def random_word(): length = randrange(2, 9) seed = ''.join(choice(string.ascii_lowercase) for x in range(length)) dic = Dict('en_US') words = dic.suggest(seed) if len(words) == 0: return seed else: return choice(words)
def spellcheck_command(self,*args): dic=Dict("en_US") data=word_tokenize(self.textPad.get('1.0', 'end-1c')) for word in data: if not dic.check(word) and word.isalpha(): suggestions_list=dic.suggest(word) suggestions_str="" for w in suggestions_list: suggestions_str+=w+" " showinfo("Suggestions for '"+word+"'\n",suggestions_str) showinfo("Spell Check","Finished checking!")
def _getDict(self, lang, path): key = (lang, path) if key not in self._dictCache: broker = Broker() broker.set_param('enchant.myspell.dictionary.path', path) currentDict = Dict(lang, broker) self._dictCache[key] = currentDict else: currentDict = self._dictCache[key] return currentDict
def text2words(text, lang='en_US', min_length=3): dict_en_US = Dict(lang) tknzr = get_tokenizer(lang) # Processed text: punctuation removal (except '-') p_text = regex.sub('', text) tokens = [token for token, _ in tknzr(p_text)] words = filter(lambda token: len(token) >= min_length, tokens) words = filter(dict_en_US.check, words) return words
def spell_check(input_question): pattern = "\W" prog = compile(pattern) input_question_word_list = input_question.split() en_dict = Dict("en_US") for word_index in range(len(input_question_word_list)): if not en_dict.check(input_question_word_list[word_index]) and prog.match(input_question_word_list[word_index]) is None: correct_word = spell(input_question_word_list[word_index]) input_question_word_list[word_index] = correct_word return " ".join(input_question_word_list)
def get_all_sub_words(word: str, dictionary: enchant.Dict, min_length: int) -> Set[str]: """ Get all words included in a larger word, including that larger word. Arguments: word (str): The word of which we extract all sub words. dictionary (enchant.Dict): The dictionary instance which checks if something is a word. min_length (int): The minimum length of a sub word before it is taken into account. """ all_sub_words = {word} for i in range(len(word)): for j in range(i + min_length, len(word) + 1): sub_word = word[i:j] if dictionary.check(sub_word): all_sub_words.add(sub_word) return all_sub_words
def set_spelling_options(self, spelling, dicts, pwl_files): """Set spelling options.""" self.spelling = spelling self.dicts = dicts self.pwl = get_concatenated_files(pwl_files) # build extra checkers with dicts self.extra_checkers = [] if dicts: if not ENCHANT_FOUND: raise ImportError('Enchant module not found (please install ' '"pyenchant")') for lang in dicts.split(','): try: _dict = Dict(lang) self.extra_checkers.append(SpellChecker(_dict)) except DictNotFoundError: print('WARNING: enchant dictionary not found for ' 'language "{0}"'.format(lang))
def initialise(self, sitecheck): super(Spelling, self).initialise(sitecheck) # Spell checker must be re-created when check is resumed global _enchant_available if _enchant_available: ddp = os.path.dirname(os.path.abspath(__file__)) + 'dict.txt' cdp = self.sitecheck.session.root_path + 'dict.txt' if os.path.exists(cdp): self.dictionary = cdp d = DictWithPWL(self.language, cdp) elif os.path.exists(ddp): self.dictionary = ddp d = DictWithPWL(self.language, ddp) else: d = Dict(self.language) self.spell_checker = SpellChecker(d, filters=[EmailFilter, URLFilter])
def spell_corrections(bow_dict): speller = Dict('en_US') vocab = set() incorrect = set() corrected = dict() [vocab.add(w) for q in bow_dict.keys() for w in bow_dict[q]] [incorrect.add(w) for w in vocab if not speller.check(w)] print('''Corrections Started''') for w in incorrect: corrections = speller.suggest(w) if len(corrections) > 0: corrected[w] = corrections[0] with connect('../Dumps/db.db') as con: cur = con.cursor() cur.execute('DROP TABLE IF EXISTS spell_corrections') cur.execute('''CREATE TABLE IF NOT EXISTS spell_corrections( original TEXT NOT NULL, corrected TEXT NOT NULL);''') cur.executemany( '''INSERT INTO spell_corrections(original, corrected) VALUES(? , ?)''', [(w, corrected[w]) for w in corrected.keys()])
def __init__(self, path, wl_dir, chunkers, filters): self.popath = path self.po = polib.pofile(path) self.lang = self.po.metadata["Language"] available_lang = Broker().list_languages() if self.lang not in available_lang: baselang = self.lang.split("_")[0] if baselang in available_lang: self.lang = baselang else: print("Dictionary for language '%s' could not be found." % self.lang) raise(errors.DictNotFoundError) wordlist = Check.get_wordlist(self.lang, wl_dir, path) try: check_dict = DictWithPWL(self.lang, pwl=wordlist) except errors.Error as e: check_dict = Dict(self.lang) print(e) self.checker = SpellChecker(check_dict, chunkers=chunkers, filters=filters)
def main(): letters = 'rilsedxcu' dictionary = Dict('en-US') all_length_combinations = [ combinations(letters, i + 1) for i in range(4, len(letters)) ] all_words_list = [] for single_length_combination in all_length_combinations: for combo in single_length_combination: all_words_list.extend([''.join(p) for p in permutations(combo)]) pool = Pool(processes=4) in_dictionary = pool.map(dictionary.check, all_words_list) pool.close(), pool.join() valid_word_list = [ word for word, result in zip(all_words_list, in_dictionary) if result == True ] print(valid_word_list)
def __init__(self): self.DICT = Dict("en_US") self.suggested_advices = set() self.suggested_words = set()
def __init__(self): # type: () -> None """Initialize the dictionaries.""" self._spell = Spell('en_US') self._dictionary = PyDictionary('html.parser') _log.debug('Initialized %s instance correctly', type(self).__name__)
class English: """English dictionary. Attributes: TypeMeanings: Type of the returned meanings from `meanings()`. TypeDefinition: Type of the returned definition from `define()`. """ # https://mypy.readthedocs.io/en/latest/cheat_sheet.html TypeMeanings = Dict[str, List[str]] TypeDefinition = Dict[str, Union[List[str], TypeMeanings]] def __init__(self): # type: () -> None """Initialize the dictionaries.""" self._spell = Spell('en_US') self._dictionary = PyDictionary('html.parser') _log.debug('Initialized %s instance correctly', type(self).__name__) def check(self, word): # type: (str) -> bool """Check if a word is in the English dictionary. Args: word: The word to check. Returns: True if it is and False otherwise. """ out = self._spell.check(word) # type: bool return out def suggest(self, misspelled_word): # type: (str) -> List[str] """Suggest corrections for a misspelled word. Args: misspelled_word: The word to use. Returns: A list of suggestions. """ out = self._spell.suggest(misspelled_word) # type: List[str] return out def meanings(self, word): # type: (str) -> English.TypeMeanings """Get the meanings of a word if they exists. Args: word: The word to use. Returns: A list of meanings. """ with CaptureStdStreams(): out = self._dictionary.meaning( word) # type: Optional[English.TypeMeanings] if out is None: _log.debug('Could not find any meaning to %s', word) return {} return out def synonyms(self, word): # type: (str) -> List[str] """Get the synonyms of a word if they exists. Args: word: The word to use. Returns: A list of synonyms. """ with CaptureStdStreams(): out = self._dictionary.synonym(word) # type: Optional[List[str]] if out is None: _log.debug('Could not find any synonym to %s', word) return [] return out def antonyms(self, word): # type: (str) -> List[str] """Get the antonyms of a word if they exists. Args: word: The word to use. Returns: A list of synonyms. """ with CaptureStdStreams(): out = self._dictionary.antonym(word) # type: Optional[List[str]] if out is None: _log.debug('Could not find any antonym to %s', word) return [] return out def define(self, word): # type: (str) -> English.TypeDefinition """Define a word and find its synonyms and antonyms. Args: word: The word to define. Returns: A dict of meanings, synonyms and antonyms. """ out = { 'Meanings': self.meanings(word), 'Synonyms': self.synonyms(word), 'Antonyms': self.antonyms(word), } # type: English.TypeDefinition # we have to put the above type comment because mypy cannot # infer the type correctly. Instead, it infers # `Dict[str, Collection[str]]`. However, we can do: # `return {...}` and it would infer it correctly. return out
def __init__(self): self.stopWords = stopwords.words('english') self.stemmer = SnowballStemmer('english') self.spellcheck = Dict()
from __future__ import unicode_literals from random import randint from django.db import models from django.contrib.auth.models import User from enchant import Dict from enchant.tokenize import get_tokenizer DICTIONARY = Dict('en_US') TOKENIZER = get_tokenizer('en_US') def default_randomness(): return randint(0, 10000) class MotionFile(models.Model): MARKER_SET_KIT = 0 # do not change values, since they are stored in the DB! MARKER_SET_CMU = 1 class Meta: unique_together = ('motion_db_id', 'motion_db_file_id') motion_db_id = models.PositiveIntegerField() motion_db_file_id = models.PositiveIntegerField() filename = models.CharField(max_length=255, unique=True) mean_perplexity = models.FloatField(default=0.) is_broken_confirmed = models.BooleanField(default=False) is_broken_reported = models.BooleanField(default=False) marker_set = models.PositiveIntegerField(default=MARKER_SET_KIT)
import os from re import match, sub from nltk import word_tokenize from enchant import Dict from nltk.stem import PorterStemmer from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) for word in ['rt', 'co', 'amp']: stopwords.add(word) word_dict = Dict("en_US") stemmer = PorterStemmer() TOKEN_DIR = './tokenized_corpus/' RAW_TWEET_DIR = './tweet_corpus/' if __name__ == '__main__': if not os.path.exists(TOKEN_DIR): os.mkdir(TOKEN_DIR) for ticker in os.listdir(RAW_TWEET_DIR): tickerpath = RAW_TWEET_DIR + ticker + '/' ticker_token_file = TOKEN_DIR + ticker + '.dat' with open(ticker_token_file, 'w') as token_file: for filename in os.listdir(tickerpath): tweet_id = match("(.*)\.dat", filename).group(1) content = open(tickerpath + filename).read() content = content.lower() content = sub('\W+', ' ', content) tokens = word_tokenize(content) tokens = [ stemmer.stem(x) for x in tokens if x not in stopwords and len(x) > 1 and word_dict.check(x)
def process_tokens(words, normalize_plurals=True): """Normalize cases and remove plurals. Each word is represented by the most common case. If a word appears with an "s" on the end and without an "s" on the end, the version with "s" is assumed to be a plural and merged with the version without "s" (except if the word ends with "ss"). Parameters ---------- words : iterable of strings Words to count. normalize_plurals : bool, default=True Whether to try and detect plurals and remove trailing "s". Returns ------- counts : dict from string to int Counts for each unique word, with cases represented by the most common case, and plurals removed. standard_forms : dict from string to string For each lower-case word the standard capitalization. """ # words can be either a list of unigrams or bigrams # d is a dict of dicts. # Keys of d are word.lower(). Values are dicts # counting frequency of each capitalization eng_d = Dict("en_US") d = defaultdict(dict) for word in words: word_lower = word.lower() # get dict of cases for word_lower case_dict = d[word_lower] # increase this case case_dict[word] = case_dict.get(word, 0) + 1 if normalize_plurals: # merge plurals into the singular count (simple cases only) merged_plurals = {} for key in list(d.keys()): if key.endswith('s') and not key.endswith("ss"): key_singular = key[:-1] if eng_d.check(key_singular): if key_singular in d: dict_plural = d[key] dict_singular = d[key_singular] for word, count in dict_plural.items(): singular = word[:-1] dict_singular[singular] = ( dict_singular.get(singular, 0) + count) merged_plurals[key] = key_singular del d[key] fused_cases = {} standard_cases = {} item1 = itemgetter(1) for word_lower, case_dict in d.items(): # Get the most popular case. first = max(case_dict.items(), key=item1)[0] fused_cases[first] = sum(case_dict.values()) standard_cases[word_lower] = first if normalize_plurals: # add plurals to fused cases: for plural, singular in merged_plurals.items(): standard_cases[plural] = standard_cases[singular.lower()] return fused_cases, standard_cases
def en_us_dict(): res = Dict("en_US") yield res del res
# -*- coding: utf-8 -*- # imports import re from enchant import Dict, tokenize # spellchecking dictionary = Dict('en_US') tokenizer = tokenize.get_tokenizer('en_US') def is_all_titlecased(string): '''checks if each word in a string is titlecased''' # enchant's tokenizer breaks a string into tokens of its constituent words, organized as tuples of the form (word, start_index) # we check only the first character because all-caps strings are not considered title-cased return all(token[0][0].istitle() for token in list(tokenizer(string))) def filter_periods(word): '''filter out periods in acronyms/all-caps words''' return re.sub(r'\.', '', word) def is_all_caps(word): '''checks if a word is all-caps''' # filter out '.' in case word is an acronym of the form 'U.S.A.' rather than 'USA' return all(char.istitle() for char in list(filter_periods(word))) def try_to_fix_case(word): '''return top suggestion if it differs from word only in case, otherwise return original word '''
cmp = lst[mid][:-1] freq = lst[mid][-1] if ele == cmp: return freq break elif compare(ele, cmp): l = mid + 1 else: u = mid - 1 else: return 0 from enchant import Dict d = Dict("en_GB") start = time.time() en_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/" hi_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/" hindi = [] english = [] hin = open(hi_dirname + "s8_" + "hindi(clean_no_handle_withfreq)_fast.bin", "rb") eng = open(en_dirname + "s8_" + "eng(withfreq)_fast.bin", "rb") #training data in the form of n-grams for english for k in range(4, 0, -1): hindi.append( p.load(hin) ) #the first time, 4-grams are loaded, the second time, 3-grams and so on english.append(p.load(eng))
from string import ascii_uppercase from enchant import Dict Letters = list(ascii_uppercase) cipher = "jslnsjjw" #know the right english word whih meet it d = Dict("en_US") for K in range(1, 26): PT = "" for ch in cipher: PT += Letters[(Letters.index(ch.upper()) - K + 26) % 26] if d.check(PT): print(f"for key {K} the plain text is {PT}\n")
from enchant import Dict from enchant.checker import SpellChecker from enchant.tokenize import EmailFilter, URLFilter from cloudbot import hook locale = "en_US" en_dict = Dict(locale) @hook.command() def spell(text): """<word/sentence> - Check spelling of a word or sentence.""" if len(text.split(" ")) > 1: # input is a sentence checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter]) checker.set_text(text) is_correct = True offset = 0 for err in checker: is_correct = False # find the location of the incorrect word start = err.wordpos + offset finish = start + len(err.word) # get some suggestions for it suggestions = err.suggest() s_string = '/'.join(suggestions[:3]) s_string = "[h1]{}[/h1]".format(s_string) # calculate the offset for the next word offset = (offset + len(s_string)) - len(err.word)
from difflib import SequenceMatcher from statistics import median import cv2 import numpy as np import pytesseract from enchant import Dict from enchant.checker import SpellChecker from thumbframes_dl import YouTubeFrames # Never Tell Me the Odds - Star Noirs One-off opening crawl | Saving Throw | CC BY 3.0 VIDEO_URL = 'https://www.youtube.com/watch?v=kEVOHhFg_s4' LANG = ('en', 'eng') dictionary = Dict(LANG[0]) spellchecker = SpellChecker(LANG[0]) # pytesseract.image_to_string returns a nice string, but no confidence level, # pytesseract.image_to_data dumps this whole mess, so you have to parse it def parse_pytesseract_output(data): # this list is a mix of ints as numbers and ints as strings data['conf'] = [int(num) for num in data['conf']] # only return line if confidence is high enough def _get_line_if_confident(start_index, end_index=None): if len(data['conf'][start_index:end_index]) == 0: return if median(data['conf'][start_index:end_index]) >= 70: