Ejemplo n.º 1
0
def test_DefaultLang(en_us_dict):
    """Test behaviour of default language selection."""
    defLang = get_default_language()
    if defLang is None:
        # If no default language, shouldn't work
        with pytest.raises(Error):
            Dict()
    else:
        # If there is a default language, should use it
        # Of course, no need for the dict to actually exist
        try:
            d = Dict()
            assert d.tag == defLang
        except DictNotFoundError:
            pass
Ejemplo n.º 2
0
    def __init__(self, model_path=None, tag=None, broker=None):
        """XDict object constructor.

        XDict requires pretrained model "GoogleNews-vectors-negative300.bin"
        in order to give smart suggestions for misspelled words.

        It is recommended to give model path while creating XDict object.
        Otherwise XDict will try to download and search the model at following locations:
        1) /home/$USER/.enchantx/GoogleNews-vectors-negative300.bin
        2) In current working directory

        A dictionary belongs to a specific language, identified by the
        string <tag>.  If the tag is not given or is None, an attempt to
        determine the language currently in use is made using the 'locale'
        module.  If the current language cannot be determined, Error is raised.

        If <tag> is instead given the value of False, a 'dead' Dict object
        is created without any reference to a language.  This is typically
        only useful within PyEnchant itself.  Any other non-string value
        for <tag> raises Error.

        Each dictionary must also have an associated Broker object which
        obtains the dictionary information from the underlying system. This
        may be specified using <broker>.  If not given, the default broker
        is used.
        """
        self._home_dir = os.path.expanduser("~/.enchantx")
        self.enchant_obj = Dict(tag=tag, broker=broker)
        self.enchantX = WORD2VEC(model_path)
        if model_path is None:
            self._create_home_dir_and_download_glove()
Ejemplo n.º 3
0
def babykangarooify(path, arg):
    head, tail = ntpath.split(path)
    d = Dict("en_US")
    doc = docx.Document(path)
    new_doc = docx.Document()

    for paragraph in doc.paragraphs:
        new_paragraph = []
        for word in paragraph.text.split():
            if 'joey' in word.lower():
                new_paragraph.append('baby kangaroo')
                continue
            syns = [l.name() for syn in wordnet.synsets(word) for l in syn.lemmas() if d.check(l.name())]
            if arg.corporate and syns:
                synergize_words = open('buzzwords/corporate.txt').read().splitlines()
                possible = []
                for syn_word in synergize_words:
                    synergy_syns = [l.name() for syn in wordnet.synsets(syn_word) for l in syn.lemmas() if d.check(l.name())]
                    synergy_exists = [i for i in synergy_syns if i in syns]
                    if synergy_exists:
                        possible.append(syn_word)
                new_word = max(possible, key=lambda s: (len(s), s)) if possible else word
                new_paragraph.append(new_word)
            elif syns:
                new_word = max(syns, key=lambda s: (len(s), s))
                new_paragraph.append(new_word)
            else:
                new_paragraph.append(word)
        new_cap_paragraph = capitalize_sentences(new_paragraph)
        new_doc.add_paragraph(new_cap_paragraph)
    new_doc.save(head + '/bk_' + tail)
Ejemplo n.º 4
0
def unscrambler(*value):
    value = int(input("Enter a value: "))
    alp = {x + 1: y for x, y in enumerate(ascii_lowercase)}
    splitter = [int(x) for x in str(value)]
    pot_v = [alp[i] for i in splitter]

    for i in range(len(splitter) - 1):
        fin_val = int(str(splitter[i]) + str(splitter[i + 1]))
        if fin_val <= 26:
            pot_v.append((alp[fin_val]))

    pot_v.sort()
    print(pot_v)

    str_pot_val = []
    d = Dict("en-US")

    f = open('unscrambled.txt', 'w')

    for i in range(len(pot_v)):
        for combination in set(map("".join, itertools.permutations(pot_v, i))):
            if len(combination) > 2 and d.check(combination):
                f.write(combination + '\n')
            print(combination)

    f.close()
Ejemplo n.º 5
0
def test_unicode_tag(broker):
    """Test that unicode language tags are accepted"""
    d1 = broker._request_dict_data("en_US")
    assert d1
    broker._free_dict_data(d1)
    d1 = Dict("en_US")
    assert d1
Ejemplo n.º 6
0
def test_pickling(en_us_dict):
    """Test that pickling doesn't corrupt internal state."""
    d1 = Dict("en")
    assert d1.check("hello")
    d2 = pickle.loads(pickle.dumps(d1))
    assert d1.check("hello")
    assert d2.check("hello")
    d1._free()
    assert d2.check("hello")
Ejemplo n.º 7
0
def random_word():
    length = randrange(2, 9)
    seed = ''.join(choice(string.ascii_lowercase) for x in range(length))
    dic = Dict('en_US')
    words = dic.suggest(seed)
    if len(words) == 0:
        return seed
    else:
        return choice(words)
Ejemplo n.º 8
0
 def spellcheck_command(self,*args):
     dic=Dict("en_US")
     data=word_tokenize(self.textPad.get('1.0', 'end-1c'))
     for word in data:
             if not dic.check(word) and word.isalpha():
                     suggestions_list=dic.suggest(word)
                     suggestions_str=""
                     for w in suggestions_list:
                             suggestions_str+=w+" "
                     showinfo("Suggestions for '"+word+"'\n",suggestions_str)
     showinfo("Spell Check","Finished checking!")
Ejemplo n.º 9
0
    def _getDict(self, lang, path):
        key = (lang, path)
        if key not in self._dictCache:
            broker = Broker()
            broker.set_param('enchant.myspell.dictionary.path', path)
            currentDict = Dict(lang, broker)
            self._dictCache[key] = currentDict
        else:
            currentDict = self._dictCache[key]

        return currentDict
Ejemplo n.º 10
0
def text2words(text, lang='en_US', min_length=3):

    dict_en_US = Dict(lang)
    tknzr = get_tokenizer(lang)

    # Processed text: punctuation removal (except '-')
    p_text = regex.sub('', text)
    tokens = [token for token, _ in tknzr(p_text)]
    words = filter(lambda token: len(token) >= min_length, tokens)
    words = filter(dict_en_US.check, words)
    return words
Ejemplo n.º 11
0
def spell_check(input_question):

    pattern = "\W"
    prog = compile(pattern)

    input_question_word_list = input_question.split()
    en_dict = Dict("en_US")
    for word_index in range(len(input_question_word_list)):
        if not en_dict.check(input_question_word_list[word_index]) and prog.match(input_question_word_list[word_index]) is None:
            correct_word = spell(input_question_word_list[word_index])
            input_question_word_list[word_index] = correct_word
    return " ".join(input_question_word_list)
Ejemplo n.º 12
0
    def get_all_sub_words(word: str, dictionary: enchant.Dict,
                          min_length: int) -> Set[str]:
        """ Get all words included in a larger word, including that larger word.

        Arguments:
            word (str): The word of which we extract all sub words.
            dictionary (enchant.Dict): The dictionary instance which checks if something is a word.
            min_length (int): The minimum length of a sub word before it is taken into account.
        """
        all_sub_words = {word}
        for i in range(len(word)):
            for j in range(i + min_length, len(word) + 1):
                sub_word = word[i:j]
                if dictionary.check(sub_word):
                    all_sub_words.add(sub_word)
        return all_sub_words
Ejemplo n.º 13
0
    def set_spelling_options(self, spelling, dicts, pwl_files):
        """Set spelling options."""
        self.spelling = spelling
        self.dicts = dicts
        self.pwl = get_concatenated_files(pwl_files)

        # build extra checkers with dicts
        self.extra_checkers = []
        if dicts:
            if not ENCHANT_FOUND:
                raise ImportError('Enchant module not found (please install '
                                  '"pyenchant")')
            for lang in dicts.split(','):
                try:
                    _dict = Dict(lang)
                    self.extra_checkers.append(SpellChecker(_dict))
                except DictNotFoundError:
                    print('WARNING: enchant dictionary not found for '
                          'language "{0}"'.format(lang))
Ejemplo n.º 14
0
    def initialise(self, sitecheck):
        super(Spelling, self).initialise(sitecheck)

        # Spell checker must be re-created when check is resumed
        global _enchant_available
        if _enchant_available:
            ddp = os.path.dirname(os.path.abspath(__file__)) + 'dict.txt'
            cdp = self.sitecheck.session.root_path + 'dict.txt'

            if os.path.exists(cdp):
                self.dictionary = cdp
                d = DictWithPWL(self.language, cdp)
            elif os.path.exists(ddp):
                self.dictionary = ddp
                d = DictWithPWL(self.language, ddp)
            else:
                d = Dict(self.language)

            self.spell_checker = SpellChecker(d,
                                              filters=[EmailFilter, URLFilter])
Ejemplo n.º 15
0
def spell_corrections(bow_dict):
    speller = Dict('en_US')
    vocab = set()
    incorrect = set()
    corrected = dict()
    [vocab.add(w) for q in bow_dict.keys() for w in bow_dict[q]]
    [incorrect.add(w) for w in vocab if not speller.check(w)]
    print('''Corrections Started''')
    for w in incorrect:
        corrections = speller.suggest(w)
        if len(corrections) > 0:
            corrected[w] = corrections[0]
    with connect('../Dumps/db.db') as con:
        cur = con.cursor()
        cur.execute('DROP TABLE IF EXISTS spell_corrections')
        cur.execute('''CREATE TABLE IF NOT EXISTS spell_corrections(
            original TEXT NOT NULL,
            corrected TEXT NOT NULL);''')
        cur.executemany(
            '''INSERT INTO spell_corrections(original, corrected) VALUES(? , ?)''',
            [(w, corrected[w]) for w in corrected.keys()])
Ejemplo n.º 16
0
    def __init__(self, path, wl_dir, chunkers, filters):
        self.popath = path
        self.po = polib.pofile(path)
        self.lang = self.po.metadata["Language"]

        available_lang = Broker().list_languages()
        if self.lang not in available_lang:
            baselang = self.lang.split("_")[0]
            if baselang in available_lang:
                self.lang = baselang
            else:
                print("Dictionary for language '%s' could not be found." % self.lang)
                raise(errors.DictNotFoundError)

        wordlist = Check.get_wordlist(self.lang, wl_dir, path)
        try:
            check_dict = DictWithPWL(self.lang, pwl=wordlist)
        except errors.Error as e:
            check_dict = Dict(self.lang)
            print(e)
        self.checker = SpellChecker(check_dict, chunkers=chunkers, filters=filters)
Ejemplo n.º 17
0
def main():
    letters = 'rilsedxcu'
    dictionary = Dict('en-US')

    all_length_combinations = [
        combinations(letters, i + 1) for i in range(4, len(letters))
    ]
    all_words_list = []
    for single_length_combination in all_length_combinations:
        for combo in single_length_combination:
            all_words_list.extend([''.join(p) for p in permutations(combo)])

    pool = Pool(processes=4)
    in_dictionary = pool.map(dictionary.check, all_words_list)
    pool.close(), pool.join()

    valid_word_list = [
        word for word, result in zip(all_words_list, in_dictionary)
        if result == True
    ]

    print(valid_word_list)
Ejemplo n.º 18
0
 def __init__(self):
     self.DICT = Dict("en_US")
     self.suggested_advices = set()
     self.suggested_words = set()
Ejemplo n.º 19
0
 def __init__(self):
     # type: () -> None
     """Initialize the dictionaries."""
     self._spell = Spell('en_US')
     self._dictionary = PyDictionary('html.parser')
     _log.debug('Initialized %s instance correctly', type(self).__name__)
Ejemplo n.º 20
0
class English:
    """English dictionary.

    Attributes:
        TypeMeanings: Type of the returned meanings from `meanings()`.
        TypeDefinition: Type of the returned definition from `define()`.

    """

    # https://mypy.readthedocs.io/en/latest/cheat_sheet.html
    TypeMeanings = Dict[str, List[str]]
    TypeDefinition = Dict[str, Union[List[str], TypeMeanings]]

    def __init__(self):
        # type: () -> None
        """Initialize the dictionaries."""
        self._spell = Spell('en_US')
        self._dictionary = PyDictionary('html.parser')
        _log.debug('Initialized %s instance correctly', type(self).__name__)

    def check(self, word):
        # type: (str) -> bool
        """Check if a word is in the English dictionary.

        Args:
            word: The word to check.

        Returns:
            True if it is and False otherwise.

        """
        out = self._spell.check(word)  # type: bool
        return out

    def suggest(self, misspelled_word):
        # type: (str) -> List[str]
        """Suggest corrections for a misspelled word.

        Args:
            misspelled_word: The word to use.

        Returns:
            A list of suggestions.

        """
        out = self._spell.suggest(misspelled_word)  # type: List[str]
        return out

    def meanings(self, word):
        # type: (str) -> English.TypeMeanings
        """Get the meanings of a word if they exists.

        Args:
            word: The word to use.

        Returns:
            A list of meanings.

        """
        with CaptureStdStreams():
            out = self._dictionary.meaning(
                word)  # type: Optional[English.TypeMeanings]
        if out is None:
            _log.debug('Could not find any meaning to %s', word)
            return {}
        return out

    def synonyms(self, word):
        # type: (str) -> List[str]
        """Get the synonyms of a word if they exists.

        Args:
            word: The word to use.

        Returns:
            A list of synonyms.

        """
        with CaptureStdStreams():
            out = self._dictionary.synonym(word)  # type: Optional[List[str]]
        if out is None:
            _log.debug('Could not find any synonym to %s', word)
            return []
        return out

    def antonyms(self, word):
        # type: (str) -> List[str]
        """Get the antonyms of a word if they exists.

        Args:
            word: The word to use.

        Returns:
            A list of synonyms.

        """
        with CaptureStdStreams():
            out = self._dictionary.antonym(word)  # type: Optional[List[str]]
        if out is None:
            _log.debug('Could not find any antonym to %s', word)
            return []
        return out

    def define(self, word):
        # type: (str) -> English.TypeDefinition
        """Define a word and find its synonyms and antonyms.

        Args:
            word: The word to define.

        Returns:
            A dict of meanings, synonyms and antonyms.

        """
        out = {
            'Meanings': self.meanings(word),
            'Synonyms': self.synonyms(word),
            'Antonyms': self.antonyms(word),
        }  # type: English.TypeDefinition
        # we have to put the above type comment because mypy cannot
        # infer the type correctly. Instead, it infers
        # `Dict[str, Collection[str]]`. However, we can do:
        # `return {...}` and it would infer it correctly.
        return out
Ejemplo n.º 21
0
 def __init__(self):
     self.stopWords = stopwords.words('english')
     self.stemmer = SnowballStemmer('english')
     self.spellcheck = Dict()
Ejemplo n.º 22
0
from __future__ import unicode_literals
from random import randint

from django.db import models
from django.contrib.auth.models import User

from enchant import Dict
from enchant.tokenize import get_tokenizer

DICTIONARY = Dict('en_US')
TOKENIZER = get_tokenizer('en_US')


def default_randomness():
    return randint(0, 10000)


class MotionFile(models.Model):
    MARKER_SET_KIT = 0  # do not change values, since they are stored in the DB!
    MARKER_SET_CMU = 1

    class Meta:
        unique_together = ('motion_db_id', 'motion_db_file_id')

    motion_db_id = models.PositiveIntegerField()
    motion_db_file_id = models.PositiveIntegerField()
    filename = models.CharField(max_length=255, unique=True)
    mean_perplexity = models.FloatField(default=0.)
    is_broken_confirmed = models.BooleanField(default=False)
    is_broken_reported = models.BooleanField(default=False)
    marker_set = models.PositiveIntegerField(default=MARKER_SET_KIT)
Ejemplo n.º 23
0
import os
from re import match, sub
from nltk import word_tokenize
from enchant import Dict
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
for word in ['rt', 'co', 'amp']:
    stopwords.add(word)
word_dict = Dict("en_US")
stemmer = PorterStemmer()

TOKEN_DIR = './tokenized_corpus/'
RAW_TWEET_DIR = './tweet_corpus/'

if __name__ == '__main__':
    if not os.path.exists(TOKEN_DIR):
        os.mkdir(TOKEN_DIR)
    for ticker in os.listdir(RAW_TWEET_DIR):
        tickerpath = RAW_TWEET_DIR + ticker + '/'
        ticker_token_file = TOKEN_DIR + ticker + '.dat'
        with open(ticker_token_file, 'w') as token_file:
            for filename in os.listdir(tickerpath):
                tweet_id = match("(.*)\.dat", filename).group(1)
                content = open(tickerpath + filename).read()
                content = content.lower()
                content = sub('\W+', ' ', content)
                tokens = word_tokenize(content)
                tokens = [
                    stemmer.stem(x) for x in tokens
                    if x not in stopwords and len(x) > 1 and word_dict.check(x)
Ejemplo n.º 24
0
def process_tokens(words, normalize_plurals=True):
    """Normalize cases and remove plurals.

    Each word is represented by the most common case.
    If a word appears with an "s" on the end and without an "s" on the end,
    the version with "s" is assumed to be a plural and merged with the
    version without "s" (except if the word ends with "ss").

    Parameters
    ----------
    words : iterable of strings
        Words to count.

    normalize_plurals : bool, default=True
        Whether to try and detect plurals and remove trailing "s".

    Returns
    -------
    counts : dict from string to int
        Counts for each unique word, with cases represented by the most common
        case, and plurals removed.

    standard_forms : dict from string to string
        For each lower-case word the standard capitalization.
    """
    # words can be either a list of unigrams or bigrams
    # d is a dict of dicts.
    # Keys of d are word.lower(). Values are dicts
    # counting frequency of each capitalization
    eng_d = Dict("en_US")
    d = defaultdict(dict)
    for word in words:
        word_lower = word.lower()
        # get dict of cases for word_lower
        case_dict = d[word_lower]
        # increase this case
        case_dict[word] = case_dict.get(word, 0) + 1
    if normalize_plurals:
        # merge plurals into the singular count (simple cases only)
        merged_plurals = {}
        for key in list(d.keys()):
            if key.endswith('s') and not key.endswith("ss"):
                key_singular = key[:-1]
                if eng_d.check(key_singular):
                    if key_singular in d:
                        dict_plural = d[key]
                        dict_singular = d[key_singular]
                        for word, count in dict_plural.items():
                            singular = word[:-1]
                            dict_singular[singular] = (
                                dict_singular.get(singular, 0) + count)
                        merged_plurals[key] = key_singular
                        del d[key]
    fused_cases = {}
    standard_cases = {}
    item1 = itemgetter(1)
    for word_lower, case_dict in d.items():
        # Get the most popular case.
        first = max(case_dict.items(), key=item1)[0]
        fused_cases[first] = sum(case_dict.values())
        standard_cases[word_lower] = first
    if normalize_plurals:
        # add plurals to fused cases:
        for plural, singular in merged_plurals.items():
            standard_cases[plural] = standard_cases[singular.lower()]
    return fused_cases, standard_cases
Ejemplo n.º 25
0
def en_us_dict():
    res = Dict("en_US")
    yield res
    del res
Ejemplo n.º 26
0
# -*- coding: utf-8 -*-

# imports
import re
from enchant import Dict, tokenize

# spellchecking
dictionary = Dict('en_US')
tokenizer = tokenize.get_tokenizer('en_US')


def is_all_titlecased(string):
    '''checks if each word in a string is titlecased'''
    # enchant's tokenizer breaks a string into tokens of its constituent words, organized as tuples of the form (word, start_index)
    # we check only the first character because all-caps strings are not considered title-cased
    return all(token[0][0].istitle() for token in list(tokenizer(string)))


def filter_periods(word):
    '''filter out periods in acronyms/all-caps words'''
    return re.sub(r'\.', '', word)


def is_all_caps(word):
    '''checks if a word is all-caps'''
    # filter out '.' in case word is an acronym of the form 'U.S.A.' rather than 'USA'
    return all(char.istitle() for char in list(filter_periods(word)))


def try_to_fix_case(word):
    '''return top suggestion if it differs from word only in case, otherwise return original word '''
Ejemplo n.º 27
0
        cmp = lst[mid][:-1]
        freq = lst[mid][-1]
        if ele == cmp:
            return freq
            break
        elif compare(ele, cmp):
            l = mid + 1
        else:
            u = mid - 1
    else:
        return 0


from enchant import Dict

d = Dict("en_GB")
start = time.time()
en_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/"
hi_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/"
hindi = []
english = []

hin = open(hi_dirname + "s8_" + "hindi(clean_no_handle_withfreq)_fast.bin",
           "rb")
eng = open(en_dirname + "s8_" + "eng(withfreq)_fast.bin",
           "rb")  #training data in the form of n-grams for english
for k in range(4, 0, -1):
    hindi.append(
        p.load(hin)
    )  #the first time, 4-grams are loaded, the second time, 3-grams and so on
    english.append(p.load(eng))
Ejemplo n.º 28
0
from string import ascii_uppercase
from enchant import Dict

Letters = list(ascii_uppercase)

cipher = "jslnsjjw"  #know the right english word whih meet it

d = Dict("en_US")

for K in range(1, 26):
    PT = ""
    for ch in cipher:
        PT += Letters[(Letters.index(ch.upper()) - K + 26) % 26]

    if d.check(PT):
        print(f"for key {K} the plain text is {PT}\n")
Ejemplo n.º 29
0
from enchant import Dict
from enchant.checker import SpellChecker
from enchant.tokenize import EmailFilter, URLFilter

from cloudbot import hook

locale = "en_US"
en_dict = Dict(locale)


@hook.command()
def spell(text):
    """<word/sentence> - Check spelling of a word or sentence."""
    if len(text.split(" ")) > 1:
        # input is a sentence
        checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter])
        checker.set_text(text)

        is_correct = True
        offset = 0
        for err in checker:
            is_correct = False
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "[h1]{}[/h1]".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
Ejemplo n.º 30
0
from difflib import SequenceMatcher
from statistics import median

import cv2
import numpy as np
import pytesseract
from enchant import Dict
from enchant.checker import SpellChecker

from thumbframes_dl import YouTubeFrames

# Never Tell Me the Odds - Star Noirs One-off opening crawl | Saving Throw | CC BY 3.0
VIDEO_URL = 'https://www.youtube.com/watch?v=kEVOHhFg_s4'

LANG = ('en', 'eng')
dictionary = Dict(LANG[0])
spellchecker = SpellChecker(LANG[0])


# pytesseract.image_to_string returns a nice string, but no confidence level,
# pytesseract.image_to_data dumps this whole mess, so you have to parse it
def parse_pytesseract_output(data):

    # this list is a mix of ints as numbers and ints as strings
    data['conf'] = [int(num) for num in data['conf']]

    # only return line if confidence is high enough
    def _get_line_if_confident(start_index, end_index=None):
        if len(data['conf'][start_index:end_index]) == 0:
            return
        if median(data['conf'][start_index:end_index]) >= 70: