コード例 #1
0
def test_DefaultLang(en_us_dict):
    """Test behaviour of default language selection."""
    defLang = get_default_language()
    if defLang is None:
        # If no default language, shouldn't work
        with pytest.raises(Error):
            Dict()
    else:
        # If there is a default language, should use it
        # Of course, no need for the dict to actually exist
        try:
            d = Dict()
            assert d.tag == defLang
        except DictNotFoundError:
            pass
コード例 #2
0
def test_unicode_tag(broker):
    """Test that unicode language tags are accepted"""
    d1 = broker._request_dict_data("en_US")
    assert d1
    broker._free_dict_data(d1)
    d1 = Dict("en_US")
    assert d1
コード例 #3
0
    def __init__(self, model_path=None, tag=None, broker=None):
        """XDict object constructor.

        XDict requires pretrained model "GoogleNews-vectors-negative300.bin"
        in order to give smart suggestions for misspelled words.

        It is recommended to give model path while creating XDict object.
        Otherwise XDict will try to download and search the model at following locations:
        1) /home/$USER/.enchantx/GoogleNews-vectors-negative300.bin
        2) In current working directory

        A dictionary belongs to a specific language, identified by the
        string <tag>.  If the tag is not given or is None, an attempt to
        determine the language currently in use is made using the 'locale'
        module.  If the current language cannot be determined, Error is raised.

        If <tag> is instead given the value of False, a 'dead' Dict object
        is created without any reference to a language.  This is typically
        only useful within PyEnchant itself.  Any other non-string value
        for <tag> raises Error.

        Each dictionary must also have an associated Broker object which
        obtains the dictionary information from the underlying system. This
        may be specified using <broker>.  If not given, the default broker
        is used.
        """
        self._home_dir = os.path.expanduser("~/.enchantx")
        self.enchant_obj = Dict(tag=tag, broker=broker)
        self.enchantX = WORD2VEC(model_path)
        if model_path is None:
            self._create_home_dir_and_download_glove()
コード例 #4
0
def babykangarooify(path, arg):
    head, tail = ntpath.split(path)
    d = Dict("en_US")
    doc = docx.Document(path)
    new_doc = docx.Document()

    for paragraph in doc.paragraphs:
        new_paragraph = []
        for word in paragraph.text.split():
            if 'joey' in word.lower():
                new_paragraph.append('baby kangaroo')
                continue
            syns = [l.name() for syn in wordnet.synsets(word) for l in syn.lemmas() if d.check(l.name())]
            if arg.corporate and syns:
                synergize_words = open('buzzwords/corporate.txt').read().splitlines()
                possible = []
                for syn_word in synergize_words:
                    synergy_syns = [l.name() for syn in wordnet.synsets(syn_word) for l in syn.lemmas() if d.check(l.name())]
                    synergy_exists = [i for i in synergy_syns if i in syns]
                    if synergy_exists:
                        possible.append(syn_word)
                new_word = max(possible, key=lambda s: (len(s), s)) if possible else word
                new_paragraph.append(new_word)
            elif syns:
                new_word = max(syns, key=lambda s: (len(s), s))
                new_paragraph.append(new_word)
            else:
                new_paragraph.append(word)
        new_cap_paragraph = capitalize_sentences(new_paragraph)
        new_doc.add_paragraph(new_cap_paragraph)
    new_doc.save(head + '/bk_' + tail)
コード例 #5
0
def unscrambler(*value):
    value = int(input("Enter a value: "))
    alp = {x + 1: y for x, y in enumerate(ascii_lowercase)}
    splitter = [int(x) for x in str(value)]
    pot_v = [alp[i] for i in splitter]

    for i in range(len(splitter) - 1):
        fin_val = int(str(splitter[i]) + str(splitter[i + 1]))
        if fin_val <= 26:
            pot_v.append((alp[fin_val]))

    pot_v.sort()
    print(pot_v)

    str_pot_val = []
    d = Dict("en-US")

    f = open('unscrambled.txt', 'w')

    for i in range(len(pot_v)):
        for combination in set(map("".join, itertools.permutations(pot_v, i))):
            if len(combination) > 2 and d.check(combination):
                f.write(combination + '\n')
            print(combination)

    f.close()
コード例 #6
0
ファイル: search_insert.py プロジェクト: Lowgain/sigworld
def random_word():
    length = randrange(2, 9)
    seed = ''.join(choice(string.ascii_lowercase) for x in range(length))
    dic = Dict('en_US')
    words = dic.suggest(seed)
    if len(words) == 0:
        return seed
    else:
        return choice(words)
コード例 #7
0
def test_pickling(en_us_dict):
    """Test that pickling doesn't corrupt internal state."""
    d1 = Dict("en")
    assert d1.check("hello")
    d2 = pickle.loads(pickle.dumps(d1))
    assert d1.check("hello")
    assert d2.check("hello")
    d1._free()
    assert d2.check("hello")
コード例 #8
0
def text2words(text, lang='en_US', min_length=3):

    dict_en_US = Dict(lang)
    tknzr = get_tokenizer(lang)

    # Processed text: punctuation removal (except '-')
    p_text = regex.sub('', text)
    tokens = [token for token, _ in tknzr(p_text)]
    words = filter(lambda token: len(token) >= min_length, tokens)
    words = filter(dict_en_US.check, words)
    return words
コード例 #9
0
ファイル: enchantwrapper.py プロジェクト: sarutobi/outwiker
    def _getDict(self, lang, path):
        key = (lang, path)
        if key not in self._dictCache:
            broker = Broker()
            broker.set_param('enchant.myspell.dictionary.path', path)
            currentDict = Dict(lang, broker)
            self._dictCache[key] = currentDict
        else:
            currentDict = self._dictCache[key]

        return currentDict
コード例 #10
0
ファイル: TextEditor.py プロジェクト: Amith-Kumar-V/TextMate
 def spellcheck_command(self,*args):
     dic=Dict("en_US")
     data=word_tokenize(self.textPad.get('1.0', 'end-1c'))
     for word in data:
             if not dic.check(word) and word.isalpha():
                     suggestions_list=dic.suggest(word)
                     suggestions_str=""
                     for w in suggestions_list:
                             suggestions_str+=w+" "
                     showinfo("Suggestions for '"+word+"'\n",suggestions_str)
     showinfo("Spell Check","Finished checking!")
コード例 #11
0
def spell_check(input_question):

    pattern = "\W"
    prog = compile(pattern)

    input_question_word_list = input_question.split()
    en_dict = Dict("en_US")
    for word_index in range(len(input_question_word_list)):
        if not en_dict.check(input_question_word_list[word_index]) and prog.match(input_question_word_list[word_index]) is None:
            correct_word = spell(input_question_word_list[word_index])
            input_question_word_list[word_index] = correct_word
    return " ".join(input_question_word_list)
コード例 #12
0
ファイル: po.py プロジェクト: rimrul/msgcheck
    def set_spelling_options(self, spelling, dicts, pwl_files):
        """Set spelling options."""
        self.spelling = spelling
        self.dicts = dicts
        self.pwl = get_concatenated_files(pwl_files)

        # build extra checkers with dicts
        self.extra_checkers = []
        if dicts:
            if not ENCHANT_FOUND:
                raise ImportError('Enchant module not found (please install '
                                  '"pyenchant")')
            for lang in dicts.split(','):
                try:
                    _dict = Dict(lang)
                    self.extra_checkers.append(SpellChecker(_dict))
                except DictNotFoundError:
                    print('WARNING: enchant dictionary not found for '
                          'language "{0}"'.format(lang))
コード例 #13
0
ファイル: modules.py プロジェクト: cash2one/sitecheck
    def initialise(self, sitecheck):
        super(Spelling, self).initialise(sitecheck)

        # Spell checker must be re-created when check is resumed
        global _enchant_available
        if _enchant_available:
            ddp = os.path.dirname(os.path.abspath(__file__)) + 'dict.txt'
            cdp = self.sitecheck.session.root_path + 'dict.txt'

            if os.path.exists(cdp):
                self.dictionary = cdp
                d = DictWithPWL(self.language, cdp)
            elif os.path.exists(ddp):
                self.dictionary = ddp
                d = DictWithPWL(self.language, ddp)
            else:
                d = Dict(self.language)

            self.spell_checker = SpellChecker(d,
                                              filters=[EmailFilter, URLFilter])
コード例 #14
0
    def __init__(self, path, wl_dir, chunkers, filters):
        self.popath = path
        self.po = polib.pofile(path)
        self.lang = self.po.metadata["Language"]

        available_lang = Broker().list_languages()
        if self.lang not in available_lang:
            baselang = self.lang.split("_")[0]
            if baselang in available_lang:
                self.lang = baselang
            else:
                print("Dictionary for language '%s' could not be found." % self.lang)
                raise(errors.DictNotFoundError)

        wordlist = Check.get_wordlist(self.lang, wl_dir, path)
        try:
            check_dict = DictWithPWL(self.lang, pwl=wordlist)
        except errors.Error as e:
            check_dict = Dict(self.lang)
            print(e)
        self.checker = SpellChecker(check_dict, chunkers=chunkers, filters=filters)
コード例 #15
0
ファイル: preprocessor.py プロジェクト: msingh27/nlp_group6
def spell_corrections(bow_dict):
    speller = Dict('en_US')
    vocab = set()
    incorrect = set()
    corrected = dict()
    [vocab.add(w) for q in bow_dict.keys() for w in bow_dict[q]]
    [incorrect.add(w) for w in vocab if not speller.check(w)]
    print('''Corrections Started''')
    for w in incorrect:
        corrections = speller.suggest(w)
        if len(corrections) > 0:
            corrected[w] = corrections[0]
    with connect('../Dumps/db.db') as con:
        cur = con.cursor()
        cur.execute('DROP TABLE IF EXISTS spell_corrections')
        cur.execute('''CREATE TABLE IF NOT EXISTS spell_corrections(
            original TEXT NOT NULL,
            corrected TEXT NOT NULL);''')
        cur.executemany(
            '''INSERT INTO spell_corrections(original, corrected) VALUES(? , ?)''',
            [(w, corrected[w]) for w in corrected.keys()])
コード例 #16
0
ファイル: countdown.py プロジェクト: anirudhramesh/Countdown
def main():
    letters = 'rilsedxcu'
    dictionary = Dict('en-US')

    all_length_combinations = [
        combinations(letters, i + 1) for i in range(4, len(letters))
    ]
    all_words_list = []
    for single_length_combination in all_length_combinations:
        for combo in single_length_combination:
            all_words_list.extend([''.join(p) for p in permutations(combo)])

    pool = Pool(processes=4)
    in_dictionary = pool.map(dictionary.check, all_words_list)
    pool.close(), pool.join()

    valid_word_list = [
        word for word, result in zip(all_words_list, in_dictionary)
        if result == True
    ]

    print(valid_word_list)
コード例 #17
0
        cmp = lst[mid][:-1]
        freq = lst[mid][-1]
        if ele == cmp:
            return freq
            break
        elif compare(ele, cmp):
            l = mid + 1
        else:
            u = mid - 1
    else:
        return 0


from enchant import Dict

d = Dict("en_GB")
start = time.time()
en_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/"
hi_dirname = "/Users/keertankrishnan/Documents/Project Work/CDSAML/Old Laptop Files/CDSAML/news/"
hindi = []
english = []

hin = open(hi_dirname + "s8_" + "hindi(clean_no_handle_withfreq)_fast.bin",
           "rb")
eng = open(en_dirname + "s8_" + "eng(withfreq)_fast.bin",
           "rb")  #training data in the form of n-grams for english
for k in range(4, 0, -1):
    hindi.append(
        p.load(hin)
    )  #the first time, 4-grams are loaded, the second time, 3-grams and so on
    english.append(p.load(eng))
コード例 #18
0
 def __init__(self):
     self.stopWords = stopwords.words('english')
     self.stemmer = SnowballStemmer('english')
     self.spellcheck = Dict()
コード例 #19
0
def en_us_dict():
    res = Dict("en_US")
    yield res
    del res
コード例 #20
0
ファイル: params.py プロジェクト: simochlieh/posos-challenge
    'min_df': 1,
    'max_features': None,
    'vocabulary': None,
    'binary': False,
    'norm': "l2",
    'use_idf': True,
    'smooth_idf': True,
    'sublinear_tf': True,
    'sparse': True,
    'verbose': 0
}

#################################################
# Cleaning and lemmatization parameters
#################################################
FR_DICT = Dict("fr_FR")
FR_DICT_BLACKLIST = ('aspirine', 'carlin', 'morphine')
DRUG_NAMES_BLACKLIST = ('\n', 'anti', 'santé')
RCP_ENCODING = 'ISO-8859-1'
UTF_8 = 'utf-8'
DRUG_NAME_COL = 'name'
DRUG_COMPLETE_NAME_COL = 'complete_name'
DRUG_ID_COL = 'id'
INPUT_TRAIN_FILENAME = './data/input_train.csv'
INPUT_TEST_FILENAME = './data/input_test.csv'

RCP_FILENAME = './data/rcp/CIS.txt'
SENTENCE_ID = 'id'
RAW_SENTENCE_COL = 'raw_sentence'
CORR_LEMM_SENTENCE_COL = 'corr_lemm_sentence'
DRUG_NAMES_COL = 'drug_names'
コード例 #21
0
 def __init__(self):
     self.generator = Dict('en_US')
コード例 #22
0
ファイル: vienna.py プロジェクト: adyachok/piggy-bee
 def __init__(self):
     self.DICT = Dict("en_US")
     self.suggested_advices = set()
     self.suggested_words = set()
コード例 #23
0
import pickle
import os
# import matplotlib.pyplot as plt

# cleaning utilities

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from html.parser import HTMLParser
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from enchant import Dict
dictionary = Dict("en_US")

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = list(set(stopwords.words('english')))
h = HTMLParser()


def lemmatize_and_stem(word):
    """
    Function to lemmatization and stemming of a word based on dictionary checks
    """
    lemmatized = lemmatizer.lemmatize(word)
    if lemmatized != word:
        if dictionary.check(lemmatized) == True:
            return lemmatized
コード例 #24
0
from difflib import SequenceMatcher
from statistics import median

import cv2
import numpy as np
import pytesseract
from enchant import Dict
from enchant.checker import SpellChecker

from thumbframes_dl import YouTubeFrames

# Never Tell Me the Odds - Star Noirs One-off opening crawl | Saving Throw | CC BY 3.0
VIDEO_URL = 'https://www.youtube.com/watch?v=kEVOHhFg_s4'

LANG = ('en', 'eng')
dictionary = Dict(LANG[0])
spellchecker = SpellChecker(LANG[0])


# pytesseract.image_to_string returns a nice string, but no confidence level,
# pytesseract.image_to_data dumps this whole mess, so you have to parse it
def parse_pytesseract_output(data):

    # this list is a mix of ints as numbers and ints as strings
    data['conf'] = [int(num) for num in data['conf']]

    # only return line if confidence is high enough
    def _get_line_if_confident(start_index, end_index=None):
        if len(data['conf'][start_index:end_index]) == 0:
            return
        if median(data['conf'][start_index:end_index]) >= 70:
コード例 #25
0
from string import ascii_uppercase
from enchant import Dict

Letters = list(ascii_uppercase)

cipher = "jslnsjjw"  #know the right english word whih meet it

d = Dict("en_US")

for K in range(1, 26):
    PT = ""
    for ch in cipher:
        PT += Letters[(Letters.index(ch.upper()) - K + 26) % 26]

    if d.check(PT):
        print(f"for key {K} the plain text is {PT}\n")
コード例 #26
0
def process_tokens(words, normalize_plurals=True):
    """Normalize cases and remove plurals.

    Each word is represented by the most common case.
    If a word appears with an "s" on the end and without an "s" on the end,
    the version with "s" is assumed to be a plural and merged with the
    version without "s" (except if the word ends with "ss").

    Parameters
    ----------
    words : iterable of strings
        Words to count.

    normalize_plurals : bool, default=True
        Whether to try and detect plurals and remove trailing "s".

    Returns
    -------
    counts : dict from string to int
        Counts for each unique word, with cases represented by the most common
        case, and plurals removed.

    standard_forms : dict from string to string
        For each lower-case word the standard capitalization.
    """
    # words can be either a list of unigrams or bigrams
    # d is a dict of dicts.
    # Keys of d are word.lower(). Values are dicts
    # counting frequency of each capitalization
    eng_d = Dict("en_US")
    d = defaultdict(dict)
    for word in words:
        word_lower = word.lower()
        # get dict of cases for word_lower
        case_dict = d[word_lower]
        # increase this case
        case_dict[word] = case_dict.get(word, 0) + 1
    if normalize_plurals:
        # merge plurals into the singular count (simple cases only)
        merged_plurals = {}
        for key in list(d.keys()):
            if key.endswith('s') and not key.endswith("ss"):
                key_singular = key[:-1]
                if eng_d.check(key_singular):
                    if key_singular in d:
                        dict_plural = d[key]
                        dict_singular = d[key_singular]
                        for word, count in dict_plural.items():
                            singular = word[:-1]
                            dict_singular[singular] = (
                                dict_singular.get(singular, 0) + count)
                        merged_plurals[key] = key_singular
                        del d[key]
    fused_cases = {}
    standard_cases = {}
    item1 = itemgetter(1)
    for word_lower, case_dict in d.items():
        # Get the most popular case.
        first = max(case_dict.items(), key=item1)[0]
        fused_cases[first] = sum(case_dict.values())
        standard_cases[word_lower] = first
    if normalize_plurals:
        # add plurals to fused cases:
        for plural, singular in merged_plurals.items():
            standard_cases[plural] = standard_cases[singular.lower()]
    return fused_cases, standard_cases
コード例 #27
0
import os
from re import match, sub
from nltk import word_tokenize
from enchant import Dict
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
for word in ['rt', 'co', 'amp']:
    stopwords.add(word)
word_dict = Dict("en_US")
stemmer = PorterStemmer()

TOKEN_DIR = './tokenized_corpus/'
RAW_TWEET_DIR = './tweet_corpus/'

if __name__ == '__main__':
    if not os.path.exists(TOKEN_DIR):
        os.mkdir(TOKEN_DIR)
    for ticker in os.listdir(RAW_TWEET_DIR):
        tickerpath = RAW_TWEET_DIR + ticker + '/'
        ticker_token_file = TOKEN_DIR + ticker + '.dat'
        with open(ticker_token_file, 'w') as token_file:
            for filename in os.listdir(tickerpath):
                tweet_id = match("(.*)\.dat", filename).group(1)
                content = open(tickerpath + filename).read()
                content = content.lower()
                content = sub('\W+', ' ', content)
                tokens = word_tokenize(content)
                tokens = [
                    stemmer.stem(x) for x in tokens
                    if x not in stopwords and len(x) > 1 and word_dict.check(x)
コード例 #28
0
from __future__ import unicode_literals
from random import randint

from django.db import models
from django.contrib.auth.models import User

from enchant import Dict
from enchant.tokenize import get_tokenizer

DICTIONARY = Dict('en_US')
TOKENIZER = get_tokenizer('en_US')


def default_randomness():
    return randint(0, 10000)


class MotionFile(models.Model):
    MARKER_SET_KIT = 0  # do not change values, since they are stored in the DB!
    MARKER_SET_CMU = 1

    class Meta:
        unique_together = ('motion_db_id', 'motion_db_file_id')

    motion_db_id = models.PositiveIntegerField()
    motion_db_file_id = models.PositiveIntegerField()
    filename = models.CharField(max_length=255, unique=True)
    mean_perplexity = models.FloatField(default=0.)
    is_broken_confirmed = models.BooleanField(default=False)
    is_broken_reported = models.BooleanField(default=False)
    marker_set = models.PositiveIntegerField(default=MARKER_SET_KIT)
コード例 #29
0
from enchant import Dict
from enchant.checker import SpellChecker
from enchant.tokenize import EmailFilter, URLFilter

from cloudbot import hook

locale = "en_US"
en_dict = Dict(locale)


@hook.command()
def spell(text):
    """<word/sentence> - Check spelling of a word or sentence."""
    if len(text.split(" ")) > 1:
        # input is a sentence
        checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter])
        checker.set_text(text)

        is_correct = True
        offset = 0
        for err in checker:
            is_correct = False
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "[h1]{}[/h1]".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
コード例 #30
0
# -*- coding: utf-8 -*-

# imports
import re
from enchant import Dict, tokenize

# spellchecking
dictionary = Dict('en_US')
tokenizer = tokenize.get_tokenizer('en_US')


def is_all_titlecased(string):
    '''checks if each word in a string is titlecased'''
    # enchant's tokenizer breaks a string into tokens of its constituent words, organized as tuples of the form (word, start_index)
    # we check only the first character because all-caps strings are not considered title-cased
    return all(token[0][0].istitle() for token in list(tokenizer(string)))


def filter_periods(word):
    '''filter out periods in acronyms/all-caps words'''
    return re.sub(r'\.', '', word)


def is_all_caps(word):
    '''checks if a word is all-caps'''
    # filter out '.' in case word is an acronym of the form 'U.S.A.' rather than 'USA'
    return all(char.istitle() for char in list(filter_periods(word)))


def try_to_fix_case(word):
    '''return top suggestion if it differs from word only in case, otherwise return original word '''