コード例 #1
0
    def __init__(
        self,
        word_list=None,
        pwl_loc="/data/pwl",
    ):
        self.dict = None
        filepath = os.getcwd() + pwl_loc

        print filepath

        # load the dictionary for spell correction
        try:
            print("loading pwl")
            self.dict = request_pwl_dict(filepath)
            if not self.dict.check("sigmaaldrich"):
                print("going to build")
                raise IOError
            print("loaded pwl")
        except IOError:
            print("building pwl")
            f = open(filepath, "w+")
            f.write("SAMUELHELMSISAWESOME" + "\n")
            f.close()
            self.load_words(word_list, filepath)
            self.dict = request_pwl_dict(filepath)
コード例 #2
0
def test_UnicodeFN(tmp_path):
    """Test that unicode PWL filenames are accepted."""
    unicode_path = tmp_path / "테스트"
    setPWLContents(unicode_path, ["Lozz"])
    d = request_pwl_dict(str(unicode_path))
    assert d.check("Lozz")
    assert d
コード例 #3
0
ファイル: transliterator.py プロジェクト: tux4/ibus-lekhanee
 def __init__(self, lang, base_keymap, full_dictionary_file=None):
     self.__lang = lang
     self.__base_keymap = base_keymap
     self.__keymap = self.__generate_full_keymap(base_keymap)
     self.__full_dictionary_file = full_dictionary_file
     if full_dictionary_file:
         self.__dict = enchant.request_pwl_dict(full_dictionary_file)
コード例 #4
0
    def test_checkline(self):
        checker = SpellChecker("en_GB", filters=[URLFilter, EmailFilter])
        pwl = enchant.request_pwl_dict('dict.txt')

        with open('test.txt', 'w+') as tfile:
            self.assertEqual(checkline('Lots of words that are spelt orrectly!',
                                       'filename.txt', False, checker, pwl, tfile, 0), 1)
コード例 #5
0
def test_check(pwl_path):
    """Test that basic checking works for PWLs."""
    setPWLContents(pwl_path, ["Sazz", "Lozz"])
    d = request_pwl_dict(str(pwl_path))
    assert d.check("Sazz")
    assert d.check("Lozz")
    assert not d.check("hello")
コード例 #6
0
ファイル: corpusloader.py プロジェクト: vacuumv/AppCat
    def generate_personal_word_list(self, minimal_appear_count=100):
        """
        Generate enchant personal word list (pwl) from all comments
         if a word is not in english but appears
         more than 100 times in all comments
        :return:
        """

        self.dictionary = defaultdict(int)
        d = enchant.Dict('en_US')
        for doc in self.corpus:
            for token in doc:
                self.dictionary[token] += 1

        word_list = [item for item in self.dictionary.iteritems() if not d.check(item[0])]
        word_list = [r for r in sorted(word_list, key=lambda item: item[1], reverse=True) if
                     r[1] > minimal_appear_count]
        dict_pwl = enchant.request_pwl_dict(self.pwl)
        added = 0
        for word in word_list:
            w = str(word[0])
            if not dict_pwl.check(w):
                dict_pwl.add(w)
                added += 1
                print(w)
        log.info("{} words not in English but appears in comments more than {} times.".format(len(word_list),
                                                                                              minimal_appear_count))
        log.info("Of those {} words, {} has already exists in pwl, the rest {} ones has been added to pwl".format(
            len(word_list), len(word_list) - added, added))
コード例 #7
0
ファイル: transliterator.py プロジェクト: tux4/libLekhanee
 def __init__(self, lang, base_keymap, full_dictionary_file=None):
     self.__lang = lang
     self.__base_keymap = base_keymap 
     self.__keymap= self.__generate_full_keymap(base_keymap)
     self.__full_dictionary_file = full_dictionary_file
     if full_dictionary_file:
         self.__dict = enchant.request_pwl_dict(full_dictionary_file)  
コード例 #8
0
def artistDict(aVocab):
    ##Creates enchant dictionary for list of words
    ##enchant.request_pwl_dict() technically tries to open a file
    ##Need to create function that just creates the dict alone
    dArtist = enchant.request_pwl_dict("_")
    for sWord in aVocab:
        dArtist.add(sWord)
    return dArtist
コード例 #9
0
 def setUp(self):
     """Create a shared markspell instance to use for testing"""
     logging.basicConfig(level=logging.CRITICAL,
                         format='%(levelname)6s: %(message)s')
     self.logger = logging.getLogger('markdown-spellchecker')
     pwl = enchant.request_pwl_dict(
         join(dirname(realpath(__file__)), 'dict.txt'))
     self.markspell = MarkSpelling(pwl)
コード例 #10
0
ファイル: helpers.py プロジェクト: tsheasha/khadamatak
def spell_check( string ):
    """
    returns a list of spell-checked words matching the given string. The first element in the list is best match
    """
    relative_path = os.path.realpath( os.path.dirname( __file__ ) )
    districts_path = os.path.join( relative_path, "districts_services.txt" )
    dictionary = enchant.request_pwl_dict( districts_path )
    matches = dictionary.suggest( str( string ).lower() )
    return matches
コード例 #11
0
def test_add(pwl_path):
    """Test that adding words to a PWL works correctly."""
    d = request_pwl_dict(str(pwl_path))
    assert not d.check("Flagen")
    d.add("Esquilax")
    d.add("Esquilam")
    assert d.check("Esquilax")
    assert "Esquilax" in getPWLContents(pwl_path)
    assert d.is_added("Esquilax")
コード例 #12
0
ファイル: data.py プロジェクト: zl6y19/Data-Mining
def correctword(words):
    # call build-in dictionary and self-added dictionary
    pwl = enchant.request_pwl_dict(
        "/Users/lxy/PycharmProjects/data mining/enwiktionary.txt")
    d_gb = enchant.Dict("en_GB")
    d_g = enchant.DictWithPWL(
        "grc_GR", "/Users/lxy/PycharmProjects/data mining/enwiktionary.txt")

    return [word for word in words if d_gb.check(word) or d_g.check(word)]
コード例 #13
0
def test_suggestions(pwl_path):
    """Test getting suggestions from a PWL."""
    setPWLContents(pwl_path, ["Sazz", "Lozz"])
    d = request_pwl_dict(str(pwl_path))
    assert "Sazz" in d.suggest("Saz")
    assert "Lozz" in d.suggest("laz")
    assert "Sazz" in d.suggest("laz")
    d.add("Flagen")
    assert "Flagen" in d.suggest("Flags")
    assert "sazz" not in d.suggest("Flags")
コード例 #14
0
def loadpwl(filename):
    logger = logging.getLogger('markdown-spellchecker')
    if os.path.exists(filename):
        logger.debug('PWL file found')
        pwl = enchant.request_pwl_dict(filename)
        logger.debug('PWL file loaded')
        return pwl
    else:
        logger.error('PWL file "%s" does not exist', filename)
        sys.exit(1)
    return None
コード例 #15
0
ファイル: starterbot.py プロジェクト: cps847group12/A1
def handle_command(command, channel):
    """
        Executes bot command if the command is known
    """
    # Default response is help text for the user
    default_response = "Not sure what you mean. Try *{}* or *{} (city)*.".format(
        ECHO_COMMAND, WEATHER_COMMAND)

    # Finds and executes the given command, filling in response
    response = None

    if command.startswith(ECHO_COMMAND):
        if len(ECHO_COMMAND) == len(command):
            response = "Sure...I need some text to do that!"
        else:
            response = command[command.index(ECHO_COMMAND) +
                               len(ECHO_COMMAND) + 1:]

    elif command.startswith(WEATHER_COMMAND):
        if len(WEATHER_COMMAND) == len(command):
            response = "Sure...I need a city to do that!"
        else:
            dictionary = enchant.request_pwl_dict("cities.txt")
            suggestion = dictionary.suggest(
                command[command.index(WEATHER_COMMAND) + len(WEATHER_COMMAND) +
                        1:])
            requestget = requests.get(
                'http://api.openweathermap.org/data/2.5/weather?q=' +
                suggestion[0].replace(" ", "%20") +
                '&units=metric&appid=bbb393e2a17ca6ff2a90939e14b836e2')
            if requestget.status_code == 200:
                responsedata = requestget.json()
                response = 'Today\'s weather for: ' + responsedata['name'] + ', ' + responsedata[
                    'sys']['country'] + '\nDescription: ' + responsedata[
                        'weather'][0]['main'] + ', ' + responsedata['weather'][0][
                            'description'] + '\nTemperature in Celsius: ' + "{0:.2f}".format(
                                responsedata['main']['temp']
                            ) + '\nMinimum Temperature in Celsius: ' + "{0:.2f}".format(
                                responsedata['main']['temp_min']
                            ) + '\nMaximum Temperature in Celsius: ' + "{0:.2f}".format(
                                responsedata['main']['temp_max']
                            ) + '\nHumidity: ' + str(
                                responsedata['main']['humidity']
                            ) + '%\nWind: ' + "{0:.2f}".format(
                                responsedata['wind']['speed']) + ' meters/sec'
            else:
                response = "Unfortunately...I do not recognize the city"

    # Sends the response back to the channel
    slack_client.api_call("chat.postMessage",
                          channel=channel,
                          text=response or default_response)
コード例 #16
0
ファイル: f2s.py プロジェクト: daviskeene/TherAIpest
def f2s(line):
    rep = {"'":"","I ": "you ", "am ": "are ", "my": "your",
           "we ": "they", " us": "you ","Because": "",
           "because": ""," me ": " you ","im":"you're","I'm":"you're",
           "and":"","but":"","for":"","if":"","or":"","when":"","My":"your"}

    #Replace substrings in line with those in rep
    rep = dict((re.escape(k), v) for k, v in rep.iteritems())
    pattern = re.compile("|".join(rep.keys()))

    b = pattern.sub(lambda m: rep[re.escape(m.group(0))], line)
    b = random.choice(tokenize.sent_tokenize(b))

    #Split the sentence if it has a comma or because
    if "," in b:
        split = b.split("because")
        split = b.split(",")

        #Spell check
        pwl = enchant.request_pwl_dict("mywords.txt")
        d2 = enchant.DictWithPWL("en_US","mywords.txt")
        #If there is a comma, choose the side with the most amount of words to pass to the chat function.
        len_choice = max(split,key=len)
        #Replace the first space if there is a comma.
        if len_choice[:0] == "":
            len_choice = len_choice.replace(" ","",1)
        print d2.check(b)
        print len_choice

        return len_choice.lower()
    else:
        # Spell check
        pwl = enchant.request_pwl_dict("mywords.txt")
        d2 = enchant.DictWithPWL("en_US", "mywords.txt")
        # If there is a comma, choose the side with the most amount of words to pass to the chat function.
        return b.lower()
コード例 #17
0
ファイル: fabfile.py プロジェクト: edwinsteele/wordspeak.org
def spellchecker(is_interactive_deploy=True):
    """Spellcheck the Markdown and ReST files on the site"""

    spelling_errors_found = False

    # aspell is available on mac by default, and I don't want to manage custom
    #  word lists for both aspell and myspell so we'll just use aspell
    enchant._broker.set_ordering("en_GB", "aspell")
    pwl_dictionary = enchant.request_pwl_dict(SPELLCHECK_EXCEPTIONS)
    en_spellchecker = enchant.checker.SpellChecker(
        "en_GB", filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter]
    )
    md_posts = glob.glob(os.path.join(SITE_BASE, "posts", "*.md"))
    md_pages = glob.glob(os.path.join(SITE_BASE, "stories", "*.md"))

    for file_to_check in md_pages + md_posts:
        with open(file_to_check, "r", encoding="utf-8") as f:
            lines = f.readlines()

        e = _get_spellcheck_exceptions(lines)
        list(map(pwl_dictionary.add_to_session, e))
        for line in _non_directive_lines(lines):
            en_spellchecker.set_text(strip_markdown_directives(line))
            for err in en_spellchecker:
                if not pwl_dictionary.check(err.word):
                    spelling_errors_found = True
                    spelling_error = "Not in dictionary: %s (file: %s " "line: %s). Suggestions: %s" % (
                        err.word,
                        os.path.basename(file_to_check),
                        lines.index(line) + 1,
                        ", ".join(en_spellchecker.suggest(err.word)),
                    )
                    print(spelling_error)
                    if is_interactive_deploy:
                        action = prompt(
                            "Add '%s' to dictionary [add] or " "replace [type replacement]?" % (err.word,),
                            default="add",
                        ).strip()
                        if action == "add":
                            _add_to_spellcheck_exceptions(file_to_check, err.word)
                            pwl_dictionary.add(err.word)
                        else:
                            _replace_in_file(file_to_check, err.word, action)
                    else:
                        _send_pushover_summary(spelling_error, "Spelling error: %s" % (err.word,))

    return spelling_errors_found
コード例 #18
0
def spellCorrect(str_input):
    str_input = removeRedundantWhiteSpaces(str_input)
    str_input = toLowerCase(str_input)
    str_input = removePunctuations(str_input).split(" ")
    out_array = []
    for word in str_input:
        d = enchant.request_pwl_dict("words.txt")
        isword = d.check(word)
        if (isword):
            out_array.append(word)
        else:
            word_list = d.suggest(word)
            out = suggestions(word_list, word)
            out_array.append(out)
    output = ' '.join(out_array)

    return output
コード例 #19
0
    def detectErrors(self, string):
        '''
        @brief get all errors given a string
        @return False
                Doesn't have errors.

                Dict()
                Dict With the errors
        '''

        spellChecker = SpellChecker('es',
                                    filters=[
                                        EmailFilter, URLFilter,
                                        HtmlEntitiesFilter,
                                        sprintfParametersFilter
                                    ])
        spellChecker.set_text(string)

        PWL_es = enchant.request_pwl_dict(self.es_PWL_path)

        errors = False

        for err in spellChecker:
            # Verify if the word is ok on English
            # Several words are on English so they are marked as error
            if (self.dict_en_WP_PWL.check(err.word) is True):
                continue

            # Verify if the word is ok on Es PWL
            if (PWL_es.check(err.word) is True):
                continue
            # convert only once errors to a dictionary
            if type(errors) is not dict():
                errors = {}
                errors['errorWord'] = list()

            errors['errorWord'].append(err.word)

            # Add the bad word to the list
            self.addNewBadWord(err.word)

        # value_is_true if condition else value_is_false
        # "fat" if is_fat else "not fat"
        return errors
コード例 #20
0
def ccchecker(dictionary):
    try:
        #the goal is to make sure all the course codes are correct
        pattern=re.compile(r'\d\w+')#pattern to get the the course codes
        pwl=enchant.request_pwl_dict("/Users/damola/Desktop/  ")#coursecodes spellcheck
        for i in dictionary:#iterates over the dictionary
            count=0
            for j in dictionary[i]:#iterates over the dictionary values for the key i


                if re.search(pattern,j):#checks if the search is valid then allows the code below to run
                    pin=re.search(pattern,j)#assigns the variable pin to the coursecode
                    if len(pin.group())>=4:#checks if it has the appropriate amount of characters
                        dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],pwl.suggest(pin.group())[0])#replaces the coursecode with a corrected version
                count=count+1
        return dictionary
    except:
        print 'CCCHECKER did not work'
        return dictionary
コード例 #21
0
def timecheck(dictionary):#this checks the times
    try:
        pattern=re.compile(r'(\S+)([AP]M)')#pattern to find the first time in the textblock from the user(the times are gotten in the following format ('11:30','AM'))
        pattern1=re.compile(r'(\w+):(\w+)')#pattern to group the time codes from the textblock into  for example (11,30)
        pwl=enchant.request_pwl_dict(os.path.expanduser("~/Desktop/timecodes.txt"))#loads the timecode spellcheck
        for i in dictionary:#iterates over the dictionary
            count=0
            for j in dictionary[i]:#iterates over the dictionary values for the key i
                    if re.search(pattern,j):#checks if the search is valid then allows the code below to run
                            length=re.findall(pattern,j)#finds all the times in the textblock
                            if len(length)==1:#if the AM or PM are not spelt correctly this corrects it
                                AMPM=length[0][1]#AM or PM

                                listoftimes=re.findall(pattern1,j)#group the time codes from the textblock into  for example (11,30)
                                listoftimes1=listoftimes[0][1]
                                print listoftimes1
                                listoftimes2=listoftimes[1][1]
                                time1=pwl.suggest(listoftimes1)
                                print time1
                                time2=pwl.suggest(listoftimes2)
                                if len(time1)==2:
                                    for k in time1:
                                        if AMPM in k:
                                            index=time1.index(k)
                                    pin=re.search(pattern1,j)
                                    dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],'%s:%s'%(re.search(pattern1,j).groups()[0],pwl.suggest(re.search(pattern1,j).groups()[1])[index]))

                                elif len(time2)==2:
                                    for k in time1:
                                        if AMPM in k:
                                            index=time1.index(k)
                                    pin=re.search(pattern1,j)
                                    dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],'%s:%s'%(re.findall(pattern,j)[1][0],pwl.suggest(re.findall(pattern,j)[1][1])[index]))




                    count=count+1
        return dictionary
    except:
        print 'timecheck did not work'
        return dictionary
コード例 #22
0
def check_spelling(meme):
    """
    Spell checks the meme in the message and see if it matches any of the 
    available memes in our meme list.

    Return: If correct spelling is found or everything is spelled correctly, return True and word.
    Else, return False

    """

    word_list = enchant.request_pwl_dict("meme_list")
    meme_dict = enchant.DictWithPWL("en_US", "meme_list")
    suggestions = meme_dict.suggest(meme)
    max_ratio = 0.0
    highest_word = ""

    for suggestion in suggestions:
        temp_ratio = similarity(meme, suggestion)
        if temp_ratio > max_ratio:
            highest_word = suggestion

    return (highest_word)
コード例 #23
0
ファイル: check_spell_ec.py プロジェクト: KDE/pology
def _create_checker(providers, langtag, words):

    try:
        import enchant
    except ImportError:
        pkgs = ["python-enchant"]
        raise PologyError(
            _("@info", "Python wrapper for Enchant not found, "
              "please install it (possible package names: "
              "%(pkglist)s).",
              pkglist=format_item_list(pkgs)))

    if langtag is not None:
        try:
            broker = enchant.Broker()
            if providers is not None:
                broker.set_ordering(langtag, providers)
            checker = broker.request_dict(langtag)
            checker.check(".")
        except:
            checker = None
    else:
        tmpf = tempfile.NamedTemporaryFile()
        tmpf.close()
        checker = enchant.request_pwl_dict(tmpf.name)
        os.unlink(tmpf.name)

    if checker:
        pname = checker.provider.name.split()[0].lower()
        need_upcasing = (pname in ("personal", "myspell"))
        for word in words or []:
            checker.add_to_session(word)
            if need_upcasing:
                checker.add_to_session(word[0].upper() + word[1:])
                checker.add_to_session(word.upper())
    return checker
コード例 #24
0
ファイル: pokedex.py プロジェクト: p-met95/Gorzon
def suggester(word, dct):
    dictio = enchant.request_pwl_dict(f"/app/bot/dictionaries/{dct}.txt")
    if dictio.check(word):
        return True
    else:
        return dictio.suggest(word)
コード例 #25
0
ファイル: noahcli.py プロジェクト: captflint/noah
import lzma
import pickle
import curses
import random
import enchant
from index import index
from doubleentries import doubleentries
with lzma.open('webster.txt.xz', 'rt') as infile:
    webster = infile.read()
wordlist = enchant.request_pwl_dict('wordlist.txt')

# initialize terminal screen
screen = curses.initscr()
curses.noecho()
curses.cbreak()
screen.keypad(True)
screen.refresh()

def display(textchunk):
    screen.clear()
    height = screen.getmaxyx()[0]
    command = 0
    offset = 0
    screen.scrollok(True)
    while chr(command) not in 'Qq':
        screen.clear()
        dispStr = ""
        lineCounter = 0
        for char in textchunk:
            if char == "\n":
                lineCounter += 1
コード例 #26
0
 def __init__(self, dict_name='en_US', max_dist=2):
     if dict_name == 'en_US':
         self.spell_dict = enchant.Dict(dict_name)
     else:
         self.spell_dict = enchant.request_pwl_dict(dict_name)
     self.max_dist = 2
コード例 #27
0
from gluon.html import *
import re, enchant
DIGITS = re.compile(r'\d')
INFRANK = re.compile(r'(\s+(ssp|subsp|var|forma|f)([.]?)(?=\s+))', re.I)
NAMES = 'applications/phylografter/static/names.txt'
d = enchant.request_pwl_dict(NAMES)

def check(s):
    try: return d.check(s)
    except:
        print 'spellcheck.check error', s
        return None

def suggest(s):
    return d.suggest(s)

def process_label(db, otu):
    options = []
    s = otu.label.replace('_', ' ')
    if check(s):
        options = list(db(db.ott_name.name==s).select())
        return (True, options)
    v = suggest(s)
    if not v:
        words = s.replace('.',' ').split()
        if words[-1].lower() == 'sp':
            words = words[:-1]
        s = DIGITS.sub('', ' '.join(words))
        if not s: return (False, options)
        if check(s):
            options = list(db(db.ott_name.name==s).select())
コード例 #28
0
ファイル: rulegen.py プロジェクト: zard777/hate_crack
 def load_custom_wordlist(self, wordlist_file):
     self.enchant = enchant.request_pwl_dict(wordlist_file)
コード例 #29
0
#control TOR and establish new identity, assigns user new IP
def newIdentity():
    socks.setdefaultproxy()
    s= socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(("127.0.0.1", 9051))
    s.send("AUTHENTICATE\r\n")
    response= s.recv(128)
    if response.startswith("250"):
        s.send("SIGNAL NEWNYM\r\n")
    s.close()
    connectTor()


flagged_queries= set()
flag_wan_dom= set()
pwl = enchant.request_pwl_dict("enchantAddList.txt")


from bulkwhois.shadowserver import BulkWhoisShadowserver

#retreived ASN rankings by number of ASes in system from http://as-rank.caida.org/
def getASNRankings():
    totalNumASes= 44086
    asn_rank_dic= {}
    for line in open("ASNRankingsByAS.txt", "rU"):
        line.strip()
        line= line.split(",")
        rank= line[0]
        rank= int(rank)
        asn= line[1]
        num_ASes= line[2]
コード例 #30
0
 def __init__(self, request, pwldir=None):
     SpellChecker.__init__(self, request, pwldir) 
     if self._haspwl():
         self.__pwldic = enchant.request_pwl_dict(self._pwlfilename())
コード例 #31
0
def ffeatures(domain):
    domain = cut_extend(domain)
    ts_bigrams = pandas.read_csv("all_bigrams.csv",
                                 converters={i: str
                                             for i in range(54872)})
    ts_trigrams = pandas.read_csv("all_trigrams.csv",
                                  converters={i: str
                                              for i in range(54872)})
    ds_bigrams = pandas.read_csv("bigrams_sorted.csv",
                                 converters={i: str
                                             for i in range(54872)})
    ds_trigrams = pandas.read_csv("trigrams_sorted.csv",
                                  converters={i: str
                                              for i in range(54872)})
    N = 1444
    M = 54872
    hexchars = "0123456789abcdefABCDEF"
    frequent = {
        "a": 9.35,
        "b": 2.27,
        "c": 3.87,
        "d": 3.26,
        "e": 9.69,
        "f": 1.67,
        "g": 2.4,
        "h": 2.56,
        "i": 7.4,
        "j": 0.55,
        "k": 1.9,
        "l": 4.65,
        "m": 3.37,
        "n": 6.12,
        "o": 7.28,
        "p": 2.91,
        "q": 0.21,
        "r": 6.44,
        "s": 6.48,
        "t": 6.13,
        "u": 3.23,
        "v": 1.37,
        "w": 1.2,
        "x": 0.67,
        "y": 1.67,
        "z": 0.68,
        "0": 0.18,
        "1": 0.24,
        "2": 0.23,
        "3": 0.15,
        "4": 0.16,
        "5": 0.1,
        "6": 0.09,
        "7": 0.09,
        "8": 0.1,
        "9": 0.08,
        ".": 0,
        "-": 1.26
    }
    domain = str(domain)
    domain = domain.lower()
    found_bi = []
    num_found_bi = []
    found_tri = []
    num_found_tri = []

    # find f1
    for j in range(len(domain) - 1):
        bi = domain[j:j + 2]
        if (bi not in found_bi):
            res = ds_bigrams[ds_bigrams["Bigrams"] == bi]
            if (res.empty == False):
                found_bi.append(bi)
                num_found_bi.append(1)
        else:
            pos = found_bi.index(bi)
            num_found_bi[pos] = num_found_bi[pos] + 1
    f1 = len(found_bi)

    # find f9
    for j in range(len(domain) - 2):
        tri = domain[j:j + 3]

        if (tri not in found_tri):
            res = ds_trigrams[ds_trigrams["Trigrams"] == tri]
            if (res.empty == False):
                found_tri.append(tri)
                num_found_tri.append(1)
        else:
            pos = found_tri.index(tri)
            num_found_tri[pos] = num_found_tri[pos] + 1
    f9 = len(found_tri)

    # find f2
    f2 = 0
    for i in range(f1):
        index = ts_bigrams.index[ts_bigrams["Bigrams"] == found_bi[i]][0] + 1
        f2 = f2 + (num_found_bi[i] * index)

    # find f10
    f10 = 0
    for i in range(f9):
        index = ts_trigrams.index[ts_trigrams["Trigrams"] ==
                                  found_tri[i]][0] + 1
        f10 = f10 + (num_found_tri[i] * index)

    #find f3
    f3 = 0
    for i in range(f1):
        vt = ds_bigrams.index[ds_bigrams["Bigrams"] == found_bi[i]][0] + 1
        f3 = f3 + (num_found_bi[i] * vt)
    f3 = f3 / f1

    #find f11
    f11 = 0
    for i in range(f9):
        vt = ds_trigrams.index[ds_trigrams["Trigrams"] == found_tri[i]][0] + 1
        f11 = f11 + (num_found_tri[i] * vt)
    f11 = f11 / f9

    #find f4
    f4 = f2 / len(domain)

    #find f12
    f12 = f10 / len(domain)

    #find f5
    f5 = f3 / len(domain)

    #find f13
    f13 = f11 / len(domain)

    #find f6
    f6 = f1 / len(domain)

    #find f14
    f14 = f9 / len(domain)

    #find f7
    f7 = 0
    for i in range(f1):
        f7 = f7 + num_found_bi[i]
    f7 = f7 / len(domain)

    #find f15
    f15 = 0
    for i in range(f9):
        f15 = f15 + num_found_tri[i]
    f15 = f15 / len(domain)

    #find f8
    f8 = 0
    for i in range(f1):
        vt = ds_bigrams.index[ds_bigrams["Bigrams"] == found_bi[i]][0] + 1
        f8 = f8 + ((vt / N) * math.log10(vt / N))
    f8 = -f8

    #find f16
    f16 = 0
    for i in range(f9):
        vt = ds_trigrams.index[ds_trigrams["Trigrams"] == found_tri[i]][0] + 1
        f16 = f16 + ((vt / M) * math.log10(vt / M))
    f16 = -f16

    #find f17
    f17 = 0
    vowels = ['a', 'e', 'i', 'o', 'u']
    for i in range(len(domain)):
        if (domain[i] in vowels):
            f17 = f17 + 1

    #find f18
    f18 = f17 / len(domain)

    #find f19
    f19 = 0
    if (len(domain) == 1):
        f19 = 1
    else:
        for i in range(len(domain)):
            if (domain[i] not in hexchars):
                if (i != 0):
                    f19 = 1
                    break

    #find f20
    unq_chars = ''.join(set(domain))
    f20 = 0
    for char in unq_chars:
        f20 = f20 + (domain.count(char) * frequent[char])
    f20 = f20 / len(domain)

    #find f21
    # d = enchant.Dict("en_US")
    d = enchant.request_pwl_dict("wordlist.txt")
    kq = [
        ''.join(_ngram) for _ngram in everygrams(domain)
        if d.check(''.join(_ngram)) and len(_ngram) > 1
    ]
    f21 = len(kq)

    #find f22
    if (len(domain) < 5): f22 = 1
    else: f22 = 0

    dfObj = pandas.DataFrame(columns=[
        'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11',
        'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21',
        'f22'
    ])
    dfObj.loc[0] = [
        f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16,
        f17, f18, f19, f20, f21, f22
    ]
    return dfObj
コード例 #32
0
ファイル: tests.py プロジェクト: stfc/markdown-spellchecker
 def setUp(self):
     """Create a shared markspell instance to use for testing"""
     logging.basicConfig(level=logging.CRITICAL, format='%(levelname)6s: %(message)s')
     self.logger = logging.getLogger('markdown-spellchecker')
     pwl = enchant.request_pwl_dict(join(dirname(realpath(__file__)), 'dict.txt'))
     self.markspell = MarkSpelling(pwl)
コード例 #33
0
         n = len(words)
         for word in words:
             if word:
                 if word in all_words_list:
                     all_words_list[word] = all_words_list[word] + 1
                 else:
                     all_words_list[word] = 1
     for word in all_words_list:
         idf_list[word] = idf(word, corpus)
 m = len(all_words_list)
 # print all_words_list
 all_words_list = sortDict(all_words_list)
 # newlist = sorted(newlist.items(), key=operator.itemgetter(1), reverse=True)
 # all_words_list = sorted(all_words_list.iterkeys())
 # print all_words_list
 pwl = enchant.request_pwl_dict("big.txt")
 print('enter query')
 a = raw_input()
 a = queryCorrector(a, pwl)
 print type(a)
 print a
 x = getRelativeDocs(a, all_words_list, idf_list)
 # x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)[:30]
 # newList = {}
 # for i in range(30):
 #     print str(x[i][0])+' '+str(x[i][1])
 # newList[x[i][0]] = popularity[x[i][0]]
 # newList = sorted(newList.items(), key=operator.itemgetter(1), reverse=True)
 # for i in range(30):
 #     flag = True
 #     arr = a.split()
コード例 #34
0
ファイル: TT_main.py プロジェクト: memedum90/TurkuazTurchese-
#GLOBAL VARIABLES

tm = time.time()
		
# Global list of tweets: every element is a dictionary
archive_list = []

#List of conversations and user in conversations flames or not
flames = []
noflames = []
flamesU = []
noflamesU = []

# Add as dictionary the English language plus the bad words
pwl = enchant.request_pwl_dict('dicts/bad-words.txt')
Dict = enchant.DictWithPWL('en_US', 'dicts/bad-words.txt')
slg = mount_slang_dict()

# Measurments used in feature evaluation
fla_upp = 0.0
nfla_upp = 0.0
tmp_upp = 0.0
fla_mrk = 0.0
nfla_mrk = 0.0
tmp_mrk = 0.0
fla_gsm = 0.0
nfla_gsm = 0.0
tmp_gsm = 0.0
fla_bsm = 0.0
nfla_bsm = 0.0
コード例 #35
0
unicode_chars = (chr(i) for i in range(sys.maxunicode))
control_unicode_chars = ''.join(c for c in unicode_chars
                                if unicodedata.category(c) == 'Cc')
control_char_re = re.compile('[%s]' % re.escape(control_unicode_chars))


def remove_control_chars(s):
    return control_char_re.sub('', s)


# ====================================================================================== #
# Delongation
us_dict = enchant.Dict("en_US")
uk_dict = enchant.Dict("en_GB")
au_dict = enchant.Dict("en_AU")
twitter_dict = enchant.request_pwl_dict(
    os.path.join(os.path.dirname(__file__), 'assets', 'twitter_jargon.txt'))


def is_known_word(word):
    return us_dict.check(word) \
           or uk_dict.check(word) \
           or au_dict.check(word) \
           or twitter_dict.check(word)


delongate_pattern = re.compile(r"(.)\1{2,}")


def delongate(text):
    return delongate_pattern.sub(r"\1\1", text)
コード例 #36
0
    ("israel", "Israel"),
)

# List of dictionary objects to test
dicts = []
# Number of correct words missed by each dictionary
missed = []
# Number of corrections not suggested by each dictionary
incorrect = []
# Number of places to find correct suggestion, or -1 if not found
dists = []

# Create each dictionary object
for prov in providers:
    if prov == "pypwl":
        d = enchant.request_pwl_dict(wordsfile)
    else:
        b = enchant.Broker()
        b.set_ordering(lang, prov)
        d = b.request_dict(lang)
        if not d.provider.name == prov:
            raise RuntimeError("Provider '%s' has no dictionary for '%s'" %
                               (prov, lang))
        del b
    dicts.append(d)
    missed.append([])
    incorrect.append([])
    dists.append([])

# Actually run the tests
testcases = open(datafile, "r")
コード例 #37
0
CONFIGFILE.read(CONFIGFILECOMPLETEPATH)
CONFIGFILE.read(DIRECTORY_TESTS, 'config.ini')
DEFAULTCONFIGFILE = CONFIGFILE['DEFAULT']
DIRECTORY_ROOT = os.path.dirname(DIRECTORY_TESTS)
FILENAME_JSONSCORE = DEFAULTCONFIGFILE['Prevscore']
FILENAME_PWL = DEFAULTCONFIGFILE['PWL']
if not os.path.isabs(FILENAME_JSONSCORE):
    FILENAME_JSONSCORE = os.path.join(
        DIRECTORY_TESTS, DEFAULTCONFIGFILE['Prevscore'])
if not os.path.isabs(FILENAME_PWL):
    FILENAME_PWL = os.path.join(DIRECTORY_TESTS, DEFAULTCONFIGFILE['PWL'])

#print()
if os.path.exists(FILENAME_PWL):
    print("\033[1;36mPWL file exists\033[0m")
    pwl = enchant.request_pwl_dict(FILENAME_PWL)
    #print("Loaded PWL object: %s" % pwl)
    #print("Methods of object: %s" % dir(pwl))
else:
    print("\033[1;36mPWL file does not exist\033[0m")
    sys.exit(2)
# add words to the dictionary used to test for spelling errors
spellcheck = SpellChecker("en_GB", filters=[URLFilter, EmailFilter])
wordswrong = open(CONFIGFILE['DEFAULT']['Wordswrongfile'], "w+")
# creates/opens a file to save the words that were spelt wrong
filecheck = open(CONFIGFILE['DEFAULT']['Filecheck'], "w+")
# creates/opens a file to save the files that were checked


def main():
    parser = argparse.ArgumentParser(
コード例 #38
0
import enchant

f = open("customWords.txt",'w')
f.close()

pwl = enchant.request_pwl_dict("customWords.txt")
d = enchant.Dict("en_US")
singleLetters = ['a','i']

def validateLeft(variable):
	return d.check(variable) or pwl.check(variable)

def compute(variable, parent):
	if variable == '': return True
	if len(variable) == 1: return []
	tempRes = []
	for i in reversed(range(1,len(variable)+1)):
		resLeft = validateLeft(variable[:i])
		if resLeft:

			resRight = compute(variable[i:],variable[:i])
			if(resRight == True and i!=1): tempRes.append([[variable[:i].lower()]]) # Don't add single letter entries
			elif(len(variable[:i]) == 1 and variable[:i].lower() not in singleLetters): pass # If single letter entries are not 'i' or 'a', move on
			else: # Recursively check for all words in right part of entries
				for y in resRight:
					innerRes = [variable[:i]]
					for entry in y:
						innerRes+=entry
					tempRes.append([innerRes])
	return tempRes
コード例 #39
0
import os
import enchant
from enchant.checker import SpellChecker

#start using python -i enchant_console.py

# combine with the US dictionary
#pdict = enchant.DictWithPWL("en_US","wordlist.txt")

pdict = enchant.request_pwl_dict("wordlist.txt")
chkr = SpellChecker(pdict)

def SuggestSpelling(theword):
	return chkr.suggest(theword)

def CheckText(thetext):
	chkr.set_text(thetext)
	rsp = []
	for err in chkr:
		rsp.append(err.word)
	return rsp


'''
chkr.set_text("pelvi frecture")
for err in chkr:
	print "ERROR:", err.word

'''
コード例 #40
0
  # count word frequency
  freqmap = {}
  for w in arr_word:
    if w not in freqmap:
      freqmap[w] = 0
    freqmap[w] += 1

  # print freqmap

  # build dict file
  dict_words = [w for w in freqmap if freqmap[w] >= DICT_THRESHOLD]

  # print 'Constructing dict..'
  # corpus_dict = enchant.Dict()
  corpus_dict = enchant.request_pwl_dict(TMP_PREFIX + docid)
  for w in dict_words: 
    corpus_dict.add_to_pwl(w)

  # print corpus_dict
  # print 'Done constructing dict.'

  # Data munging
  last_varid = arr_varid[0]
  last_candid = arr_candid[0]
  last_source = arr_source[0]
  data = []
  thisvar = []
  thiscand = []

  # GENERATE DATA with one pass
コード例 #41
0
ファイル: chat_bot_main.py プロジェクト: yuvapawan/Dr.RK_Bot
        return_list.append((classes[r[0]], r[1]))
    # return tuple of intent and probability
    return return_list

def output_message(intents, sentence, model, userID='123', show_details=False):
    results = classify(intents, sentence, model)
    # if we have a classification then find the matching intent tag
    if results:
        # loop as long as there are matches to process
        while results:
            for i in intents['intents']:
                # find a tag matching the first result
                if i['tag'] == results[0][0]:
                    # a random response from the intent
                    return print("Dr. RK -> ",random.choice(i['responses']))

            results.pop(0)
            
#d = enchant.Dict("en_GB")
d = enchant.request_pwl_dict("english_dict.txt")
model = trained_model(intents)

while True:
    sentence = input("")
    if sentence.lower().strip() == "bye":
        break
    else:
        output = output_message(intents, sentence, model)


コード例 #42
0
    "ms": ["ms_MY"],
    "sge": [],
    "zh": []
    # "sge" and "zh" handled with personal word lists below
}
# --- Corresponding dictionaries ---
spelling_dictionaries = {}
for language in spelling_languages.keys():
    spelling_dictionaries[language] = {}
    for variant in spelling_languages[language]:
        spelling_dictionaries[language][variant] = enchant.Dict(variant)
# --- SgE word lists ---
spelling_dictionaries["sge"] = {}
sge_lists = sge_words + sge_chinese_derived_words + sge_malay_derived_words
for wordlist in sge_lists:
    spelling_dictionaries["sge"][wordlist] = enchant.request_pwl_dict(wordlist)
# --- Additional word list handling ---
# Count Chinese-derived words in SgE as Chinese
for wordlist in sge_chinese_derived_words:
    spelling_dictionaries["zh"][wordlist] = enchant.request_pwl_dict(wordlist)
for wordlist in sge_malay_derived_words:
    spelling_dictionaries["ms"][wordlist] = enchant.request_pwl_dict(wordlist)

def extract_features(sentence):
    tokenised = tokenise(sentence)
    tokenised_spellcheck = prep_tokens_for_spellcheck(tokenised)
    features = {}
    ## Primary features
    # Chinese
    features["has_zh_chars"] = has_zh_chars(sentence)
    features["has_pinyin"] = has_pinyin(tokenised_spellcheck)
コード例 #43
0
def parse_options():
    parser = OptionParser()
    usage = 'usage: brxor.py [options] <file>'
    parser = OptionParser(usage=usage)
    parser.add_option('-k',
                      '--key',
                      action='store',
                      dest='key',
                      type='string',
                      help='Static XOR key to use')
    parser.add_option('-f',
                      '--full',
                      action='store_true',
                      dest='full',
                      help='XOR full file')
    parser.add_option(
        '-d',
        '--dict',
        action='store',
        dest='user_dict',
        help="User supplied dictionary, one word per line (Default: 'en_us')")
    parser.add_option('-l',
                      '--length',
                      action='store',
                      dest='length',
                      default=4,
                      help="Minimum word length to use (Default: 4)")
    parser.add_option('-v',
                      '--verbose',
                      action='store_true',
                      dest='verbose',
                      help='Increase verbosity of output.')
    (options, args) = parser.parse_args()

    global word_dict

    # Test for Args
    if len(sys.argv) < 2:
        parser.print_help()
        return
    # Test that the full option contains a XOR Key
    if options.full != None and options.key == None:
        print '[ERROR] --FULL OPTION MUST INCLUDE XOR KEY'
        return
    # XOR the full file with key
    if options.full != None and options.key != None:
        sys.stdout.write(xor(f.read(), options.key))
        return
    if options.user_dict:
        # check for file
        if os.path.isfile(options.user_dict) and os.access(
                options.user_dict, os.R_OK):
            word_dict = enchant.request_pwl_dict(options.user_dict)
        else:
            print '[ERROR] FILE CAN NOT BE OPENED OR READ!\n'
            print usage
            sys.exit(1)
    else:
        word_dict = enchant.Dict('en_US')

    # Parse file for regular expressions
    return options
コード例 #44
0
documents = prepare_data()

# Magic happening with term-document matrix
cv = CountVectorizer(
    lowercase=True,
    binary=True,
    token_pattern=r'(?u)\b\w+\b',
    ngram_range=(1, 3)
)  # 1gram: ngram_range=(1,1), 2gram: ngram_range=(2,2), from 1gram to 3gram: ngram_range=(1, 3) ... etc
sparse_matrix = cv.fit_transform(documents)
dense_matrix = sparse_matrix.todense()
td_matrix = dense_matrix.T
sparse_td_matrix = sparse_matrix.T.tocsr()
t2i = cv.vocabulary_  # dictionary of terms
terms = cv.get_feature_names()
terms_textfile = enchant.request_pwl_dict(
    "data100_wordlist_3gram.txt")  # Defining a personal wordlist (= pwl)
unknownword_list = []  # for similar word suggestions

# TF-IDF
tfv5 = TfidfVectorizer(lowercase=True,
                       sublinear_tf=True,
                       use_idf=True,
                       norm="l2",
                       token_pattern=r'(?u)\b\w+\b')
sparse_matrix = tfv5.fit_transform(documents).T.tocsr()

# Make a file of list of words
f = open("data100_wordlist_3gram.txt", "w")
for k, v in t2i.items():
    f.write(k + "\n")
f.close()
コード例 #45
0
import numpy
import pickle
import re
import sys

from multiprocessing import Pool
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


CHAT_WORDS = nltk.corpus.nps_chat.words() 
ENGLISH_WORDS = nltk.corpus.words.words()
PWL = enchant.request_pwl_dict('CHAT_WORDS')
DICT = enchant.DictWithPWL("en_US", 'chat_words')


# First removes punctuations and numbers and adds in a tag for each word.
# Then removes the list of stop words
# given from nltk. Then uses the lemmatizer then the Lancaster stemmer in
# order to get the root of each words. Lemmatizer forms actual words
# from the word list dictionary so the words it forms are actual words,
# but it doesn't get rid of every stems. However, the Lancaster removes
# more stems, but it doesn't check with a dictionary and many of the words
# changed are not actual words.
def remove_stems(file):
    new_file = []
    punctuation = re.compile(r'[.,"?!:;]')
    lemmatizer = WordNetLemmatizer()
コード例 #46
0
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords

stuff_to_be_removed = list(stopwords.words("english")) + list(punctuation)
import enchant, difflib
import pandas as pd
import numpy as np
import re
import pickle
import random
import nltk

file_path = "words.txt"
dictionary = enchant.request_pwl_dict(file_path)

data = pd.read_excel("data/sampledata_v2.xlsx")
for i in data.columns:
    data[i] = data[i].str.lower()


class color:
    PURPLE = "\033[95m"
    CYAN = "\033[96m"
    DARKCYAN = "\033[36m"
    BLUE = "\033[94m"
    GREEN = "\033[92m"
    YELLOW = "\033[93m"
    RED = "\033[91m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
コード例 #47
0
def test_UnicodeCharsInPath(tmp_path):
    """Test that unicode chars in PWL paths are accepted."""
    _fileName = r"test_\xe5\xe4\xf6_ing"
    path = tmp_path / _fileName
    d = request_pwl_dict(str(path))
    assert d
コード例 #48
0
ファイル: shootout.py プロジェクト: DKaman/pyenchant
# This is so we can use unmodified tests published by third parties
corrections = (("caesar","Caesar"),("confucianism","Confucianism"),("february","February"),("gandhi","Gandhi"),("muslims","Muslims"),("israel","Israel"))

# List of dictionary objects to test
dicts = []
# Number of correct words missed by each dictionary
missed = []
# Number of corrections not suggested by each dictionary
incorrect = []
# Number of places to find correct suggestion, or -1 if not found
dists = []

# Create each dictionary object
for prov in providers:
    if prov == "pypwl":
        d = enchant.request_pwl_dict(wordsfile)
    else:
        b = enchant.Broker()
        b.set_ordering(lang,prov)
        d = b.request_dict(lang)
        if not d.provider.name == prov:
          raise RuntimeError("Provider '%s' has no dictionary for '%s'"%(prov,lang))
        del b
    dicts.append(d)
    missed.append([])
    incorrect.append([])
    dists.append([])
    
# Actually run the tests
testcases = file(datafile,"r")
testnum = 0
コード例 #49
0
ファイル: rulegen.py プロジェクト: KorayAgaya/hate_crack
 def load_custom_wordlist(self, wordlist_file):
     self.enchant = enchant.request_pwl_dict(wordlist_file)