def change_a_word_5_ways_invalid(word):
    cnt = loadDict()

    method = random.randint(0, 4)
    ret_word_and_method = ('', -1)
    count = 0
    ret_flag = False

    while not ret_flag and count < 10:
        count = count + 1

        if method == 0:
            ret_word_and_method = (add_character(word), 0)
        elif method == 1:
            ret_word_and_method = (delete_character(word), 1)
        elif method == 2:
            ret_word_and_method = (change_character(word), 2)
        elif method == 3:
            ret_word_and_method = (permute_characters(word), 3)
        else:  # method == 4
            ret_word_and_method = (separate_characters(word), 4)

        if (cnt[ret_word_and_method[0]] == 0):
            ret_flag = True
        else:
            # Do not use method = 4 more than once
            method = random.randint(0, 3)

    return ret_word_and_method
def change_a_word_5_ways_invalid_v2(word):
    cnt = loadDict()

    method = random.randint(0, 3)
    ret_word_and_method = ('', -1)
    count = 0
    ret_flag = False

    while not ret_flag and count < 10:
        count = count + 1

        # TODO: Add logging if ret_word_and_method[0] == ''
        if method == 0:
            ret_word_and_method = (add_character(word), 0)
        elif method == 1:
            if len(word) < 8:
                # Do not delete characters unless a word has length 8 or more
                method = random.choice([0, 2, 3, 4])
                continue
            ret_word_and_method = (delete_character(word), 1)
        elif method == 2:
            ret_word_and_method = (change_character(word), 2)
        else:  # method == 3
            ret_word_and_method = (permute_characters(word), 3)

        if (cnt[ret_word_and_method[0]] == 0):
            ret_flag = True
        else:
            method = random.randint(0, 3)

    if not ret_flag and count >= 10:
        if ret_word_and_method[1] == 0:
            ret_word_and_method = (add_character(word, char='*'), 0)
        elif ret_word_and_method[1] == 2:
            ret_word_and_method = (change_character(word, char='*'), 2)

    if ret_word_and_method[1] == -1:
        if len(word) >= 8:
            ret_word_and_method = (delete_character(word), 1)
        else:
            ret_word_and_method = (add_character(word, char='*'), 0)

    # TODO: Log here also
    return ret_word_and_method
def change_a_word_5_ways_invalid_v2_force_method(word, method):
    cnt = loadDict()

    ret_word_and_method = ('', -1)
    count = 0
    ret_flag = False
    while not ret_flag and count < 10:
        count = count + 1

        # TODO: Add logging if ret_word_and_method[0] == ''
        if method == 0:
            ret_word_and_method = (add_character(word), 0)
        elif method == 1:
            if len(word) < 8:
                raise ValueError(f'The word {word} is too short ({len(word)})'
                                 ' to use delete method')
            ret_word_and_method = (delete_character(word), 1)
        elif method == 2:
            ret_word_and_method = (change_character(word), 2)
        else:  # method == 3
            ret_word_and_method = (permute_characters(word), 3)

        if (cnt[ret_word_and_method[0]] == 0):
            ret_flag = True

    if not ret_flag and count >= 10:
        if ret_word_and_method[1] == 0:
            ret_word_and_method = (add_character(word, char='*'), 0)
        elif ret_word_and_method[1] == 2:
            ret_word_and_method = (change_character(word, char='*'), 2)

    if ret_word_and_method[1] == -1:
        if len(word) >= 8:
            ret_word_and_method = (delete_character(word), 1)
        else:
            ret_word_and_method = (add_character(word, char='*'), 0)

    # TODO: Log here also
    return ret_word_and_method
def closest_words_edit_distance_pickle(word, cutoff=1):
    cnt = loadDict()
    return closest_words_edit_distance(word, cnt, cutoff)
import os
import difflib
from nltk.corpus import words
#from context_based_selection.vocab import Vocab
from vocab import Vocab
#from context_based_selection.context_score import cosSim
import numpy as np
import sys
#from domain_corpus_generation.corpus_util import loadDict
from regular_check import rawCheckOnDist
import editdistance
from pyxdameraulevenshtein import damerau_levenshtein_distance as dist
sys.path.insert(0, '../domain_corpus_generation/')
from corpus_util import loadDict

corpus = loadDict(fn="../domain_corpus_generation/dict_v1.pickle",
                  freq_threshold=100)
small_corpus = corpus
train_corpus = loadDict(
    fn="../domain_corpus_generation/persective_train_dict.pickle",
    freq_threshold=5)

vecDim = 300
embedding_directory = "../domain_corpus_generation/embeddings/"
vocabInputFile = "vocab.txt"
vectorInputFile = "vectors.bin"
isFunctional = 1
vocab = Vocab(vecDim, embedding_directory, vocabInputFile, vectorInputFile,
              isFunctional)  # !!
CAND_LIMIT = 4  # 8 - if more candiates are generated, this word might be a non-sense word
DIST_LIMIT = 1
    Revised_Words.append(List_Of_List[i][0])
    Distances.append(List_Of_List[i][1])
    Num_Candidates.append(int(List_Of_List[i][2]))
    Candidates.append(List_Of_List[i][3])
print('number of valid revised words:',len(Revised_Words))
print('\n')



# check whether the original word is in the candidates

out_file_name1 = 'original_word_not_in_candidate.txt'
out_file_name2 = 'original_word_is_correct_but_not_in_candidate.txt'
out_file_name3 = 'original_word_in_candidate.txt'
number_of_candidates_if_include_original_word = []
d = corpus_util.loadDict()
temp1 = codecs.open(out_file_name1, "w", "utf-8-sig")
temp2 = codecs.open(out_file_name2, "w", "utf-8-sig")
temp3 = codecs.open(out_file_name3, "w", "utf-8-sig")
# index in the list of valid data (sentences with unique identifiable revised & original words)    
for i in range(len(revised_words)): 
    # index in the list of valid data (no I/O errors)
    for j in range(len(Revised_Words)): 
        if (Revised_Words[j] == revised_words[i]):
            if (original_words[i] not in Candidates[j]):
                print(original_words[i], 'not in candidates of ', revised_words[i])
                temp1.write(original_words[i])
                temp1.write('\n')
                temp1.write(revised_words[i])
                temp1.write('\n')
                temp1.write(str(Distances[j]))
Beispiel #7
0
"""
spell checker
"""
import argparse
import codecs
import os
import numpy as np
from jellyfish import levenshtein_distance as dist
from context_score import cosSim
from vocab import Vocab
from corpus_util import loadDict
from regular_check import rawCheckOnDist

corpus = loadDict()
small_corpus = corpus
train_corpus = corpus

vecDim = 300
embedding_directory = "./"
vocabInputFile = "vocab.txt"
vectorInputFile = "vectors.bin"
isFunctional = 1
vocab = Vocab(vecDim, embedding_directory, vocabInputFile, vectorInputFile,
              isFunctional)  # !!
CAND_LIMIT = 4  # 8 - if more candiates are generated, this word might be a non-sense word
DIST_LIMIT = 1


def getCandFromDict(word):
    """
    use edit distance to generate candidates
Beispiel #8
0
def change_a_word_5_ways_invalid_v2_force_method(word, method):
    Alphabet_List = list(string.ascii_lowercase)
    cnt = loadDict()

    # 0 - add
    # 1 - delete
    # 2 - replace
    # 3 - permute
    ret_word_and_method = ('', -1)
    count = 0
    ret_flag = False

    while (ret_flag == False and count < 10):

        count = count + 1

        if (method == 0):
            pos = random.randint(0, len(word))
            word1 = word[0:pos]
            word2 = word[pos:len(word)]
            add = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)]
            ret_word_and_method = (word1 + add + word2, 0)
            if (ret_word_and_method[0] == ''):
                print(word)
                print(ret_word_and_method[0])
                print(method)
                print(pos)
                print(word1)
                print(word2)
                print(add)

        elif (method == 1):
            if (len(word) < 8):
                print('Bug: used delete method for len(word)')
                return 'Bug: used delete method for len(word)', 1
            pos = random.randint(0, len(word) - 1)
            word1 = word[0:pos]
            word2 = word[pos + 1:len(word)]
            ret_word_and_method = (word1 + word2, 1)
            if (ret_word_and_method[0] == ''):
                print(word)
                print(ret_word_and_method[0])
                print(method)
                print(pos)
                print(word1)
                print(word2)

        elif (method == 2):
            pos = random.randint(0, len(word) - 1)
            word1 = word[0:pos]
            word2 = word[pos + 1:len(word)]
            change = word[pos]
            while (change == word[pos]):
                change = Alphabet_List[random.randint(0,
                                                      len(Alphabet_List) - 1)]
            ret_word_and_method = (word1 + change + word2, 2)
            if (ret_word_and_method[0] == ''):
                print(word)
                print(ret_word_and_method[0])
                print(method)
                print(pos)
                print(word1)
                print(word2)

        elif (method == 3):
            if (len(word) <= 1):
                ret_word_and_method = (word, 3)
            else:
                pos = random.randint(0, len(word) - 2)
                word1 = word[0:pos]
                word2 = word[pos + 2:len(word)]
                ret_word_and_method = (word1 + word[pos + 1] + word[pos] +
                                       word2, 3)
            if (ret_word_and_method[0] == ''):
                print(word)
                print(ret_word_and_method[0])
                print(method)
                print(pos)
                print(word1)
                print(word2)

        if (cnt[ret_word_and_method[0]] == 0):
            ret_flag = True

    if (ret_flag == False and count >= 10 and ret_word_and_method[1] == 0):
        pos = random.randint(0, len(word))
        word1 = word[0:pos]
        word2 = word[pos:len(word)]
        add = '*'
        ret_word_and_method = (word1 + add + word2, 0)
    elif (ret_flag == False and count >= 10 and ret_word_and_method[1] == 2):
        pos = random.randint(0, len(word) - 1)
        word1 = word[0:pos]
        word2 = word[pos + 1:len(word)]
        change = '*'
        ret_word_and_method = (word1 + change + word2, 2)

    if (ret_word_and_method[1] == -1):
        if (len(word) >= 8):
            pos = random.randint(0, len(word) - 1)
            word1 = word[0:pos]
            word2 = word[pos + 1:len(word)]
            ret_word_and_method = (word1 + word2, 1)
        else:
            pos = random.randint(0, len(word))
            word1 = word[0:pos]
            word2 = word[pos:len(word)]
            add = '*'
            ret_word_and_method = (word1 + add + word2, 0)

    if (ret_word_and_method[0] == ''):
        print(word)
        print(ret_word_and_method[0])
        print(ret_word_and_method[1])

    return ret_word_and_method[0], ret_word_and_method[1]
Beispiel #9
0
def change_a_word_5_ways_invalid(word):
    Alphabet_List = list(string.ascii_lowercase)
    Alphabet_List.append(' ')
    cnt = loadDict()

    # 0 - add
    # 1 - delete
    # 2 - replace
    # 3 - permute
    # 4 - separate
    method = random.randint(0, 4)  # if method>4, then no return value
    ret_word_and_method = ('', -1)
    count = 0
    ret_flag = False

    while (ret_flag == False and count < 10):

        count = count + 1

        if (method == 0):
            pos = random.randint(0, len(word))
            word1 = word[0:pos]
            word2 = word[pos:len(word)]
            add = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)]
            ret_word_and_method = (word1 + add + word2, 0)

        elif (method == 1):
            pos = random.randint(0, len(word) - 1)
            word1 = word[0:pos]
            word2 = word[pos + 1:len(word)]
            ret_word_and_method = (word1 + word2, 1)

        elif (method == 2):
            pos = random.randint(0, len(word) - 1)
            word1 = word[0:pos]
            word2 = word[pos + 1:len(word)]
            change = word[pos]
            while (change == word[pos]):
                change = Alphabet_List[random.randint(0,
                                                      len(Alphabet_List) - 1)]
            ret_word_and_method = (word1 + change + word2, 2)

        elif (method == 3):
            if (len(word) <= 1):
                ret_word_and_method = (word, 3)
            else:
                pos = random.randint(0, len(word) - 2)
                word1 = word[0:pos]
                word2 = word[pos + 2:len(word)]
                ret_word_and_method = (word1 + word[pos + 1] + word[pos] +
                                       word2, 3)

        elif (method == 4):
            modified_word = ''
            for c in list(word):
                modified_word = modified_word + ' ' + c
            modified_word = modified_word[1:len(modified_word)]
            ret_word_and_method = (modified_word, 4)

        if (cnt[ret_word_and_method[0]] == 0):
            ret_flag = True
        else:
            method = random.randint(0, 3)

    return ret_word_and_method[0], ret_word_and_method[1]