def change_a_word_5_ways_invalid(word): cnt = loadDict() method = random.randint(0, 4) ret_word_and_method = ('', -1) count = 0 ret_flag = False while not ret_flag and count < 10: count = count + 1 if method == 0: ret_word_and_method = (add_character(word), 0) elif method == 1: ret_word_and_method = (delete_character(word), 1) elif method == 2: ret_word_and_method = (change_character(word), 2) elif method == 3: ret_word_and_method = (permute_characters(word), 3) else: # method == 4 ret_word_and_method = (separate_characters(word), 4) if (cnt[ret_word_and_method[0]] == 0): ret_flag = True else: # Do not use method = 4 more than once method = random.randint(0, 3) return ret_word_and_method
def change_a_word_5_ways_invalid_v2(word): cnt = loadDict() method = random.randint(0, 3) ret_word_and_method = ('', -1) count = 0 ret_flag = False while not ret_flag and count < 10: count = count + 1 # TODO: Add logging if ret_word_and_method[0] == '' if method == 0: ret_word_and_method = (add_character(word), 0) elif method == 1: if len(word) < 8: # Do not delete characters unless a word has length 8 or more method = random.choice([0, 2, 3, 4]) continue ret_word_and_method = (delete_character(word), 1) elif method == 2: ret_word_and_method = (change_character(word), 2) else: # method == 3 ret_word_and_method = (permute_characters(word), 3) if (cnt[ret_word_and_method[0]] == 0): ret_flag = True else: method = random.randint(0, 3) if not ret_flag and count >= 10: if ret_word_and_method[1] == 0: ret_word_and_method = (add_character(word, char='*'), 0) elif ret_word_and_method[1] == 2: ret_word_and_method = (change_character(word, char='*'), 2) if ret_word_and_method[1] == -1: if len(word) >= 8: ret_word_and_method = (delete_character(word), 1) else: ret_word_and_method = (add_character(word, char='*'), 0) # TODO: Log here also return ret_word_and_method
def change_a_word_5_ways_invalid_v2_force_method(word, method): cnt = loadDict() ret_word_and_method = ('', -1) count = 0 ret_flag = False while not ret_flag and count < 10: count = count + 1 # TODO: Add logging if ret_word_and_method[0] == '' if method == 0: ret_word_and_method = (add_character(word), 0) elif method == 1: if len(word) < 8: raise ValueError(f'The word {word} is too short ({len(word)})' ' to use delete method') ret_word_and_method = (delete_character(word), 1) elif method == 2: ret_word_and_method = (change_character(word), 2) else: # method == 3 ret_word_and_method = (permute_characters(word), 3) if (cnt[ret_word_and_method[0]] == 0): ret_flag = True if not ret_flag and count >= 10: if ret_word_and_method[1] == 0: ret_word_and_method = (add_character(word, char='*'), 0) elif ret_word_and_method[1] == 2: ret_word_and_method = (change_character(word, char='*'), 2) if ret_word_and_method[1] == -1: if len(word) >= 8: ret_word_and_method = (delete_character(word), 1) else: ret_word_and_method = (add_character(word, char='*'), 0) # TODO: Log here also return ret_word_and_method
def closest_words_edit_distance_pickle(word, cutoff=1): cnt = loadDict() return closest_words_edit_distance(word, cnt, cutoff)
import os import difflib from nltk.corpus import words #from context_based_selection.vocab import Vocab from vocab import Vocab #from context_based_selection.context_score import cosSim import numpy as np import sys #from domain_corpus_generation.corpus_util import loadDict from regular_check import rawCheckOnDist import editdistance from pyxdameraulevenshtein import damerau_levenshtein_distance as dist sys.path.insert(0, '../domain_corpus_generation/') from corpus_util import loadDict corpus = loadDict(fn="../domain_corpus_generation/dict_v1.pickle", freq_threshold=100) small_corpus = corpus train_corpus = loadDict( fn="../domain_corpus_generation/persective_train_dict.pickle", freq_threshold=5) vecDim = 300 embedding_directory = "../domain_corpus_generation/embeddings/" vocabInputFile = "vocab.txt" vectorInputFile = "vectors.bin" isFunctional = 1 vocab = Vocab(vecDim, embedding_directory, vocabInputFile, vectorInputFile, isFunctional) # !! CAND_LIMIT = 4 # 8 - if more candiates are generated, this word might be a non-sense word DIST_LIMIT = 1
Revised_Words.append(List_Of_List[i][0]) Distances.append(List_Of_List[i][1]) Num_Candidates.append(int(List_Of_List[i][2])) Candidates.append(List_Of_List[i][3]) print('number of valid revised words:',len(Revised_Words)) print('\n') # check whether the original word is in the candidates out_file_name1 = 'original_word_not_in_candidate.txt' out_file_name2 = 'original_word_is_correct_but_not_in_candidate.txt' out_file_name3 = 'original_word_in_candidate.txt' number_of_candidates_if_include_original_word = [] d = corpus_util.loadDict() temp1 = codecs.open(out_file_name1, "w", "utf-8-sig") temp2 = codecs.open(out_file_name2, "w", "utf-8-sig") temp3 = codecs.open(out_file_name3, "w", "utf-8-sig") # index in the list of valid data (sentences with unique identifiable revised & original words) for i in range(len(revised_words)): # index in the list of valid data (no I/O errors) for j in range(len(Revised_Words)): if (Revised_Words[j] == revised_words[i]): if (original_words[i] not in Candidates[j]): print(original_words[i], 'not in candidates of ', revised_words[i]) temp1.write(original_words[i]) temp1.write('\n') temp1.write(revised_words[i]) temp1.write('\n') temp1.write(str(Distances[j]))
""" spell checker """ import argparse import codecs import os import numpy as np from jellyfish import levenshtein_distance as dist from context_score import cosSim from vocab import Vocab from corpus_util import loadDict from regular_check import rawCheckOnDist corpus = loadDict() small_corpus = corpus train_corpus = corpus vecDim = 300 embedding_directory = "./" vocabInputFile = "vocab.txt" vectorInputFile = "vectors.bin" isFunctional = 1 vocab = Vocab(vecDim, embedding_directory, vocabInputFile, vectorInputFile, isFunctional) # !! CAND_LIMIT = 4 # 8 - if more candiates are generated, this word might be a non-sense word DIST_LIMIT = 1 def getCandFromDict(word): """ use edit distance to generate candidates
def change_a_word_5_ways_invalid_v2_force_method(word, method): Alphabet_List = list(string.ascii_lowercase) cnt = loadDict() # 0 - add # 1 - delete # 2 - replace # 3 - permute ret_word_and_method = ('', -1) count = 0 ret_flag = False while (ret_flag == False and count < 10): count = count + 1 if (method == 0): pos = random.randint(0, len(word)) word1 = word[0:pos] word2 = word[pos:len(word)] add = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)] ret_word_and_method = (word1 + add + word2, 0) if (ret_word_and_method[0] == ''): print(word) print(ret_word_and_method[0]) print(method) print(pos) print(word1) print(word2) print(add) elif (method == 1): if (len(word) < 8): print('Bug: used delete method for len(word)') return 'Bug: used delete method for len(word)', 1 pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] ret_word_and_method = (word1 + word2, 1) if (ret_word_and_method[0] == ''): print(word) print(ret_word_and_method[0]) print(method) print(pos) print(word1) print(word2) elif (method == 2): pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] change = word[pos] while (change == word[pos]): change = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)] ret_word_and_method = (word1 + change + word2, 2) if (ret_word_and_method[0] == ''): print(word) print(ret_word_and_method[0]) print(method) print(pos) print(word1) print(word2) elif (method == 3): if (len(word) <= 1): ret_word_and_method = (word, 3) else: pos = random.randint(0, len(word) - 2) word1 = word[0:pos] word2 = word[pos + 2:len(word)] ret_word_and_method = (word1 + word[pos + 1] + word[pos] + word2, 3) if (ret_word_and_method[0] == ''): print(word) print(ret_word_and_method[0]) print(method) print(pos) print(word1) print(word2) if (cnt[ret_word_and_method[0]] == 0): ret_flag = True if (ret_flag == False and count >= 10 and ret_word_and_method[1] == 0): pos = random.randint(0, len(word)) word1 = word[0:pos] word2 = word[pos:len(word)] add = '*' ret_word_and_method = (word1 + add + word2, 0) elif (ret_flag == False and count >= 10 and ret_word_and_method[1] == 2): pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] change = '*' ret_word_and_method = (word1 + change + word2, 2) if (ret_word_and_method[1] == -1): if (len(word) >= 8): pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] ret_word_and_method = (word1 + word2, 1) else: pos = random.randint(0, len(word)) word1 = word[0:pos] word2 = word[pos:len(word)] add = '*' ret_word_and_method = (word1 + add + word2, 0) if (ret_word_and_method[0] == ''): print(word) print(ret_word_and_method[0]) print(ret_word_and_method[1]) return ret_word_and_method[0], ret_word_and_method[1]
def change_a_word_5_ways_invalid(word): Alphabet_List = list(string.ascii_lowercase) Alphabet_List.append(' ') cnt = loadDict() # 0 - add # 1 - delete # 2 - replace # 3 - permute # 4 - separate method = random.randint(0, 4) # if method>4, then no return value ret_word_and_method = ('', -1) count = 0 ret_flag = False while (ret_flag == False and count < 10): count = count + 1 if (method == 0): pos = random.randint(0, len(word)) word1 = word[0:pos] word2 = word[pos:len(word)] add = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)] ret_word_and_method = (word1 + add + word2, 0) elif (method == 1): pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] ret_word_and_method = (word1 + word2, 1) elif (method == 2): pos = random.randint(0, len(word) - 1) word1 = word[0:pos] word2 = word[pos + 1:len(word)] change = word[pos] while (change == word[pos]): change = Alphabet_List[random.randint(0, len(Alphabet_List) - 1)] ret_word_and_method = (word1 + change + word2, 2) elif (method == 3): if (len(word) <= 1): ret_word_and_method = (word, 3) else: pos = random.randint(0, len(word) - 2) word1 = word[0:pos] word2 = word[pos + 2:len(word)] ret_word_and_method = (word1 + word[pos + 1] + word[pos] + word2, 3) elif (method == 4): modified_word = '' for c in list(word): modified_word = modified_word + ' ' + c modified_word = modified_word[1:len(modified_word)] ret_word_and_method = (modified_word, 4) if (cnt[ret_word_and_method[0]] == 0): ret_flag = True else: method = random.randint(0, 3) return ret_word_and_method[0], ret_word_and_method[1]