def fix(self, replace=False, suggest=True): """ Start fixing the files Param: repalce=False: Replace the wrong spelling to the correct spelling. It is not recommend to use this feature since it will modify the files directly. Unless you trust the spell checker completely. And also, the fixer is not case sensitive. Param: suggest=True: Show the wrong spelling on console and also the suggested correct spelling """ files = self._scan() for file in files: self.current_file = file lines = self._lines(file) if suggest: print("====================", file, "====================") for line_index in range(0, len(lines)): words = util.words(lines[line_index]) for w in words: if not is_word(w): if suggest: print("wrong:", w, \ "\t@line:", line_index+1, \ "\tsuggest:", correct(w), \ "\tsuggest list:", candidates(w, 5)) if suggest: print("done")
def P(word): """ Get the probability of word. The probability is related to the frequency of the words we use in our daily life. """ global counter if counter is None: counter = Counter(util.words(open("resources/big.txt").read())) return counter[word] / sum(counter.values())
def word_pairs(filenames): """Return list of word + filename tuples from multiple files""" # list to collect resulting pairs result = [] # go over all filenames for f in filenames: # go over all words in f for w in util.words(f): # add word+filename pair to result result.append([w, f]) return result
def fix(text): """ Fix the text, check all words and detect if there is any wrong spelling, then display on console. This function is different from Fixer.fix() since this function workds with text but not files """ words = util.words(text) for word in words: if not is_word(word): print("Wrong:", word,\ "\tsuggest:", correct(word),\ "\tsuggest list:", candidates(word, 5))
def count_to_density(data): res = data #res is the result list filenames, filewords = [], [] #list of number of words for each file if len(data) > 0: i = 1 while i < len(res): #Continue until the end of data j = 0 while j < len(res[i][1]): #Continue until done with every filename word = res[i][1][ j] #["filename", "number that this word occur"] if word[0] not in filenames: #if the number of words of a file is not yet regired then register it filenames.append(word[0]) filewords.append(len(util.words(word[0]))) index = filenames.index(word[0]) word[1] = round( word[1] / filewords[index], 4) #Place density instead of counter and round the result j += 1 res[i][1] = counter_bubble_sort( res[i] [1]) #sort all the filenames in decreasing order of density i += 1 return res
def get_series(ses, spec, spec_ord): # parse helper def split(s, expect, err, full): m = re.split('([' + expect + err + '])', s, 1) s1, d, s2 = m if len(m) == 3 else (m[0], '$', '') if d in err: msg = 'expected %s at pos %d in %s, found %s' % ( expect, len(full) - len(s) + 1, full, d) raise Exception(msg) return s1, d, s2 # parse the spec left, d, s = split(spec, '(:=', ')', spec) if d == '=': # has tag tag = left spec_name, d, s = split(s, '(:', ')=', spec) else: # no tag tag = None spec_name = left params = {} if d == '(': # has args while d != ')': # consume args name, d, s = split(s, '=)', '(', spec) # get arg name value, d, s = split(s, '(),', '', spec) # bare value p = 0 while d == '(' or p > 0: # plus balanced parens value += d if d == '(': p += 1 elif d == ')': p -= 1 v, d, s = split(s, '(),', '', spec) value += v params[name] = value fn = s.lstrip(':') # xxx canonicalize fn util.dbg(spec_name, params, fn) ses.add_title(fn) def detect_file_type(fn): if ftdc.is_ftdc_file_or_dir(fn): return 'ftdc' with open(fn) as f: for _ in range(10): try: json.loads(f.next()) return 'json' except Exception as e: util.dbg(e) return 'text' file_type = detect_file_type(fn) util.msg('detected type of', fn, 'as', file_type) # find matching descriptors scored = collections.defaultdict(list) spec_name_words = util.words(spec_name) for desc in descriptors.descriptors: if get(desc, 'file_type') != file_type: continue desc_name_words = util.words(desc['name']) last_i = -1 beginning = matched = in_order = adjacent = 0 for w, word in enumerate(spec_name_words): try: i = desc_name_words.index(word) if i == 0 and w == 0: beginning = 1 matched += 1 if i == last_i + 1: adjacent += 1 elif i > last_i: in_order += 1 last_i = i except ValueError: pass score = (beginning, matched, adjacent, in_order) scored[score].append(desc) best_score = sorted(scored.keys())[-1] best_descs = scored[best_score] if best_score != (0, 0, 0, 0) else [] series = [ Series(spec, desc, params, fn, spec_ord, tag, ses.opt) for desc in best_descs ] # no match? if not series: util.msg('no descriptors match', spec_name) return series
def get_series(ses, spec, spec_ord): # parse helper def split(s, expect, err, full): m = re.split('([' + expect + err + '])', s, 1) s1, d, s2 = m if len(m)==3 else (m[0], '$', '') if d in err: msg = 'expected %s at pos %d in %s, found %s' % (expect, len(full)-len(s)+1, full, d) raise Exception(msg) return s1, d, s2 # parse the spec left, d, s = split(spec, '(:=', ')', spec) if d=='=': # has tag tag = left spec_name, d, s = split(s, '(:', ')=', spec) else: # no tag tag = None spec_name = left params = {} if d == '(': # has args while d != ')': # consume args name, d, s = split(s, '=)', '(', spec) # get arg name value, d, s = split(s, '(),', '', spec) # bare value p = 0 while d=='(' or p>0: # plus balanced parens value += d if d=='(': p += 1 elif d==')': p -= 1 v, d, s = split(s, '(),', '', spec) value += v params[name] = value fn = s.lstrip(':') # xxx canonicalize fn util.dbg(spec_name, params, fn) ses.add_title(fn) def detect_file_type(fn): if ftdc.is_ftdc_file_or_dir(fn): return 'ftdc' with open(fn) as f: for _ in range(10): try: json.loads(f.next()) return 'json' except Exception as e: util.dbg(e) return 'text' file_type = detect_file_type(fn) util.msg('detected type of', fn, 'as', file_type) # find matching descriptors scored = collections.defaultdict(list) spec_name_words = util.words(spec_name) for desc in descriptors.descriptors: if get(desc,'file_type') != file_type: continue desc_name_words = util.words(desc['name']) last_i = -1 beginning = matched = in_order = adjacent = 0 for w, word in enumerate(spec_name_words): try: i = desc_name_words.index(word) if i==0 and w==0: beginning = 1 matched += 1 if i==last_i+1: adjacent += 1 elif i>last_i: in_order += 1 last_i = i except ValueError: pass score = (beginning, matched, adjacent, in_order) scored[score].append(desc) best_score = sorted(scored.keys())[-1] best_descs = scored[best_score] if best_score != (0,0,0,0) else [] series = [Series(spec, desc, params, fn, spec_ord, tag, ses.opt) for desc in best_descs] # no match? if not series: util.msg('no descriptors match', spec_name) return series
__author__ = 'Egbert' from dup import remove_dups from util import words print("hacktest.txt bevat {} woorden".format(len(remove_dups(words("hacktest.txt"))))) print("Grail.txt bevat {} woorden".format(len(remove_dups(words("grail.txt")))))
print(fixer.candidates("incoerrct")) # Output: incorrect # Fix a string, results print on console fixer.fix("Thsi is a sentnce taht full of mistakes") # Initialize new fixer # rt1.txt and rt2.txt are files that contain some random texts for testing # Some words are spelled incorrectly in order to test the fixer f = fixer.Fixer( dir, # The root dir recursive=True, # Recursively check all the files fname=["rt1", "rt2"]) # Only certain files will be checked # Start fixing files, results print on console f.fix() import util # print all files under resources folder print(util.find(dir)) # print files with given filenames print(util.find(dir, fname=["rt1"])) # print files with given extensions print(util.find(dir, suffix=[".txt"])) print(util.trim("exam?ple!")) # output: example print(util.words("Separate words from sentence")) # Output: ['separate', 'words', 'from', 'sentence'] # Find all the comments in a py file, here use corrector.py as an example examplepy = util.find(dir, fname=["corrector"])[0] print(util.find_comment_py(examplepy))
__author__ = 'Egbert' import util import pearl pearl.make_density_table(util.all_word_pairs()) print(len(util.words("brian.txt")))
import ordsearch import sort import dup import util hacktest = util.words("hacktest.txt") hacktest = sort.merge(hacktest) unihacktest = dup.remove_dup(hacktest) print("unique elements in sorted hacktest.txt: ",len(unihacktest)) print() grail = util.words("grail.txt") grail = sort.merge(grail) unigrail = dup.remove_dup(grail) print("unique elements in sorted grail.txt: ",len(unigrail)) print() hacktest = util.words("hacktest.txt") notunihacktest = dup.remove_dup(hacktest) print("unique elements in unsorted hacktest.txt: ",len(notunihacktest)) print() grail = util.words("grail.txt") notunigrail = dup.remove_dup(grail) print("unique elements in unsorted grail.txt: ",len(notunigrail))
def make_density_table(data): woordenPerFile = [] # [["Brian.txt",17562],["huppel.txt",127547]] pairs = merge_pairs(data) res = [] zelfdeWoordAndereBestand = [] fresh = "" if len(pairs) != 0: fresh = pairs[0][0] i = 1 currentTextFile = pairs[0][1] countForHowManyInTextFile = 1 while i < len(pairs): if pairs[i][0] == fresh: # Als woord is gelijk aan het te vergelijkenwoord if pairs[i][1] == currentTextFile: # als die uit het zelfde bestand komt counter 1 omhoog countForHowManyInTextFile += 1 else: # check if we already have the wordCount in our Array indexOfTextFile = linearPairs( woordenPerFile, currentTextFile ) # zoek dan de index van het huidige textbestand if indexOfTextFile != -1: totalWordCount = woordenPerFile[indexOfTextFile][ 1 ] # als deze niet -1 is kunnen we hem gewoon opvragen else: totalWordCount = len( words(currentTextFile) ) # als hij -1 is, kijk wat het totaal aantal woorden is woordenPerFile.append([currentTextFile, totalWordCount]) # en zet deze in de array freq = ( countForHowManyInTextFile / totalWordCount ) # bereken de frequentie door de het getelde aantal woorden delen door het totaal aantal woorden zelfdeWoordAndereBestand.append([currentTextFile, freq]) # En voeg deze vervolgens toe aan de array currentTextFile = pairs[i][1] # set de nieuwe huidige textfile, waar we nu in aan het zoeken zijn countForHowManyInTextFile = 1 # En er is dus al 1 geteld, dus de counter begint op 1 else: # Er is nu een nieuw woord gedetecteerd if len(zelfdeWoordAndereBestand) == 0: # woord voorgekomen in 1 tekstbestand indexOfTextFile = linearPairs(woordenPerFile, currentTextFile) if indexOfTextFile != -1: totalWordCount = woordenPerFile[indexOfTextFile][1] else: totalWordCount = len(words(currentTextFile)) woordenPerFile.append([currentTextFile, totalWordCount]) freq = countForHowManyInTextFile / totalWordCount zelfdeWoordAndereBestand.append([currentTextFile, freq]) # Dus moeten we eerst de array van de tekstbestanden met de frequentie sorteren van groot naar klein zelfdeWoordAndereBestand = sortVanGrootNaarKlein(zelfdeWoordAndereBestand) res.append( [fresh, zelfdeWoordAndereBestand] ) # Vervolgens voegen wij het woord + de gesorteerde lijst met frequenties toe aan het resultaat zelfdeWoordAndereBestand = [] # We resetten deze array voor nieuwe frequenties fresh = pairs[i][0] # Setten het nieuwe woord om naar te zoeken currentTextFile = pairs[i][1] # Setten het nieuwe textbestand waarin we zoeken countForHowManyInTextFile = 1 # Omdat hij nu al is gezien begint de counter op 1 i += 1 # nu van de laatste het laatste frequentie nog berekenen: indexOfTextFile = linearPairs(woordenPerFile, currentTextFile) if indexOfTextFile != -1: totalWordCount = woordenPerFile[indexOfTextFile][1] else: totalWordCount = len(words(currentTextFile)) woordenPerFile.append([currentTextFile, totalWordCount]) freq = countForHowManyInTextFile / totalWordCount zelfdeWoordAndereBestand.append([currentTextFile, freq]) # einde van de rit, sorteer en voeg het laatste woord nog toe zelfdeWoordAndereBestand = sortVanGrootNaarKlein(zelfdeWoordAndereBestand) res.append([fresh, zelfdeWoordAndereBestand]) return res
def text_to_pairs(text): dirs = util.words(text) steps = util.ints(text) pairs = list(zip(dirs, steps)) return pairs
__author__ = 'Egbert' from sort import bubble from sort import merge from util import words from ordsearch import binary #print(bubble([3,253,223,52,6,23,6,2,6,8,3,5,34,8,3,433,76,83465,8,34,36,457,5478,3,45,2])) #print(bubble(words("hacktest.txt"))) #print(bubble(words("Unabr.dict"))) #print(merge([3,253,223,52,6,23,6,2,6,8,3,5,34,8,3,433,76,83465,8,34,36,457,5478,3,45,2])) #print(merge(words("hacktest.txt"))) #print(merge(words("Unabr.dict"))) print(binary(merge(words("grail.txt")),"swallow"))