Exemple #1
0
 def fix(self, replace=False, suggest=True):
     """
     Start fixing the files
     Param: repalce=False: Replace the wrong spelling to the
             correct spelling. It is not recommend to use this
             feature since it will modify the files directly. Unless you
             trust the spell checker completely. And also, the fixer
             is not case sensitive.
     Param: suggest=True: Show the wrong spelling on console
             and also the suggested correct spelling
     """
     files = self._scan()
     for file in files:
         self.current_file = file
         lines = self._lines(file)
         if suggest:
             print("====================", file, "====================")
         for line_index in range(0, len(lines)):
             words = util.words(lines[line_index])
             for w in words:
                 if not is_word(w):
                     if suggest:
                         print("wrong:", w, \
                              "\t@line:", line_index+1, \
                              "\tsuggest:", correct(w), \
                              "\tsuggest list:", candidates(w, 5))
         if suggest:
             print("done")
Exemple #2
0
def P(word):
    """
    Get the probability of word. The probability is related to the frequency
    of the words we use in our daily life.
    """
    global counter
    if counter is None:
        counter = Counter(util.words(open("resources/big.txt").read()))
    return counter[word] / sum(counter.values())
Exemple #3
0
def word_pairs(filenames):
    """Return list of word + filename tuples from multiple files"""
    # list to collect resulting pairs
    result = []
    # go over all filenames
    for f in filenames:
        # go over all words in f
        for w in util.words(f):
            # add word+filename pair to result
            result.append([w, f])
    return result
Exemple #4
0
def fix(text):
    """
    Fix the text, check all words and detect if there is any wrong spelling,
    then display on console. This function is different from Fixer.fix()
    since this function workds with text but not files
    """
    words = util.words(text)
    for word in words:
        if not is_word(word):
            print("Wrong:", word,\
                "\tsuggest:", correct(word),\
                "\tsuggest list:", candidates(word, 5))
Exemple #5
0
def count_to_density(data):
    res = data  #res is the result list
    filenames, filewords = [], []  #list of number of words for each file
    if len(data) > 0:
        i = 1
        while i < len(res):  #Continue until the end of data
            j = 0
            while j < len(res[i][1]):  #Continue until done with every filename
                word = res[i][1][
                    j]  #["filename", "number that this word occur"]
                if word[0] not in filenames:
                    #if the number of words of a file is not yet regired then register it
                    filenames.append(word[0])
                    filewords.append(len(util.words(word[0])))
                index = filenames.index(word[0])
                word[1] = round(
                    word[1] / filewords[index],
                    4)  #Place density instead of counter and round the result
                j += 1
            res[i][1] = counter_bubble_sort(
                res[i]
                [1])  #sort all the filenames in decreasing order of density
            i += 1
    return res
Exemple #6
0
def get_series(ses, spec, spec_ord):

    # parse helper
    def split(s, expect, err, full):
        m = re.split('([' + expect + err + '])', s, 1)
        s1, d, s2 = m if len(m) == 3 else (m[0], '$', '')
        if d in err:
            msg = 'expected %s at pos %d in %s, found %s' % (
                expect, len(full) - len(s) + 1, full, d)
            raise Exception(msg)
        return s1, d, s2

    # parse the spec
    left, d, s = split(spec, '(:=', ')', spec)
    if d == '=':  # has tag
        tag = left
        spec_name, d, s = split(s, '(:', ')=', spec)
    else:  # no tag
        tag = None
        spec_name = left
    params = {}
    if d == '(':  # has args
        while d != ')':  # consume args
            name, d, s = split(s, '=)', '(', spec)  # get arg name
            value, d, s = split(s, '(),', '', spec)  # bare value
            p = 0
            while d == '(' or p > 0:  # plus balanced parens
                value += d
                if d == '(': p += 1
                elif d == ')': p -= 1
                v, d, s = split(s, '(),', '', spec)
                value += v
            params[name] = value
    fn = s.lstrip(':')  # xxx canonicalize fn
    util.dbg(spec_name, params, fn)
    ses.add_title(fn)

    def detect_file_type(fn):
        if ftdc.is_ftdc_file_or_dir(fn):
            return 'ftdc'
        with open(fn) as f:
            for _ in range(10):
                try:
                    json.loads(f.next())
                    return 'json'
                except Exception as e:
                    util.dbg(e)
        return 'text'

    file_type = detect_file_type(fn)
    util.msg('detected type of', fn, 'as', file_type)

    # find matching descriptors
    scored = collections.defaultdict(list)
    spec_name_words = util.words(spec_name)
    for desc in descriptors.descriptors:
        if get(desc, 'file_type') != file_type:
            continue
        desc_name_words = util.words(desc['name'])
        last_i = -1
        beginning = matched = in_order = adjacent = 0
        for w, word in enumerate(spec_name_words):
            try:
                i = desc_name_words.index(word)
                if i == 0 and w == 0: beginning = 1
                matched += 1
                if i == last_i + 1: adjacent += 1
                elif i > last_i: in_order += 1
                last_i = i
            except ValueError:
                pass
        score = (beginning, matched, adjacent, in_order)
        scored[score].append(desc)
    best_score = sorted(scored.keys())[-1]
    best_descs = scored[best_score] if best_score != (0, 0, 0, 0) else []
    series = [
        Series(spec, desc, params, fn, spec_ord, tag, ses.opt)
        for desc in best_descs
    ]

    # no match?
    if not series:
        util.msg('no descriptors match', spec_name)

    return series
Exemple #7
0
def get_series(ses, spec, spec_ord):

    # parse helper
    def split(s, expect, err, full):
        m = re.split('([' + expect + err + '])', s, 1)
        s1, d, s2 = m if len(m)==3 else (m[0], '$', '')
        if d in err:
            msg = 'expected %s at pos %d in %s, found %s' % (expect, len(full)-len(s)+1, full, d)
            raise Exception(msg)
        return s1, d, s2

    # parse the spec
    left, d, s = split(spec, '(:=', ')', spec)
    if d=='=': # has tag
        tag = left
        spec_name, d, s = split(s, '(:', ')=', spec)        
    else: # no tag
        tag = None
        spec_name = left
    params = {}
    if d == '(': # has args
        while d != ')': # consume args
            name, d, s = split(s, '=)', '(', spec) # get arg name
            value, d, s = split(s, '(),', '', spec) # bare value
            p = 0
            while d=='(' or p>0: # plus balanced parens
                value += d
                if d=='(': p += 1
                elif d==')': p -= 1
                v, d, s = split(s, '(),', '', spec)
                value += v
            params[name] = value
    fn = s.lstrip(':') # xxx canonicalize fn
    util.dbg(spec_name, params, fn)
    ses.add_title(fn)

    def detect_file_type(fn):
        if ftdc.is_ftdc_file_or_dir(fn):
            return 'ftdc'
        with open(fn) as f:
            for _ in range(10):
                try:
                    json.loads(f.next())
                    return 'json'
                except Exception as e:
                    util.dbg(e)
        return 'text'

    file_type = detect_file_type(fn)
    util.msg('detected type of', fn, 'as', file_type)

    # find matching descriptors
    scored = collections.defaultdict(list)
    spec_name_words = util.words(spec_name)
    for desc in descriptors.descriptors:
        if get(desc,'file_type') != file_type:
            continue
        desc_name_words = util.words(desc['name'])
        last_i = -1
        beginning = matched = in_order = adjacent = 0
        for w, word in enumerate(spec_name_words):
            try:
                i = desc_name_words.index(word)
                if i==0 and w==0: beginning = 1
                matched += 1
                if i==last_i+1: adjacent += 1
                elif i>last_i: in_order += 1
                last_i = i
            except ValueError:
                pass
        score = (beginning, matched, adjacent, in_order)
        scored[score].append(desc)
    best_score = sorted(scored.keys())[-1]
    best_descs = scored[best_score] if best_score != (0,0,0,0) else []
    series = [Series(spec, desc, params, fn, spec_ord, tag, ses.opt) for desc in best_descs]

    # no match?
    if not series:
        util.msg('no descriptors match', spec_name)

    return series
Exemple #8
0
__author__ = 'Egbert'

from dup import remove_dups
from util import words


print("hacktest.txt bevat {} woorden".format(len(remove_dups(words("hacktest.txt")))))
print("Grail.txt bevat {} woorden".format(len(remove_dups(words("grail.txt")))))
Exemple #9
0
print(fixer.candidates("incoerrct"))  # Output: incorrect

# Fix a string, results print on console
fixer.fix("Thsi is a sentnce taht full of mistakes")

# Initialize new fixer
# rt1.txt and rt2.txt are files that contain some random texts for testing
# Some words are spelled incorrectly in order to test the fixer
f = fixer.Fixer(
    dir,  # The root dir
    recursive=True,  # Recursively check all the files
    fname=["rt1", "rt2"])  # Only certain files will be checked
# Start fixing files, results print on console
f.fix()

import util
# print all files under resources folder
print(util.find(dir))
# print files with given filenames
print(util.find(dir, fname=["rt1"]))
# print files with given extensions
print(util.find(dir, suffix=[".txt"]))

print(util.trim("exam?ple!"))  # output: example
print(util.words("Separate words from sentence"))
# Output: ['separate', 'words', 'from', 'sentence']

# Find all the comments in a py file, here use corrector.py as an example
examplepy = util.find(dir, fname=["corrector"])[0]
print(util.find_comment_py(examplepy))
Exemple #10
0
__author__ = 'Egbert'

import util
import pearl


pearl.make_density_table(util.all_word_pairs())
print(len(util.words("brian.txt")))
Exemple #11
0
import ordsearch
import sort
import dup
import util

hacktest = util.words("hacktest.txt")
hacktest = sort.merge(hacktest)
unihacktest = dup.remove_dup(hacktest)
print("unique elements in sorted hacktest.txt: ",len(unihacktest))

print()

grail = util.words("grail.txt")
grail = sort.merge(grail)
unigrail = dup.remove_dup(grail)
print("unique elements in sorted grail.txt: ",len(unigrail))

print()

hacktest = util.words("hacktest.txt")
notunihacktest = dup.remove_dup(hacktest)
print("unique elements in unsorted hacktest.txt: ",len(notunihacktest))

print()

grail = util.words("grail.txt")
notunigrail = dup.remove_dup(grail)
print("unique elements in unsorted grail.txt: ",len(notunigrail))
Exemple #12
0
def make_density_table(data):
    woordenPerFile = []  # [["Brian.txt",17562],["huppel.txt",127547]]

    pairs = merge_pairs(data)

    res = []
    zelfdeWoordAndereBestand = []
    fresh = ""

    if len(pairs) != 0:
        fresh = pairs[0][0]
        i = 1

        currentTextFile = pairs[0][1]
        countForHowManyInTextFile = 1

        while i < len(pairs):
            if pairs[i][0] == fresh:  # Als woord is gelijk aan het te vergelijkenwoord
                if pairs[i][1] == currentTextFile:  # als die uit het zelfde bestand komt counter 1 omhoog
                    countForHowManyInTextFile += 1
                else:
                    # check if we already have the wordCount in our Array

                    indexOfTextFile = linearPairs(
                        woordenPerFile, currentTextFile
                    )  # zoek dan de index van het huidige textbestand

                    if indexOfTextFile != -1:
                        totalWordCount = woordenPerFile[indexOfTextFile][
                            1
                        ]  # als deze niet -1 is kunnen we hem gewoon opvragen
                    else:
                        totalWordCount = len(
                            words(currentTextFile)
                        )  # als hij -1 is, kijk wat het totaal aantal woorden is
                        woordenPerFile.append([currentTextFile, totalWordCount])  # en zet deze in de array

                    freq = (
                        countForHowManyInTextFile / totalWordCount
                    )  # bereken de frequentie door de het getelde aantal woorden delen door het totaal aantal woorden

                    zelfdeWoordAndereBestand.append([currentTextFile, freq])  # En voeg deze vervolgens toe aan de array
                    currentTextFile = pairs[i][1]  # set de nieuwe huidige textfile, waar we nu in aan het zoeken zijn
                    countForHowManyInTextFile = 1  # En er is dus al 1 geteld, dus de counter begint op 1

            else:
                # Er is nu een nieuw woord gedetecteerd
                if len(zelfdeWoordAndereBestand) == 0:
                    # woord voorgekomen in 1 tekstbestand
                    indexOfTextFile = linearPairs(woordenPerFile, currentTextFile)

                    if indexOfTextFile != -1:
                        totalWordCount = woordenPerFile[indexOfTextFile][1]
                    else:
                        totalWordCount = len(words(currentTextFile))
                        woordenPerFile.append([currentTextFile, totalWordCount])

                    freq = countForHowManyInTextFile / totalWordCount
                    zelfdeWoordAndereBestand.append([currentTextFile, freq])

                # Dus moeten we eerst de array van de tekstbestanden met de frequentie sorteren van groot naar klein
                zelfdeWoordAndereBestand = sortVanGrootNaarKlein(zelfdeWoordAndereBestand)

                res.append(
                    [fresh, zelfdeWoordAndereBestand]
                )  # Vervolgens voegen wij het woord + de gesorteerde lijst met frequenties toe aan het resultaat

                zelfdeWoordAndereBestand = []  # We resetten deze array voor nieuwe frequenties
                fresh = pairs[i][0]  # Setten het nieuwe woord om naar te zoeken
                currentTextFile = pairs[i][1]
                # Setten het nieuwe textbestand waarin we zoeken
                countForHowManyInTextFile = 1
                # Omdat hij nu al is gezien begint de counter op 1

            i += 1

        # nu van de laatste het laatste frequentie nog berekenen:
        indexOfTextFile = linearPairs(woordenPerFile, currentTextFile)

        if indexOfTextFile != -1:
            totalWordCount = woordenPerFile[indexOfTextFile][1]
        else:
            totalWordCount = len(words(currentTextFile))
            woordenPerFile.append([currentTextFile, totalWordCount])

        freq = countForHowManyInTextFile / totalWordCount
        zelfdeWoordAndereBestand.append([currentTextFile, freq])

        # einde van de rit, sorteer en voeg het laatste woord nog toe
        zelfdeWoordAndereBestand = sortVanGrootNaarKlein(zelfdeWoordAndereBestand)
        res.append([fresh, zelfdeWoordAndereBestand])

    return res
Exemple #13
0
def text_to_pairs(text):
    dirs = util.words(text)
    steps = util.ints(text)
    pairs = list(zip(dirs, steps))
    return pairs
Exemple #14
0
__author__ = 'Egbert'

from sort import bubble
from sort import merge
from util import words
from ordsearch import binary

#print(bubble([3,253,223,52,6,23,6,2,6,8,3,5,34,8,3,433,76,83465,8,34,36,457,5478,3,45,2]))
#print(bubble(words("hacktest.txt")))
#print(bubble(words("Unabr.dict")))

#print(merge([3,253,223,52,6,23,6,2,6,8,3,5,34,8,3,433,76,83465,8,34,36,457,5478,3,45,2]))
#print(merge(words("hacktest.txt")))
#print(merge(words("Unabr.dict")))

print(binary(merge(words("grail.txt")),"swallow"))