Beispiel #1
0
def all_words():
    data = retrieve('ordmyndalisti')
    if data is not None:
        return data
    else:
        data1 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti.txt'), 'r').read())
        data2 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti2.txt'), 'r').read())
        data = data1+data2
        store('ordmyndalisti', data)
        return data
Beispiel #2
0
def perm_search(string):
    pmatched = []
    errors = []
    if len(string) > 10:
        errors.append(u"Af tæknilegum ástæðum er ekki hægt að finna stafarugl sem er lengra en 10 stafir.")
    else:
        #perms = list(set([u''.join(p) for p in permutations(list(string), len(string))]))
        perms = set(list(all_perms(to_unicode_or_bust(string))))

        # way better! 35 milliseconds for 7 letter word!
        pmatched = list(perms.intersection(words))
        logging.info(pmatched)
        #worst
        #print set(words) & set(perms)

        #best, 18 seconds for 7 letter words, 1.8 seconds for 6 letter word
        #pmatched = [x for x in words if x in perms]

        #bad, 40 seconds for 7 letter words, 4 seconds for 6 letter word
        #for word in words:
        #    for p in perms:
        #        if p == word:
        #            pmatched.append(word)

    return {'matches':pmatched,'errors':errors}
Beispiel #3
0
 def post(self):
     which = self.request.get('which')
     perms = self.request.get('perms', None)
     regex = self.request.get('regex', None)
     d = {}
     if which == 'regex':
         regex = ''.join([r'\s', to_unicode_or_bust(regex), r'\s'])
         rmatches = regex_search2(regex)
         d.update(regex=rmatches)
     if which == 'perms':
         pmatches = perm_search(perms)
         d.update(perms=pmatches)
     self.response.out.write(json.dumps(d))
Beispiel #4
0
 def __insert_string_to_buffer(self, string, handle=None):
     try:
         unicode_string = to_unicode_or_bust(string, self.__encoding)
         utf8_string = unicode_string.encode("utf-8")
         buf = self.__document.Buffer
         buf.begin_not_undoable_action()
         buf.set_text(utf8_string)
         buf.end_not_undoable_action()
         buf.set_modified(False)
     except UnicodeDecodeError:
         self._error(i.exception_unicode_decode)
     except ValueError:
         self._error(i.exception_unicode_decode)
     except UnicodeEncodeError:
         self._error(i.exception_unicode_encode)
     return
Beispiel #5
0
 def save_file(self, document, filename=None, encoding='utf-8'):
     # TODO: Implement File Save operation
     buf = document.Buffer
     string = buf.get_text(buf.get_start_iter(), buf.get_end_iter())
     handler = open(document.filename, 'w')
     unicode_string = to_unicode_or_bust(string, encoding)
     handler.write(unicode_string)
     handler.flush()
     handler.close()
     last_mod = get_last_modification(document.filename)
     self.__document.last_modified_time = last_mod
     # 1. Check For Other Program Modifications by checking the last modified
     #    time whith last file modified time
     # 2. Check For Permissions to Save
     # 3. Encode File before writing to disk
     # 4. Write to a tmp file
     # 5. Copy tmp File over original file (For Crash Prevent)
     # 6. Delete old file
     return True
Beispiel #6
0
def read_file(filename):
    f = open(os.path.join(os.path.dirname(__file__), 'static', filename))
    return [to_unicode_or_bust(l.strip()) for l in list(f)]
Beispiel #7
0
    def create_corpus_from_wiki(self, corpus_root, filename, output_dir):
        create_error_corpus = False
        valid_word_pat = ur'(?u)^\w+$'
        sentences = utils.get_sentences_for_text(corpus_root, filename)
        if sentences == None:
            return
        top_rev = []
        top_rev_with_err = []
        try:
            for s_list in sentences:
                s = ' '.join(s_list)
                if s.startswith('[Revision timestamp:'):
                    self.num_rev += 1
                else:
                    if self.num_rev == 1:
                        if len(s_list) >= self.min_sen_len:
                            rev_sen = RevisionSentence(s_list)
                            top_rev.append(rev_sen)
                    elif self.num_rev > 1:
                        for r in top_rev:
                            if len(s_list) == len(r.orig_tokens):
                                valid_errors = True
                                errors = False
                                old_curr_rev_sen = zip(r.orig_tokens, s_list)
                                for t in old_curr_rev_sen:
                                    dist = utils.levenshtein_distance(
                                        t[0], t[1])
                                    if dist > 0 and dist <= self.max_dist:
                                        # token must be a word
                                        orig_uni = utils.to_unicode_or_bust(
                                            t[0])
                                        match = re.search(
                                            valid_word_pat, orig_uni)
                                        if match:
                                            errors = True
                                    elif dist > self.max_dist:
                                        valid_errors = False
                                        break
                                if errors == True and valid_errors == True:
                                    print 'errr'
                                    r.add_err_sentence(s_list)
                                    create_error_corpus = True
                                    break
        except AssertionError:
            print 'Empty file'

        if create_error_corpus == True:
            with codecs.open(output_dir + '/' + filename,
                             'w',
                             'utf-8',
                             errors='ignore') as f:
                for r in top_rev:
                    if r.contains_spelling_errors() == True:
                        orig_sen = ' '.join(r.orig_tokens)
                        err_as_sen = map(lambda x: ' '.join(x), r.err_sen)
                        orig_err_sen = [orig_sen] + err_as_sen
                        to_write = '####'.join(orig_err_sen)
                        to_write_uni = unicode(to_write,
                                               encoding='utf-8',
                                               errors='ignore')
                        f.write(to_write_uni + u'\n')