def makeutf8(rawstring): """ Return the string decoded as utf8 if it wasn't already. DEPRECATED IN FAVOUR OF PLUGIN_UTILS VERSION """ return plugin_utils.makeutf8(rawstring)
def _internal_print(self, i, lvl=0): """ """ indent = ' ' * lvl if lvl > 0 else '' if isinstance(i, OrderedDict): i = self._uprint_ordered_dict(i, lvl=lvl + 1) if isinstance(i, dict): i = self._uprint_dict(i, lvl=lvl + 1) elif isinstance(i, list): i = self._uprint_list(i, lvl=lvl + 1) elif isinstance(i, tuple): i = self._uprint_tuple(i, lvl=lvl + 1) elif isinstance(i, (int, long, float, complex)): i = makeutf8(str(i)) else: i = "'" + makeutf8(i) + "'" return i, lvl
def _uprint_dict(self, mydict, lvl=0, html=False): """ Prints a unicode dictionary in readable form. """ assert isinstance(mydict, dict) indent = ' ' * lvl if lvl > 0 else '' newdict = '\n' + indent + '{\n' for k, i in mydict.iteritems(): i, lvl = self._internal_print(i, lvl) newdict += indent + ' ' + makeutf8(k) + ': ' + i + ',\n' newdict += '\n' + indent + ' }' return newdict
def _uprint_ordered_dict(self, myod, lvl=0, html=False): """ Prints a unicode OrderedDict in readable form. """ assert isinstance(myod, OrderedDict) indent = ' ' * lvl if lvl > 0 else '' newod = '\n' + indent + 'OrderedDict([\n' for k, i in myod.iteritems(): i, lvl = self._internal_print(i, lvl) newod += indent + " ('" + makeutf8(k) + "', " + i + '),\n' newod += '\n' + indent + ' ])' return newod
def validate(self, validlfs, failedlfs={}): """ compare ordered list of word tokens to definition of valid natural language expressions. clause should be Clause() object """ # validate constituent structures recursively if self.structures: for s in self.structures: print 'validating', s validlfs, failedlfs = s.validate(validlfs, failedlfs) if len(validlfs) < 1: print 'failed while validating', s return validlfs, failedlfs # find matching string for remaining viable leaves if self.restring: old_validlfs = copy(validlfs) # need to compare with newly parsed to find newly added words print 'checking for restring', makeutf8(self.restring) validlfs, failedlfs = self.match_string(validlfs, failedlfs) for k, lf in validlfs.iteritems(): self.matching_words[k] if len(validlfs) < 1: # no valid leaves, validation fails return validlfs, failedlfs # test sub-structure order for any viable leaves if self.structures: validlfs, failedlfs = self.test_order(validlfs, failedlfs) if len(validlfs) < 1: # no valid leaves, validation fails return validlfs, failedlfs # find any untagged words if this structure is top level # if found, validation fails if self.top: for v in validlfs.values(): for w in v: print w, print '' for idx, leaf in validlfs.iteritems(): untagged = [t for t in leaf if 'pos' not in t[1].keys()] if untagged: failedlfs[idx] = validlfs[idx] validlfs = {k: v for k, v in validlfs.iteritems() if k != idx} print clr(['some extra words left over'], self.myc) for w in untagged: print w if not validlfs: return validlfs, failedlfs else: pass return validlfs, failedlfs
def strip_extra_spaces(self, strings): """ Remove leading, trailing, and multiple internal spaces from string. """ strings = [strings] if not isinstance(strings, list) else strings newstrings = [] for string in strings: user_response = unicode(makeutf8(string)) while ' ' in user_response: # remove multiple inner spaces user_response = user_response.replace(' ', ' ') user_response = user_response.strip() # remove leading and trailing spaces newstrings.append(user_response) if len(newstrings) == 1: newstrings = newstrings[0] return newstrings
def tokenize(str): """ Divide a string into clauses and each clause into word tokens. Returns a list of lists, each of which represents one clause (or fragment). The members of each list are 2-member tuples including [0] a word from the clause, and [1] a dictionary. Each dictionary contains the key 'index' with an integer giving the position of the word in the clause. These tuples are ordered according to the appearance of the words in the original string. """ clauses = re.split(r'[\.\?;:,]', str) tokenized = [] for c in clauses: words = c.split(' ') token_list = [(unicode(makeutf8(t)), {'index': words.index(t)}) for t in words] tokenized.append({0: token_list}) return tokenized
def match_string(self, validleaves, failedleaves, restring=None, classname=None): ''' Identify token strings that match the current construction. validleaves : An OrderedDict with the words as keys and dictionaries (or None) for values. The dictionaries may have the keys 'pos' (str, classname for a part of speech) and 'modifies' (int, index of another word modified by this one). ''' matching_words = {} restring = unicode(makeutf8(self.restring)) if not restring \ else unicode(makeutf8(restring)) classname = self.classname if not classname else classname test = re.compile(restring, re.U | re.I) validstring = ' '.join([i[0] for i in validleaves[0]]) mymatch = test.findall(validstring) print 'mymatch is' for m in mymatch: print makeutf8(m) def tag_token(matchstring, leafkey, leafcopy): print 'in tag_token---------------------------' print 'matchstring:', matchstring matchindex = [l[1]['index'] for l in leafcopy if l[0] == matchstring][0] print 'matchindex:', matchindex mydict = leafcopy[matchstring] print 'mydict =======================' mydict if 'pos' not in mydict.keys(): # only tag word if currently untagged mydict['pos'] = classname if 'current' in mydict.keys(): mydict['current'] += 1 else: mydict['current'] = 0 print 'tagged as:', classname print leafcopy # add matching words (with index) directly to instance variable self.matching_words[leafkey] = {'word': matchstring, 'index': matchindex} else: print 'already tagged, leaf invalid' return False return leafcopy newvalids = {} newfaileds = {} if mymatch: print 'mymatch', mymatch for key, leaf in validleaves.iteritems(): print 'tagging leaf:', leaf for m in mymatch: if m in [w[0] for w in leaf]: # because findall and case print 'tagging leaf with', makeutf8(m), '===============' taggedleaf = tag_token(m, key, deepcopy(leaf)) if taggedleaf and taggedleaf not in newvalids.values(): if key in newvalids.keys(): newkey = max(newvalids.keys()) + 1 else: newkey = key print 'appending valid leaf', newkey newvalids[newkey] = taggedleaf elif not taggedleaf and leaf not in newfaileds.values(): if key in newfaileds.keys(): newkey = max(newfaileds.keys()) + 1 else: newkey = key print 'appending failed leaf', newkey newfaileds[newkey] = leaf else: print 'leaf is duplicate' else: print 'match', m, 'is not in string' pass else: print 'no match' newfaileds = deepcopy(validleaves) newvalids = {} return newvalids, newfaileds
def gather_word_forms(): """ Return a list of all strings satisfying the supplied regex. The fieldnames argument should be a list, so that multiple target fields can be searched at once. The optional 'unique' keyword argument determines whether duplicates will be removed from the list. (Defaults to True.) The optional 'filterfunc' keyword argument allows a function to be passed which which will be used to alter the gathered strings. This alteration will happen before duplicate values are removed. So, for example, the strings can be normalized for case or accent characters if those variations are not significant. """ items = [] db = current.db x = ['πιλ', 'βοδυ', 'μειδ', 'νηλ', 'ἰλ', 'σαγγ', 'ἁμ', 'ἱτ', 'ἑλπ', 'ἑλω', 'ο', 'βοτ', 'ὁλ', 'ὁγ', 'παθ', 'τιψ', 'β', 'σωλ', 'κορπ', 'ὡλ', 'κατς', 'γγς', 'μωλτεγγ', 'δεκ', 'φιξ', 'βαλ', 'διλ', 'δαξ', 'δρομα', 'δακ', 'δαγ', 'ἁγ', 'λοξ', 'δυδ', 'βωθ', 'ὐψ', 'καν', 'καβ', 'ὀτ', 'βαδ', 'μωστ', 'μοισδ', 'μιλ', 'βελ', 'ἑδ', 'θοτ', 'κιλ', 'κρω', 'βοχ', 'ω', 'μεντ', 'ἁτ', 'νεατ', 'σπηρ', 'βοδι', 'πιτ', 'βονδ', 'ἁρδ', 'δοκς', 'μελτ', 'βεδ', 'μαλ', 'δατς', 'σωπ', 'α', 'πενσιλ', 'κς', 'δεκς', 'αριας', 'βαγγ', 'σετ', 'βρουμ', 'ἀδ', 'πωλ', 'δατ', 'ἁγγ', 'πραυδ', 'αὐτης', 'νειλ', 'σογγ', 'ζαπ', 'κλαδ', 'νιτ', 'φαξ', 'βολ', 'κεπτ', 'μοιστ', 'ἁμερ', 'τουνα', 'προγγ', 'τ', 'κλυν', 'λοβ', 'πλειαρ', 'κροπ', 'βανδ', 'μωλτεν', 'υτ', 'κοτ', 'κοπ', 'ἀτ', 'φυξ', 'ὡλι', 'μυτ', 'θατ', 'δοτ', 'βικς', 'ἁμαρ', 'λωφερ', 'δοκ', 'ταπ', 'ἀβωδ', 'ὑτος', 'λωφρ', 'ἁμρ', 'ροκ', 'πς', 'βαδυ', 'οὐψ', 'πραγγ', 'σπειρ', 'ἀγγλ', 'σλαψ', 'πλαυ', 'δραμα', 'φοξ', 'ἱτεδ', 'ὁτ', 'δογ', 'δολ', 'ρω', 'δοξ', 'ὗτος', 'μιτ', 'αὑ', 'ἱτς', 'μωλτ', 'βατ', 'βαχ', 'βικ', 'μιαλ', 'μολ', 'μιελ', 'κον', 'μωισδ', 'κραπ', 'καπ', 'ὑπ', 'ἀγκλ', 'λιξ', 'ρωλ', 'λαβ', 'ὀδ', 'λαξ', 'δοτς', 'ἀνκλ', 'ρακ', 'πεγ', 'τυνα', 'βρυμ', 'καρπ', 'βρεδ', 'κιπ', 'μηδ', 'δαλ', 'βετ', 'διπ', 'κλιν', 'πετ', 'βαδι', 'λικς', 'δακς', 'πς', 'ὑπ', 'κς', 'α', 'ος', 'μιτ', 'βρεδ', 'ί', 'ο', 'νεατ', 'δι', 'Ω', 'τ', 'υτ', 'η', 'ον', 'β', 'α', 'δεξ', 'παι'] x = [makeutf8(word) for word in x] form = SQLFORM.factory(Field('search_table', default='steps', writable=False), Field('search_field', requires=IS_IN_SET(['prompt', 'readable_response'])), Field('write_table', default='word_forms', writable=False), Field('write_field', default='word_form', writable=False), Field('unique', 'boolean', default=True), Field('testing', 'boolean', default=True), Field('new', 'boolean', default=True)) form.vars.search_table = 'steps' form.vars.write_table = 'word_forms' form.vars.write_field = 'word_form' if form.process().accepted: vv = form.vars filter_func = eval(vv.filter_func) if vv.filter_func else None trans_func = eval(vv.trans_func) if vv.trans_func else None rows = db(db[vv.search_table].id > 0).select() for r in rows: items.append(r[vv['search_field']]) ptrn = re.compile(u'[\u0370-\u03FF\u1F00-\u1FFF]+', flags=re.U) items = flatten([re.findall(ptrn, makeutf8(i)) for i in items]) normalizer = GreekNormalizer() items = [normalizer.normalize(i) for i in items] if vv.unique: items = list(set(items)) items = [i.lower() for i in items if i not in x] if vv.new: existing = [makeutf8(r['word_form']) for r in db(db.word_forms.id > 0).select(db.word_forms.word_form)] items = [i for i in items if i not in existing and i.capitalize() not in existing and i.lower() not in existing] if vv.testing: pass response.flash = 'Success, but nothing written to database.' else: newdata = [{'word_form': item} for item in items] rowcount = db.word_forms.bulk_insert(newdata) response.flash = 'Success. Added', len(rowcount), 'new word forms.' elif form.errors: items = BEAUTIFY(form.errors) return {'form': form, 'items': items}