Ejemplo n.º 1
0
def makeutf8(rawstring):
    """
    Return the string decoded as utf8 if it wasn't already.

    DEPRECATED IN FAVOUR OF PLUGIN_UTILS VERSION
    """
    return plugin_utils.makeutf8(rawstring)
Ejemplo n.º 2
0
 def _internal_print(self, i, lvl=0):
     """
     """
     indent = '    ' * lvl if lvl > 0 else ''
     if isinstance(i, OrderedDict):
         i = self._uprint_ordered_dict(i, lvl=lvl + 1)
     if isinstance(i, dict):
         i = self._uprint_dict(i, lvl=lvl + 1)
     elif isinstance(i, list):
         i = self._uprint_list(i, lvl=lvl + 1)
     elif isinstance(i, tuple):
         i = self._uprint_tuple(i, lvl=lvl + 1)
     elif isinstance(i, (int, long, float, complex)):
         i = makeutf8(str(i))
     else:
         i = "'" + makeutf8(i) + "'"
     return i, lvl
Ejemplo n.º 3
0
    def _uprint_dict(self, mydict, lvl=0, html=False):
        """
        Prints a unicode dictionary in readable form.

        """
        assert isinstance(mydict, dict)
        indent = '    ' * lvl if lvl > 0 else ''
        newdict = '\n' + indent + '{\n'
        for k, i in mydict.iteritems():
            i, lvl = self._internal_print(i, lvl)
            newdict += indent + ' ' + makeutf8(k) + ': ' + i + ',\n'
        newdict += '\n' + indent + ' }'
        return newdict
Ejemplo n.º 4
0
    def _uprint_ordered_dict(self, myod, lvl=0, html=False):
        """
        Prints a unicode OrderedDict in readable form.

        """
        assert isinstance(myod, OrderedDict)
        indent = '    ' * lvl if lvl > 0 else ''
        newod = '\n' + indent + 'OrderedDict([\n'
        for k, i in myod.iteritems():
            i, lvl = self._internal_print(i, lvl)
            newod += indent + " ('" + makeutf8(k) + "', " + i + '),\n'
        newod += '\n' + indent + ' ])'
        return newod
Ejemplo n.º 5
0
    def validate(self, validlfs, failedlfs={}):
        """
        compare ordered list of word tokens to definition of valid natural language expressions.

        clause should be Clause() object
        """
        # validate constituent structures recursively
        if self.structures:
            for s in self.structures:
                print 'validating', s
                validlfs, failedlfs = s.validate(validlfs, failedlfs)
                if len(validlfs) < 1:
                    print 'failed while validating', s
                    return validlfs, failedlfs

        # find matching string for remaining viable leaves
        if self.restring:
            old_validlfs = copy(validlfs)  # need to compare with newly parsed to find newly added words
            print 'checking for restring', makeutf8(self.restring)
            validlfs, failedlfs = self.match_string(validlfs, failedlfs)
            for k, lf in validlfs.iteritems():

                self.matching_words[k]
            if len(validlfs) < 1:  # no valid leaves, validation fails
                return validlfs, failedlfs

        # test sub-structure order for any viable leaves
        if self.structures:
            validlfs, failedlfs = self.test_order(validlfs, failedlfs)
            if len(validlfs) < 1:  # no valid leaves, validation fails
                return validlfs, failedlfs

        # find any untagged words if this structure is top level
        # if found, validation fails
        if self.top:
            for v in validlfs.values():
                for w in v: print w,
                print ''
            for idx, leaf in validlfs.iteritems():
                untagged = [t for t in leaf if 'pos' not in t[1].keys()]
                if untagged:
                    failedlfs[idx] = validlfs[idx]
                    validlfs = {k: v for k, v in validlfs.iteritems() if k != idx}
                    print clr(['some extra words left over'], self.myc)
                    for w in untagged: print w
            if not validlfs:
                return validlfs, failedlfs
            else:
                pass

        return validlfs, failedlfs
Ejemplo n.º 6
0
 def strip_extra_spaces(self, strings):
     """
     Remove leading, trailing, and multiple internal spaces from string.
     """
     strings = [strings] if not isinstance(strings, list) else strings
     newstrings = []
     for string in strings:
         user_response = unicode(makeutf8(string))
         while '  ' in user_response:  # remove multiple inner spaces
             user_response = user_response.replace('  ', ' ')
         user_response = user_response.strip()  # remove leading and trailing spaces
         newstrings.append(user_response)
     if len(newstrings) == 1:
         newstrings = newstrings[0]
     return newstrings
Ejemplo n.º 7
0
def tokenize(str):
    """
    Divide a string into clauses and each clause into word tokens.

    Returns a list of lists, each of which represents one clause
    (or fragment). The members of each list are 2-member tuples including [0] a
    word from the clause, and [1] a dictionary. Each dictionary contains the
    key 'index' with an integer giving the position of the word in the clause.
    These tuples are ordered according to the appearance of the words in the
    original string.

    """
    clauses = re.split(r'[\.\?;:,]', str)
    tokenized = []
    for c in clauses:
        words = c.split(' ')
        token_list = [(unicode(makeutf8(t)), {'index': words.index(t)})
                      for t in words]
        tokenized.append({0: token_list})
    return tokenized
Ejemplo n.º 8
0
    def match_string(self, validleaves, failedleaves,
                     restring=None, classname=None):
        '''
        Identify token strings that match the current construction.

        validleaves
        : An OrderedDict with the words as keys and dictionaries (or None) for
        values. The dictionaries may have the keys 'pos' (str, classname for
        a part of speech) and 'modifies' (int, index of another word modified
        by this one).
        '''
        matching_words = {}
        restring = unicode(makeutf8(self.restring)) if not restring \
            else unicode(makeutf8(restring))
        classname = self.classname if not classname else classname

        test = re.compile(restring, re.U | re.I)

        validstring = ' '.join([i[0] for i in validleaves[0]])
        mymatch = test.findall(validstring)
        print 'mymatch is'
        for m in mymatch:
            print makeutf8(m)

        def tag_token(matchstring, leafkey, leafcopy):
            print 'in tag_token---------------------------'
            print 'matchstring:', matchstring
            matchindex = [l[1]['index'] for l in leafcopy
                          if l[0] == matchstring][0]
            print 'matchindex:', matchindex
            mydict = leafcopy[matchstring]
            print 'mydict ======================='
            mydict
            if 'pos' not in mydict.keys():  # only tag word if currently untagged
                mydict['pos'] = classname
                if 'current' in mydict.keys():
                    mydict['current'] += 1
                else:
                    mydict['current'] = 0
                print 'tagged as:', classname
                print leafcopy

                # add matching words (with index) directly to instance variable
                self.matching_words[leafkey] = {'word': matchstring,
                                                'index': matchindex}
            else:
                print 'already tagged, leaf invalid'
                return False


            return leafcopy

        newvalids = {}
        newfaileds = {}
        if mymatch:
            print 'mymatch', mymatch
            for key, leaf in validleaves.iteritems():
                print 'tagging leaf:', leaf
                for m in mymatch:
                    if m in [w[0] for w in leaf]:  # because findall and case
                        print 'tagging leaf with', makeutf8(m), '==============='
                        taggedleaf = tag_token(m, key, deepcopy(leaf))
                        if taggedleaf and taggedleaf not in newvalids.values():
                            if key in newvalids.keys():
                                newkey = max(newvalids.keys()) + 1
                            else:
                                newkey = key
                            print 'appending valid leaf', newkey
                            newvalids[newkey] = taggedleaf

                        elif not taggedleaf and leaf not in newfaileds.values():
                            if key in newfaileds.keys():
                                newkey = max(newfaileds.keys()) + 1
                            else:
                                newkey = key
                            print 'appending failed leaf', newkey
                            newfaileds[newkey] = leaf
                        else:
                            print 'leaf is duplicate'
                    else:
                        print 'match', m, 'is not in string'
                        pass
        else:
            print 'no match'
            newfaileds = deepcopy(validleaves)
            newvalids = {}

        return newvalids, newfaileds
Ejemplo n.º 9
0
def gather_word_forms():
    """
    Return a list of all strings satisfying the supplied regex.

    The fieldnames argument should be a list, so that multiple target fields
    can be searched at once.

    The optional 'unique' keyword argument determines whether duplicates will
    be removed from the list. (Defaults to True.)

    The optional 'filterfunc' keyword argument allows a function to be passed
    which which will be used to alter the gathered strings. This alteration will
    happen before duplicate values are removed. So, for example, the strings
    can be normalized for case or accent characters if those variations are
    not significant.
    """

    items = []
    db = current.db
    x = ['πιλ', 'βοδυ', 'μειδ', 'νηλ', 'ἰλ', 'σαγγ', 'ἁμ', 'ἱτ', 'ἑλπ', 'ἑλω', 'ο',
         'βοτ', 'ὁλ', 'ὁγ', 'παθ', 'τιψ', 'β', 'σωλ', 'κορπ', 'ὡλ', 'κατς', 'γγς',
         'μωλτεγγ', 'δεκ', 'φιξ', 'βαλ', 'διλ', 'δαξ', 'δρομα', 'δακ', 'δαγ', 'ἁγ',
         'λοξ', 'δυδ', 'βωθ', 'ὐψ', 'καν', 'καβ', 'ὀτ', 'βαδ', 'μωστ', 'μοισδ',
         'μιλ', 'βελ', 'ἑδ', 'θοτ', 'κιλ', 'κρω', 'βοχ', 'ω', 'μεντ', 'ἁτ', 'νεατ',
         'σπηρ', 'βοδι', 'πιτ', 'βονδ', 'ἁρδ', 'δοκς', 'μελτ', 'βεδ', 'μαλ', 'δατς',
         'σωπ', 'α', 'πενσιλ', 'κς', 'δεκς', 'αριας', 'βαγγ', 'σετ', 'βρουμ', 'ἀδ',
         'πωλ', 'δατ', 'ἁγγ', 'πραυδ', 'αὐτης', 'νειλ', 'σογγ', 'ζαπ', 'κλαδ',
         'νιτ', 'φαξ', 'βολ', 'κεπτ', 'μοιστ', 'ἁμερ', 'τουνα', 'προγγ', 'τ',
         'κλυν', 'λοβ', 'πλειαρ', 'κροπ', 'βανδ', 'μωλτεν', 'υτ', 'κοτ', 'κοπ',
         'ἀτ', 'φυξ', 'ὡλι', 'μυτ', 'θατ', 'δοτ', 'βικς', 'ἁμαρ', 'λωφερ', 'δοκ',
         'ταπ', 'ἀβωδ', 'ὑτος', 'λωφρ', 'ἁμρ', 'ροκ', 'πς', 'βαδυ', 'οὐψ', 'πραγγ',
         'σπειρ', 'ἀγγλ', 'σλαψ', 'πλαυ', 'δραμα', 'φοξ', 'ἱτεδ', 'ὁτ', 'δογ',
         'δολ', 'ρω', 'δοξ', 'ὗτος', 'μιτ', 'αὑ', 'ἱτς', 'μωλτ', 'βατ', 'βαχ',
         'βικ', 'μιαλ', 'μολ', 'μιελ', 'κον', 'μωισδ', 'κραπ', 'καπ', 'ὑπ', 'ἀγκλ',
         'λιξ', 'ρωλ', 'λαβ', 'ὀδ', 'λαξ', 'δοτς', 'ἀνκλ', 'ρακ', 'πεγ', 'τυνα',
         'βρυμ', 'καρπ', 'βρεδ', 'κιπ', 'μηδ', 'δαλ', 'βετ', 'διπ', 'κλιν', 'πετ',
         'βαδι', 'λικς', 'δακς', 'πς', 'ὑπ', 'κς', 'α', 'ος', 'μιτ', 'βρεδ', 'ί',
         'ο', 'νεατ', 'δι', 'Ω', 'τ', 'υτ', 'η', 'ον', 'β', 'α', 'δεξ', 'παι']
    x = [makeutf8(word) for word in x]

    form = SQLFORM.factory(Field('search_table', default='steps',
                                 writable=False),
                           Field('search_field',
                                 requires=IS_IN_SET(['prompt',
                                                     'readable_response'])),
                           Field('write_table', default='word_forms',
                                 writable=False),
                           Field('write_field', default='word_form',
                                 writable=False),
                           Field('unique', 'boolean', default=True),
                           Field('testing', 'boolean', default=True),
                           Field('new', 'boolean', default=True))
    form.vars.search_table = 'steps'
    form.vars.write_table = 'word_forms'
    form.vars.write_field = 'word_form'

    if form.process().accepted:
        vv = form.vars
        filter_func = eval(vv.filter_func) if vv.filter_func else None
        trans_func = eval(vv.trans_func) if vv.trans_func else None

        rows = db(db[vv.search_table].id > 0).select()
        for r in rows:
            items.append(r[vv['search_field']])

        ptrn = re.compile(u'[\u0370-\u03FF\u1F00-\u1FFF]+', flags=re.U)
        items = flatten([re.findall(ptrn, makeutf8(i)) for i in items])
        normalizer = GreekNormalizer()
        items = [normalizer.normalize(i) for i in items]
        if vv.unique:
            items = list(set(items))
        items = [i.lower() for i in items if i not in x]

        if vv.new:
            existing = [makeutf8(r['word_form']) for r in
                        db(db.word_forms.id > 0).select(db.word_forms.word_form)]
            items = [i for i in items if i not in existing
                     and i.capitalize() not in existing
                     and i.lower() not in existing]
        if vv.testing:
            pass
            response.flash = 'Success, but nothing written to database.'
        else:
            newdata = [{'word_form': item} for item in items]
            rowcount = db.word_forms.bulk_insert(newdata)
            response.flash = 'Success. Added', len(rowcount), 'new word forms.'

    elif form.errors:
        items = BEAUTIFY(form.errors)

    return {'form': form, 'items': items}