Python isDigit Beispiele

Programmiersprache: Python

Namespace / Paketname: ellyChar

Methode / Funktion: isDigit

Beispiele auf hotexamples.com: 38

Python isDigit - 38 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die ellyChar.isDigit, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

    def getNext(self):
        """
        get single Chinese character

        arguments:
            self

        returns:
            a token or None if buffer is empty

        exceptions:
            StemmingError
        """

        #       print super(EllyBufferZH,self) , 'ZH getNext'
        ln = len(self.buffer)
        if ln == 0:
            return None

#       print 'buffer=' , self.buffer
        n = 1
        if ellyChar.isDigit(self.buffer[0]):
            while n < ln and ellyChar.isDigit(self.buffer[n]):
                n += 1

        w = ellyToken.EllyToken(self.extract(n))
        #       print 'return token=' , w
        #       print 'ZH extracted'
        #       print 'buffer=' , self.buffer
        return w

Beispiel #2

Datei anzeigen

Datei: simpleTransform.py Projekt: ivanjelinek/pyelly

    def get ( self , ts , n=N ):

        """
        get normalized substring in lower case for subsequent comparisons

        arguments:
            self -
            ts   - list of chars to get substring from
            n    - limit on count of chars to get

        returns:
            count of chars scanned for substring
        """

        sl = [ ]                          # char sublist to be matched
#       print 'ts=' , ts
        lts = len(ts)
        if lts == 0:
            return 0                      # no chars to scan
        lm = lts if lts < n else n
#       print 'lm=' , lm
        i = 0
        c = ''
        while i < lm:                     # scan input text up to char limit
            lc = c
            c = ts[i]                     # get next char
            if c == PERIOD:               # special treatment of PERIOD
                if lc == PERIOD: break
            elif c == COMMA:              # special treatment of COMMA
#               print 'comma'
                if ( not ellyChar.isDigit(lc) or
                     i + 1 == lm or
                     not ellyChar.isDigit(ts[i + 1])
                   ):
                    break
            else:
                if not ellyChar.isLetterOrDigit(c):  # stop if not letter
                    if not c in ALSO: break          #   or "'" or "/" or "-"
                sl.append(c.lower())                 # otherwise append to sublist
            i += 1

#       print 'i=' , i , '<' + c + '>'

        if i < lm and ellyChar.isLetterOrDigit(ts[i]):     # proper termination?
            return 0                                       # if not, reject substring

        self.string = u''.join(sl)
        return i                          # scan count

Beispiel #3

Datei anzeigen

def matchtoo(txt, pnc, ctx):
    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
    #   print ( 'nomatch() ln=' , ln , txt )
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print ( 'check' , txt[-3:] )
    if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [
            'P', 'p', 'A', 'a'
    ] or txt[-4] != ' ':
        return False
    ch = txt[-5]
    #   print ( 'ch=' , ch )
    if ellyChar.isDigit(ch):  # only 1 digit will be checked here!
        #       print ( 'ONE DIGIT' )
        return True  # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print ( 'nn=' , nn )
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [' ', '-']:
            return False
    wd = ''.join(txt[:-nn]).lower()

    #   print ( 'wd=' , wd )
    if wd in [
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten', 'eleven', 'twelve'
    ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False

Beispiel #4

Datei anzeigen

    def get(self, ts, n=N):
        """
        get normalized substring in lower case for subsequent comparisons

        arguments:
            self -
            ts   - list of chars to get substring from
            n    - limit on count of chars to get

        returns:
            count of chars scanned for substring
        """

        sl = []  # char sublist to be matched
        #       print ( 'ts=' , ts )
        lts = len(ts)
        if lts == 0:
            return 0  # no chars to scan
        lm = lts if lts < n else n
        #       print ( 'lm=' , lm )
        i = 0
        c = ''
        while i < lm:  # scan input text up to char limit
            lc = c
            c = ts[i]  # get next char
            if c == COMMA:  # special treatment of COMMA
                #               print ( 'comma' )
                if (not ellyChar.isDigit(lc) or i + 3 >= lm
                        or not ellyChar.isDigit(ts[i + 1])
                        or not ellyChar.isDigit(ts[i + 2])
                        or not ellyChar.isDigit(ts[i + 3])):
                    break
            else:
                if not ellyChar.isLetterOrDigit(c):  # stop if not letter
                    if not c in ALSO: break  #   or "'" or "/" or "-"
                sl.append(c.lower())  # otherwise append to sublist
            i += 1

#       print ( 'i=' , i , '<' + c + '>' )

        if i < lm and ellyChar.isLetterOrDigit(ts[i]):  # proper termination?
            return 0  # if not, reject substring

        self.string = ''.join(sl)
        return i  # scan count

Beispiel #5

Datei anzeigen

Datei: stopExceptions.py Projekt: prohippo/pyelly

def matchtoo ( txt , pnc , ctx ):

    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
#   print 'nomatch() ln=' , ln , txt
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print 'check' , txt[-3:]
    if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ':
        return False
    ch = txt[-5]
#   print 'ch=' , ch
    if ellyChar.isDigit(ch):        # only 1 digit will be checked here!
#       print 'ONE DIGIT'
        return True                 # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print 'nn=' , nn
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [ ' ' , '-' ]:
            return False
    wd = ''.join(txt[:-nn]).lower()

#   print 'wd=' , wd
    if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' ,
               'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False

Beispiel #6

Datei anzeigen

def stateZip(buffr):
    """
    recognize U.S. state abbreviation and zip code

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    if len(buffr) < 8 or buffr[2] != ' ': return 0
    st = ''.join(buffr[:2]).upper()  # expected 2-char state abbreviation
    if not st in ziprs: return 0  # if not known, quit
    zc = ziprs[st]  # get zip-code start
    b = buffr[3:]  # expected start of zipcode
    i = 0
    for c in zc:  # check starting digits of zipcode
        if c != b[i]: return 0
        i += 1
    while i < 5:  # check for digits in rest of zipcode
        if not ellyChar.isDigit(b[i]): return 0
        i += 1
    b = b[5:]  # look for proper termination
    if len(b) == 0:  # if end of input, success
        return 8  # success: 5-digit zip
    c = b[0]
    if ellyChar.isLetterOrDigit(c):  # if next char is alphanumeric, failure
        return 0
    elif b[0] == '-':  # look for possible 9-digit zip
        if len(b) > 5:
            b = b[1:]
            for i in range(4):
                if not ellyChar.isDigit(b[i]):
                    return 0  # check for 4 more digits
            b = b[4:]  # past end of 4 digits
            if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]):
                return 0  # termination check
            return 8 + 5  # success: 9-digit zip
    else:
        return 8  # success: 5-digit zip

Beispiel #7

Datei anzeigen

Datei: dateTransform.py Projekt: prohippo/pyelly

    def _aYear ( self , ts ):

        """
        parse a year

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        lts = len(ts)
        if lts < 2: return 0    # year must be at least 2 digits

        k = 0
        while k < lts:          # scan for digits in input list
            if not ellyChar.isDigit(ts[k]):
                break
            k += 1

#       print k , 'digits scanned'
        if k != 2 and k != 4:   # simple check for year range (change this as needed)
            return 0
        if k == 4 and ts[0] != '1' and ts[0] != '2':
            return 0
        self._yr[2:] = ts[k-2:] # save last 2 digits of year
        if k == 2:
            ce = self.cent[0] if ts[k-2:k] > self.ycur else self.cent[1]
            self._yr[:2] = ce
        else:
            self._yr[:2] = ts[k-4:]

        t = ts[k:]              # look for what follows year
#       print 'epoch t=' , t

        ns = 0
        if len(t) > 0 and t[0] == ' ':
            t = t[1:]
            ns = 1

        lss = self.get(t)
#       print 'lss=' , lss , self.string
        if self.string in Ep:
            self._ep = list(self.string)
            k += ns + lss
#           print 'k=' , k , 'ns=' , ns
        elif k < 4:
            return 0

#       print 'k=' , k
        return k if k > 3 else 0

Beispiel #8

Datei anzeigen

Datei: extractionProcedure.py Projekt: belkhir-nacim/pyelly

def stateZip ( buffr ):

    """
    recognize U.S. state abbreviation and zip code

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    if len(buffr) < 8 or buffr[2] != ' ': return 0
    st = ''.join(buffr[:2]).upper()    # expected 2-char state abbreviation
    if not st in ziprs: return 0       # if not known, quit
    zc = ziprs[st]                     # get zip-code start
    b = buffr[3:]                      # expected start of zipcode
    i = 0
    for c in zc:                       # check starting digits of zipcode
        if c != b[i]: return 0
        i += 1
    while i < 5:                       # check for digits in rest of zipcode
        if not ellyChar.isDigit(b[i]): return 0
        i += 1
    b = b[5:]                          # look for proper termination
    if len(b) == 0:                    # if end of input, success
        return 8                       # success: 5-digit zip
    c = b[0]
    if ellyChar.isLetterOrDigit(c):    # if next char is alphanumeric, failure
        return 0
    elif b[0] == '-':                  # look for possible 9-digit zip
        if len(b) > 5:
            b = b[1:]
            for i in range(4):
                if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits
            b = b[4:]                                   # past end of 4 digits
            if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check
            return 8 + 5                                # success: 9-digit zip
    else:
        return 8                       # success: 5-digit zip

Beispiel #9

Datei anzeigen

    def _aYear(self, ts):
        """
        parse a year

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        lts = len(ts)
        if lts < 2: return 0  # year must be at least 2 digits

        k = 0
        while k < lts:  # scan for digits in input list
            if not ellyChar.isDigit(ts[k]):
                break
            k += 1

#       print k , 'digits scanned'
        if k != 2 and k != 4:  # simple check for year range (change this as needed)
            return 0
        if k == 4 and ts[0] != '1' and ts[0] != '2':
            return 0
        self._yr[2:] = ts[k - 2:]  # save last 2 digits of year
        if k == 2:
            ce = self.cent[0] if ts[k - 2:k] > self.ycur else self.cent[1]
            self._yr[:2] = ce
        else:
            self._yr[:2] = ts[k - 4:]

        t = ts[k:]  # look for what follows year
        #       print 'epoch t=' , t

        ns = 0
        if len(t) > 0 and t[0] == ' ':
            t = t[1:]
            ns = 1

        lss = self.get(t)
        #       print 'lss=' , lss , self.string
        if self.string in Ep:
            self._ep = list(self.string)
            k += ns + lss
#           print 'k=' , k , 'ns=' , ns
        elif k < 4:
            return 0

#       print 'k=' , k
        return k if k > 3 else 0

Beispiel #10

Datei anzeigen

Datei: treeLogic.py Projekt: ivanjelinek/pyelly

    def __init__ ( self , tree , nsave , resto , recur=False , post='' ):
        """
        initialization
        arguments:
            self  -
            tree  - logic tree containing node with action
            nsave - how many matched chars to keep in root
            resto - how to restore root after keeping
            recur - recursive matching to look for more affixes?
            post  - how to define a removed affix
        """

        self.tree  = tree
        self.nsave = nsave
        self.resto = resto
        self.recur = recur
        self.ndrop = 0      # default
        if post != '':      # no specification
            if ellyChar.isDigit(post[0]):
                self.ndrop = int(post[0]) # expect only single digit here, if any
                post = post[1:]           # rest of action string
        self.amod = post

Beispiel #11

Datei anzeigen

    def __init__(self, tree, nsave, resto, recur=False, post=''):
        """
        initialization
        arguments:
            self  -
            tree  - logic tree containing node with action
            nsave - how many matched chars to keep in root
            resto - how to restore root after keeping
            recur - recursive matching to look for more affixes?
            post  - how to define a removed affix
        """

        self.tree = tree
        self.nsave = nsave
        self.resto = resto
        self.recur = recur
        self.ndrop = 0  # default
        if post != '':  # no specification
            if ellyChar.isDigit(post[0]):
                self.ndrop = int(
                    post[0])  # expect only single digit here, if any
                post = post[1:]  # rest of action string
        self.amod = post

Beispiel #12

Datei anzeigen

def _rightside(stb, txt, sta):
    """
    process actions for a clause

    arguments:
        stb   - symbol table
        txt   - string input for single clause
        sta   - for status reporting

    returns:
        action list on success, None otherwise
    """

    #   print ( "right side" )

    actn = []
    val = 0
    cnc = ''  # default is no concept specified

    m = txt.rfind(']')  # look for semantic features to set or reset
    n = txt.rfind(' ')  # look for space marking explicit concept

    #   print ( 'n=',n )
    #   print ( "0 txt=[" , txt , "]" )

    if n > m:  # space must not be in semantic feature specification
        cnc = txt[n:].strip().upper()
        txt = txt[:n]  # remove concept from right size of clause

#   print ( "1 txt=[" , txt , "]" )

    if len(txt) > 1:

        if txt[0] == '*':  # inherit from phrase component?
            c = txt[1]
            if c == 'l':
                actn.append([semanticCommand.Clhr])
                sta.lh = True
            elif c == 'r':
                actn.append([semanticCommand.Crhr])
                sta.rh = True
            else:
                return _err('bad inheritance')
            txt = txt[2:].strip()

#   print ( "2 txt=[" , txt , "]" )

    if len(txt) > 3 and txt[0] == '[':

        n = txt.find(']')  # set or unset semantic features for phrase?
        #       print ( 'n=' , n )
        if n < 3:
            return _err('incomplete semantic features to set or unset')
        try:
            f = featureSpecification.FeatureSpecification(stb,
                                                          txt[:n + 1],
                                                          semantic=True)
            sta.res = f
        except ellyException.FormatFailure:
            return _err('bad semantic features to set or unset')

        if sta.id[sS] == None:
            sta.id[sS] = f.id
        elif f.id != sta.id[sS]:
            _err('inconsistency: final features=' + txt[:n + 1])
            return None

#       print ( 'features=' , f.positive , f.negative )
        actn.append([semanticCommand.Csetf, f.positive])
        if not f.negative.zeroed():
            f.negative.complement()
            actn.append([semanticCommand.Crstf, f.negative])
#       print ( 'set:' , actn[-1] )
        txt = txt[n + 1:]

#   print ( "3 txt=[" , txt , "]" )

    if len(txt) > 0:

        c = txt[0]  # check for sign of plausibility change

        if c != '+' and c != '-':
            if ellyChar.isDigit(c):
                return _err('plausibility must begin with + or -')
            else:
                return _err('bad cognitive semantic action: ' + txt)

#       print ( "2 txt=[",txt,"]" )

        if len(txt) == 1:
            val = 1
        elif ellyChar.isDigit(txt[1]):
            try:
                val = int(txt[1:])  # explicit numerical change
            except ValueError:
                return _err('bad cognitive plausibility: ' + txt)
        elif c == txt[1]:  # alternate notation for plausibility change
            val = 2
            for xc in txt[2:]:
                if xc != c:
                    return _err('must be all + or all -')
                val += 1  # count up value
        else:
            return _err('cannot interpret clause: ' + txt)

        if c == '-': val = -val  # get right sign


#   print ( 'val=' , val )

    if len(cnc) > 0:
        actn.append([semanticCommand.Csetc, cnc])

    if val != 0:
        actn.append([semanticCommand.Cadd, val])

    return actn

Beispiel #13

Datei anzeigen

    def getNext(self):
        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

        #       print ( 'getNext' )

        self.resetBracketing()
        inBrkt = False

        nspc = 0  # set space count

        sent = []  # list buffer to fill

        x = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:  # EOF check
            return None

        c = END  # reset
        lc = END

        #       print ( 'x=' , '<' + x + '>' , ord(x) )
        self.inp.unread(x, SP)  # put first char back to restore input
        #       print ( '0  <<' , self.inp.buf )

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0  # alphanumeric count in sentence

        while True:

            x = self.inp.read()  # next input char

            if x == END:  # handle any EOF
                break

#           print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' )
#           print ( 'sent=' , sent , 'nspc=' , nspc )

# check for table delimiters in text

            if len(sent) == 0:
                #               print ( 'table' )
                #               print ( '1  <<' , self.inp.buf )

                if x == '.' or x == '-':  # look for multiple '.' or '-'
                    while True:  # scan up to end of current buffering
                        y = self.inp.read()  #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break  #
                    continue  # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

            #           print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' )
            if lc == SP or lc == END:  # normalize chars for proper bracketing
                if x == SQuo:  #
                    x = LSQm  # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:  #
                    x = LDQm  # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END:  #
                if x == SQuo:  # a SQuo followed by a space becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by a space becomes RDQm
                    x = RDQm  #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:  # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm  #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(
                x)  # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

            #           print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt )

            sent.append(c)  # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue  # if alphanumeric, just add to sentence

            if c == SP:
                continue  # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()  # remove from sentence chars
                break

            # certain Unicode punctuation will always break

            if c in Hards:
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

            #           print ( '0  <<' , self.inp.buf )

            #           print ( 'sent=' , sent[:-1] )
            #           print ( 'punc=' , '<' + c + '>' )
            #           print ( 'next=' , cx )
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1], c, cx):
                    #                   print ( 'stop exception MATCH' )
                    if self.drop:
                        sent.pop()  # remove punctuation char from sentence
                        lc = SP
                    continue

#           print ( 'no stop exception MATCH for' , c )

#           print ( '@1  <<' , self.inp.buf )

# handle any nonstandard punctuation

            exoticPunctuation.normalize(c, self.inp)

            #           print ( '@2  <<' , self.inp.buf )

            # check for dash

            if c == '-':
                d = self.inp.read()
                if d == '-':
                    #                   print ( 'dash' )
                    while True:
                        d = self.inp.read()
                        if d != '-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print ( '@3  c=' , c , inBrkt )

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

                #               print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) )

                if not inBrkt:
                    #                   print ( sent , 'so far' )
                    z = self.inp.read()
                    if self.shortBracketing(sent, z):
                        break
                    self.inp.unread(z)
                    #                   print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' )
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
                    #                   print ( 'stop+quote' )
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
                        #                       print ( 'stop+quote+quote' )
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print ( 'continue' )
                continue

            elif not c in Stops:
                continue

            else:
                #               print ( 'check stopping!' )
                d = self.inp.read()
                #               print ( '@3  <<' , self.inp.buf )

                if d == None: d = '!'
                #               print ( 'stop=' , '<' + c + '> <' + d + '>' )

                #               print ( 'ellipsis check' )
                if c == '.' and c == d:
                    if self.inp.peek() != c:  # look for third '.' in ellipsis
                        self.inp.unread(d)  # if none, keep only first '.'
                    else:
                        self.inp.skip()  # found ellipsis
                        sent.append(d)  # complete it in sentence buffer
                        sent.append(d)  #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(
                                SP
                            )  # if part of token, put in space as separator
                    continue

                if c == ELLP:
                    #                   print ( 'found Unicode ellipsis, d=' , d )
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(
                            d)  # super special case of bad punctuation
                        self.inp.unread(' ')  # put in implied period and space
                        self.inp.unread('.')  #

                # special check for multiple stops

#               print ( 'next char d=' , d , ord(d) if d != END else 'NONE' )
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP  # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent, d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
                    #                   print ( 'no space after punc' )
                    continue

                # if no match for lookahead, put back

                elif d != END:
                    #                   print ( 'unread d=' , d )
                    self.inp.unread(d)

#               print ( 'possible stop' )

# check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
                    #                   print ( 'sent=' , sent )
                    #                   print ( 'ixn=' ,ixn )
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
                        #                       print ( 'cxn=' , cxn )
                        if not ellyChar.isDigit(cxn): break
#                   print ( 'break: ixn=' , ixn , 'ixb=' , ixb )
                    if ixn < ixb and cxn in [' ', '-', '+']:
                        prvw = self.inp.preview()
                        #                       print ( 'prvw=' , prvw )
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(
                                prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
                    #                   print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() )
                    #                   print ( 'nspc=' , nspc )
                    if c in [':', ';'] or nspc < 3:
                        sent.append(d)
                        #                       print ( 'add' , '<' + d + '> to sentence' )
                        #                       print ( 'sent=' , sent )
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print ( '@4  <<' , self.inp.buf )
                cx = self.inp.peek()
                if cx == None: cx = '!!'
                #               print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent )
                #               print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt )
                if nAN > 1:
                    break

        if sent == ['\u2026']:  # special case of sentence
            return list("-.-")  # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None

Beispiel #14

Datei anzeigen

def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset

Beispiel #15

Datei anzeigen

def _leftside(stb, txt, sta):
    """
    process conditions for a clause and store

    arguments:
        stb  - symbol table
        txt  - string input for left side of single clause
        sta   - for status reporting

    returns:
        predicate list on success, None otherwise
    """

    #   print ( "left side" )
    pred = []
    txt = txt.rstrip()

    while len(txt) > 0:
        txt = txt.lstrip()
        #       print ( 'clause=' , txt )
        if len(txt) <= 1:
            _err('malformed conditions for clause')
            return None
        side = txt[0]
        txt = txt[1:]

        if side in ['n', 'p', 'c']:
            sns = txt[0]
            txt = txt[1:]
            if sns != '<' and sns != '>':
                _err('invalid comparison in clause condition=' + sns)
                return None
            if side == 'n':
                op = semanticCommand.Cngt if sns == '>' else semanticCommand.Cnlt
            elif side == 'p':
                op = semanticCommand.Cpgt if sns == '>' else semanticCommand.Cplt
            else:
                op = semanticCommand.Ccgt if sns == '>' else semanticCommand.Cclt
            nd = 0
            lt = len(txt)
            while nd < lt:
                if not ellyChar.isDigit(txt[nd]): break
                nd += 1
            if nd == 0:
                _err('no token count for condition')
                return None
            test = int(txt[:nd])
            txt = txt[nd:]
            pred.append([op, test])
            continue
        if not side in ['l', 'r']:
            _err('invalid side for test=' + side)
            return None
        k = 0
        if txt[0] == '[':  # semantic feature check?
            k = txt.find(']')  # if so, look for closing bracket
            if k < 0:
                return _err('incomplete semantic features to check')
            p = txt[:k + 1]  # get semantic feature string

            #           print ( "side:" , side , "test:" , p )

            try:
                f = featureSpecification.FeatureSpecification(stb,
                                                              p,
                                                              semantic=True)
            except ellyException.FormatFailure:
                return _err('bad semantic features to check')

            if side == 'l':
                if sta.id[lS] == None:
                    sta.id[lS] = f.id
                elif f.id != sta.id[lS]:
                    _err('inconsistency: left features=' + p)
                    return None
            else:
                if sta.id[rS] == None:
                    sta.id[rS] = f.id
                elif f.id != sta.id[rS]:
                    _err('inconsistency: right features=' + p)
                    return None

            op = semanticCommand.Crhtf if side == 'r' else semanticCommand.Clftf

            if side == 'r':
                sta.rht = f
            else:
                sta.lft = f
#           print ( 'test:' , f.positive.hexadecimal() , f.negative.hexadecimal() )
            test = ellyBits.join(f.positive, f.negative)
            #           print ( test )
            pred.append([op, test])

        elif txt[0] == '(':  # semantic concept check?
            #           print ( "txt=\"" + txt +"\"" )
            k = txt.find(')')  # if so, look for closing parenthesis
            if k < 0:
                return _err('incomplete concept check')
            s = txt[1:k].strip().upper()  # normalize concepts
            p = s.split(',')  # allow for multiple disjunctive checks
            #           print ( "p=\"" + p + "\"" )

            op = semanticCommand.Crhtc if side == 'r' else semanticCommand.Clftc
            pred.append([op, p])

        else:
            _err('unknown test in clause=' + side + txt)
            return None

        txt = txt[k + 1:].lstrip()  # advance to next predicate


#       print ( "NEXT" )

    return pred

Beispiel #16

Datei anzeigen

    def match(self, txt, pnc, ctx):
        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

        #       print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx )

        if matchtoo(txt, pnc, ctx):  # exception by complex match?
            return True
#       print ( 'matchtoo() returned False' )

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

        #       print ( 'lstg=' , self.lstg.keys() )
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

        #       print ( len(lp) , 'patterns' )

        ltx = len(txt)  # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1  # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(
                    c) and not ellyChar.isEmbeddedCombining(c):
                #               print ( 'break at nrg=' , nrg , txt[-nrg] )
                break
            nrg += 1
        nrg -= 1  # end of range for all pattern matching

        #       print ( 'ntr=' , ntr , 'nrg=' , nrg )

        txt = txt[-nrg:]  # reset text to limit for matching
        ltx = len(txt)  # its new length

        #       print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' )

        for p in lp:  # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(
                    pat)  # it each pattern element matches one sequence char
                if star:  # except for a final wildcard *
                    #                   print ( 'pattern ending with *' )
                    n -= 1
                    #                   print ( 'ltx=' , ltx , 'n=' , n )
                    if ltx < n:
                        continue  # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue  # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat, t, 0):
                    #                   print ( 'no possible pattern match' )
                    continue

                k = ltx - n  # extra chars beyond any match
                #               print ( 'k=' , k , 't=' , t )
                #               print ( 'txt=' , txt )
                #               print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' )
                #               print ( 'matches' , n , 'chars' )
                if not star and k > 0:
                    #                   print ( 'check text before [' , txt[-n] , ']' )
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n - 1]
                        #                       print ( 'preceding= [', c , ']' )
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue  # because break in text is required

#           print ( 'pat=' , ellyWildcard.deconvert(p.left) )
#           print ( 'n=' , n , 'ltx=' , ltx )
#           print ( 'txt=' , txt )

#           nc = '\\n' if nxt == '\n' else nxt
#           print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' )
#           print ( 'versus c=' , nc )

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:  # check for specific char after possible stop )
                #               print ( 'right=' , nxt )
                return True
            elif pcx == ellyWildcard.cALF:  # check for alphabetic
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True
            elif pcx == ellyWildcard.cDIG:  # check for numeric
                if ellyChar.isDigit(nxt):
                    #                   print ( 'right is numeric=' , nxt 0
                    return True
            elif pcx == ellyWildcard.cUPR:  # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:  # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:  # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True

#       print ( "no matches" )
        return False

Beispiel #17

Datei anzeigen

Datei: vocabularyTable.py Projekt: belkhir-nacim/pyelly

def compile ( name , stb , defn ):

    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new SQLite database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0
    cdb = None  # SQLite db connection
    cur = None  # SQLite db cursor

#   print 'compiled stb=' , stb

    if stb == None :
        print >> sys.stderr, 'no symbol table'
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False)
    except ellyException.FormatFailure:              # should never need this
        print >> sys.stderr , 'unexpected failure with zero features'
        raise ellyException.TableFailure

#   print 'zfs=' , zfs               # hexadecimal for all features off

    tsave = ''                                       # original term
    dsave = ''                                       #          definition

    try:
        filn = name + vocabulary                     # where to put vocabulary database
        try:
            os.remove(filn)                          # delete the file if it exists
        except OSError:
            print >> sys.stderr , 'no' , filn        # if no such file, warn but proceed

#### SQLite
####
        try:
            cdb = dbs.connect(filn)                  # create new database
            cur = cdb.cursor()
            cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)")
            cdb.commit()
        except dbs.Error , e:
            print >> sys.stderr , e
            raise ellyException.TableFailure         # give up on any database failure

#       print 'creating' , filn
#
####

        r = None                                          # for error reporting

        while True:                                       # process vocabulary definition records

            try:                                          # for catching FormatFailure exception
#               print '------------'
                r = defn.readline()                       # next definition
                if len(r) == 0: break                     # stop on EOF
#               print type(r) , r

                k = r.find(':')                           # look for first ':'
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()                                # report error and quit entry

                t = r[:k].strip()                         # term to go into dictionary
                d = r[k+1:].strip()                       # its definition
                tsave = t                                 # save for any error reporting
                dsave = d                                 #

#               print ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>'
                if len(t) == 0 or len(d) == 0:
                    _err()                                # quit on missing parts
                c = t[0]
                if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"':
                    _err('bad term')

                n = delimitKey(t)                         # get part of term to index
                if n <= 0:
                    _err()                                # quit on bad term
                wky = toKey(t[:n])                        # key part of term to define
#               print '  SQLite key=' , wky

                ns = syntaxSpecification.scan(d)          # find extent of syntax info
#               print 'ns=' , ns
                if ns <= 0: _err('bad syntax specification')
#               print 'PoS=' , d[:ns]

                syn = d[:ns]                              # syntax info as string
                d = d[ns:].strip()                        # rest of definition

                try:
#                   print 'VT syn=' , syn
                    ss = SSpec(stb,syn)                   # decode syntax info
#                   print 'VT ss =' , ss
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                cat = str(ss.catg)                        #   syntax category
                syf = ss.synf.positive.hexadecimal(False) #   syntactic flags
#               print 'syf=' , syf

                smf = zfs                                 # initialize defaults for
                pb = '0'                                  #   cognitive semantics
                cn = conceptualHierarchy.NOname           #

#               print '0:d=[' + d + ']'
                if len(d) > 1:                            # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':                      # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                            d = d[2:].strip()             # skip over
                        else:
                            ns = featureSpecification.scan(d) # look for ']' of features
#                           print 'ns=' , ns
                            if ns < 0:
                                _err()
                            sem = d[:ns]                  # get semantic features
                            d = d[ns:].strip()            # skip over
                            try:
#                               print 'smf=' , smf
                                fs = FSpec(stb,sem,True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                            smf = fs.positive.hexadecimal(False) # convert to hex

#                       print '1:d=[' + d + ']'
                        ld = len(d)
#                       print 'ld=' , ld
                        if ld == 0:
                            _err('missing plausibility')
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1                       # take any plus or minus sign
                        while np < ld:                    # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print 'np=' , np
                        if np == 0:
                            _err('missing plausibility')
                        pb = d[:np]                       # plausibility bias
#                       print 'pb=' , pb
                        d = d[np:]
                        ld = len(d)
#                       print '2:d=[' + d + ']'
                        if ld > 1:                        # any more to process?
                            c = d[0]                      # get next char after bias
                            d = d[1:]                     # advance scan
                            ld -= 1
                            if c == '/':                  # check for explicit concept
#                               print 'getting concept'
                                np = 0
                                while np < ld:            # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                cn = d[:np]               # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()                    # signal bad format
                        elif ld > 0:
                            _err()                        # unidentifiable trailing text

                d = d.strip()                             # rest of definition
#               print 'rest of d=' , d
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')

                ld = [ ]                            # for normalizing definition

                k = 0                               # count spaces removed
                sd = ''                             # previous char seen
                for cd in d:                        # scan all chars in translation
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':    # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'                    # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print >> sys.stderr , '** WARNING \'=\' followed by \'=\''
                        print >> sys.stderr , '*  at [' , tsave , ']'

                    sd = cd
                    ld.append(cd)                   # add char to reformatted definition

                if k > 0:
                    d = ''.join(ld)                 # definition with spaces removed

#               print '3:d=[' + d + ']'

                vrc = [ t , ':' , cat , syf , smf ,
                        pb , cn ]                   # start data record
                vss = u' '.join(vrc)                # convert to string
                vss += u' ' + d                     # fill out record with rest of input
#               print 'type(vss)=' , type(vss)

#               print 'rec=' , vrc , 'tra=' , d
#               print '   =' , vss

            except ellyException.FormatFailure:
                print >> sys.stderr , '*  at [' , tsave ,
                if dsave != None:
                    print >> sys.stderr , ':' , dsave ,
                print >> sys.stderr , ']'
                continue                            # skip rest of processing

#### SQLite
####
            try:
                sql = "INSERT INTO Vocab VALUES(?,?)"
#               print type(wky) , wky , type(vss) , vss
                cur.execute(sql,(wky,vss))
            except dbs.Error , e:
                print >> sys.stderr , 'FATAL' , e
                sys.exit(1)

Beispiel #18

Datei anzeigen

def build(name, stb, defn):
    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new SQLite database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0
    cdb = None  # SQLite db connection
    cur = None  # SQLite db cursor

    #   print ( 'built stb=' , stb )

    if stb == None:
        print('no symbol table', file=sys.stderr)
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb, '[$]', True).positive.hexadecimal(False)
    except ellyException.FormatFailure:  # should never need this
        print('unexpected failure with zero features', file=sys.stderr)
        raise ellyException.TableFailure

#   print ( 'zfs=' , zfs )                           # hexadecimal for all features off

    tsave = ''  # original term
    dsave = ''  #          definition

    try:
        filn = name + vocabulary  # where to put vocabulary database
        try:
            os.remove(filn)  # delete the file if it exists
        except OSError:
            print('no', filn,
                  file=sys.stderr)  # if no such file, warn but proceed

#### SQLite DB operations
####
        try:
            cdb = dbs.connect(filn)  # create new database
            cur = cdb.cursor()
            cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)")
            cdb.commit()
        except dbs.Error as e:
            print(e, file=sys.stderr)
            raise ellyException.TableFailure  # give up on any database failure

#       print ( 'creating' , filn )
#
####

        r = None  # for error reporting

        while True:  # process vocabulary definition records

            try:  # for catching FormatFailure exception
                #               print ( '------------' )
                r = defn.readline()  # next definition
                if len(r) == 0: break  # stop on EOF
                #               print ( type(r) , r )
                r = definitionLine.normalize(r)  #
                #               print ( 'to' , r )

                k = r.find(' : ')  # look for first ' : '
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()  # report error and quit entry

                t = r[:k].strip()  # term to go into dictionary
                d = r[k + 2:].strip()  # its definition
                tsave = t  # save for any error reporting
                dsave = d  #

                #               print ( ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' )
                if len(t) == 0 or len(d) == 0:
                    _err()  # quit on missing parts
                if ellyConfiguration.language == 'ZH':  # special key for Chinese
                    wky = toKeyZH(t[0])
                else:
                    c = t[0]
                    if not ellyChar.isLetterOrDigit(c) and not c in initChr:
                        _err('bad term')

                    n = delimitKey(t)  # get part of term to index
                    #                   print ( 'delimit=' , n )
                    if n <= 0:
                        _err()  # quit on bad term
                    wky = toKey(t[:n])  # key part of term to define

#               print ( '  SQLite key=' , wky )

#               print ( 'd=' , d )
                ns = syntaxSpecification.scan(d)  # find extent of syntax info
                #               print ( 'ns=' , ns , '"' + d[ns:] + '"' )
                if ns <= 0: _err('bad syntax specification')
                if not d[ns:] == '' and d[ns] != ' ':
                    _err('trailing chars in syntax specification')
                #               print ( 'PoS=' , d[:ns] )

                syn = d[:ns]  # syntax info as string
                d = d[ns:].strip()  # rest of definition

                try:
                    #                   print ( 'VT syn=' , syn )
                    ss = SSpec(stb, syn)  # decode syntax info
#                   print ( 'VT ss =' , ss )
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                cat = str(ss.catg)  #   syntax category
                cid = _smfchk[ss.catg]  #   associated semantic feature ID
                syf = ss.synf.positive.hexadecimal(False)  #   syntactic flags
                #               print ( 'cat=' , cat )
                #               print ( 'syf=' , syf )

                smf = zfs  # initialize defaults for
                pb = '0'  #   cognitive semantics
                cn = conceptualHierarchy.NOname  #

                #               print ( '0:d=[' + d + ']' )
                if len(d) > 1:  # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':  # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                            d = d[2:].strip()  # skip over
                        else:
                            ns = featureSpecification.scan(
                                d)  # look for ']' of features
                            #                           print ( 'ns=' , ns )
                            if ns < 0:
                                _err()
                            sem = d[:ns]  # get semantic features
                            d = d[ns:].strip(
                            )  # skip over for subsequent processing

                            sid = sem[1]  # feature ID
                            if sid != cid:
                                if cid != None:
                                    _err('inconsistent semantic feature id')
                                _smfchk[ss.catg] = sid

                            try:
                                #                               print ( 'smf=' , smf )
                                fs = FSpec(stb, sem, True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                            smf = fs.positive.hexadecimal(
                                False)  # convert to hex

#                       print ( '1:d=[' + d + ']' )
                        ld = len(d)
                        #                       print ( 'ld=' , ld )
                        if ld == 0:
                            _err('missing plausibility')
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1  # take any plus or minus sign
                        while np < ld:  # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print ( 'np=' , np )
                        if np == 0:
                            _err('missing plausibility')
                        pb = d[:np]  # plausibility bias
                        #                       print ( 'pb=' , pb )
                        d = d[np:]
                        ld = len(d)
                        #                       print ( '2:d=[' + d + ']' )
                        if ld > 1:  # any more to process?
                            c = d[0]  # get next char after bias
                            d = d[1:]  # advance scan
                            ld -= 1
                            if c == '/':  # check for explicit concept
                                #                               print ( 'getting concept' )
                                np = 0
                                while np < ld:  # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                cn = d[:np]  # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()  # signal bad format
                        elif ld > 0:
                            _err()  # unidentifiable trailing text
                    elif d[0] != '(':
                        dd = d
                        while ellyChar.isLetterOrDigit(dd[0]):
                            dd = dd[1:]
                        if len(dd) == 0 or dd[0] != '=':
                            _err()

                d = d.strip()  # rest of definition
                #               print ( 'rest of d=' , d )
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')

                ld = []  # for normalizing definition

                k = 0  # count spaces removed
                sd = ''  # previous char seen
                for cd in d:  # scan all chars in translation
                    #                   print ( 'cd=' , cd )
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':  # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'  # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print('** WARNING \'=\' followed by \'=\'',
                              file=sys.stderr)
                        print('*  at [', tsave, ']', file=sys.stderr)

                    sd = cd
                    ld.append(cd)  # add char to reformatted definition

#               print ( 'ld=' , ld )
                if k > 0:
                    d = ''.join(ld)  # definition with spaces removed

#               print ( '3:d=[' + d + ']' )

                vrc = [t, '=:', cat, syf, smf, pb, cn]  # start data record
                vss = ' '.join(vrc)  # convert to string
                vss += ' ' + d  # fill out record with rest of input
#               print ( 'type(vss)=' , type(vss) )

#               print ( 'rec=' , vrc , 'tra=' , d )
#               print ( '   =' , vss )

            except ellyException.FormatFailure:  # will catch exceptions from _err()
                print('*  at [', tsave, end=' ', file=sys.stderr)
                if dsave != None:
                    print(':', dsave, end=' ', file=sys.stderr)
                print(']', file=sys.stderr)
                continue  # skip rest of processing this rule

#### SQLite DB operation
####
            try:
                sql = "INSERT INTO Vocab VALUES(?,?)"
                #               print ( type(wky) , wky , type(vss) , vss )
                cur.execute(sql, (wky, vss))
            except dbs.Error as e:
                print('FATAL', e, file=sys.stderr)
                sys.exit(1)
#
####

#### SQLite DB operations
####
        if nerr == 0:
            cdb.commit()
        cdb.close()  # clean up


#       print ( 'DONE' )
#
####

    except Error as e:  # catch any other errors
        print('**', e, file=sys.stderr)
        print('*  at', r, file=sys.stderr)
        nerr += 1

    if nerr > 0:
        print('**', nerr, 'vocabulary table errors in all', file=sys.stderr)
        print('*  compilation FAILed', file=sys.stderr)
        cdb.close()  # discard any changes
        raise ellyException.TableFailure

Beispiel #19

Datei anzeigen

Datei: vocabularyTable.py Projekt: ivanjelinek/pyelly

def compile ( name , stb , defn , stem=None ):

    """
    static method to create an Elly vocabulary database from text file input

    arguments:
        name  - for new BSDDB database
        stb   - Elly symbol table
        defn  - Elly definition reader for vocabulary
        stem  - optional stemmer for indexing

    exceptions:
        TableFailure on error
    """

    global nerr
    nerr = 0

#   print >> sys.stderr , 'compiled stb=' , stb , 'stem=' , stem , 'db=' , db

    if stb == None :
        print >> sys.stderr, 'no symbol table'
        raise ellyException.TableFailure
    if db  == None :
        print >> sys.stderr, 'no Python db package'
        raise ellyException.TableFailure

    try:
        zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False)
    except ellyException.FormatFailure:              # should never need this
        print >> sys.stderr , 'unexpected failure with zero features'
        raise ellyException.TableFailure

#   print >> sys.stderr , 'zfs=' , zfs               # hexadecimal for all features off

    tsave = ''                                       # original term
    dsave = ''                                       #          definition

    try:
        filn = name + vocabulary                     # where to put vocabulary database
        try:
            os.remove(filn)                          # delete the file if it exists
        except OSError:
            print >> sys.stderr , 'no' , filn
        dbs = db.DB()                                # create new database
        dbs.set_flags(db.DB_DUP)                     # keys may identify multiple records
        dbs.open(filn,None,db.DB_HASH,db.DB_CREATE)  # open new database file
#       print >> sys.stderr , 'creating' , filn

        r = None                                          # for error reporting

        while True:                                       # process vocabulary records

            try:
#               print >> sys.stderr , '------------'
                r = defn.readline()                       # next definition
                if len(r) == 0: break                     # stop on EOF
                if r[0] == '#': continue                  # skip comment line
#               print >> sys.stderr , 'def=' , r

                k = r.find(':')                           # look for first ':'
                if k < 0:
                    tsave = r
                    dsave = None
                    _err()                                # report error and quit entry
                    continue

                t = r[:k].strip()                         # term to go into dictionary
                d = r[k+1:].strip()                       # its definition
                tsave = t                                 # save for any error reporting
                dsave = d                                 #

#               print >> sys.stderr , ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>'
                if len(t) == 0 or len(d) == 0:
                    _err()                                # quit on missing parts
                    continue
                c = t[0]
                if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"':
                    _err('bad term')
                    continue

                n = toIndex(t)                            # get part of term to index
                if n == 0:
                    _err()                                # quit on bad term
                    continue
                w = t[:n]                                 # first word of term to define  
                if stem != None:
                    try:
                        w = stem.simplify(w)              # reduce for lookup key
                    except ellyException.StemmingError:
                        _err('bad stemming logic')
                        continue
#               print >> sys.stderr , '  w=' , w
                lcw = lcAN(w)                             # convert to ASCII lower case
#               print >> sys.stderr , 'lcw=' , '"' + lcw + '"'

                ns = syntaxSpecification.scan(d)          # find extent of syntax info
#               print >> sys.stderr , 'ns=' , ns
                if ns <= 0: _err('bad syntax specification')
#               print >> sys.stderr , 'PoS=' , d[:ns]

                syn = d[:ns]                              # syntax info as string
                d = d[ns:].strip()                        # rest of definition

                try:
#                   print >> sys.stderr , 'VT syn=' , syn
                    ss = SSpec(stb,syn)                   # decode syntax info to get
#                   print >> sys.stderr , 'VT ss =' , ss
                except ellyException.FormatFailure:
                    _err('malformed syntax specification')
                    continue
                cat = str(ss.catg)                        #   syntax category
                syf = ss.synf.positive.hexadecimal(False) #   syntactic flags
#               print >> sys.stderr , 'syf=' , syf

                smf = zfs                                 # initialize defaults for
                pb = '0'                                  #   cognitive semantics
                cn = '-'                                  #

#               print >> sys.stderr , '0:d=[' + d + ']'
                if len(d) > 1:                            # check for cognitive semantics
                    x = d[0]
                    if x == '[' or x == '0' or x == '-':  # semantic features?
                        if x != '[':                      # a '0' or '-' means to take default
                            if len(d) == 1 or d[1] != ' ':
                                _err('missing semantic features')
                                continue
                            d = d[2:].strip()             # skip over
                        else:
                            ns = featureSpecification.scan(d) # look for ']' of features
#                           print >> sys.stderr , 'ns=' , ns
                            if ns < 0:
                                _err()
                                continue
                            sem = d[:ns]                  # get semantic features
                            d = d[ns:].strip()            # skip over
                            try:
#                               print >> sys.stderr , 'smf=' , smf
                                fs = FSpec(stb,sem,True)
                            except ellyException.FormatFailure:
                                _err('bad semantic features')
                                continue
                            smf = fs.positive.hexadecimal(False) # convert to hex

#                       print >> sys.stderr , '1:d=[' + d + ']'
                        ld = len(d)
#                       print >> sys.stderr , 'ld=' , ld
                        if ld == 0:
                            _err('missing plausibility')
                            continue
                        np = 0
                        x = d[np]
                        if x == '+' or x == '-':
                            np += 1                       # take any plus or minus sign
                        while np < ld:                    # and successive digits
                            if ellyChar.isDigit(d[np]): np += 1
                            else: break
#                       print >> sys.stderr , 'np=' , np
                        if np == 0:
                            _err('missing plausibility')
                            continue
                        pb = d[:np]                       # plausibility bias
#                       print >> sys.stderr , 'pb=' , pb
                        d = d[np:]
                        ld = len(d)
#                       print >> sys.stderr , '2:d=[' + d + ']'
                        if ld > 1:                        # any more to process?
                            c = d[0]                      # get next char after bias
                            d = d[1:]                     # advance scan
                            ld -= 1
                            if c == '/':                  # check for explicit concept
#                               print >> sys.stderr , 'getting concept'
                                np = 0
                                while np < ld:            # get extent of concept
                                    if ellyChar.isWhiteSpace(d[np]): break
                                    np += 1
                                if np == 0:
                                    _err('missing concept for plausibility')
                                    continue
                                cn = d[:np]               # extract concept
                                d = d[np:]
                            elif c != ' ':
                                _err()                    # signal bad format
                                continue
                        elif ld > 0:
                            _err()                        # unidentifiable trailing text
                            continue

                d = d.strip()                             # rest of definition
#               print 'rest of d=' , d
                if len(d) > 0 and d[-1] == '=':
                    if len(d) == 1 or d[0] != '=':
                        _err('incomplete definition')
                        continue

                ld = [ ]                            # for normalizing definition

                k = 0                               # count spaces removed
                sd = ''                             # previous char seen
                for cd in d:                        # scan all chars in translation
                    if cd == ' ':
                        if sd == '=' or sd == ',' or sd == ' ':
                            k += 1
                            sd = cd
                            continue
                    elif cd == '=' or cd == ',':    # no spaces before '=' or ','
                        if sd == ' ':
                            k += 1
                            ld.pop()
                    if cd == ',':
                        if sd == '=':
                            _err('missing translation')
                        cd = '#'                    # format for PICK operation
                    elif cd == '=' and sd == '=':
                        print >> sys.stderr , '** WARNING \'=\' followed by \'=\''
                        print >> sys.stderr , '*  at [' , tsave , ']'

                    sd = cd
                    ld.append(cd)                   # add char to reformatted definition

                if k > 0:
                    d = ''.join(ld)                 # definition with spaces removed

#               print >> sys.stderr , '3:d=[' + d + ']'

                vrc = [ t , ':' , cat , syf , smf ,
                        pb , cn ]                         # start BdB data record
                vss = u' '.join(vrc)                      # convert to string
                vss += u' ' + d                           # fill out record with rest of input
#               print >> sys.stderr , 'type(vss)=' , type(vss)
                rss = vss.encode('utf8')                  # convert to UTF-8

#               print >> sys.stderr , 'rec=' , vrc , 'tra=' , d
#               print >> sys.stderr , '   =' , rss

            except ellyException.FormatFailure:
                print >> sys.stderr , '*  at [' , tsave ,
                if dsave != None:
                    print >> sys.stderr , ':' , dsave ,
                print >> sys.stderr , ']'
                continue

#           print >> sys.stderr , 'lcw=' , lcw
            dbs.put(lcw,rss)                          # save in database
#           print >> sys.stderr , 'saved'

#       print >> sys.stderr , 'DONE'
        dbs.close()                                   # clean up

    except StandardError , e:                         # catch any other errors
        print >> sys.stderr , '**' , e
        print >> sys.stderr , '*  at' , r
        nerr += 1

Beispiel #20

Datei anzeigen

Datei: dateTransform.py Projekt: prohippo/pyelly

    def _aDay ( self , ts ):

        """
        parse a day number

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'aDay', ts

        if len(ts) == 0:
            return 0

        k = 0              # running match count
        x = ts[0]
        y = ''
        if not ellyChar.isDigit(x):
            if not self.rewriteNumber(ts):
                return 0
            else:
                x = ts[0]

#       print 'rewritten ts=' , ts

        ls = len(ts)
        if ls == 1:
            if x == '0': return 0    # cannot have 0 as day
            self._dy.append(x)       # accept at end of input as possible date
            return 1
        elif not ellyChar.isDigit(ts[1]):
            k = 1
        elif x > '3':                # reject first digit bigger than '3'
            return 0
        else:
            y = x                    # save first digit
            x = ts[1]                # this will be second digit
            if y == '3' and x > '1': # reject day > 31
                return 0
            k = 2

        ls -= k
        if k == 2:
            self._dy.append(y)
        self._dy.append(x)
        if ls == 0:
            return k

        z = ts[k]
        if ellyChar.isDigit(z):
            return 0         # reject 3-digit day

        if z == '.' and ls > 1 and ellyChar.isDigit(ts[k+1]):
            return 0         # reject digit after decimal point

        if ls >= 2:          # at least 2 chars to check after day number
            if z == u'-':
#               print 'hypen ls=' , ls , 'k=' , k
                if ellyChar.isDigit(ts[k+1]):                     # hyphen, digit match
#                   print 'digit=' , ts[k+1]
                    self._dy.append(z)
                    self._dy.append(ts[k+1])
                    if ls == 2:                                   # only 2 chars to check?
                        k += 2                                    # add hyphen, digit to day
                    elif ls == 3:                                 # only 3 chars to check?
#                       print 'ts[k]=' , ts[k:]
                        if not ellyChar.isLetterOrDigit(ts[k+2]): #
                            k += 2                                # add hyphen, digit to day
                        elif ellyChar.isDigit(ts[k+2]):           # found second digit to add?
                            self._dy.append(ts[k+2])              # if so, add to day string
                            k += 3
                    elif not ellyChar.isLetterOrDigit(ts[k+2]):   # more than 3 chars to check?
                        k += 2                                    # if not, we are done
                    elif ellyChar.isDigit(ts[k+2]):               # check for second digit
#                       print 'k=' , k
                        if ls > 3 and ellyChar.isDigit(ts[k+3]):
                            return 0
                        if ts[k+1] > '3':                         # check for valid day
                            return 0
                        if ts[k+1] == '3' and ts[k+2] > '1':
                            return 0
                        self._dy.append(ts[k+2])
                        k += 3
                    else:
                        return 0                                  # no other hyphen allowed in day
                else:
                    return 0                                      #

        t = ts[k:]
#       print 'k=' , k , 't=' , t
        if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]):
            return k

        if ellyChar.isDigit(t[0]) or len(t) < 2:
            return 0
        sx = t[0].lower() + t[1].lower()

#       print 'y=' , y , 'x=' , x , 'sx=' , sx

        if x == '1':
#           print 'end of day=' , y
            if y == '1':
                if sx != 'th': return 0
            elif sx != 'st':   return 0
        elif x == '2':
            if sx != 'nd': return 0
        elif x == '3':
            if sx != 'rd': return 0
        else:
#           print 'default ordinal indicator'
            if sx != 'th': return 0

#       print 'ord k=' , k
        t = t[2:]
        k += 2

#       print 'k=' , k , 'len=' , len(ts)

        if len(ts) == k: # check next char in stream
            return k     # if none, match succeeds
        elif ellyChar.isLetterOrDigit(ts[k]):
#           print 'ts[k]=' , ts[k] , k
            return 0     # otherwise, match fails if next char is alphanumeric
        else:
#           print 'return k=' , k
            return k     # otherwise succeed

Beispiel #21

Datei anzeigen

Datei: ellyWildcard.py Projekt: belkhir-nacim/pyelly

def convert ( strg ):

    """
    convert wildcard and escaped chars in a string to coded chars

    arguments:
        strg  - the original string

    returns:
        the converted string on success, None otherwise
    """

    if strg == None: return None

    lng = len(strg)
    nlb = 0                          # check balancing of brackets
    t = [ ]                          # converted output
    i = 0
    while True:
        if i == lng: break
        wild = True                  # flag for wildcard char, True by default
        x = strg[i]
#       print "convert",i,x

        if   x == wANY:              # check for wildcard
            t.append(cANY)
        elif x == wALF:
            t.append(cALF)
        elif x == wUPR:
            t.append(cUPR)
        elif x == wDIG:
            t.append(cDIG)
        elif x == wVWL:
            t.append(cVWL)
        elif x == wCNS:
            t.append(cCNS)
        elif x == wSPC:
            t.append(cSPC)
        elif x == wAPO:
            t.append(cAPO)
        elif x == wEND:
            t.append(cEND)
        elif x == wALL:
            if len(t) == 0 or t[-1] != cALL:
                t.append(cALL)
        elif x == wSPN:              # check for repetition of wildcard
            if i + 1 == lng:
                t.append(x)
            else:
                i += 1
                y = strg[i]
                if   y == wANY:      # only these wildcards can be repeated
                    op = cSAN
                elif y == wDIG:
                    op = cSDG
                elif y == wALF:
                    op = cSAL
                else:
                    continue
                t.append(op)
        elif x == ellyChar.LBR:
#           print 'at \[ nlb=' , nlb
            if nlb != 0: return None
            nlb += 1
            t.append(cSOS)           # start of optional match in pattern
        elif x == ellyChar.RBR:
#           print 'at \] nlb=' , nlb
            if nlb !=  1: return None
            nlb -= 1
            t.append(cEOS)           # end   of optional match
        elif x == ellyChar.BSL:      # escape char
            if i + 1 == lng:         # nothing to escape?
                t.append(x)
            elif strg[i+1] == ' ':   # escaped space?
                t.append(ellyChar.NBS)
                i += 1
            else:                    # escaped non-space?
                z = strg[i+1]
#               print 'escaped=',z
                if ellyChar.isDigit(z):
                    t.append(x)      # if digit, preserve backslash to indicate substitution
                else:
                    t.append(z)      # otherwise, keep the next char literally
                    i += 1
        else:
            t.append(x)
            wild = False

        if wild and nlb > 0 and x != ellyChar.LBR:
#           print 'at wildcard' , x , 'nlb=' , nlb
            return None              # no wildcards allowed in optional segments

        i += 1

#   print "converted=", t

    return u''.join(t).lower() # converted string to match against

Beispiel #22

Datei anzeigen

Datei: ellyBuffer.py Projekt: ivanjelinek/pyelly

    def _getRaw ( self ):

        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        self.skipSpaces()
#       print "|",len(self.buffer)
        ln = len(self.buffer)
#       print "|",len(self.buffer)
        if ln == 0:
            return None
#       print "proceed"
            
        ## get length of next token and if it has
        ## initial - or +, check for word fragment

        k = 0                   # number of chars for next token
        
        if self.match(MIN):     # check for hyphen
            if self.match(DSH): # it is a dash when doubled
                k = 2
            else:
                k = self.find(separators,1)
        elif self.match(PLS):   # check for elly prefix
            k = self.find(separators,1)
        elif self.match(DOT):   # check for period
            if self.match(ELP): # it is ellipsis when tripled
                k = 3
            else:
                k = 1
        elif not ellyChar.isCombining(self.buffer[0]):
            k = 1               # if next char cannot start a token, take it as a token
        else:
            k = self.find(separators)
            if k < 0:           # break a token at next separator
                k = ln
            while k < ln:       # look at separator if it exists
                x = self.buffer[k]
                if x != MIN and x != COM:
                    break       # a hyphen or comma is not absolute break
                if not ellyChar.isDigit(self.buffer[k+1]):
                    break       # accept hyphen or comma if NOT followed by digit
                else:           # otherwise, look for another separator
                    k = self.find(separators,k+2)
                    if k < 0:
                        k = ln
        
        ## if token not delimited, take rest of buffer as
        ## will fit into token working area
        
        if k < 0: k = ln

#       print "take",k,"chars from",len(self.buffer),self.buffer
            
        buf = self.extract(k) # get k characters

        ## special check for - next in buffer after extraction

        if self.match(MIN):                    # hyphen immediately following?
            self.skip()                        # if so, take it
            if self.atSpace():                 # when followed by space
                buf.append(MIN)                # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):        # when not followed by another hyphen
                    self.prepend(ellyChar.SPC) # put back a space
                else:
                    self.skip()                # double hyphen = dash
                    self.prepend(ellyChar.SPC) # put back space after dash
                    self.prepend(MIN)          # put back second hyphen
                self.prepend(MIN)              # put back first
                self.prepend(ellyChar.SPC)     # put extra space before hyphen or dash
        
        ## fill preallocated token for current position from working area
        
#       print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']'
        to = ellyToken.EllyToken(u''.join(buf))
        
        ## strip off trailing non-token chars from token and put back in buffer
        
        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
            if x == APO and km > 0 and buf[km - 1] == 's':
                break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km,both=True)
        
        return to

Beispiel #23

Datei anzeigen

Datei: treeLogic.py Projekt: prohippo/pyelly

    def build ( self , inp ):

        """
        build tree logic from definition reader input

        arguments:
            self  -
            inp   - definition text for logic

        exceptions:
            TableFailure on error
        """

        if inp == None:
            return

        nerr = 0                   # error count

        # read in affixes and associated actions

        while True:

            line = inp.readline()  # next input line
            if line == u'':        # check for EOF
                break

            modf = ''
            elem = line.strip().lower().split(' ')
#           print 'elem=' , elem
            le = len(elem)
            if le < 4:
                nerr += 1
                print >> sys.stderr , "** affix error: incomplete input"
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # skip incomplete line
            if le > 4:                    # affix mod specified?
                modf = elem.pop()         # if so, get it
#               print elem[0] , modf
            do = elem.pop()               # note main action

            # get affix within definition line

            aff = list(elem.pop(0))       # affix as list of chars

            # check for proper form

            aff = self.sequence(aff)      # backward or forward  matching?
#           print 'aff=' , aff

            c = aff[0]                    # get first char to compare with
            aff = aff[1:]

            if (not ellyChar.isLetter(c) and
                    c != '+'):            # affix must start with letter or '+'
                nerr += 1
                print >> sys.stderr , "** affix error: must start with letter or '+'"
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # ignore line

            if not c in self.indx:        # node not already in tree index?
                self.indx[c] = Node()     # add new node

            node = self.indx[c]

            for a in aff:                 # now check each successive char in affix
                if a in node.contn:
                    node = node.contn[a]  # go to existing node if found
                else:
                    new = Node()          # otherwise make new node
                    node.contn[a] = new   # and insert into tree
                    node = new            # and move down

            # at final node in tree logic

            node.condn = int(elem.pop(0)) # condition for match

            try:
                nsave = 0 if len(elem) == 0 else int(elem.pop())
            except ValueError , e:
                print >> sys.stderr , e
                print >> sys.stderr , "*  at: [" , line , "]"
                continue                  # ignore line

            resto = [ Add ]               # set to defaults
            recur = False                 #

            mode  = do[-1]                # kind of recursion
            rest  = do[:-1]               # added chars to fill out root
#           print 'mode=' + '<' + mode + '>' , 'rest=' , rest
            if mode == u'?':
                node.condn = 1
                resto = [ Fail ]          # will generate fatal error
            elif ellyChar.isDigit(mode):
                nerr += 1
                print >> sys.stderr , "* bad action mode=" , mode
                continue
            else:
                if mode == ',':           # allow recursion?
                    recur = True          # if so, change default
                if len(rest) == 1 and rest[0] == '&':
                    resto = [ RestorE ]
                else:
                    resto += list(rest)

            if self.addn != None:
                resto.insert(1,self.addn) # insert AFTER first char of list
#           print 'resto=' , resto

            # insert action

            node.actns = Action(self,nsave,resto,recur,modf)
            node.tag()

Beispiel #24

Datei anzeigen

    def _getRaw(self):
        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        #       print ( '_getRaw() from' , len(self.buffer) , 'chars' )
        #       print ( 'before skipping spaces, buffer=' , self.buffer )
        self.skipSpaces()
        ln = len(self.buffer)
        #       print ( "after skip=",ln )
        if ln == 0:
            return None

        ## get length of next token and if it has
        ## initial - or +, check for word fragment

#       print ( 'buffer start=' , self.buffer[0] )

        k = 0  # number of chars for next token

        cz = ' ' if ln == 0 else self.buffer[0]
        if cz in [MIN, PLS]:
            k = self.findSeparator(1)
        elif cz == APO:
            if ln > 2 and self.buffer[1].lower(
            ) == 's' and self.buffer[2] in separators:
                k = 2
            else:
                k = 1
        elif cz in [COM, DOT, UELP]:  # these can be tokens by themselves
            k = 1
        else:
            #           print ( 'full token extraction' )
            k = self.findSeparator()
            #           print ( 'k=' , k , 'ln=' , ln )
            if k < 0:  # break multi-char token at next separator
                k = ln  # if no separator, go up to end of buffer
            elif k == 0:
                k = 1  # immediate break in scanning
            else:
                while k < ln:  # look at any separator and following context
                    x = self.buffer[k]
                    if x != MIN and x != COM:
                        break  # no further check if separator not hyphen or comma
                    if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]):
                        #                       print ( 'x=' , x , 'buf=' , self.buffer[k:] )
                        break  # accept hyphen or comma if NOT followed by digit
                    else:  # otherwise, look for another separator
                        k = self.findSeparator(k + 2)
                        if k < 0:  #
                            k = ln

        ## if token not delimited, take rest of buffer as
        ## will fit into token working area

        if k < 0: k = ln

        #       print ( "take",k,"chars from",len(self.buffer),self.buffer )

        buf = self.extract(k)  # get k characters

        ## special check for hyphen next in buffer after extraction

        if self.match(MIN):  # hyphen immediately following?
            self.skip()  # if so, take it
            if self.atSpace():  # when followed by space
                buf.append(MIN)  # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):  # when not followed by another hyphen
                    self.prepend(ellyChar.SPC)  # put back a space
                else:
                    self.skip()  # double hyphen = dash
                    self.prepend(ellyChar.SPC)  # put back space after dash
                    self.prepend(MIN)  # put back second hyphen
                self.prepend(MIN)  # put back first
                self.prepend(
                    ellyChar.SPC)  # put extra space before hyphen or dash

        ## fill preallocated token for current position from working area

#       print ( "raw text buf=" , buf )

        to = ellyToken.EllyToken(''.join(buf))

        #       print ( "EllyBuffer token before=" , str(to) )

        ## strip off trailing non-token chars from token and put back in buffer

        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
#           print ( 'trailing x=' , x )
            if x == APO or x == APX:
                if km > 0 and buf[km - 1] == 's':
                    break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km, both=True)

#       print ( "EllyBuffer token=" , strx(to) )
#       print ( "next in buffer=" , self.buffer )
        return to

Beispiel #25

Datei anzeigen

    def _matchN ( self , ts ):

        """
        apply logic for numeric only time recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        self._m = u'00'       # initialize defaults
        self._s = u'00'       #

        k = 1                 # count of chars already scanned
        if len(ts) < 3:       # enough chars for time expression?
            return 0          # if not, fail
        if ellyChar.isDigit(ts[k]):
            k += 1            # skip second digit

        if ts[k] != ':':      # short time expression?
            if k == 2:
                h = u''.join(ts[:k])
                if h > '12':  # check 2-digit hour
                    return 0
            else:
                h = ts[0]
                if h  == '0': # check 1-digit hour
                    return 0
            m = self._findAMorPM(ts[k:]) # AM or PM
            if m == 0:        # if none in short expression, fail
                return 0
            self._hr = int(h) # set the hour
            return k + m      # return success

        self._hr = int(u''.join(ts[:k])) # numerical hour
        if self._hr >= 24: return 0

        k += 1
        t = ts[k:]
        lt = len(t)
        if lt < 2: return 0
        c = t[0]           # should be minutes
        d = t[1]
        if not ellyChar.isDigit(c) or not ellyChar.isDigit(d): return 0
        if c > '5': return 0
        self._m = u''.join(t[:2])          # save
        t = t[2:]
        lt -= 2
        k += 2
        if lt > 2:         # should be seconds
            if t[0] == ':':
                c = t[1]
                d = t[2]
                if not ellyChar.isDigit(c) or not ellyChar.isDigit(d): return 0
                if c > '5': return 0
                if lt > 3 and ellyChar.isDigit(t[3]): return 0
                self._s = u''.join(t[1:3]) # save
                t = t[3:]
                lt -= 3
                k += 3

        if lt > 0 and ellyChar.isDigit(t[0]):
            return 0
        else:
            return k

Beispiel #26

Datei anzeigen

Datei: ellySentenceReader.py Projekt: prohippo/pyelly

    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()
        inBrkt = False

        nspc = 0           # set space count

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent , 'nspc=' , nspc

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(x)    # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , cx
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1],c,cx):
#                   print 'stop exception MATCH'
                    if self.drop:
                        sent.pop()   # remove punctuation char from sentence
                        lc = SP
                    continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c , inBrkt

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
#                   print 'stop+quote'
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
#                       print 'stop+quote+quote'
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print 'continue'
                continue

            elif not c in Stops:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(d)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                if c == ELLP:
#                   print 'found Unicode ellipsis, d=' , d
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(d)   # super special case of bad punctuation
                        self.inp.unread(' ') # put in implied period and space
                        self.inp.unread('.') #

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

#               print 'possible stop'

                # check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
#                   print 'sent=' , sent
#                   print 'ixn=' ,ixn
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
#                       print 'cxn=' , cxn
                        if not ellyChar.isDigit(cxn): break
#                   print 'break: ixn=' , ixn , 'ixb=' , ixb
                    if ixn < ixb and cxn in [ ' ' , '-' , '+' ]:
                        prvw = self.inp.preview()
#                       print 'prvw=' , prvw
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
#                   print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview()
#                   print 'nspc=' , nspc
                    if c in [ ':' , ';' ] or nspc < 3:
                        sent.append(d)
#                       print 'add' , '<' + d + '> to sentence'
#                       print 'sent=' , sent
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
#               print 'nAN=' , nAN , 'inBrkt=' , inBrkt
                if nAN > 1:
                    break

        if sent == [ u'\u2026' ]:  # special case of sentence
            return list("-.-")     # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None

Beispiel #27

Datei anzeigen

    def rewrite ( self , ts ):

        """
        check for date at current text position and rewrite if found

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            True on any rewriting, False otherwise
        """

        lts = len(ts)
        if lts < Lm: return False

        tz = self._tz      # default

        self._xm = ''      # default

        self._m = u'00'    # defaults
        self._s = u'00'

        c = ts[0]          # first char
        if not ellyChar.isDigit(c):
            return False   # time can never start with a letter
                           # because of number transforms

        k = self._matchN(ts)
#       print 'match numeric=' , k

        if k == 0: return False

#       print 'ts[k:]=' , ts[k:]
        k += self._findAMorPM(ts[k:])
#       print 'AM or PM k=' , k

#       print 'hour=' , self._hr
        if   self._xm == 'p' and self._hr <  12: # convert to 24-hour time
            self._hr += 12
        elif self._xm == 'a' and self._hr == 12: #
            self._hr = 0
#       print 'hour=' , self._hr

        t = ts[k:]                 # remainder of text
#       print 'rest t=' , t
        dk = 0                     # skip count
        ns = 0                     # space count
        if len(t) > 0:             # look for time zone
            if t[0] == ' ':        # skip any initial space
                dk += 1
                ns = 1
#           print 't[dk:]=' , t[dk:] , 'dk=' , dk
            dk += self.get(t[dk:]) # extract next token from input
            ss = self.string       #
#           print 'zone=' , ss
            if ss in Zn:           # match to known time zone?
                tz = ss
            elif ns == 0 and ss == u'z': # military ZULU time
                tz = u'gmt'        # translate
            else:
                dk = 0             # no match

        k += dk                    # update match count
        t = t[dk:]                 # advance scan

#       print 't=' , t
        if len(t) > 0 and ellyChar.isLetterOrDigit(t[0]): return False

        for _ in range(k):         # strip matched substring to be rewritten
            ts.pop(0)

        r  = str(self._hr).zfill(2) + u':' + self._m + u':' + self._s + tz
        rr = r[::-1]
        for c in rr:               # do rewriting
            ts.insert(0,c)
        self._rwl = len(r)
        return True

Beispiel #28

Datei anzeigen

Datei: dateTransform.py Projekt: belkhir-nacim/pyelly

    def _matchN ( self , ts ):

        """
        apply logic for numeric only date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'NUMERIC'

        lts = len(ts)
        if lts < Lm: return 0  # shortest date is 0/0
        if not ellyChar.isDigit(ts[0]): return 0

        n = Ln
        if n > lts: n = lts

        ss = [ ]               # substring to compare
        ns = 0                 # slash count

#       print 'lts=' , lts , 'n=' , n

        k = 0
        while k < n:
            c = ts[k]
            if c == '/':
                ns += 1
            elif c == '-':
                ns += 1
                c = '/'
            elif not ellyChar.isDigit(c):
                break
            ss.append(c)
            k += 1

        if k < Lm: return 0
        if ns != 1 and ns != 2: return 0

#       print 'k=' , k , 'ns=' , ns , ss

        if k < lts and ellyChar.isLetterOrDigit(ts[k]):
            return 0

        dt = ''.join(ss).split('/')

        dt0 = dt.pop(0)               # get first two date components
        dt1 = dt.pop(0)               #

#       print 'split=' , dt0 , dt1

        if len(dt0) == 4 or dt0[0] == '0':
            if ns == 1: return 0      #
            dt.append(dt0)            # put first component at end if it looks like year
            dt0 = dt1                 # move month up
            dt1 = dt.pop()            # move date  up

        m = int(dt0)
        if m < 1 or m > 12: return 0  # check validity of month
        d = int(dt1)
        if d < 1 or d > 31: return 0  # check validity of day
        if ns == 2:
            y = dt.pop(0)             # if there is a year, process it also
            ly = len(y)
            if ly == 4:               # 4-digit year?
                s = y[0]
                if s != '1' and s != '2': return 0
                yls = list(y)
            elif ly == 2:
                ix = 0 if y > self.ycur else 1
                yls = list(self.cent[ix] + y)
            else:
                return 0              # fail on any other number of year digits

            self._yr = yls            # handle year

        self._mo = list(dt0.zfill(2)) # handle month
        self._dy = list(dt1.zfill(2)) # handle day
        return k

Beispiel #29

Datei anzeigen

Datei: cognitiveDefiner.py Projekt: prohippo/pyelly

def _leftside ( stb , txt , sta ):

    """
    process conditions for a clause and store

    arguments:
        stb  - symbol table
        txt  - string input for left side of single clause
        sta   - for status reporting

    returns:
        predicate list on success, None otherwise
    """

#   print "left side"
    pred = [ ]
    txt = txt.rstrip()

    while len(txt) > 0:
        txt = txt.lstrip()
#       print 'clause=' , txt
        if len(txt) <= 1:
            _err('malformed conditions for clause')
            return None
        side = txt[0]
        txt = txt[1:]

        if side in [ 'n' , 'p' , 'c' ]:
            sns = txt[0]
            txt = txt[1:]
            if sns != '<' and sns != '>':
                _err('invalid comparison in clause condition=' + sns)
                return None
            if side == 'n':
                op = semanticCommand.Cngt if sns == '>' else semanticCommand.Cnlt
            elif side == 'p':
                op = semanticCommand.Cpgt if sns == '>' else semanticCommand.Cplt
            else:
                op = semanticCommand.Ccgt if sns == '>' else semanticCommand.Cclt
            nd = 0
            lt = len(txt)
            while nd < lt:
                if not ellyChar.isDigit(txt[nd]): break
                nd += 1
            if nd == 0:
                _err('no token count for condition')
                return None
            test = int(txt[:nd])
            txt = txt[nd:]
            pred.append([ op , test ])
            continue
        if not side in [ 'l' , 'r' ]:
            _err('invalid side for test=' + side)
            return None
        k = 0
        if txt[0] == '[':                   # semantic feature check?
            k = txt.find(']')               # if so, look for closing bracket
            if k < 0:
                return _err('incomplete semantic features to check')
            p = txt[:k+1]                   # get semantic feature string

#           print "side:" , side , "test:" , p

            try:
                f = featureSpecification.FeatureSpecification(stb,p,semantic=True)
            except ellyException.FormatFailure:
                return _err('bad semantic features to check')

            if side == 'l':
                if sta.id[lS] == None:
                    sta.id[lS] = f.id
                elif f.id != sta.id[lS]:
                    _err('inconsistency: left features=' + p)
                    return None
            else:
                if sta.id[rS] == None:
                    sta.id[rS] = f.id
                elif f.id != sta.id[rS]:
                    _err('inconsistency: right features=' + p)
                    return None

            op = semanticCommand.Crhtf if side == 'r' else semanticCommand.Clftf

            if side == 'r':
                sta.rht = f
            else:
                sta.lft = f
#           print 'test:' , f.positive.hexadecimal() , f.negative.hexadecimal()
            test = ellyBits.join(f.positive,f.negative)
#           print test
            pred.append([ op , test ])

        elif txt[0] == '(':                 # semantic concept check?
#           print "txt=\"" + txt +"\""
            k = txt.find(')')               # if so, look for closing parenthesis
            if k < 0:
                return _err('incomplete concept check')
            s = txt[1:k].strip().upper()    # normalize concepts
            p = s.split(',')                # allow for multiple disjunctive checks
#           print "p=\"" + p + "\""

            op = semanticCommand.Crhtc if side == 'r' else semanticCommand.Clftc
            pred.append([ op , p ])

        else:
            _err('unknown test in clause=' + side + txt)
            return None

        txt = txt[k+1:].lstrip()            # advance to next predicate

#       print "NEXT"

    return pred

Beispiel #30

Datei anzeigen

    def _getRaw ( self ):

        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

#       print '_getRaw() from' , len(self.buffer) , 'chars'
#       print unicode(self)
        self.skipSpaces()
        ln = len(self.buffer)
#       print "after skip=",ln
        if ln == 0:
            return None

        ## get length of next token and if it has
        ## initial - or +, check for word fragment

        bs = self.buffer[0]
#       print 'buffer start=' , bs

        k = 0                   # number of chars for next token

        if self.match(MIN):     # check for hyphen
            if self.match(DSH): # it is a dash when doubled
                k = 2
            else:               # otherwise, could be word fragment
                k = self.findSeparator(1)
        elif self.match(PLS):   # check for Elly prefix
            k = self.findSeparator(1)
        elif self.match(DOT):   # check for period
            if self.match(ELP): # it is ellipsis when tripled
                k = 3
            else:               # otherwise, single punctuation char
                k = 1
        elif bs == APO:
            k = 1
        else:
#           print 'full token extraction'
            k = self.findSeparator()
#           print 'k=' , k
            if k < 0:           # break multi-char token at next separator
                k = ln          # if no separator, go up to end of buffer
            elif k == 0:
                k = 1           # immediate break in scanning
            else:
                while k < ln:       # look at any separator and following context
                    x = self.buffer[k]
                    if x != MIN and x != COM:
                        break       # no further check if separator not hyphen or comma
                    if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k+1]):
                        break       # accept hyphen or comma if NOT followed by digit
                    else:           # otherwise, look for another separator
                        k = self.findSeparator(k+2)
                        if k < 0:   #
                            k = ln

        ## if token not delimited, take rest of buffer as
        ## will fit into token working area

        if k < 0: k = ln

#       print "take",k,"chars from",len(self.buffer),self.buffer

        buf = self.extract(k) # get k characters

        ## special check for hyphen next in buffer after extraction

        if self.match(MIN):                    # hyphen immediately following?
            self.skip()                        # if so, take it
            if self.atSpace():                 # when followed by space
                buf.append(MIN)                # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):        # when not followed by another hyphen
                    self.prepend(ellyChar.SPC) # put back a space
                else:
                    self.skip()                # double hyphen = dash
                    self.prepend(ellyChar.SPC) # put back space after dash
                    self.prepend(MIN)          # put back second hyphen
                self.prepend(MIN)              # put back first
                self.prepend(ellyChar.SPC)     # put extra space before hyphen or dash

        ## fill preallocated token for current position from working area

#       print "raw text buf=" , buf

        to = ellyToken.EllyToken(u''.join(buf))

#       print "EllyBuffer token before=" , unicode(to)

        ## strip off trailing non-token chars from token and put back in buffer

        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
            if x == APO or x == APX:
                if km > 0 and buf[km - 1] == 's':
                    break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km,both=True)

#       print "EllyBuffer token after =" , unicode(to)
        return to

Beispiel #31

Datei anzeigen

    def _aDay(self, ts):
        """
        parse a day number

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'aDay', ts

        if len(ts) == 0:
            return 0

        k = 0  # running match count
        x = ts[0]
        y = ''
        if not ellyChar.isDigit(x):
            if not self.rewriteNumber(ts):
                return 0
            else:
                x = ts[0]

#       print 'rewritten ts=' , ts

        ls = len(ts)
        if ls == 1:
            if x == '0': return 0  # cannot have 0 as day
            self._dy.append(x)  # accept at end of input as possible date
            return 1
        elif not ellyChar.isDigit(ts[1]):
            k = 1
        elif x > '3':  # reject first digit bigger than '3'
            return 0
        else:
            y = x  # save first digit
            x = ts[1]  # this will be second digit
            if y == '3' and x > '1':  # reject day > 31
                return 0
            k = 2

        ls -= k
        if k == 2:
            self._dy.append(y)
        self._dy.append(x)
        if ls == 0:
            return k

        z = ts[k]
        if ellyChar.isDigit(z):
            return 0  # reject 3-digit day

        if z == '.' and ls > 1 and ellyChar.isDigit(ts[k + 1]):
            return 0  # reject digit after decimal point

        if ls >= 2:  # at least 2 chars to check after day number
            if z == u'-':
                #               print 'hypen ls=' , ls , 'k=' , k
                if ellyChar.isDigit(ts[k + 1]):  # hyphen, digit match
                    #                   print 'digit=' , ts[k+1]
                    self._dy.append(z)
                    self._dy.append(ts[k + 1])
                    if ls == 2:  # only 2 chars to check?
                        k += 2  # add hyphen, digit to day
                    elif ls == 3:  # only 3 chars to check?
                        #                       print 'ts[k]=' , ts[k:]
                        if not ellyChar.isLetterOrDigit(ts[k + 2]):  #
                            k += 2  # add hyphen, digit to day
                        elif ellyChar.isDigit(
                                ts[k + 2]):  # found second digit to add?
                            self._dy.append(ts[k +
                                               2])  # if so, add to day string
                            k += 3
                    elif not ellyChar.isLetterOrDigit(
                            ts[k + 2]):  # more than 3 chars to check?
                        k += 2  # if not, we are done
                    elif ellyChar.isDigit(ts[k + 2]):  # check for second digit
                        #                       print 'k=' , k
                        if ls > 3 and ellyChar.isDigit(ts[k + 3]):
                            return 0
                        if ts[k + 1] > '3':  # check for valid day
                            return 0
                        if ts[k + 1] == '3' and ts[k + 2] > '1':
                            return 0
                        self._dy.append(ts[k + 2])
                        k += 3
                    else:
                        return 0  # no other hyphen allowed in day
                else:
                    return 0  #

        t = ts[k:]
        #       print 'k=' , k , 't=' , t
        if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]):
            return k

        if ellyChar.isDigit(t[0]) or len(t) < 2:
            return 0
        sx = t[0].lower() + t[1].lower()

        #       print 'y=' , y , 'x=' , x , 'sx=' , sx

        if x == '1':
            #           print 'end of day=' , y
            if y == '1':
                if sx != 'th': return 0
            elif sx != 'st': return 0
        elif x == '2':
            if sx != 'nd': return 0
        elif x == '3':
            if sx != 'rd': return 0
        else:
            #           print 'default ordinal indicator'
            if sx != 'th': return 0

#       print 'ord k=' , k
        t = t[2:]
        k += 2

        #       print 'k=' , k , 'len=' , len(ts)

        if len(ts) == k:  # check next char in stream
            return k  # if none, match succeeds
        elif ellyChar.isLetterOrDigit(ts[k]):
            #           print 'ts[k]=' , ts[k] , k
            return 0  # otherwise, match fails if next char is alphanumeric
        else:
            #           print 'return k=' , k
            return k  # otherwise succeed

Beispiel #32

Datei anzeigen

    def build(self, inp):
        """
        build tree logic from definition reader input

        arguments:
            self  -
            inp   - definition text for logic

        exceptions:
            TableFailure on error
        """

        if inp == None:
            return

        nerr = 0  # error count

        # read in affixes and associated actions

        while True:

            line = inp.readline()  # next input line
            if line == '':  # check for EOF
                break

            modf = ''
            elem = line.strip().lower().split(' ')
            #           print ( 'elem=' , elem )
            le = len(elem)
            if le < 4:
                nerr += 1
                print("** affix error: incomplete input", file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # skip incomplete line
            if le > 4:  # affix mod specified?
                modf = elem.pop()  # if so, get it
#               print ( elem[0] , modf )
            do = elem.pop()  # note main action

            # get affix within definition line

            aff = list(elem.pop(0))  # affix as list of chars

            # check for proper form

            aff = self.sequence(aff)  # backward or forward  matching?
            #           print ( 'aff=' , aff )

            c = aff[0]  # get first char to compare with
            aff = aff[1:]

            if (not ellyChar.isLetter(c)
                    and c != '+'):  # affix must start with letter or '+'
                nerr += 1
                print("** affix error: must start with letter or '+'",
                      file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # ignore line

            if not c in self.indx:  # node not already in tree index?
                self.indx[c] = Node()  # add new node

            node = self.indx[c]

            for a in aff:  # now check each successive char in affix
                if a in node.contn:
                    node = node.contn[a]  # go to existing node if found
                else:
                    new = Node()  # otherwise make new node
                    node.contn[a] = new  # and insert into tree
                    node = new  # and move down

            # at final node in tree logic

            node.condn = int(elem.pop(0))  # condition for match

            try:
                nsave = 0 if len(elem) == 0 else int(elem.pop())
            except ValueError as e:
                print(e, file=sys.stderr)
                print("*  at: [", line, "]", file=sys.stderr)
                continue  # ignore line

            resto = [Add]  # set to defaults
            recur = False  #

            mode = do[-1]  # kind of recursion
            rest = do[:-1]  # added chars to fill out root
            #           print ( 'mode=' + '<' + mode + '>' , 'rest=' , rest )
            if mode == '?':
                node.condn = 1
                resto = [Fail]  # will generate fatal error
            elif ellyChar.isDigit(mode):
                nerr += 1
                print("* bad action mode=", mode, file=sys.stderr)
                continue
            else:
                if mode == ',':  # allow recursion?
                    recur = True  # if so, change default
                if len(rest) == 1 and rest[0] == '&':
                    resto = [RestorE]
                else:
                    resto += list(rest)

            if self.addn != None:
                resto.insert(1, self.addn)  # insert AFTER first char of list
#           print ( 'resto=' , resto )

# insert action

            node.actns = Action(self, nsave, resto, recur, modf)
            node.tag()

#           if modf != '': print ( node , node.actns )

        if nerr > 0:
            print("**", nerr, "affix errors in all", file=sys.stderr)
            raise ellyException.TableFailure

Beispiel #33

Datei anzeigen

    def _matchN(self, ts):
        """
        apply logic for numeric only date recognition

        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

        #       print 'NUMERIC'

        lts = len(ts)
        if lts < Lm: return 0  # shortest date is 0/0
        if not ellyChar.isDigit(ts[0]): return 0

        n = Ln
        if n > lts: n = lts

        ss = []  # substring to compare
        ns = 0  # slash count

        #       print 'lts=' , lts , 'n=' , n

        k = 0
        while k < n:
            c = ts[k]
            if c == '/':
                ns += 1
            elif c == '-':
                ns += 1
                c = '/'
            elif not ellyChar.isDigit(c):
                break
            ss.append(c)
            k += 1

#       print 'k=', k , 'Lm=' , Lm , 'ns=' , ns
        if k < Lm: return 0
        if ns != 1 and ns != 2: return 0

        #       print 'ss=' , ss

        if k < lts and ellyChar.isLetterOrDigit(ts[k]):
            return 0

        dt = ''.join(ss).split('/')

        dt0 = dt.pop(0)  # get first two date components
        dt1 = dt.pop(0)  #

        #       print 'split=' , dt0 , dt1

        if len(dt0) == 4 or dt0[0] == '0':
            if ns == 1: return 0  #
            dt.append(dt0)  # put first component at end if it looks like year
            dt0 = dt1  # move month up
            dt1 = dt.pop()  # move date  up

        m = int(dt0)
        if m < 1 or m > 12: return 0  # check validity of month
        if dt1 == '': return 0
        try:
            d = int(dt1)
        except ValueError:
            return 0
        if d < 1 or d > 31: return 0  # check validity of day
        if ns == 2:
            y = dt.pop(0)  # if there is a year, process it also
            ly = len(y)
            if ly == 4:  # 4-digit year?
                s = y[0]
                if s != '1' and s != '2': return 0
                yls = list(y)
            elif ly == 2:
                ix = 0 if y > self.ycur else 1
                yls = list(self.cent[ix] + y)
            else:
                return 0  # fail on any other number of year digits

            self._yr = yls  # handle year

        self._mo = list(dt0.zfill(2))  # handle month
        self._dy = list(dt1.zfill(2))  # handle day
        return k

Beispiel #34

Datei anzeigen

Datei: ellyWildcard.py Projekt: prohippo/pyelly

def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset

Beispiel #35

Datei anzeigen

Datei: ellyWildcard.py Projekt: belkhir-nacim/pyelly

def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset

Beispiel #36

Datei anzeigen

Datei: dateTransform.py Projekt: belkhir-nacim/pyelly

    def _aDay ( self , ts ):

        """
        parse a day number
        
        arguments:
            self  -
            ts    - text stream as list of chars

        returns:
            total number of chars matched
        """

#       print 'aDay', ts

        if len(ts) == 0:
            return 0

        k = 0              # running match count
        x = ts[0]
        if not ellyChar.isDigit(x):
            if not self.rewriteNumber(ts):
                return 0
            else:
                x = ts[0]

#       print 'ts=' , ts

        if len(ts) == 1:
            self._dy[0] = x          # accept at end of input as possible date
            return 1
        elif not ellyChar.isDigit(ts[1]):
            k = 1
        elif x > '3':                # reject first digit bigger than '3'
            return 0
        else:
            y = x                    # save first digit
            x = ts[1]                # this known to be second digit
            if y == '3' and x > '1': # reject day > 31
                return 0

            lr = len(ts) - 2         # how many chars after possible date
            if lr > 0:
                z = ts[2]
                if ellyChar.isDigit(z):
                    return 0         # reject 3-digit date
                if z == '.' and lr > 1 and ellyChar.isDigit(ts[3]):
                    return 0         # reject 2 digits before decimal point
            self._dy[0] = y
            k = 2

        self._dy[1] = x

        t = ts[k:]
        if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]):
            return k

        if ellyChar.isDigit(t[0]) or len(t) < 2:
            return 0
        sx = t[0].lower() + t[1].lower()

#       print 'x=' , x , 'sx=' , sx

        if x == '1':
            if sx != 'st': return 0
        elif x == '2':
            if sx != 'nd': return 0
        elif x == '3':
            if sx != 'rd': return 0
        else:
            if sx != 'th': return 0

        t = t[2:]
        k += 2

#       print 'k=' , k

        if len(ts) == k: # check next char in stream
            return k     # if none, match succeeds
        elif ellyChar.isLetterOrDigit(ts[k]):
            return 0     # otherwise, match fails if next char is alphanumeric
        else:
            return k     # otherwise succeed

Beispiel #37

Datei anzeigen

Datei: cognitiveDefiner.py Projekt: ivanjelinek/pyelly

def _rightside ( stb , txt ):

    """
    process actions for a clause

    arguments:
        stb   - symbol table
        txt   - string input for single clause

    returns:
        action list on success, None otherwise
    """

#   print "right side"

    actn = [ ]
    val  = 0
    cnc  = ''                        # default is no concept specified

    m = txt.rfind(']')
    n = txt.rfind(' ')               # look for space marking explicit concept

#   print 'n=',n
#   print "0 txt=[" , txt , "]"

    if n > m:                        # space must not be in semantic feature specification
        cnc = txt[n:].strip().upper()
        txt = txt[:n]                # break off concept

#   print "1 txt=[" , txt , "]"

    if len(txt) > 1:

        if txt[0] == '*':            # inherit from phrase component?
            c = txt[1]
            if c == 'l':
                actn.append([ semanticCommand.Clhr ])
            elif c == 'r':
                actn.append([ semanticCommand.Crhr ])
            else:
                return _err('bad inheritance')
            txt = txt[2:].strip()

#   print "2 txt=[" , txt , "]"

    if len(txt) > 3 and txt[0] == '[':

        n = txt.find(']')        # set semantic features for phrase?
#       print 'n=' , n
        if n < 3:
            return _err('incomplete semantic features to set')
        try:
            f = featureSpecification.FeatureSpecification(stb,txt[:n+1],semantic=True)
        except ellyException.FormatFailure:
            return _err('bad semantic features')
        actn.append([ semanticCommand.Csetf , f.positive ])
#       print 'set:' , actn[-1]
        txt = txt[n+1:]

#   print "3 txt=[" , txt , "]"

    if len(txt) > 0:

        c = txt[0]                   # check for sign of plausibility change

        if c != '+' and c != '-':
            return _err('plausibility must begin with + or -')

#       print "2 txt=[",txt,"]"

        if len(txt) == 1:
            val = 1
        elif ellyChar.isDigit(txt[1]):
            try:
                val = int(txt[1:])   # explicit numerical change
            except ValueError:
                return _err('bad cognitive plausibility: ' + txt)
        elif c == txt[1]:            # alternate notation for plausibility change
            val = 2
            for xc in txt[2:]:
                if xc != c:
                    return _err('must be all + or all -')
                val += 1             # count up value
        else:
            return _err('cannot interpret clause: ' + txt)

        if c == '-': val = -val      # get right sign

#   print 'val=' , val

    ret = [ semanticCommand.Cadd , val ]

    if len(cnc) > 0:
        actn.append([ semanticCommand.Csetc , cnc ])

    actn.append(ret)
    return actn

Beispiel #38

Datei anzeigen

Datei: stopExceptions.py Projekt: prohippo/pyelly

    def match ( self , txt , pnc , ctx ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

#       print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx

        if matchtoo(txt,pnc,ctx):     # exception by complex match?
            return True
#       print 'matchtoo() returned False'

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

#       print 'lstg=' , self.lstg.keys()
        if not pnc in self.lstg:     # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

#       print len(lp) , 'patterns'

        ltx = len(txt)               # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1                     # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c):
#               print 'break at nrg=' , nrg , txt[-nrg]
                break
            nrg += 1
        nrg -= 1                     # end of range for all pattern matching

#       print 'ntr=' , ntr , 'nrg=' , nrg

        txt = txt[-nrg:]             # reset text to limit for matching
        ltx = len(txt)               # its new length

#       print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        for p in lp:                 # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(pat)         # it each pattern element matches one sequence char
                if star:             # except for a final wildcard *
#                   print 'pattern ending with *'
                    n -= 1
#                   print 'ltx=' , ltx , 'n=' , n
                    if ltx < n:
                        continue     # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue     # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat,t,0):
#                   print 'no possible pattern match'
                    continue

                k = ltx - n          # extra chars beyond any match
#               print 'k=' , k , 't=' , t
#               print 'txt=' , txt
#               print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']'
#               print 'matches' , n , 'chars'
                if not star and k > 0:
#                   print 'check text before [' , txt[-n] , ']'
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n-1]
#                       print 'preceding= [', c , ']'
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue # because break in text is required

#           print 'pat=' , ellyWildcard.deconvert(p.left)
#           print 'n=' , n , 'ltx=' , ltx
#           print 'txt=' , txt

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           print 'versus c=' , nc

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:                     # check for specific char after possible stop
#               print 'right=' , nxt
                return True
            elif pcx == ellyWildcard.cALF:     # check for alphabetic
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True
            elif pcx == ellyWildcard.cDIG:     # check for numeric
                if ellyChar.isDigit(nxt):
#                   print 'right is numeric=' , nxt
                    return True
            elif pcx == ellyWildcard.cUPR:     # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:     # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:     # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True

#       print "no matches"
        return False