def scan(strg): """ check for extent of syntax specification arguments: strg - string of chars to scan returns: char count > 0 on finding possible syntax specification, 0 otherwise """ n = 0 for c in strg: if c == '.' or ellyChar.isLetterOrDigit(c): n += 1 else: break else: return n c = strg[n] if c == ' ': return n if c != '[': return 0 k = featureSpecification.scan(strg[n:]) return n + k if k > 0 else 0
def scan ( strg ): """ check for extent of syntax specification arguments: strg - string of chars to scan returns: char count > 0 on finding possible syntax specification, 0 otherwise """ n = 0 for c in strg: if c == '.' or ellyChar.isLetterOrDigit(c): n += 1 else: break else: return n c = strg[n] if c == ' ': return n if c != '[': return 0 k = featureSpecification.scan(strg[n:]) return n + k if k > 0 else 0
def compile ( name , stb , defn ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print 'compiled stb=' , stb if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn # if no such file, warn but proceed #### SQLite #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error , e: print >> sys.stderr , e raise ellyException.TableFailure # give up on any database failure # print 'creating' , filn # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print type(r) , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') n = delimitKey(t) # get part of term to index if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ' SQLite key=' , wky ns = syntaxSpecification.scan(d) # find extent of syntax info # print 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info # print 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print 'ns=' , ns if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal(False) # convert to hex # print '1:d=[' + d + ']' ld = len(d) # print 'ld=' , ld if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print 'np=' , np if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print 'pb=' , pb d = d[np:] ld = len(d) # print '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print 'type(vss)=' , type(vss) # print 'rec=' , vrc , 'tra=' , d # print ' =' , vss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # skip rest of processing #### SQLite #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print type(wky) , wky , type(vss) , vss cur.execute(sql,(wky,vss)) except dbs.Error , e: print >> sys.stderr , 'FATAL' , e sys.exit(1)
def build(name, stb, defn): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print ( 'built stb=' , stb ) if stb == None: print('no symbol table', file=sys.stderr) raise ellyException.TableFailure try: zfs = FSpec(stb, '[$]', True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print('unexpected failure with zero features', file=sys.stderr) raise ellyException.TableFailure # print ( 'zfs=' , zfs ) # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print('no', filn, file=sys.stderr) # if no such file, warn but proceed #### SQLite DB operations #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error as e: print(e, file=sys.stderr) raise ellyException.TableFailure # give up on any database failure # print ( 'creating' , filn ) # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print ( '------------' ) r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print ( type(r) , r ) r = definitionLine.normalize(r) # # print ( 'to' , r ) k = r.find(' : ') # look for first ' : ' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k + 2:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ( ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' ) if len(t) == 0 or len(d) == 0: _err() # quit on missing parts if ellyConfiguration.language == 'ZH': # special key for Chinese wky = toKeyZH(t[0]) else: c = t[0] if not ellyChar.isLetterOrDigit(c) and not c in initChr: _err('bad term') n = delimitKey(t) # get part of term to index # print ( 'delimit=' , n ) if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ( ' SQLite key=' , wky ) # print ( 'd=' , d ) ns = syntaxSpecification.scan(d) # find extent of syntax info # print ( 'ns=' , ns , '"' + d[ns:] + '"' ) if ns <= 0: _err('bad syntax specification') if not d[ns:] == '' and d[ns] != ' ': _err('trailing chars in syntax specification') # print ( 'PoS=' , d[:ns] ) syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print ( 'VT syn=' , syn ) ss = SSpec(stb, syn) # decode syntax info # print ( 'VT ss =' , ss ) except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category cid = _smfchk[ss.catg] # associated semantic feature ID syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print ( 'cat=' , cat ) # print ( 'syf=' , syf ) smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print ( '0:d=[' + d + ']' ) if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan( d) # look for ']' of features # print ( 'ns=' , ns ) if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip( ) # skip over for subsequent processing sid = sem[1] # feature ID if sid != cid: if cid != None: _err('inconsistent semantic feature id') _smfchk[ss.catg] = sid try: # print ( 'smf=' , smf ) fs = FSpec(stb, sem, True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal( False) # convert to hex # print ( '1:d=[' + d + ']' ) ld = len(d) # print ( 'ld=' , ld ) if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print ( 'np=' , np ) if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print ( 'pb=' , pb ) d = d[np:] ld = len(d) # print ( '2:d=[' + d + ']' ) if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print ( 'getting concept' ) np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text elif d[0] != '(': dd = d while ellyChar.isLetterOrDigit(dd[0]): dd = dd[1:] if len(dd) == 0 or dd[0] != '=': _err() d = d.strip() # rest of definition # print ( 'rest of d=' , d ) if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation # print ( 'cd=' , cd ) if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print('** WARNING \'=\' followed by \'=\'', file=sys.stderr) print('* at [', tsave, ']', file=sys.stderr) sd = cd ld.append(cd) # add char to reformatted definition # print ( 'ld=' , ld ) if k > 0: d = ''.join(ld) # definition with spaces removed # print ( '3:d=[' + d + ']' ) vrc = [t, '=:', cat, syf, smf, pb, cn] # start data record vss = ' '.join(vrc) # convert to string vss += ' ' + d # fill out record with rest of input # print ( 'type(vss)=' , type(vss) ) # print ( 'rec=' , vrc , 'tra=' , d ) # print ( ' =' , vss ) except ellyException.FormatFailure: # will catch exceptions from _err() print('* at [', tsave, end=' ', file=sys.stderr) if dsave != None: print(':', dsave, end=' ', file=sys.stderr) print(']', file=sys.stderr) continue # skip rest of processing this rule #### SQLite DB operation #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print ( type(wky) , wky , type(vss) , vss ) cur.execute(sql, (wky, vss)) except dbs.Error as e: print('FATAL', e, file=sys.stderr) sys.exit(1) # #### #### SQLite DB operations #### if nerr == 0: cdb.commit() cdb.close() # clean up # print ( 'DONE' ) # #### except Error as e: # catch any other errors print('**', e, file=sys.stderr) print('* at', r, file=sys.stderr) nerr += 1 if nerr > 0: print('**', nerr, 'vocabulary table errors in all', file=sys.stderr) print('* compilation FAILed', file=sys.stderr) cdb.close() # discard any changes raise ellyException.TableFailure
def compile ( name , stb , defn , stem=None ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new BSDDB database stb - Elly symbol table defn - Elly definition reader for vocabulary stem - optional stemmer for indexing exceptions: TableFailure on error """ global nerr nerr = 0 # print >> sys.stderr , 'compiled stb=' , stb , 'stem=' , stem , 'db=' , db if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure if db == None : print >> sys.stderr, 'no Python db package' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print >> sys.stderr , 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn dbs = db.DB() # create new database dbs.set_flags(db.DB_DUP) # keys may identify multiple records dbs.open(filn,None,db.DB_HASH,db.DB_CREATE) # open new database file # print >> sys.stderr , 'creating' , filn r = None # for error reporting while True: # process vocabulary records try: # print >> sys.stderr , '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF if r[0] == '#': continue # skip comment line # print >> sys.stderr , 'def=' , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry continue t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print >> sys.stderr , ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts continue c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') continue n = toIndex(t) # get part of term to index if n == 0: _err() # quit on bad term continue w = t[:n] # first word of term to define if stem != None: try: w = stem.simplify(w) # reduce for lookup key except ellyException.StemmingError: _err('bad stemming logic') continue # print >> sys.stderr , ' w=' , w lcw = lcAN(w) # convert to ASCII lower case # print >> sys.stderr , 'lcw=' , '"' + lcw + '"' ns = syntaxSpecification.scan(d) # find extent of syntax info # print >> sys.stderr , 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print >> sys.stderr , 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print >> sys.stderr , 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info to get # print >> sys.stderr , 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') continue cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print >> sys.stderr , 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = '-' # # print >> sys.stderr , '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') continue d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print >> sys.stderr , 'ns=' , ns if ns < 0: _err() continue sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print >> sys.stderr , 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') continue smf = fs.positive.hexadecimal(False) # convert to hex # print >> sys.stderr , '1:d=[' + d + ']' ld = len(d) # print >> sys.stderr , 'ld=' , ld if ld == 0: _err('missing plausibility') continue np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print >> sys.stderr , 'np=' , np if np == 0: _err('missing plausibility') continue pb = d[:np] # plausibility bias # print >> sys.stderr , 'pb=' , pb d = d[np:] ld = len(d) # print >> sys.stderr , '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print >> sys.stderr , 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') continue cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format continue elif ld > 0: _err() # unidentifiable trailing text continue d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') continue ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print >> sys.stderr , '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start BdB data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print >> sys.stderr , 'type(vss)=' , type(vss) rss = vss.encode('utf8') # convert to UTF-8 # print >> sys.stderr , 'rec=' , vrc , 'tra=' , d # print >> sys.stderr , ' =' , rss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # print >> sys.stderr , 'lcw=' , lcw dbs.put(lcw,rss) # save in database # print >> sys.stderr , 'saved' # print >> sys.stderr , 'DONE' dbs.close() # clean up except StandardError , e: # catch any other errors print >> sys.stderr , '**' , e print >> sys.stderr , '* at' , r nerr += 1