def wrsnd(cur, workfiles): vols = jdb.dbread(cur, "SELECT * FROM sndvol") for v in vols: _wrrow(x, workfiles['sndvol']) sels = jdb.dbread(cur, "SELECT * FROM sndfile s WHERE s.vol=%s", [v.id]) for s in sels: _wrrow(x, workfiles['sndfile']) clips = jdb.dbread(cur, "SELECT * FROM snd c WHERE c.file=%s", [s.id]) for c in clips: _wrrow(x, workfiles['snd'])
def labels_from_db(cur, filenum): sql = "SELECT v.loc AS vloc, f.loc AS floc " \ "FROM sndfile f JOIN sndvol v ON f.vol=v.id " \ "WHERE f.id=%s" rs = jdb.dbread(cur, sql, [filenum]) if not rs: return None, None if len(rs) > 1: raise RuntimeError fname = os.path.join(rs[0].vloc, rs[0].floc) sql = "SELECT * FROM snd s WHERE s.file=%s ORDER BY strt,leng" rs = jdb.dbread(cur, sql, [filenum]) return fname, rs
def main(args, opts): jdb.reset_encoding(sys.stdout, 'utf-8') errs = [] try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page([str(e)]) entries = jmcgi.get_entrs(cur, form.getlist('e'), form.getlist('q'), errs) if not entries: jmcgi.err_page("No entries found") return ids = [e.id for e in entries] sql, args = "SELECT * FROM vinflxt WHERE id IN %s", (tuple(ids), ) results = jdb.dbread(cur, sql, args) poses = set([p.kw for e in entries for s in e._sens for p in s._pos]) poskws = sorted([jdb.KW.POS[p].kw for p in poses]) if not results: if poskws: msg = "Unable to conjugate any of the following parts-of-speech: %s." % ( ', '.join(poskws)) else: msg = "Word does not have a part-of-speech tag." jmcgi.err_page(msg) return sql, args = "SELECT DISTINCT id,txt FROM vconotes WHERE pos IN %s ORDER BY id", ( tuple(poses), ) notes = jdb.dbread(cur, sql, args) cur.close() # Make notes links, replace '\n's with <br/>s. htmlify_conjs(results) # Divide the conjugations table up into sections, one for each word (by id). sections = partition_conjs(results) # Make each note a link target. htmlify_notes(notes) if errs: jmcgi.err_page(errs) jmcgi.jinja_page('conj.jinja', sections=sections, notes=notes, svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg, parms=parms, this_page='conj.py')
def find_similar(dbh, kanj, rdng, src, excl_seq=None): # Find all entries that have a kanj in the list of text # strings, 'kanj', or a reading in the list of text strings, # 'rdng', and return a list of esum view records of such # entries. Either 'kanj' or 'rdng', but not both, may empty. # If 'src' is given, search will be limited to entries with # that entr.src id number. Entries with a seq number of # 'excl_seq' (which may be None) will be excluded. rwhr = " OR ".join(["txt=%s"] * len(rdng)) kwhr = " OR ".join(["txt=%s"] * len(kanj)) args = [src] if excl_seq is not None: args.append(excl_seq) args.extend([x.txt for x in rdng + kanj]) sql = "SELECT DISTINCT e.* " \ + "FROM esum e " \ + "WHERE e.src=%s AND e.stat<4 " \ + ("" if excl_seq is None else "AND seq!=%s ") \ + "AND e.id IN (" \ + (("SELECT entr FROM rdng WHERE %s " % rwhr) if rwhr else "") \ + ("UNION " if rwhr and kwhr else "") \ + (("SELECT entr FROM kanj WHERE %s " % kwhr) if kwhr else "") \ + ")" rs = jdb.dbread(dbh, sql, args) return rs
def main(args, opts): jdb.reset_encoding(sys.stdout, 'utf-8') errs = [] chklist = {} try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page([str(e)]) fv = form.getfirst fl = form.getlist if not sess or sess.priv != 'A': users = [] else: sql = "SELECT * FROM users ORDER BY userid" sesscur = jdb.dbOpenSvc(cfg, svc, session=True, nokw=True) users = jdb.dbread(sesscur, sql) L('cgi.users').debug('read %d rows from table "user"' % (len(users), )) jmcgi.jinja_page("users.jinja", users=users, session=sess, cfg=cfg, parms=parms, svc=svc, dbg=dbg, sid=sid, this_page='user.py', result=fv('result'))
def find_xref(cur, typ, rtxt, ktxt, slist, seq, corp, corpcache={}, clearcache=False): xrfs = [] xunrs = None msg = '' if clearcache: corpcache.clear() if isinstance(corp, str): if corpcache.get(corp, None): corpid = corpcache[corp] else: rs = jdb.dbread(cur, "SELECT id FROM kwsrc WHERE kw=%s", [corp]) if len(rs) != 1: raise ValueError("Invalid corpus name: '%s'" % corp) corpid = corpcache[corp] = rs[0][0] else: corpid = corp try: xrfs = jdb.resolv_xref(cur, typ, rtxt, ktxt, slist, seq, corpid) except ValueError as e: msg = e.args[0] xunrs = jdb.Xrslv(typ=typ, ktxt=ktxt, rtxt=rtxt, tsens=None) xunrs.msg = msg return xrfs, xunrs
def main(args, opts): jdb.reset_encoding(sys.stdout, 'utf-8') errs = [] try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page([str(e)]) fv = form.getfirst fl = form.getlist orderby = "k.id,s.kw,e.src" sql = "SELECT k.id, k.kw, k.descr, s.kw AS corpus, count(*) AS cnt " \ "FROM kwgrp k " \ "LEFT JOIN grp g ON g.kw=k.id " \ "LEFT JOIN entr e ON e.id=g.entr " \ "LEFT JOIN kwsrc s ON s.id=e.src " \ "GROUP BY k.id, k.kw, k.descr, e.src, s.kw " \ "ORDER BY %s" % orderby rs = jdb.dbread(cur, sql) jmcgi.jinja_page("groups.jinja", results=rs, parms=parms, svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg, this_page='goups.py')
def get_subtree (dbh, id): # Read the "entr" table row for entry with id 'id' # and all the rows with a 'dfrm' attribute that points # to that row, and all the rows with 'dfrm' attributes # that point to any of thoses rows, and so on recursively. # That is, we read all the rows in the edit sub-tree # below (leaf-ward) and including row 'id'. If 'id' # denotes an edit root row, then we will read the # entire edit tree. # # After reading the rows, they are linked together in a # tree structure that mirrors that in the database by # adding an attribute, '._dfrm' tO each row which is set # to a list of rows that have 'dfrm' values equal to the # id number of the ._dfrm row. # # Return a 2-tuple of: # 1. The entr row with id 'id' (which is the root of # the subtree. # 2. A dict of (id, entr row) key value pairs allows # quick lookup of a row by 'id'.. if id is None: raise ValueError (id) root = None sql = "SELECT * FROM get_subtree(%s)" rs = jdb.dbread (dbh, sql, [id]) d = dict ((r.id,r) for r in rs) for r in rs: r._dfrm = [] for r in rs: if r.dfrm: d[r.dfrm]._dfrm.append (r) else: if root: raise ValueError ("get_subtree: Multiple roots returned by get_subtree") root = r return root, d
def get_user(uid, svc, cfg): cur = jdb.dbOpenSvc(cfg, svc, session=True, nokw=True) sql = "SELECT * FROM users WHERE userid=%s" users = jdb.dbread(cur, sql, (uid, )) # 'userid' is primary key of users table so we should never # receive more than one row. assert len(users) <= 1, "Too many rows received" return users[0] if users else None
def main(args, opts): jdb.reset_encoding(sys.stdout, opts.encoding) dir = jdb.find_in_syspath("dtd-audio.xml") dtd = jdb.get_dtd(dir + "/" + "dtd-audio.xml", "JMaudio", opts.encoding) print(dtd) print("<JMaudio>") cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts)) vols = jdb.dbread(cur, "SELECT * FROM sndvol") for v in vols: print("\n".join(fmtxml.sndvols([v]))) sels = jdb.dbread(cur, "SELECT * FROM sndfile s WHERE s.vol=%s", [v.id]) for s in sels: print("\n".join(fmtxml.sndsels([s]))) clips = jdb.dbread(cur, "SELECT * FROM snd c WHERE c.file=%s", [s.id]) for c in clips: print("\n".join(fmtxml.sndclips([c]))) print('</JMaudio>')
def dblogin(cur, userid, password): # Login by authenticating the userid/password pair. # and getting a session record which is returned with # the session id. If 'userid' has an active session # already, it (the most recent one if more than one) # is returned. If not, a new session is created. # Reusing existing sessions help prevent the proliferation # of sessions that was occuring previously. # Check userid, password validity. sql = "SELECT userid FROM users "\ "WHERE userid=%s AND pw=crypt(%s, pw) AND NOT disabled" rs = jdb.dbread(cur, sql, (userid, password)) if not rs: L('cgi.jmcgi').debug("login: pw fail for %s" % userid) time.sleep(1) return '', None # Look for an existing session (the most recent if more than one). sql = "SELECT s.id,s.userid,s.ts,u.fullname,u.email,u.priv" \ " FROM sessions s JOIN users u ON u.userid=s.userid" \ " WHERE u.userid=%%s AND NOT u.disabled" \ " AND (NOW()-ts)<'%s'::INTERVAL" \ " ORDER BY ts DESC LIMIT 1" % SESSION_TIMEOUT rs = jdb.dbread(cur, sql, (userid, )) L('cgi.jmcgi').debug("login: %s: %s sessions found" % (userid, len(rs))) if len(rs) == 1: sid = rs[0][0] # Update the session timestamp to 'now'. db_update_sid_ts(cur, sid) L('cgi.jmcgi').debug("login: %s: using session: %s" % (userid, sid)) return sid, rs[0] # No existing session found, create a new session. sql = "INSERT INTO sessions(userid) VALUES(%s) RETURNING(id)" cur.execute(sql, (userid, )) sid = cur.fetchone()[0] cur.connection.commit() L('cgi.jmcgi').debug("login: %s: new session %s" % (userid, sid)) sess = db_validate_sid(cur, sid) return sid, sess
def get_edroot(dbh, id): # Given the id number of an 'entr' row, return the id # of the root of the edit tree it is part of. The # edit tree on an entry is that set of entries from # which the first entry can be reached by following # 'dfrm' links. if id is None: raise ValueError(id) sql = "SELECT * FROM get_edroot(%s)" rs = jdb.dbread(dbh, sql, [id]) if not rs: return None return rs[0][0]
def db_validate_sid(cur, sid): # Check that 'sid' is an existing session and if so # return a session record. Otherwise return None. sql = "SELECT s.id,s.userid,s.ts,u.fullname,u.email,u.priv" \ " FROM sessions s JOIN users u ON u.userid=s.userid" \ " WHERE id=%%s AND NOT u.disabled" \ " AND (NOW()-ts)<'%s'::INTERVAL" \ % SESSION_TIMEOUT rs = jdb.dbread(cur, sql, (sid, )) L('cgi.jmcgi').debug("login: validating sid %s, result=%s" % (sid, len(rs))) if len(rs) == 0: return None return rs[0]
def get_xresolv_block(dbh, blksz, xref_src, read_xref=False): # Read and yield sucessive blocks of 'blksz' rows from table "xresolv" # (or, despite our name, table "xref" if 'read_xref' is true). Rows # are ordered by (target) entr id, sens, xref type and xref ord (or # xref.xref for table "xref") and limited to entries having a .src # attribute of 'xref_src'. None is returned when no more rows are # available. table = "xref" if read_xref else "xresolv" lastpos = 0, 0, 0, 0 while True: e0, s0, t0, o0 = lastpos # Following sql will read 'blksz' xref rows, starting # at 'lastpos' (which is given as a 4-tuple of xresolv.entr, # .sens, .typ and .ord). Note that the result set must be # ordered on exactly this same set of values in order to # step through them block-wise. sql_args = [] if xref_src: srcs, neg = xref_src src_condition = "e.src %sIN %%s AND " % ('NOT ' if neg else '') sql_args.append(tuple(srcs)) else: src_condition = '' sql = "SELECT v.*,e.src,e.seq,e.stat,e.unap FROM %s v JOIN entr e ON v.entr=e.id " \ "WHERE %s" \ " (v.entr>%%s OR (v.entr=%%s " \ "AND (v.sens>%%s OR (v.sens=%%s " \ "AND (v.typ>%%s OR (v.typ=%%s " \ "AND (v.ord>%%s))))))) " \ "ORDER BY v.entr,v.sens,v.typ,v.ord " \ "LIMIT %s" % (table, src_condition, blksz) if read_xref: # If reading the xref rather than the xresolv table make some # adjustments: sql = sql.replace('.ord', '.xref') # The "ord" field is named "xref". t0, o0 = o0, t0 # The typ and xref (aka ord) fields are swapped in xref rows. sql_args.extend([e0, e0, s0, s0, t0, t0, o0]) rs = jdb.dbread(dbh, sql, sql_args) if len(rs) == 0: return None if Opts.debug & 0x04: print("Read %d %s rows from %s" % (len(rs), table, lastpos), file=sys.stderr) # Slicing doesn't seem to currently work on DbRow objects or we could # write "lastpos = rs[-1][0:4]" below. lastpos = rs[-1][0], rs[-1][1], rs[-1][2], rs[-1][3] yield rs assert True, "Unexpected break from loop" return
def get_langs(cur): """Get set of kwlang rows for languages currently used in the the database (for gloss and lsrc.)""" sql = \ "SELECT k.id,k.kw,k.descr FROM "\ "(SELECT lang FROM gloss "\ "UNION DISTINCT "\ "SELECT lang FROM lsrc) AS l "\ "JOIN kwlang k ON k.id=l.lang "\ "ORDER BY k.kw!='eng', k.kw " # The first "order by" term will sort english to the top # of the list. rows = jdb.dbread(cur, sql) return rows
def main(args, opts): jdb.reset_encoding(sys.stdout, 'utf-8') errs = [] so = None stats = {} try: form, svc, dbg, cur, sid, sess, parms, cfg = jmcgi.parseform() except Exception as e: jmcgi.err_page([str(e)]) cfg_web = d2o(cfg['web']) cfg_srch = d2o(cfg['search']) fv = form.getfirst fl = form.getlist force_srchres = fv( 'srchres') # Force display of srchres page even if only one result. sqlp = (fv('sql') or '') soj = (fv('soj') or '') pgoffset = int(fv('p1') or 0) pgtotal = int(fv('pt') or -1) entrs_per_page = min( max(int(fv('ps') or cfg_web.DEF_ENTRIES_PER_PAGE), cfg_web.MIN_ENTRIES_PER_PAGE), cfg_web.MAX_ENTRIES_PER_PAGE) if not sqlp and not soj: so = jmcgi.SearchItems() so.idnum = fv('idval') so.idtyp = fv('idtyp') tl = [] for i in (1, 2, 3): txt = (fv('t' + str(i)) or '') if txt: tl.append( jmcgi.SearchItemsTexts(srchtxt=txt, srchin=fv('s' + str(i)), srchtyp=fv('y' + str(i)))) if tl: so.txts = tl so.pos = fl('pos') so.misc = fl('misc') so.fld = fl('fld') so.dial = fl('dial') so.rinf = fl('rinf') so.kinf = fl('kinf') so.freq = fl('freq') so.grp = grpsparse(fv('grp')) so.src = fl('src') so.stat = fl('stat') so.unap = fl('appr') so.nfval = fv('nfval') so.nfcmp = fv('nfcmp') # Search using gA freq criterion no longer supported. See # the comments in jmcgi._freqcond() but code left here for # reference. so.gaval = fv('gaval') so.gacmp = fv('gacmp') #FIXME? use selection boxes for dates? Or a JS calendar control? so.ts = dateparse(fv('ts0'), 0, errs), dateparse(fv('ts1'), 1, errs) so.smtr = (fv('smtr') or ''), fv('smtrm') so.mt = fv('mt') # Pack up all the search criteria in a json string that will # be given to the srchres form, which will in turn give it back # to us if the user want to display the "next page". soj = serialize.so2js(so) elif soj: # 'soj' is a json string that encodes the so object (containing # the search criteria) that were used in previous invocation # of this script, which displayed the previous page. so = serialize.js2so(soj) elif sqlp: # 'sqlp' is a SQL statement string that allows an arbitrary search. # Because it can also do other things such as delete the database, # it should only be run as a user with read-only access to the # database and it is the job of jmcgi.adv_srch_allowed() to check # that. if not jmcgi.adv_srch_allowed(cfg, sess): jmcgi.err_page(["'sql' parameter is disallowed."]) sql = sqlp.strip() if sql.endswith(';'): sql = sql[:-1] sql_args = [] if so: try: condlist = jmcgi.so2conds(so) except ValueError as e: errs.append(str(e)) # FIXME: [IS-115] Following will prevent kanjidic entries from # appearing in results. Obviously hardwiring id=4 is a hack. else: #condlist.append (('entr e', 'e.src!=4', [])) sql, sql_args = jdb.build_search_sql(condlist) if errs: jmcgi.err_page(errs) orderby = "ORDER BY __wrap__.kanj,__wrap__.rdng,__wrap__.seq,__wrap__.id" page = "OFFSET %s LIMIT %s" % (pgoffset, entrs_per_page) sql2 = "SELECT __wrap__.* FROM esum __wrap__ " \ "JOIN (%s) AS __user__ ON __user__.id=__wrap__.id %s %s" \ % (sql, orderby, page) stats['sql'] = sql stats['args'] = sql_args stats['orderby'] = orderby if cfg_srch.MAX_QUERY_COST > 0: try: cost = jdb.get_query_cost(cur, sql2, sql_args) except Exception as e: jmcgi.err_page(errs=[str(e)], cssclass="errormsg", prolog="Database error (%s)" % e.__class__.__name__) stats['cost'] = cost if cost > cfg_srch.MAX_QUERY_COST: jmcgi.err_page([ "The search request you made will likely take too " "long to execute. Please use your browser's \"back\" " "button to return to the search page and add more " "criteria to restrict your search more narrowly. " "(The estimated cost was %.1f, max allowed is %d.)" % (cost, cfg_srch.MAX_QUERY_COST) ]) t0 = time.time() try: rs = jdb.dbread(cur, sql2, sql_args) except Exception as e: #FIXME, what exception value(s)? jmcgi.err_page(errs=[str(e)], cssclass="errormsg", prolog="Database error (%s)" % e.__class__.__name__) stats['dbtime'] = time.time() - t0 reccnt = len(rs) if pgtotal < 0: if reccnt >= entrs_per_page: # If there may be more than one page of entries (because # 'reccnt' is greater than the page size, 'entrs_per_page', # then run another query to get the actual number of entries. # We only do this on the first page of results ('pgtotal' is # less then 0) and subsequently pass the value between pages # for performace reasons, even though the number of entries # may change before the user gets to the last page. sql3 = "SELECT COUNT(*) AS cnt FROM (%s) AS i " % sql cntrec = jdb.dbread(cur, sql3, sql_args) pgtotal = cntrec[0][0] # Total number of entries. else: pgtotal = reccnt if reccnt == 1 and pgtotal == 1 and not force_srchres: # If there is only one entry, display it rather than a search # results page. 'force_srchres' allows supressing this behavior # for debugging. svcstr = ("svc=%s&sid=%s&" % (svc, sid)) if svc else '' print("Location: entr.py?%se=%d\n" % (svcstr, rs[0].id)) else: jmcgi.jinja_page("srchres.jinja", results=rs, pt=pgtotal, p0=pgoffset, p1=pgoffset + reccnt, soj=soj, sql=sqlp, parms=parms, svc=svc, dbg=dbg, sid=sid, session=sess, cfg=cfg, stats=stats, this_page='srchres.py')
def main(args, opts): global Debug Debug = opts.debug # Open the database. jdb.dbopts() extracts the db-related # options from the command line options in 'opts'. cur = jdb.dbOpen(opts.database, **jdb.dbopts(opts)) # If no "--root" option was supplied, choose a default based # on the value of the "--compat" option. if not opts.root: if opts.compat in ('jmnedict', 'jmneold'): opts.root = 'JMnedict' else: opts.root = 'JMdict' outf = None if not opts.nodtd: # Choose a dtd to use based on the "--compat" option. # The dtd file is expected to be located somewhere in the # pythonpath (sys.path) directories. if opts.compat == 'jmdict': dtd = "dtd-jmdict.xml" elif opts.compat == 'jmdicthist': dtd = "dtd-jmdict.xml" elif opts.compat == 'jmnedict': dtd = "dtd-jmnedict.xml" elif opts.compat == 'jmneold': dtd = "dtd-jmneold.xml" else: dtd = "dtd-jmdict-ex.xml" dir = jdb.find_in_syspath(dtd) dtdfn = dir + "/" + dtd # Fully qualified dtd file name. # jdb.get_dtd() reads the dtd text, and replaces the root # element name name and encoding with the values supplied # in the arguments. dtdtxt = jdb.get_dtd(dtdfn, opts.root, opts.encoding) if len(args) == 0: outf = sys.stdout else: outf = open(args[0], "w") jdb.reset_encoding(outf, opts.encoding) outf.write(dtdtxt) if opts.seqfile: if opts.seqfile == '-': f = sys.stdin else: f = open(opts.seqfile) #FIXME: we should read these incrementally. entrlist = [int(x) for x in f.read().split()] # seq# separated by sp or nl. if f != sys.stdin: f.close() # Turn the "--corpus" option value into a string that can be # and'ed into a SQL WHERE clause to restrict the results to # the specified corpora. corp_terms = parse_corpus_opt(opts.corpus, 'e.src') # If the output file was not opened in the dtd section # above, open it now. We postpose opening it until the # last possible moment to avoid creating it and then # bombing because there was a typo in the input or dtd # filename, etc. # FIXME: Should do a "write" function that opens the # file just before writing. if not outf: if len(args) == 0: outf = sys.stdout else: outf = open(args[0], "w") whr_act = " AND NOT unap AND stat=" + str( jdb.KW.STAT['A'].id) if opts.compat else "" if opts.begin: # If a "--begin" sequence number was given, we need to read # the entr record so we can get the src id number. Complain # and exit if not found. Complain if more than one entry # with the requested seq number exists. More than one may be # found since the same sequence number may exist in different # corpora, or in the same corpus if an entry was edited. # #FIXME: no way to select from multiple entries with same seq # number. Might want just the stat="A" entries for example. sql = "SELECT id,seq,src FROM entr e WHERE seq=%s%s%s ORDER BY src" \ % (int(opts.begin), corp_terms, whr_act) if Debug: print(sql, file=sys.stderr) start = time.time() rs = jdb.dbread(cur, sql) if Debug: print("Time: %s (init read)" % (time.time() - start), file=sys.stderr) if not rs: print ("No entry with seq '%s' found" \ % opts.begin, file=sys.stderr) sys.exit(1) if len(rs) > 1: print ("Multiple entries having seq '%s' found, results " \ "may not be as expected. Consider using -s to " \ "restrict to a single corpus." % (opts.begin), file=sys.stderr) lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id if not opts.begin and not opts.seqfile: # If no "--begin" option, remove the " AND" from the front of # the 'corp_terms' string. Read the first entry (by seq number) # in the requested corpora. cc = corp_terms[4:] if corp_terms else 'True' # If compat (jmdict or jmnedict), restrict the xml to Active # entries only. sql = "SELECT id,seq,src FROM entr e WHERE %s%s ORDER BY src,seq LIMIT 1" % ( cc, whr_act) start = time.time() if Debug: print(sql, file=sys.stderr) rs = jdb.dbread(cur, sql) if Debug: print("Time: %s (init read)" % (time.time() - start), file=sys.stderr) lastsrc, lastseq, lastid = rs[0].src, rs[0].seq, rs[0].id # Add an enclosing root element only if we are also including # a DTD (ie, producing a full XML file). Otherwise, the file # generated will just be a list of <entr> elements. if not opts.nodtd: if opts.compat: # Add a date comment... today = time.strftime("%Y-%m-%d", time.localtime()) outf.write("<!-- %s created: %s -->\n" % (opts.root, today)) outf.write('<%s>\n' % opts.root) entrlist_loc = 0 count = opts.count done = 0 blksize = opts.blocksize corpora = set() while count is None or count > 0: if opts.seqfile: seqnums = tuple(entrlist[entrlist_loc:entrlist_loc + blksize]) if not seqnums: break entrlist_loc += blksize #FIXME: need detection of non-existent seq#s. sql = "SELECT id FROM entr e WHERE seq IN %s" + corp_terms + whr_act sql_args = [seqnums] if Debug: print(sql, sql_args, file=sys.stderr) start = time.time() tmptbl = jdb.entrFind(cur, sql, sql_args) else: # In this loop we read blocks of 'blksize' entries. Each # block read is ordered by entr src (i.e. corpus), seq, and # id. The block to read is specified in WHERE clause which # is effectively: # WHERE ((e.src=lastsrc AND e.seq=lastseq AND e.id>=lastid+1) # OR (e.src=lastsrc AND e.seq>=lastseq) # OR e.src>lastsrc) # and (lastsrc, lastseq, lastid) are from the last entry in # the last block read. whr = "WHERE ((e.src=%%s AND e.seq=%%s AND e.id>=%%s) " \ "OR (e.src=%%s AND e.seq>%%s) " \ "OR e.src>%%s) %s%s" % (corp_terms, whr_act) sql = "SELECT e.id FROM entr e" \ " %s ORDER BY src,seq,id LIMIT %d" \ % (whr, blksize if count is None else min (blksize, count)) # The following args will be substituted for the "%%s" in # the sql above, in jbd.findEntr(). sql_args = [lastsrc, lastseq, lastid, lastsrc, lastseq, lastsrc] # Create a temporary table of id numbers and give that to # jdb.entrList(). This is an order of magnitude faster than # giving the above sql directly to entrList(). if Debug: print(sql, sql_args, file=sys.stderr) start = time.time() tmptbl = jdb.entrFind(cur, sql, sql_args) mid = time.time() entrs, raw = jdb.entrList(cur, tmptbl, None, ord="src,seq,id", ret_tuple=True) end = time.time() if Debug: print("read %d entries" % len(entrs), file=sys.stderr) if Debug: print("Time: %s (entrFind), %s (entrList)" % (mid - start, end - mid), file=sys.stderr) if not entrs: break write_entrs(cur, entrs, raw, corpora, opts, outf) # Update the 'last*' variables for the next time through # the loop. Also, decrement 'count', if we are counting. lastsrc = entrs[-1].src lastseq = entrs[-1].seq lastid = entrs[-1].id + 1 if count is not None: count -= blksize done += len(entrs) if not Debug: sys.stderr.write('.') else: print("%d entries written" % done, file=sys.stderr) if not opts.nodtd: outf.writelines('</%s>\n' % opts.root) if not Debug: sys.stderr.write('\n') print("Wrote %d entries" % done, file=sys.stderr)