def markall(e, trigs, labelab=lambda x: x): clss = set(cls for (cls, _) in trigs.iterkeys()) ei = dict((k, (typ, fields)) for (k, (typ, fields)) in e.iteritems() if [c for c in clss if not fields.has_key(c)]) wk = {} for (k, (typ, fields)) in ei.iteritems(): for w in bib.wrds(fields.get('title', '')): bib.setd(wk, w, k) u = {} it = bib.indextrigs(trigs) for (dj, clslabs) in it.iteritems(): mkst = [wk.get(w, {}).iterkeys() for (stat, w) in dj if stat] mksf = [set(ei.iterkeys()).difference(wk.get(w, [])) for (stat, w) in dj if not stat] mks = intersectall(mkst + mksf) for k in mks: for cl in clslabs: bib.setd3(u, k, cl, dj) for (k, cd) in u.iteritems(): (t, f) = e[k] f2 = dict((a, b) for (a, b) in f.iteritems()) for ((cls, lab), ms) in cd.iteritems(): a = ';'.join(' and '.join(('' if stat else 'not ') + w for (stat, w) in m) for m in ms) f2[cls] = labelab(lab) + ' (computerized assignment from "' + a + '")' e[k] = (t, f2) print "trigs", len(trigs) print "trigger-disjuncts", len(it) print "label classes", len(clss) print "unlabeled refs", len(ei) print "updates", len(u) return e
def generate_hashes(conn): from _libmonster import wrds, keyid words = collections.Counter() cursor = conn.execute('SELECT value FROM value WHERE field = ?', ('title', )) while True: rows = cursor.fetchmany(10000) if not rows: break for title, in rows: words.update(wrds(title)) # TODO: consider dropping stop words/hapaxes from freq. distribution print('%d title words (from %d tokens)' % (len(words), sum(words.itervalues()))) get_bibkey = operator.itemgetter(0) for filename, first, last in windowed_entries(conn, 500): rows = conn.execute( 'SELECT bibkey, field, value FROM value ' 'WHERE filename = ? AND bibkey BETWEEN ? AND ? ' 'AND field != ? ORDER BY bibkey', (filename, first, last, 'ENTRYTYPE')) conn.executemany( 'UPDATE entry SET hash = ? WHERE filename = ? AND bibkey = ?', ((keyid({k: v for b, k, v in grp}, words), filename, bibkey) for bibkey, grp in itertools.groupby(rows, get_bibkey)))
def generate_hashes(conn): from _libmonster import wrds, keyid words = collections.Counter() cursor = conn.execute('SELECT value FROM value WHERE field = ?', ('title',)) while True: rows = cursor.fetchmany(10000) if not rows: break for title, in rows: words.update(wrds(title)) # TODO: consider dropping stop words/hapaxes from freq. distribution print('%d title words (from %d tokens)' % (len(words), sum(words.values()))) get_bibkey = operator.itemgetter(0) for filename, first, last in windowed_entries(conn, 500): rows = conn.execute('SELECT bibkey, field, value FROM value ' 'WHERE filename = ? AND bibkey BETWEEN ? AND ? ' 'AND field != ? ORDER BY bibkey', (filename, first, last, 'ENTRYTYPE')) conn.executemany('UPDATE entry SET hash = ? WHERE filename = ? AND bibkey = ?', ((keyid({k: v for b, k, v in grp}, words), filename, bibkey) for bibkey, grp in itertools.groupby(rows, get_bibkey)))