Example #1
0
def markall(e, trigs, labelab=lambda x: x):
    clss = set(cls for (cls, _) in trigs.iterkeys())
    ei = dict((k, (typ, fields)) for (k, (typ, fields)) in e.iteritems() if [c for c in clss if not fields.has_key(c)])

    wk = {}
    for (k, (typ, fields)) in ei.iteritems():
        for w in bib.wrds(fields.get('title', '')):
            bib.setd(wk, w, k)

    u = {}
    it = bib.indextrigs(trigs)
    for (dj, clslabs) in it.iteritems():
        mkst = [wk.get(w, {}).iterkeys() for (stat, w) in dj if stat]
        mksf = [set(ei.iterkeys()).difference(wk.get(w, [])) for (stat, w) in dj if not stat]
        mks = intersectall(mkst + mksf)
        for k in mks:
            for cl in clslabs:
                bib.setd3(u, k, cl, dj)

    for (k, cd) in u.iteritems():
        (t, f) = e[k]
        f2 = dict((a, b) for (a, b) in f.iteritems())
        for ((cls, lab), ms) in cd.iteritems():
            a = ';'.join(' and '.join(('' if stat else 'not ') + w for (stat, w) in m) for m in ms)
            f2[cls] = labelab(lab) + ' (computerized assignment from "' + a + '")'
            e[k] = (t, f2)
    print "trigs", len(trigs)
    print "trigger-disjuncts", len(it)
    print "label classes", len(clss)
    print "unlabeled refs", len(ei)
    print "updates", len(u)
    return e
Example #2
0
def generate_hashes(conn):
    from _libmonster import wrds, keyid

    words = collections.Counter()
    cursor = conn.execute('SELECT value FROM value WHERE field = ?',
                          ('title', ))
    while True:
        rows = cursor.fetchmany(10000)
        if not rows:
            break
        for title, in rows:
            words.update(wrds(title))
    # TODO: consider dropping stop words/hapaxes from freq. distribution
    print('%d title words (from %d tokens)' %
          (len(words), sum(words.itervalues())))

    get_bibkey = operator.itemgetter(0)
    for filename, first, last in windowed_entries(conn, 500):
        rows = conn.execute(
            'SELECT bibkey, field, value FROM value '
            'WHERE filename = ? AND bibkey BETWEEN ? AND ? '
            'AND field != ? ORDER BY bibkey',
            (filename, first, last, 'ENTRYTYPE'))
        conn.executemany(
            'UPDATE entry SET hash = ? WHERE filename = ? AND bibkey = ?',
            ((keyid({k: v
                     for b, k, v in grp}, words), filename, bibkey)
             for bibkey, grp in itertools.groupby(rows, get_bibkey)))
Example #3
0
def generate_hashes(conn):
    from _libmonster import wrds, keyid

    words = collections.Counter()
    cursor = conn.execute('SELECT value FROM value WHERE field = ?', ('title',))
    while True:
        rows = cursor.fetchmany(10000)
        if not rows:
            break
        for title, in rows:
            words.update(wrds(title))
    # TODO: consider dropping stop words/hapaxes from freq. distribution
    print('%d title words (from %d tokens)' % (len(words), sum(words.values())))

    get_bibkey = operator.itemgetter(0)
    for filename, first, last in windowed_entries(conn, 500):
        rows = conn.execute('SELECT bibkey, field, value FROM value '
            'WHERE filename = ? AND bibkey BETWEEN ? AND ? '
            'AND field != ? ORDER BY bibkey', (filename, first, last, 'ENTRYTYPE'))
        conn.executemany('UPDATE entry SET hash = ? WHERE filename = ? AND bibkey = ?',
            ((keyid({k: v for b, k, v in grp}, words), filename, bibkey)
            for bibkey, grp in itertools.groupby(rows, get_bibkey)))