Esempio n. 1
0
def my_getNewCard( self, _old ):
    '''Continually call _getNewCard until we get one with a focusMorph we haven't
    seen before. Also skip bad vocab cards'''
    while True:
        C = partial( cfg, None, self.col.decks.active()[0] )
        if not C('next new card feature'):
            return _old( self )
        if not C('new card merged fill'):
            c = _old( self )
        else:   # pop from opposite direction and skip sibling spacing
            if not self._fillNew(): return
            ( id, due ) = self._newQueue.pop( 0 )
            c = self.col.getCard( id )

        if not c: return			# no more cards
        n = c.note()

        try: fm = focus( n )		# fm is either the focusMorph or empty
        except KeyError: return c	# card has no focusMorph field -> assume it's good

        # determine if good vocab word based on whether k+1
        # defaults to whether has focus morph if no k+N field or disabled
        try: goodVocab = n[ cfg( n.mid, None, 'k+N' ) ] == '1'
        except KeyError: goodVocab = fm

        if not goodVocab or fm in seenMorphs or n.hasTag( CN( n, 'tag_alreadyKnown' ) ):
            self.buryNote( c.note().id )
            continue
        break
    return c
Esempio n. 2
0
def my_getNewCard(self, _old):
    '''Continually call _getNewCard until we get one with a focusMorph we haven't
    seen before. Also skip bad vocab cards'''
    while True:
        C = partial(cfg, None, self.col.decks.active()[0])
        if not C('next new card feature'):
            return _old(self)
        if not C('new card merged fill'):
            c = _old(self)
        else:  # pop from opposite direction and skip sibling spacing
            if not self._fillNew(): return
            (id, due) = self._newQueue.pop(0)
            c = self.col.getCard(id)

        if not c: return  # no more cards
        n = c.note()

        try:
            fm = focus(n)  # fm is either the focusMorph or empty
        except KeyError:
            return c  # card has no focusMorph field -> assume it's good

        # determine if good vocab word based on whether k+1
        # defaults to whether has focus morph if no k+N field or disabled
        try:
            goodVocab = n[cfg(n.mid, None, 'k+N')] == '1'
        except KeyError:
            goodVocab = fm

        if not goodVocab or fm in seenMorphs or n.hasTag(
                CN(n, 'tag_alreadyKnown')):
            self.buryNote(c.note().id)
            continue
        break
    return c
Esempio n. 3
0
def mkAllDb( allDb=None ):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )
        if not C('enabled'): continue
        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in C('morph_fields'):
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes( fieldValue )
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes( fieldValue )
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms
    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb
Esempio n. 4
0
    def extra_setup(self):
        self.subfunc = partial(self.subfunc, parent=None)
        self.subclass = partial(self.subclass, parent=None)
        self.loadscopes = {
            'local': ('LOAD_NAME', self.names),
            'global': ('LOAD_GLOBAL', self.names)
        }
        self.storescopes = {
            'local': ('STORE_NAME', self.names),
            'global': ('STORE_GLOBAL', self.names)
        }
        self.delscopes = {
            'local': ('DELETE_NAME', self.names),
            'global': ('DELETE_GLOBAL', self.names)
        }
        self.localmacros = {}

        # XXX: ?
        # XXX: ?
        self.compiling_namespace = default_macro_namespace.copy()
Esempio n. 5
0
def my_fillNew( self, _old ):
    '''If 'new card merged fill' is enabled for the current deck, when we refill we
    pull from all child decks, sort combined pool of cards, then limit.
    If disabled, do the standard sequential fill method'''
    C = partial( cfg, None, self.col.decks.active()[0] )
    if not C('new card merged fill'): return _old( self )

    if self._newQueue:      return True
    if not self.newCount:   return False

    self._newQueue = self.col.db.all('''select id, due from cards where did in %s and queue = 0 and due >= ? order by due limit ?''' % self._deckLimit(), C('new card merged fill min due'), self.queueLimit )
    if self._newQueue:      return True
Esempio n. 6
0
def my_fillNew( self, _old ):
    '''If 'new card merged fill' is enabled for the current deck, when we refill we
    pull from all child decks, sort combined pool of cards, then limit.
    If disabled, do the standard sequential fill method'''
    C = partial( cfg, None, self.col.decks.active()[0] )
    if not C('new card merged fill'): return _old( self )

    if self._newQueue:      return True
    if not self.newCount:   return False

    self._newQueue = self.col.db.all('''select id, due from cards where did in %s and queue = 0 and due >= ? order by due limit ?''' % self._deckLimit(), C('new card merged fill min due'), self.queueLimit )
    if self._newQueue:      return True
Esempio n. 7
0
def my_getNewCard(self, _old):
    '''Continually call _getNewCard until we get one with a focusMorph we haven't
    seen before. Also skip bad vocab cards.

    :type self: anki.sched.Scheduler
    :type _old: Callable
    '''

    while True:
        C = partial(cfg, None, self.col.decks.active()[0])
        if not C('next new card feature'):
            return _old(self)
        if not C('new card merged fill'):
            c = _old(self)
            ''' :type c: anki.cards.Card '''
        else:  # pop from opposite direction and skip sibling spacing
            if not self._fillNew(): return
            (id, due) = self._newQueue.pop(0)
            c = self.col.getCard(id)
            self.newCount -= 1

        if not c: return  # no more cards
        n = c.note()

        try:
            fm = focus(n)  # fm is either the focusMorph or empty
        except KeyError:
            return c  # card has no focusMorph field -> assume it's good

        # determine if good vocab word based on whether k+1
        # defaults to whether has focus morph if no k+N field or disabled
        try:
            goodVocab = n[jcfg('Field_UnknownMorphCount')] == '1'
        except KeyError:
            goodVocab = fm

        # even if it is not a good vocabulary card, we have no choice when there are no other cards available
        if (not goodVocab and not n.hasTag(jcfg('Tag_NotReady'))) or n.hasTag(
                jcfg('Tag_AlreadyKnown')) or fm in seenMorphs:
            self.buryCards([c.id])
            self.newCount += 1  # the card was quaried from the "new queue" so we have to increase the "new counter" back to its original value
            continue
        break

    return c
Esempio n. 8
0
valid_en, train_en = np.load(pform(P.data, "valid_en.npy")), np.load(pform(P.data, "train_en.npy"))
# valid_nl, train_nl = np.load(pform(P.data, "valid_nl.npy")), np.load(pform(P.data, "train_nl.npy"))
valid_de, train_de = np.load(pform(P.data, "valid_de.npy")), np.load(pform(P.data, "train_de.npy"))
# valid_da, train_da = np.load(pform(P.data, "valid_da.npy")), np.load(pform(P.data, "train_da.npy"))
valid_sv, train_sv = np.load(pform(P.data, "valid_sv.npy")), np.load(pform(P.data, "train_sv.npy"))

data_index =        0,        2,        4
data_valid = valid_en, valid_de, valid_sv
data_train = train_en, train_de, train_sv

def batch(arrs, size= C.batch_train, seed= C.seed):
    size //= len(arrs) * (len(arrs) - 1)
    for i in batch_sample(len(arrs[0]), size, seed):
        yield tuple(arr[i] for arr in arrs)

perm = comp(tuple, partial(permutations, r= 2))
data_index = perm(data_index)
data_valid = perm(data_valid)
data_train = perm(pipe(partial(batch, data_train), (tf.int32,)*len(data_train), prefetch= 16))

###############
# build model #
###############

model = Model.new(**select(C, *Model._new))
valid = tuple(model.data(i, j).valid() for i, j in data_index)
train = tuple(model.data(i, j, s, t).train(**T) for (i, j), (s, t) in zip(data_index, data_train))

model.lr   = train[0].lr
model.step = train[0].step
model.errt = train[0].errt
Esempio n. 9
0
def mkAllDb(allDb=None):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)
        if not C('enabled'): continue

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in C('morph_fields'):
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(fieldValue)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf('    .mats for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf('    .morphs for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(fieldValue)
                    locDb.pop(loc)
                    locDb[newLoc] = ms
    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb
Esempio n. 10
0
def updateNotes(allDb):
    t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi = [], {}
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Updating data', max=N_notes, immediate=True)
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    # handle secondary databases
    mw.progress.update(label='Creating seen/known/mature from all.db')
    seenDb = filterDbByMat(allDb, cfg1('threshold_seen'))
    knownDb = filterDbByMat(allDb, cfg1('threshold_known'))
    matureDb = filterDbByMat(allDb, cfg1('threshold_mature'))
    mw.progress.update(label='Loading priority.db')
    priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db

    if cfg1('saveDbs'):
        mw.progress.update(label='Saving seen/known/mature dbs')
        seenDb.save(cfg1('path_seen'))
        knownDb.save(cfg1('path_known'))
        matureDb.save(cfg1('path_mature'))

    mw.progress.update(label='Calculating frequency information')
    pops = [len(locs) for locs in allDb.db.values()]
    pops = [n for n in pops if n > 1]

    mw.progress.update(label='Updating notes')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)
        if not C('enabled'): continue
        # Get all morphemes for note
        ms = set()
        for fieldName in C('morph_fields'):
            try:
                loc = fidDb[(nid, guid, fieldName)]
                ms.update(locDb[loc])
            except KeyError:
                continue
        ms = [m for m in ms if m.pos not in C('morph_blacklist')]

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for m in ms:
            if m not in seenDb.db: unseens.add(m)
            if m not in knownDb.db: unknowns.add(m)
            if m not in matureDb.db: unmatures.add(m)
            if m not in matureDb.db and m in knownDb.db:
                newKnowns.add(m)

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len(ms), len(unseens), len(unknowns), len(unmatures)

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

        # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns:  # focusMorph used outside loop
            F_k += len(allDb.db[focusMorph])
        F_k_avg = F_k / N_k if N_k > 0 else F_k
        usefulness = F_k_avg

        # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for m in newKnowns:
            locs = allDb.db[m]
            if locs:
                ivl = min(1, max(loc.maturity for loc in locs))
                usefulness += C(
                    'reinforce new vocab weight'
                ) / ivl  #TODO: maybe average this so it doesnt favor long sentences

        if any(m.pos == u'動詞'
               for m in unknowns):  #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min(999, usefulness)

        # difference from optimal length (too little context vs long sentence)
        lenDiff = max(0, min(9, abs(C('optimal sentence length') - N) - 2))
        tooLong = N > C('optimal sentence length')

        # calculate mmi
        mmi = 10000 * N_k + 1000 * lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[nid] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split(tags), splitFields(flds)
        # determine card type
        compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag, badLengthTag, tooLongTag = tagNames = C(
            'tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C(
                'tag_alreadyKnown'), C('tag_priority'), C('tag_badLength'), C(
                    'tag_tooLong')
        if N_m == 0:  # sentence comprehension card, m+0
            ts = [compTag
                  ] + [t for t in ts if t not in [vocabTag, notReadyTag]]
            setField(mid, fs, C('focusMorph'), u'')
        elif N_k == 1:  # new vocab card, k+1
            ts = [vocabTag
                  ] + [t for t in ts if t not in [compTag, notReadyTag]]
            setField(mid, fs, C('focusMorph'), u'%s' % focusMorph.base)
        elif N_k > 1:  # M+1+ and K+2+
            ts = [notReadyTag
                  ] + [t for t in ts if t not in [compTag, vocabTag]]

            # set type agnostic fields
        setField(mid, fs, C('k+N'), u'%d' % N_k)
        setField(mid, fs, C('m+N'), u'%d' % N_m)
        setField(mid, fs, C('morphManIndex'), u'%d' % mmi)
        setField(mid, fs, C('unknowns'), u', '.join(u.base for u in unknowns))
        setField(mid, fs, C('unmatures'),
                 u', '.join(u.base for u in unmatures))
        setField(mid, fs, C('unknownFreq'), u'%d' % F_k_avg)

        # other tags
        if priorityTag in ts: ts.remove(priorityTag)
        if isPriority: ts.append(priorityTag)

        if badLengthTag in ts: ts.remove(badLengthTag)
        if lenDiff: ts.append(badLengthTag)

        if tooLongTag in ts: ts.remove(tooLongTag)
        if tooLong: ts.append(tooLongTag)

        # update sql db
        tags_ = TAG.join(TAG.canonify(ts))
        flds_ = joinFields(fs)
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum(fs[0])
            sfld = stripHTML(fs[getSortFieldIndex(mid)])
            ds.append({
                'now': now,
                'tags': tags_,
                'flds': flds_,
                'sfld': sfld,
                'csum': csum,
                'usn': mw.col.usn(),
                'nid': nid
            })

    mw.progress.update(value=i, label='Updating anki database...')
    mw.col.db.executemany(
        'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid',
        ds)
    TAG.register(tagNames)

    # Now reorder new cards based on MMI
    mw.progress.update(value=i, label='Updating new card ordering...')
    ds = []
    for (cid, nid,
         due) in db.execute('select id, nid, due from cards where type = 0'):
        if nid in nid2mmi:  # owise it was disabled
            due_ = nid2mmi[nid]
            if due != due_:  # only update cards that have changed
                ds.append({
                    'now': now,
                    'due': due_,
                    'usn': mw.col.usn(),
                    'cid': cid
                })
    mw.col.db.executemany(
        'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds)
    mw.reset()

    printf('Updated notes in %f sec' % (time.time() - t_0))
    mw.progress.finish()
    return knownDb
Esempio n. 11
0
def bleu(gold, pred):
    """-> float in [0, 1]; gold, pred : seq (sent : seq (word : str))"""
    from nltk.translate.bleu_score import corpus_bleu
    return corpus_bleu([[g] for g in gold], pred)


def parse_args():
    import argparse
    parser = argparse.ArgumentParser(description="corpus-level BLEU score.")
    parser.add_argument('gold', help="file for the gold-standard sentences")
    parser.add_argument('pred',
                        help="files for the predicted sentences",
                        nargs='+')
    parser.add_argument('--ignore-case',
                        action='store_true',
                        help="case insensitive")
    return parser.parse_args()


if '__main__' == __name__:
    args = parse_args()
    from util import comp, partial
    from util_io import load
    proc = str.split
    if args.ignore_case: proc = comp(proc, str.lower)
    load_corpus = comp(list, partial(map, proc), load)
    gold = load_corpus(args.gold)
    for pred in args.pred:
        print(pred, "{:.4f}".format(bleu(gold, load_corpus(pred))))
Esempio n. 12
0
def mkAllDb(allDb=None):
    import config
    reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    N_enabled_notes = 0  # for providing an error message if there is no note that is used for processing
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in notecfg['Fields']:
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop(loc)
                    locDb[newLoc] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(
            u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.'
        )
        return None

    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb
Esempio n. 13
0
def updateNotes(allDb):
    t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi = [], {}
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Updating data', max=N_notes, immediate=True)
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    # read tag names
    compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg(
        'Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg(
            'Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg(
                'Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong')
    TAG.register(tagNames)
    badLengthTag = jcfg2().get('Tag_BadLength')

    # handle secondary databases
    mw.progress.update(label='Creating seen/known/mature from all.db')
    seenDb = filterDbByMat(allDb, cfg1('threshold_seen'))
    knownDb = filterDbByMat(allDb, cfg1('threshold_known'))
    matureDb = filterDbByMat(allDb, cfg1('threshold_mature'))
    mw.progress.update(label='Loading priority.db')
    priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db

    if cfg1('saveDbs'):
        mw.progress.update(label='Saving seen/known/mature dbs')
        seenDb.save(cfg1('path_seen'))
        knownDb.save(cfg1('path_known'))
        matureDb.save(cfg1('path_mature'))

    mw.progress.update(label='Updating notes')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None or not notecfg['Modify']: continue

        # Get all morphemes for note
        morphemes = set()
        for fieldName in notecfg['Fields']:
            try:
                loc = fidDb[(nid, guid, fieldName)]
                morphemes.update(locDb[loc])
            except KeyError:
                continue

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for morpheme in morphemes:
            if morpheme not in seenDb.db: unseens.add(morpheme)
            if morpheme not in knownDb.db: unknowns.add(morpheme)
            if morpheme not in matureDb.db: unmatures.add(morpheme)
            if morpheme not in matureDb.db and morpheme in knownDb.db:
                newKnowns.add(morpheme)

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len(morphemes), len(unseens), len(unknowns), len(
            unmatures)

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

        # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns:  # focusMorph used outside loop
            F_k += allDb.frequency(focusMorph)
        F_k_avg = F_k // N_k if N_k > 0 else F_k
        usefulness = F_k_avg

        # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for morpheme in newKnowns:
            locs = allDb.db[morpheme]
            if locs:
                ivl = min(1, max(loc.maturity for loc in locs))
                usefulness += C(
                    'reinforce new vocab weight'
                ) // ivl  #TODO: maybe average this so it doesnt favor long sentences

        if any(morpheme.pos == u'動詞'
               for morpheme in unknowns):  #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min(999, usefulness)

        # difference from optimal length range (too little context vs long sentence)
        lenDiffRaw = min(N - C('min good sentence length'),
                         max(0, N - C('max good sentence length')))
        lenDiff = min(9, abs(lenDiffRaw))

        # calculate mmi
        mmi = 10000 * N_k + 1000 * lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[nid] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split(tags), splitFields(flds)

        # clear any 'special' tags, the appropriate will be set in the next few lines
        ts = [
            t for t in ts
            if t not in [notReadyTag, compTag, vocabTag, freshTag]
        ]

        # determine card type
        if N_m == 0:  # sentence comprehension card, m+0
            ts = ts + [compTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')
        elif N_k == 1:  # new vocab card, k+1
            ts = ts + [vocabTag]
            setField(mid, fs, jcfg('Field_FocusMorph'),
                     u'%s' % focusMorph.base)
        elif N_k > 1:  # M+1+ and K+2+
            ts = ts + [notReadyTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')
        elif N_m == 1:  # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [freshTag]
            setField(mid, fs, jcfg('Field_FocusMorph'),
                     u'%s' % list(unmatures)[0].base)
        else:  # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [freshTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')

            # set type agnostic fields
        setField(mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k)
        setField(mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m)
        setField(mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi)
        setField(mid, fs, jcfg('Field_Unknowns'),
                 u', '.join(u.base for u in unknowns))
        setField(mid, fs, jcfg('Field_Unmatures'),
                 u', '.join(u.base for u in unmatures))
        setField(mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg)

        # remove deprecated tag
        if badLengthTag is not None and badLengthTag in ts:
            ts.remove(badLengthTag)

            # other tags
        if priorityTag in ts: ts.remove(priorityTag)
        if isPriority: ts.append(priorityTag)

        if tooShortTag in ts: ts.remove(tooShortTag)
        if lenDiffRaw < 0: ts.append(tooShortTag)

        if tooLongTag in ts: ts.remove(tooLongTag)
        if lenDiffRaw > 0: ts.append(tooLongTag)

        # remove unnecessary tags
        if not jcfg('Option_SetNotRequiredTags'):
            unnecessary = [priorityTag, tooShortTag, tooLongTag]
            ts = [tag for tag in ts if tag not in unnecessary]

            # update sql db
        tags_ = TAG.join(TAG.canonify(ts))
        flds_ = joinFields(fs)
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum(fs[0])
            sfld = stripHTML(fs[getSortFieldIndex(mid)])
            ds.append({
                'now': now,
                'tags': tags_,
                'flds': flds_,
                'sfld': sfld,
                'csum': csum,
                'usn': mw.col.usn(),
                'nid': nid
            })

    mw.progress.update(value=i, label='Updating anki database...')
    mw.col.db.executemany(
        'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid',
        ds)

    # Now reorder new cards based on MMI
    mw.progress.update(value=i, label='Updating new card ordering...')
    ds = []

    # "type = 0": new cards
    # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type]
    # "type = 2": review cards
    for (cid, nid,
         due) in db.execute('select id, nid, due from cards where type = 0'):
        if nid in nid2mmi:  # owise it was disabled
            due_ = nid2mmi[nid]
            if due != due_:  # only update cards that have changed
                ds.append({
                    'now': now,
                    'due': due_,
                    'usn': mw.col.usn(),
                    'cid': cid
                })
    mw.col.db.executemany(
        'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds)
    mw.reset()

    printf('Updated notes in %f sec' % (time.time() - t_0))
    mw.progress.finish()
    return knownDb
Esempio n. 14
0
from util import partial, Record
from util_np import np
from util_tf import tf, placeholder, trim, get_shape

scope = partial(tf.variable_scope, reuse=tf.AUTO_REUSE)

init_bias = tf.zeros_initializer()
init_kern = tf.variance_scaling_initializer(1.0, 'fan_avg', 'uniform')
init_relu = tf.variance_scaling_initializer(2.0, 'fan_avg', 'uniform')

layer_nrm = tf.contrib.layers.layer_norm
layer_aff = partial(tf.layers.dense,
                    kernel_initializer=init_kern,
                    bias_initializer=init_bias)
layer_act = partial(tf.layers.dense,
                    kernel_initializer=init_relu,
                    bias_initializer=init_bias,
                    activation=tf.nn.relu)
layer_rnn = partial(tf.contrib.cudnn_rnn.CudnnGRU,
                    kernel_initializer=init_kern,
                    bias_initializer=init_bias)


def attention(query, value, mask, dim, head=8):
    """computes scaled dot-product attention

    query : tensor f32 (b, d_q, t)
    value : tensor f32 (b, d_v, s)
     mask : tensor f32 (b,   t, s)
         -> tensor f32 (b, dim, t)
Esempio n. 15
0
def mkAllDb( allDb=None ):
    import config; reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        if C('ignore maturity'):
            mats = [ 0 for mat in mats ]
        ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in notecfg['Fields']:
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.')
        return None

    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb
Esempio n. 16
0
def updateNotes( allDb ):
    t_0, now, db, TAG   = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi         = [], {}
    N_notes             = db.scalar( 'select count() from notes' )
    mw.progress.start( label='Updating data', max=N_notes, immediate=True )
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc

    # read tag names
    compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg('Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg('Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg('Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong')
    TAG.register( tagNames )
    badLengthTag = jcfg2().get('Tag_BadLength')

    # handle secondary databases
    mw.progress.update( label='Creating seen/known/mature from all.db' )
    seenDb      = filterDbByMat( allDb, cfg1('threshold_seen') )
    knownDb     = filterDbByMat( allDb, cfg1('threshold_known') )
    matureDb    = filterDbByMat( allDb, cfg1('threshold_mature') )
    mw.progress.update( label='Loading priority.db' )
    priorityDb  = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db

    if cfg1('saveDbs'):
        mw.progress.update( label='Saving seen/known/mature dbs' )
        seenDb.save( cfg1('path_seen') )
        knownDb.save( cfg1('path_known') )
        matureDb.save( cfg1('path_mature') )

    mw.progress.update( label='Updating notes' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None or not notecfg['Modify']: continue

        # Get all morphemes for note
        morphemes = set()
        for fieldName in notecfg['Fields']:
            try:
                loc = fidDb[ ( nid, guid, fieldName ) ]
                morphemes.update( locDb[ loc ] )
            except KeyError: continue

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for morpheme in morphemes:
            if morpheme not in seenDb.db:      unseens.add( morpheme )
            if morpheme not in knownDb.db:     unknowns.add( morpheme )
            if morpheme not in matureDb.db:    unmatures.add( morpheme )
            if morpheme not in matureDb.db and morpheme in knownDb.db:
                newKnowns.add( morpheme )

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len( morphemes ), len( unseens ), len( unknowns ), len( unmatures )

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

            # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns: # focusMorph used outside loop
            F_k += allDb.frequency(focusMorph)
        F_k_avg = F_k // N_k if N_k > 0 else F_k
        usefulness = F_k_avg

            # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for morpheme in newKnowns:
            locs = allDb.db[ morpheme ]
            if locs:
                ivl = min( 1, max( loc.maturity for loc in locs ) )
                usefulness += C('reinforce new vocab weight') // ivl #TODO: maybe average this so it doesnt favor long sentences

        if any( morpheme.pos == u'動詞' for morpheme in unknowns ): #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min( 999, usefulness )

        # difference from optimal length range (too little context vs long sentence)
        lenDiffRaw = min(N - C('min good sentence length'),
                         max(0, N - C('max good sentence length')))
        lenDiff = min(9, abs(lenDiffRaw))

            # calculate mmi
        mmi = 10000*N_k + 1000*lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[ nid ] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split( tags ), splitFields( flds )

        # clear any 'special' tags, the appropriate will be set in the next few lines
        ts = [ t for t in ts if t not in [ notReadyTag, compTag, vocabTag, freshTag ] ]

        # determine card type
        if N_m == 0:    # sentence comprehension card, m+0
            ts = ts + [ compTag ]
            setField( mid, fs, jcfg('Field_FocusMorph'), u'' )
        elif N_k == 1:  # new vocab card, k+1
            ts = ts + [ vocabTag ]
            setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base )
        elif N_k > 1:   # M+1+ and K+2+
            ts = ts + [ notReadyTag ]
            setField( mid, fs, jcfg('Field_FocusMorph'), u'')
        elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [ freshTag ]
            setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base)
        else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [ freshTag ]
            setField( mid, fs, jcfg('Field_FocusMorph'), u'')


            # set type agnostic fields
        setField( mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k )
        setField( mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m )
        setField( mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi )
        setField( mid, fs, jcfg('Field_Unknowns'), u', '.join( u.base for u in unknowns ) )
        setField( mid, fs, jcfg('Field_Unmatures'), u', '.join( u.base for u in unmatures ) )
        setField( mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg )

            # remove deprecated tag
        if badLengthTag is not None and badLengthTag in ts:
            ts.remove( badLengthTag )

            # other tags
        if priorityTag in ts:   ts.remove( priorityTag )
        if isPriority:          ts.append( priorityTag )

        if tooShortTag in ts:   ts.remove( tooShortTag )
        if lenDiffRaw < 0:      ts.append( tooShortTag )

        if tooLongTag in ts:    ts.remove( tooLongTag )
        if lenDiffRaw > 0:      ts.append( tooLongTag )

        # remove unnecessary tags
        if not jcfg('Option_SetNotRequiredTags'):
            unnecessary = [priorityTag, tooShortTag, tooLongTag]
            ts = [tag for tag in ts if tag not in unnecessary]

            # update sql db
        tags_ = TAG.join( TAG.canonify( ts ) )
        flds_ = joinFields( fs )
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum( fs[0] )
            sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] )
            ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } )

    mw.progress.update( value=i, label='Updating anki database...' )
    mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds )

    # Now reorder new cards based on MMI
    mw.progress.update( value=i, label='Updating new card ordering...' )
    ds = []

    # "type = 0": new cards
    # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type]
    # "type = 2": review cards
    for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ):
        if nid in nid2mmi: # owise it was disabled
            due_ = nid2mmi[ nid ]
            if due != due_: # only update cards that have changed
                ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } )
    mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds )
    mw.reset()

    printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) )
    mw.progress.finish()
    return knownDb
Esempio n. 17
0
train_nl = train_nl[:2**17].copy()
train_da = train_da[:2**17].copy()

data_index = 1, 3
data_valid = valid_nl, valid_da
data_train = train_nl, train_da


def batch(arrs, size=C.batch_train, seed=C.seed):
    size //= len(arrs) * (len(arrs) - 1)
    for i in batch_sample(len(arrs[0]), size, seed):
        yield tuple(arr[i] for arr in arrs)


perm = comp(tuple, partial(permutations, r=2))
data_index = perm(data_index)
data_valid = perm(data_valid)
data_train = perm(
    pipe(partial(batch, data_train), (tf.int32, ) * len(data_train),
         prefetch=16))

###############
# build model #
###############

model = Model.new(**select(C, *Model._new))
valid = tuple(model.data(i, j).valid() for i, j in data_index)
train = tuple(
    model.data(i, j, s, t).train(**T)
    for (i, j), (s, t) in zip(data_index, data_train))
Esempio n. 18
0
from util import Record, partial
import tensorflow as tf
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

scope = partial(tf.variable_scope, reuse=tf.AUTO_REUSE)


def profile(sess, wtr, run, feed_dict=None, prerun=3, tag='flow'):
    for _ in range(prerun):
        sess.run(run, feed_dict)
    meta = tf.RunMetadata()
    sess.run(run, feed_dict,
             tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), meta)
    wtr.add_run_metadata(meta, tag)


def pipe(gen_func,
         gen_types,
         map_func=None,
         map_types=None,
         para_map=4,
         prefetch=4,
         name='pipe'):
    """returns iterator tensors of `gen_types` from generator `gen_func`.
    see `tf.data.Dataset.from_generator`.

    when specified, `map_func` is called on the generator outputs (as
    numpy arrays) and tensors of `map_types` are returned instead.
    `para_map` number of calls are processed in parallel.  `map_func`
    must be stateless.  otherwise simply transform the data in
Esempio n. 19
0
def vAe(
        mode,
        src=None,
        tgt=None,
        # model spec
        dim_tgt=8192,
        dim_emb=512,
        dim_rep=1024,
        rnn_layers=3,
        bidirectional=True,
        bidir_stacked=True,
        attentive=False,
        logit_use_embed=True,
        # training spec
        accelerate=1e-4,
        learn_rate=1e-3,
        bos=2,
        eos=1):

    # dim_tgt : vocab size
    # dim_emb : model dimension
    # dim_rep : representation dimension
    #
    # unk=0 for word dropout

    assert mode in ('train', 'valid', 'infer')
    self = Record(bos=bos, eos=eos)

    with scope('step'):
        step = self.step = tf.train.get_or_create_global_step()
        rate = accelerate * tf.to_float(step)
        rate_keepwd = self.rate_keepwd = tf.sigmoid(rate)
        rate_anneal = self.rate_anneal = tf.tanh(rate)
        rate_update = self.rate_update = learn_rate / (tf.sqrt(rate) + 1.0)

    with scope('src'):
        src = self.src = placeholder(tf.int32, (None, None), src, 'src')
        src = tf.transpose(src)  # time major order
        src, msk_src, len_src = trim(src, eos)

    with scope('tgt'):
        tgt = self.tgt = placeholder(tf.int32, (None, None), tgt, 'tgt')
        tgt = tf.transpose(tgt)  # time major order
        tgt, msk_tgt, len_tgt = trim(tgt, eos)
        msk_tgt = tf.pad(msk_tgt, ((1, 0), (0, 0)), constant_values=True)
        # pads for decoder : lead=[bos]+tgt -> gold=tgt+[eos]
        lead, gold = tgt, tf.pad(tgt,
                                 paddings=((0, 1), (0, 0)),
                                 constant_values=eos)
        if 'train' == mode:
            lead *= tf.to_int32(
                tf.random_uniform(tf.shape(lead)) < rate_keepwd)
        lead = self.lead = tf.pad(lead,
                                  paddings=((1, 0), (0, 0)),
                                  constant_values=bos)

    # s : src length
    # t : tgt length plus one padding, either eos or bos
    # b : batch size
    #
    # len_src :  b  aka s
    # msk_src : sb  without padding
    # msk_tgt : tb  with eos
    #
    #    lead : tb  with bos
    #    gold : tb  with eos

    with scope('embed'):
        b = (6 / (dim_tgt / dim_emb + 1))**0.5
        embedding = tf.get_variable('embedding', (dim_tgt, dim_emb),
                                    initializer=tf.random_uniform_initializer(
                                        -b, b))
        emb_tgt = tf.gather(embedding, lead,
                            name='emb_tgt')  # (t, b) -> (t, b, dim_emb)
        emb_src = tf.gather(embedding, src,
                            name='emb_src')  # (s, b) -> (s, b, dim_emb)

    with scope('encode'):  # (s, b, dim_emb) -> (b, dim_emb)
        reverse = partial(tf.reverse_sequence,
                          seq_lengths=len_src,
                          seq_axis=0,
                          batch_axis=1)

        if bidirectional and bidir_stacked:
            for i in range(rnn_layers):
                with scope("rnn{}".format(i + 1)):
                    emb_fwd, _ = layer_rnn(1, dim_emb, name='fwd')(emb_src)
                    emb_bwd, _ = layer_rnn(1, dim_emb,
                                           name='bwd')(reverse(emb_src))
                    hs = emb_src = tf.concat((emb_fwd, reverse(emb_bwd)),
                                             axis=-1)

        elif bidirectional:
            with scope("rnn"):
                emb_fwd, _ = layer_rnn(rnn_layers, dim_emb,
                                       name='fwd')(emb_src)
                emb_bwd, _ = layer_rnn(rnn_layers, dim_emb,
                                       name='bwd')(reverse(emb_src))
            hs = tf.concat((emb_fwd, reverse(emb_bwd)), axis=-1)

        else:
            hs, _ = layer_rnn(rnn_layers, dim_emb, name='rnn')(emb_src)

        with scope('cata'):
            # extract the final states from the outputs: bd <- sbd, b2
            h = tf.gather_nd(
                hs,
                tf.stack(
                    (len_src - 1, tf.range(tf.size(len_src), dtype=tf.int32)),
                    axis=1))
            if attentive:  # todo fixme
                # the values are the outputs from all non-padding steps;
                # the queries are the final states;
                h = layer_nrm(h + tf.squeeze(  # bd <- bd1
                    attention(  # bd1 <- bd1, bds, b1s
                        tf.expand_dims(h, axis=2),  # query: bd1 <- bd
                        tf.transpose(hs, (1, 2, 0)),  # value: bds <- sbd
                        tf.log(
                            tf.to_float(  # -inf,0  mask: b1s <- sb <- bs
                                tf.expand_dims(tf.transpose(msk_src),
                                               axis=1))),
                        int(h.shape[-1])),
                    2))

    with scope('latent'):  # (b, dim_emb) -> (b, dim_rep) -> (b, dim_emb)
        # h = layer_aff(h, dim_emb, name='in')
        mu = self.mu = layer_aff(h, dim_rep, name='mu')
        lv = self.lv = layer_aff(h, dim_rep, name='lv')
        with scope('z'):
            h = mu
            if 'train' == mode:
                h += tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv))
            self.z = h
        h = layer_aff(h, dim_emb, name='ex')

    with scope('decode'):  # (b, dim_emb) -> (t, b, dim_emb) -> (?, dim_emb)
        h = self.state_in = tf.stack((h, ) * rnn_layers)
        h, _ = _, (self.state_ex, ) = layer_rnn(rnn_layers,
                                                dim_emb,
                                                name='rnn')(
                                                    emb_tgt,
                                                    initial_state=(h, ))
        if 'infer' != mode: h = tf.boolean_mask(h, msk_tgt)
        h = layer_aff(h, dim_emb, name='out')

    with scope('logits'):  # (?, dim_emb) -> (?, dim_tgt)
        if logit_use_embed:
            logits = self.logits = tf.tensordot(h, (dim_emb**-0.5) *
                                                tf.transpose(embedding), 1)
        else:
            logits = self.logits = layer_aff(h, dim_tgt)

    with scope('prob'):
        prob = self.prob = tf.nn.softmax(logits)
    with scope('pred'):
        pred = self.pred = tf.argmax(logits, -1, output_type=tf.int32)

    if 'infer' != mode:
        labels = tf.boolean_mask(gold, msk_tgt, name='labels')
        with scope('errt'):
            errt_samp = self.errt_samp = tf.to_float(tf.not_equal(
                labels, pred))
            errt = self.errt = tf.reduce_mean(errt_samp)
        with scope('loss'):
            with scope('loss_gen'):
                loss_gen_samp = self.loss_gen_samp = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=logits)
                loss_gen = self.loss_gen = tf.reduce_mean(loss_gen_samp)
            with scope('loss_kld'):
                loss_kld_samp = self.loss_kld_samp = 0.5 * (
                    tf.square(mu) + tf.exp(lv) - lv - 1.0)
                loss_kld = self.loss_kld = tf.reduce_mean(loss_kld_samp)
            loss = self.loss = rate_anneal * loss_kld + loss_gen

    if 'train' == mode:
        with scope('train'):
            train_step = self.train_step = tf.train.AdamOptimizer(
                rate_update).minimize(loss, step)

    return self
Esempio n. 20
0
# load sentencepiece model
vocab = sp.load_spm(path_vocab)

# Load the model
model = vAe('infer')
# Restore the session
sess = tf.InteractiveSession()
tf.train.Saver().restore(sess, path_ckpt)

################################
# deterministic representation #
################################

# encode text with sentence piece model
data = list(map(partial(sp.encode_capped, vocab), text))
data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32)

# calculate z for the test data in batches
inpt = [model.z.eval({model.src: data[i:j]}) for i, j in partition(len(data), 128)]
inpt = np.concatenate(inpt, axis=0)

np.save(path_emb, inpt)

#######################################################
# averaged representation with sentencepiece sampling #
#######################################################

def infer_avg(sent, samples=128):
   bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)]
   bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32)
Esempio n. 21
0
def updateNotes( allDb ):
    t_0, now, db, TAG   = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi         = [], {}
    N_notes             = db.scalar( 'select count() from notes' )
    mw.progress.start( label='Updating data', max=N_notes, immediate=True )
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc

    # handle secondary databases
    mw.progress.update( label='Creating seen/known/mature from all.db' )
    seenDb      = filterDbByMat( allDb, cfg1('threshold_seen') )
    knownDb     = filterDbByMat( allDb, cfg1('threshold_known') )
    matureDb    = filterDbByMat( allDb, cfg1('threshold_mature') )
    mw.progress.update( label='Loading priority.db' )
    priorityDb  = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db

    if cfg1('saveDbs'):
        mw.progress.update( label='Saving seen/known/mature dbs' )
        seenDb.save( cfg1('path_seen') )
        knownDb.save( cfg1('path_known') )
        matureDb.save( cfg1('path_mature') )

    mw.progress.update( label='Calculating frequency information' )
    pops = [ len( locs ) for locs in allDb.db.values() ]
    pops = [ n for n in pops if n > 1 ]

    mw.progress.update( label='Updating notes' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )
        if not C('enabled'): continue
        # Get all morphemes for note
        ms = set()
        for fieldName in C('morph_fields'):
            try:
                loc = fidDb[ ( nid, guid, fieldName ) ]
                ms.update( locDb[ loc ] )
            except KeyError: continue
        ms = [ m for m in ms if m.pos not in C('morph_blacklist') ]

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for m in ms:
            if m not in seenDb.db:      unseens.add( m )
            if m not in knownDb.db:     unknowns.add( m )
            if m not in matureDb.db:    unmatures.add( m )
            if m not in matureDb.db and m in knownDb.db:
                newKnowns.add( m )

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len( ms ), len( unseens ), len( unknowns ), len( unmatures )

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

            # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns: # focusMorph used outside loop
            F_k += len( allDb.db[ focusMorph ] )
        F_k_avg = F_k / N_k if N_k > 0 else F_k
        usefulness = F_k_avg

            # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for m in newKnowns:
            locs = allDb.db[ m ]
            if locs:
                ivl = min( 1, max( loc.maturity for loc in locs ) )
                usefulness += C('reinforce new vocab weight') / ivl #TODO: maybe average this so it doesnt favor long sentences

        if any( m.pos == u'動詞' for m in unknowns ): #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min( 999, usefulness )

            # difference from optimal length (too little context vs long sentence)
        lenDiff = max( 0, min( 9, abs( C('optimal sentence length') - N ) -2 ) )

            # calculate mmi
        mmi = 10000*N_k + 1000*lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[ nid ] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split( tags ), splitFields( flds )
            # determine card type
        compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag = tagNames = C('tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C('tag_alreadyKnown'), C('tag_priority')
        if N_m == 0:    # sentence comprehension card, m+0
            ts = [ compTag ] + [ t for t in ts if t not in [ vocabTag, notReadyTag ] ]
            setField( mid, fs, C('focusMorph'), u'' )
        elif N_k == 1:  # new vocab card, k+1
            ts = [ vocabTag ] + [ t for t in ts if t not in [ compTag, notReadyTag ] ]
            setField( mid, fs, C('focusMorph'), u'%s' % focusMorph.base )
        elif N_k > 1:   # M+1+ and K+2+
            ts = [ notReadyTag ] + [ t for t in ts if t not in [ compTag, vocabTag ] ]

            # set type agnostic fields
        setField( mid, fs, C('k+N'), u'%d' % N_k )
        setField( mid, fs, C('m+N'), u'%d' % N_m )
        setField( mid, fs, C('morphManIndex'), u'%d' % mmi )
        setField( mid, fs, C('unknowns'), u', '.join( u.base for u in unknowns ) )
        setField( mid, fs, C('unmatures'), u', '.join( u.base for u in unmatures ) )
        setField( mid, fs, C('unknownFreq'), u'%d' % F_k_avg )

            # other tags
        if priorityTag in ts:   ts.remove( priorityTag )
        if isPriority:          ts.append( priorityTag )

            # update sql db
        tags_ = TAG.join( TAG.canonify( ts ) )
        flds_ = joinFields( fs )
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum( fs[0] )
            sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] )
            ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } )

    mw.progress.update( value=i, label='Updating anki database...' )
    mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds )
    TAG.register( tagNames )

    # Now reorder new cards based on MMI
    mw.progress.update( value=i, label='Updating new card ordering...' )
    ds = []
    for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ):
        if nid in nid2mmi: # owise it was disabled
            due_ = nid2mmi[ nid ]
            if due != due_: # only update cards that have changed
                ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } )
    mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds )
    mw.reset()

    printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) )
    mw.progress.finish()
Esempio n. 22
0
    def __init__(self, tree, filename, firstlineno=1, nopassmacros=None,
                 varnames=None, argcount=0, codename='*', parent=None,
                 linemap=None, codeflags=0, docstring=None):

        self.code = []
        self.filename = filename
        self.name = codename
        self.argcount = argcount
        self.var_translators = []

        ## Map of ids of tuples/symbols in input to tuples of
        # (filename, startline, endline)
        if linemap is None:
            linemap = {}
        self.linemap = linemap

        if varnames is None:
            varnames = []
        ## List of arguments that are either local or cell
        self.varnames = varnames

        ## List of arguments that are either global, local, or cell
        self.unknowns = []

        ## List of other names needed in the function
        self.names = []

        ## List of constants used in the code block
        self.constants = [docstring]

        ## Map of variable names to the variable-name list in which
        # they want to belong.
        self.declareds = {}

        ## List of variable names which are to be taken from the function's
        # closure (the func_closure attribute).
        #
        # when appending to freevars, ensure that the corresponding parent
        # block has its variable made into a cellvar.
        self.freevars = []

        ## List of variable names which are referenced by nested blocks.
        # They may also appear in varnames.
        self.cellvars = []

        ## Maximum depth the value stack might reach when executing this
        # code block.
        self.maxstacklevel = 0
        self.stacklevel = 0

        ## 0x2000: future division (standard for Noodle); 0x4: *arguments;
        # 0x8: **kwargs; 0x20: generator
        self.codeflags = 0x2000 | codeflags

        ## Line number table
        self.lnotab = []
        self.lnotab_last_address = 0
        self.lnotab_last_lineno = firstlineno
        self.curlineno = self.firstlineno = firstlineno

        if nopassmacros is None:
            nopassmacros = {}
        self.nopassmacros = nopassmacros
        self.parent = parent

        self.setup_array_control()

        self.subfunc = partial(NoodleFunction,
                               filename=self.filename,
                               linemap=self.linemap,
                               parent=self)
        self.subclass = partial(NoodleClass,
                                filename=self.filename,
                                linemap=self.linemap,
                                parent=self)

        self.loadscopes = {
            'local': ('LOAD_FAST', self.varnames),
            'cell': ('LOAD_DEREF', self.cellvars),
            'free': ('LOAD_DEREF', self.freevars),
            'global': ('LOAD_GLOBAL', self.names)
        }
        self.storescopes = {
            'local': ('STORE_FAST', self.varnames),
            'cell': ('STORE_DEREF', self.cellvars),
            'free': ('STORE_DEREF', self.freevars),
            'global': ('STORE_GLOBAL', self.names)
        }
        self.delscopes = {
            'local': ('DELETE_FAST', self.varnames),
            'global': ('DELETE_GLOBAL', self.names)
        }

        self.extra_setup()
        self.compile_piece(tree)
        self.close_block()
        self.compile_deferreds()
        self.codestring = self.arrange_code(
            optimizing.OptimizeBytecode(self, self.code)
        )
Esempio n. 23
0
#!/usr/bin/env python3

from util import comp, partial, PointedIndex
from util_io import path, load_meta, load
from util_np import np, vpack

names, texts = load_meta()

chars = {char for text in texts for char in text}
chars.remove("\n")
chars.remove(" ")
index = PointedIndex(" \n" + "".join(sorted(chars)))
texts = vpack(
    map(comp(partial(np.fromiter, dtype=np.uint8), partial(map, index)),
        texts), index("\n"))

np.save("trial/data/index", index.vec)
np.save("trial/data/texts", texts)
np.save("trial/data/names", names)

for name in names:
    np.save("trial/data/grams/" + name, load(path(name)))
Esempio n. 24
0
#!/usr/bin/env python3

path = "trial/data"

from os.path import join
from util import partial, PointedIndex
from util_io import load, chartab, encode
from util_np import np, vpack

src = list(load(join(path, "train_src")))
tgt = list(load(join(path, "train_tgt")))

idx_src = PointedIndex(chartab(src))
idx_tgt = PointedIndex(chartab(tgt))
enc_src = partial(encode, idx_src)
enc_tgt = partial(encode, idx_tgt)

assert 1 == idx_src("\n") == idx_tgt("\n")
pack = lambda txt: vpack(map(partial(np.array, dtype=np.uint8), txt), fill=1)

np.save(join(path, "index_src"), idx_src.vec)
np.save(join(path, "index_tgt"), idx_tgt.vec)
np.save(join(path, "train_src"), pack(map(enc_src, src)))
np.save(join(path, "train_tgt"), pack(map(enc_tgt, tgt)))
np.save(join(path, "valid_src"),
        pack(map(enc_src, load(join(path, "valid_src")))))
np.save(join(path, "valid_tgt"),
        pack(map(enc_tgt, load(join(path, "valid_tgt")))))
Esempio n. 25
0
def my_getNewCard( self, _old ):
    '''Continually call _getNewCard until we get one with a focusMorph we haven't
    seen before. Also skip bad vocab cards.

    :type self: anki.sched.Scheduler
    :type _old: Callable
    '''

    while True:
        C = partial( cfg, None, self.col.decks.active()[0] )
        if not C('next new card feature'):
            return _old( self )
        if not C('new card merged fill'):
            c = _old( self )
            ''' :type c: anki.cards.Card '''
        else:   # pop from opposite direction and skip sibling spacing
            if not self._fillNew(): return
            ( id, due ) = self._newQueue.pop( 0 )
            c = self.col.getCard( id )
            self.newCount -= 1

        if not c: return			# no more cards
        n = c.note()

        # find the right morphemizer for this note, so we can apply model-dependent options (modify off == disable skip feature)
        from morphemes import getMorphemes
        from util import getFilter
        notefilter = getFilter(n)
        if notefilter is None: return c # this note is not configured in any filter -> proceed like normal without MorphMan-plugin
        if not notefilter['Modify']: return c # the deck should not be modified -> the user probably doesn't want the 'skip mature' feature

        # get the focus morph
        try: focusMorph = focus( n )		# field contains either the focusMorph or is empty
        except KeyError:
            tooltip( _( 'Encountered card without the \'focus morph\' field configured in the preferences. Please check your MorphMan settings and note models.') )
            return c	# card has no focusMorph field -> undefined behavior -> just proceed like normal

        # evaluate all conditions, on which this card might be skipped/buried
        isVocabCard = n.hasTag(jcfg('Tag_Vocab'))
        isNotReady = n.hasTag(jcfg('Tag_NotReady'))
        isComprehensionCard = n.hasTag(jcfg('Tag_Comprehension'))
        isFreshVocab = n.hasTag(jcfg('Tag_Fresh'))
        isAlreadyKnown = n.hasTag( jcfg('Tag_AlreadyKnown') )

        skipComprehension = jcfg('Option_SkipComprehensionCards')
        skipFresh = jcfg('Option_SkipFreshVocabCards')
        skipFocusMorphSeenToday = jcfg('Option_SkipFocusMorphSeenToday')

        skipCondition1 = (isComprehensionCard and skipComprehension)
        skipCondition2 = (isFreshVocab and skipFresh)
        skipCondition3 = isAlreadyKnown # the user requested that the vocabulary does not have to be shown
        skipCondition4 = (focusMorph in seenMorphs and skipFocusMorphSeenToday) # we already learned that/saw that today
        #skipCondition5 = not (isVocabCard or isNotReady) # even if it is not a good vocabulary card, we have no choice when there are no other cards available

        # skip/bury card if any skip condition is true
        if skipCondition1 or skipCondition2 or skipCondition3 or skipCondition4:
            self.buryCards( [ c.id ] )
            self.newCount += 1 # the card was quaried from the "new queue" so we have to increase the "new counter" back to its original value
            continue
        break

    return c