def my_getNewCard( self, _old ): '''Continually call _getNewCard until we get one with a focusMorph we haven't seen before. Also skip bad vocab cards''' while True: C = partial( cfg, None, self.col.decks.active()[0] ) if not C('next new card feature'): return _old( self ) if not C('new card merged fill'): c = _old( self ) else: # pop from opposite direction and skip sibling spacing if not self._fillNew(): return ( id, due ) = self._newQueue.pop( 0 ) c = self.col.getCard( id ) if not c: return # no more cards n = c.note() try: fm = focus( n ) # fm is either the focusMorph or empty except KeyError: return c # card has no focusMorph field -> assume it's good # determine if good vocab word based on whether k+1 # defaults to whether has focus morph if no k+N field or disabled try: goodVocab = n[ cfg( n.mid, None, 'k+N' ) ] == '1' except KeyError: goodVocab = fm if not goodVocab or fm in seenMorphs or n.hasTag( CN( n, 'tag_alreadyKnown' ) ): self.buryNote( c.note().id ) continue break return c
def my_getNewCard(self, _old): '''Continually call _getNewCard until we get one with a focusMorph we haven't seen before. Also skip bad vocab cards''' while True: C = partial(cfg, None, self.col.decks.active()[0]) if not C('next new card feature'): return _old(self) if not C('new card merged fill'): c = _old(self) else: # pop from opposite direction and skip sibling spacing if not self._fillNew(): return (id, due) = self._newQueue.pop(0) c = self.col.getCard(id) if not c: return # no more cards n = c.note() try: fm = focus(n) # fm is either the focusMorph or empty except KeyError: return c # card has no focusMorph field -> assume it's good # determine if good vocab word based on whether k+1 # defaults to whether has focus morph if no k+N field or disabled try: goodVocab = n[cfg(n.mid, None, 'k+N')] == '1' except KeyError: goodVocab = fm if not goodVocab or fm in seenMorphs or n.hasTag( CN(n, 'tag_alreadyKnown')): self.buryNote(c.note().id) continue break return c
def mkAllDb( allDb=None ): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) locDb.pop( loc ) locDb[ newLoc ] = ms printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def extra_setup(self): self.subfunc = partial(self.subfunc, parent=None) self.subclass = partial(self.subclass, parent=None) self.loadscopes = { 'local': ('LOAD_NAME', self.names), 'global': ('LOAD_GLOBAL', self.names) } self.storescopes = { 'local': ('STORE_NAME', self.names), 'global': ('STORE_GLOBAL', self.names) } self.delscopes = { 'local': ('DELETE_NAME', self.names), 'global': ('DELETE_GLOBAL', self.names) } self.localmacros = {} # XXX: ? # XXX: ? self.compiling_namespace = default_macro_namespace.copy()
def my_fillNew( self, _old ): '''If 'new card merged fill' is enabled for the current deck, when we refill we pull from all child decks, sort combined pool of cards, then limit. If disabled, do the standard sequential fill method''' C = partial( cfg, None, self.col.decks.active()[0] ) if not C('new card merged fill'): return _old( self ) if self._newQueue: return True if not self.newCount: return False self._newQueue = self.col.db.all('''select id, due from cards where did in %s and queue = 0 and due >= ? order by due limit ?''' % self._deckLimit(), C('new card merged fill min due'), self.queueLimit ) if self._newQueue: return True
def my_getNewCard(self, _old): '''Continually call _getNewCard until we get one with a focusMorph we haven't seen before. Also skip bad vocab cards. :type self: anki.sched.Scheduler :type _old: Callable ''' while True: C = partial(cfg, None, self.col.decks.active()[0]) if not C('next new card feature'): return _old(self) if not C('new card merged fill'): c = _old(self) ''' :type c: anki.cards.Card ''' else: # pop from opposite direction and skip sibling spacing if not self._fillNew(): return (id, due) = self._newQueue.pop(0) c = self.col.getCard(id) self.newCount -= 1 if not c: return # no more cards n = c.note() try: fm = focus(n) # fm is either the focusMorph or empty except KeyError: return c # card has no focusMorph field -> assume it's good # determine if good vocab word based on whether k+1 # defaults to whether has focus morph if no k+N field or disabled try: goodVocab = n[jcfg('Field_UnknownMorphCount')] == '1' except KeyError: goodVocab = fm # even if it is not a good vocabulary card, we have no choice when there are no other cards available if (not goodVocab and not n.hasTag(jcfg('Tag_NotReady'))) or n.hasTag( jcfg('Tag_AlreadyKnown')) or fm in seenMorphs: self.buryCards([c.id]) self.newCount += 1 # the card was quaried from the "new queue" so we have to increase the "new counter" back to its original value continue break return c
valid_en, train_en = np.load(pform(P.data, "valid_en.npy")), np.load(pform(P.data, "train_en.npy")) # valid_nl, train_nl = np.load(pform(P.data, "valid_nl.npy")), np.load(pform(P.data, "train_nl.npy")) valid_de, train_de = np.load(pform(P.data, "valid_de.npy")), np.load(pform(P.data, "train_de.npy")) # valid_da, train_da = np.load(pform(P.data, "valid_da.npy")), np.load(pform(P.data, "train_da.npy")) valid_sv, train_sv = np.load(pform(P.data, "valid_sv.npy")), np.load(pform(P.data, "train_sv.npy")) data_index = 0, 2, 4 data_valid = valid_en, valid_de, valid_sv data_train = train_en, train_de, train_sv def batch(arrs, size= C.batch_train, seed= C.seed): size //= len(arrs) * (len(arrs) - 1) for i in batch_sample(len(arrs[0]), size, seed): yield tuple(arr[i] for arr in arrs) perm = comp(tuple, partial(permutations, r= 2)) data_index = perm(data_index) data_valid = perm(data_valid) data_train = perm(pipe(partial(batch, data_train), (tf.int32,)*len(data_train), prefetch= 16)) ############### # build model # ############### model = Model.new(**select(C, *Model._new)) valid = tuple(model.data(i, j).valid() for i, j in data_index) train = tuple(model.data(i, j, s, t).train(**T) for (i, j), (s, t) in zip(data_index, data_train)) model.lr = train[0].lr model.step = train[0].step model.errt = train[0].errt
def mkAllDb(allDb=None): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf(' .mats for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf(' .morphs for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) locDb.pop(loc) locDb[newLoc] = ms printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Calculating frequency information') pops = [len(locs) for locs in allDb.db.values()] pops = [n for n in pops if n > 1] mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue # Get all morphemes for note ms = set() for fieldName in C('morph_fields'): try: loc = fidDb[(nid, guid, fieldName)] ms.update(locDb[loc]) except KeyError: continue ms = [m for m in ms if m.pos not in C('morph_blacklist')] # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for m in ms: if m not in seenDb.db: unseens.add(m) if m not in knownDb.db: unknowns.add(m) if m not in matureDb.db: unmatures.add(m) if m not in matureDb.db and m in knownDb.db: newKnowns.add(m) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(ms), len(unseens), len(unknowns), len(unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += len(allDb.db[focusMorph]) F_k_avg = F_k / N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for m in newKnowns: locs = allDb.db[m] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) / ivl #TODO: maybe average this so it doesnt favor long sentences if any(m.pos == u'動詞' for m in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length (too little context vs long sentence) lenDiff = max(0, min(9, abs(C('optimal sentence length') - N) - 2)) tooLong = N > C('optimal sentence length') # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # determine card type compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag, badLengthTag, tooLongTag = tagNames = C( 'tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C( 'tag_alreadyKnown'), C('tag_priority'), C('tag_badLength'), C( 'tag_tooLong') if N_m == 0: # sentence comprehension card, m+0 ts = [compTag ] + [t for t in ts if t not in [vocabTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = [vocabTag ] + [t for t in ts if t not in [compTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = [notReadyTag ] + [t for t in ts if t not in [compTag, vocabTag]] # set type agnostic fields setField(mid, fs, C('k+N'), u'%d' % N_k) setField(mid, fs, C('m+N'), u'%d' % N_m) setField(mid, fs, C('morphManIndex'), u'%d' % mmi) setField(mid, fs, C('unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, C('unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, C('unknownFreq'), u'%d' % F_k_avg) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if badLengthTag in ts: ts.remove(badLengthTag) if lenDiff: ts.append(badLengthTag) if tooLongTag in ts: ts.remove(tooLongTag) if tooLong: ts.append(tooLongTag) # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) TAG.register(tagNames) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb
def bleu(gold, pred): """-> float in [0, 1]; gold, pred : seq (sent : seq (word : str))""" from nltk.translate.bleu_score import corpus_bleu return corpus_bleu([[g] for g in gold], pred) def parse_args(): import argparse parser = argparse.ArgumentParser(description="corpus-level BLEU score.") parser.add_argument('gold', help="file for the gold-standard sentences") parser.add_argument('pred', help="files for the predicted sentences", nargs='+') parser.add_argument('--ignore-case', action='store_true', help="case insensitive") return parser.parse_args() if '__main__' == __name__: args = parse_args() from util import comp, partial from util_io import load proc = str.split if args.ignore_case: proc = comp(proc, str.lower) load_corpus = comp(list, partial(map, proc), load) gold = load_corpus(args.gold) for pred in args.pred: print(pred, "{:.4f}".format(bleu(gold, load_corpus(pred))))
def mkAllDb(allDb=None): import config reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop(loc) locDb[newLoc] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg( u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.' ) return None printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # read tag names compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg( 'Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg( 'Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg( 'Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong') TAG.register(tagNames) badLengthTag = jcfg2().get('Tag_BadLength') # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None or not notecfg['Modify']: continue # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: try: loc = fidDb[(nid, guid, fieldName)] morphemes.update(locDb[loc]) except KeyError: continue # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for morpheme in morphemes: if morpheme not in seenDb.db: unseens.add(morpheme) if morpheme not in knownDb.db: unknowns.add(morpheme) if morpheme not in matureDb.db: unmatures.add(morpheme) if morpheme not in matureDb.db and morpheme in knownDb.db: newKnowns.add(morpheme) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(morphemes), len(unseens), len(unknowns), len( unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += allDb.frequency(focusMorph) F_k_avg = F_k // N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for morpheme in newKnowns: locs = allDb.db[morpheme] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) // ivl #TODO: maybe average this so it doesnt favor long sentences if any(morpheme.pos == u'動詞' for morpheme in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length range (too little context vs long sentence) lenDiffRaw = min(N - C('min good sentence length'), max(0, N - C('max good sentence length'))) lenDiff = min(9, abs(lenDiffRaw)) # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # clear any 'special' tags, the appropriate will be set in the next few lines ts = [ t for t in ts if t not in [notReadyTag, compTag, vocabTag, freshTag] ] # determine card type if N_m == 0: # sentence comprehension card, m+0 ts = ts + [compTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = ts + [vocabTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = ts + [notReadyTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base) else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') # set type agnostic fields setField(mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k) setField(mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m) setField(mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi) setField(mid, fs, jcfg('Field_Unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, jcfg('Field_Unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg) # remove deprecated tag if badLengthTag is not None and badLengthTag in ts: ts.remove(badLengthTag) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if tooShortTag in ts: ts.remove(tooShortTag) if lenDiffRaw < 0: ts.append(tooShortTag) if tooLongTag in ts: ts.remove(tooLongTag) if lenDiffRaw > 0: ts.append(tooLongTag) # remove unnecessary tags if not jcfg('Option_SetNotRequiredTags'): unnecessary = [priorityTag, tooShortTag, tooLongTag] ts = [tag for tag in ts if tag not in unnecessary] # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] # "type = 0": new cards # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type] # "type = 2": review cards for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb
from util import partial, Record from util_np import np from util_tf import tf, placeholder, trim, get_shape scope = partial(tf.variable_scope, reuse=tf.AUTO_REUSE) init_bias = tf.zeros_initializer() init_kern = tf.variance_scaling_initializer(1.0, 'fan_avg', 'uniform') init_relu = tf.variance_scaling_initializer(2.0, 'fan_avg', 'uniform') layer_nrm = tf.contrib.layers.layer_norm layer_aff = partial(tf.layers.dense, kernel_initializer=init_kern, bias_initializer=init_bias) layer_act = partial(tf.layers.dense, kernel_initializer=init_relu, bias_initializer=init_bias, activation=tf.nn.relu) layer_rnn = partial(tf.contrib.cudnn_rnn.CudnnGRU, kernel_initializer=init_kern, bias_initializer=init_bias) def attention(query, value, mask, dim, head=8): """computes scaled dot-product attention query : tensor f32 (b, d_q, t) value : tensor f32 (b, d_v, s) mask : tensor f32 (b, t, s) -> tensor f32 (b, dim, t)
def mkAllDb( allDb=None ): import config; reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] if C('ignore maturity'): mats = [ 0 for mat in mats ] ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop( loc ) locDb[ newLoc ] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.') return None printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def updateNotes( allDb ): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Updating data', max=N_notes, immediate=True ) fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc # read tag names compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg('Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg('Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg('Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong') TAG.register( tagNames ) badLengthTag = jcfg2().get('Tag_BadLength') # handle secondary databases mw.progress.update( label='Creating seen/known/mature from all.db' ) seenDb = filterDbByMat( allDb, cfg1('threshold_seen') ) knownDb = filterDbByMat( allDb, cfg1('threshold_known') ) matureDb = filterDbByMat( allDb, cfg1('threshold_mature') ) mw.progress.update( label='Loading priority.db' ) priorityDb = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db if cfg1('saveDbs'): mw.progress.update( label='Saving seen/known/mature dbs' ) seenDb.save( cfg1('path_seen') ) knownDb.save( cfg1('path_known') ) matureDb.save( cfg1('path_mature') ) mw.progress.update( label='Updating notes' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None or not notecfg['Modify']: continue # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: try: loc = fidDb[ ( nid, guid, fieldName ) ] morphemes.update( locDb[ loc ] ) except KeyError: continue # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for morpheme in morphemes: if morpheme not in seenDb.db: unseens.add( morpheme ) if morpheme not in knownDb.db: unknowns.add( morpheme ) if morpheme not in matureDb.db: unmatures.add( morpheme ) if morpheme not in matureDb.db and morpheme in knownDb.db: newKnowns.add( morpheme ) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len( morphemes ), len( unseens ), len( unknowns ), len( unmatures ) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += allDb.frequency(focusMorph) F_k_avg = F_k // N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for morpheme in newKnowns: locs = allDb.db[ morpheme ] if locs: ivl = min( 1, max( loc.maturity for loc in locs ) ) usefulness += C('reinforce new vocab weight') // ivl #TODO: maybe average this so it doesnt favor long sentences if any( morpheme.pos == u'動詞' for morpheme in unknowns ): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min( 999, usefulness ) # difference from optimal length range (too little context vs long sentence) lenDiffRaw = min(N - C('min good sentence length'), max(0, N - C('max good sentence length'))) lenDiff = min(9, abs(lenDiffRaw)) # calculate mmi mmi = 10000*N_k + 1000*lenDiff + usefulness if C('set due based on mmi'): nid2mmi[ nid ] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split( tags ), splitFields( flds ) # clear any 'special' tags, the appropriate will be set in the next few lines ts = [ t for t in ts if t not in [ notReadyTag, compTag, vocabTag, freshTag ] ] # determine card type if N_m == 0: # sentence comprehension card, m+0 ts = ts + [ compTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'' ) elif N_k == 1: # new vocab card, k+1 ts = ts + [ vocabTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base ) elif N_k > 1: # M+1+ and K+2+ ts = ts + [ notReadyTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'') elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [ freshTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base) else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [ freshTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'') # set type agnostic fields setField( mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k ) setField( mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m ) setField( mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi ) setField( mid, fs, jcfg('Field_Unknowns'), u', '.join( u.base for u in unknowns ) ) setField( mid, fs, jcfg('Field_Unmatures'), u', '.join( u.base for u in unmatures ) ) setField( mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg ) # remove deprecated tag if badLengthTag is not None and badLengthTag in ts: ts.remove( badLengthTag ) # other tags if priorityTag in ts: ts.remove( priorityTag ) if isPriority: ts.append( priorityTag ) if tooShortTag in ts: ts.remove( tooShortTag ) if lenDiffRaw < 0: ts.append( tooShortTag ) if tooLongTag in ts: ts.remove( tooLongTag ) if lenDiffRaw > 0: ts.append( tooLongTag ) # remove unnecessary tags if not jcfg('Option_SetNotRequiredTags'): unnecessary = [priorityTag, tooShortTag, tooLongTag] ts = [tag for tag in ts if tag not in unnecessary] # update sql db tags_ = TAG.join( TAG.canonify( ts ) ) flds_ = joinFields( fs ) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum( fs[0] ) sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] ) ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } ) mw.progress.update( value=i, label='Updating anki database...' ) mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds ) # Now reorder new cards based on MMI mw.progress.update( value=i, label='Updating new card ordering...' ) ds = [] # "type = 0": new cards # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type] # "type = 2": review cards for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[ nid ] if due != due_: # only update cards that have changed ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } ) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds ) mw.reset() printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) ) mw.progress.finish() return knownDb
train_nl = train_nl[:2**17].copy() train_da = train_da[:2**17].copy() data_index = 1, 3 data_valid = valid_nl, valid_da data_train = train_nl, train_da def batch(arrs, size=C.batch_train, seed=C.seed): size //= len(arrs) * (len(arrs) - 1) for i in batch_sample(len(arrs[0]), size, seed): yield tuple(arr[i] for arr in arrs) perm = comp(tuple, partial(permutations, r=2)) data_index = perm(data_index) data_valid = perm(data_valid) data_train = perm( pipe(partial(batch, data_train), (tf.int32, ) * len(data_train), prefetch=16)) ############### # build model # ############### model = Model.new(**select(C, *Model._new)) valid = tuple(model.data(i, j).valid() for i, j in data_index) train = tuple( model.data(i, j, s, t).train(**T) for (i, j), (s, t) in zip(data_index, data_train))
from util import Record, partial import tensorflow as tf # import os # os.environ["CUDA_VISIBLE_DEVICES"] = "1" scope = partial(tf.variable_scope, reuse=tf.AUTO_REUSE) def profile(sess, wtr, run, feed_dict=None, prerun=3, tag='flow'): for _ in range(prerun): sess.run(run, feed_dict) meta = tf.RunMetadata() sess.run(run, feed_dict, tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), meta) wtr.add_run_metadata(meta, tag) def pipe(gen_func, gen_types, map_func=None, map_types=None, para_map=4, prefetch=4, name='pipe'): """returns iterator tensors of `gen_types` from generator `gen_func`. see `tf.data.Dataset.from_generator`. when specified, `map_func` is called on the generator outputs (as numpy arrays) and tensors of `map_types` are returned instead. `para_map` number of calls are processed in parallel. `map_func` must be stateless. otherwise simply transform the data in
def vAe( mode, src=None, tgt=None, # model spec dim_tgt=8192, dim_emb=512, dim_rep=1024, rnn_layers=3, bidirectional=True, bidir_stacked=True, attentive=False, logit_use_embed=True, # training spec accelerate=1e-4, learn_rate=1e-3, bos=2, eos=1): # dim_tgt : vocab size # dim_emb : model dimension # dim_rep : representation dimension # # unk=0 for word dropout assert mode in ('train', 'valid', 'infer') self = Record(bos=bos, eos=eos) with scope('step'): step = self.step = tf.train.get_or_create_global_step() rate = accelerate * tf.to_float(step) rate_keepwd = self.rate_keepwd = tf.sigmoid(rate) rate_anneal = self.rate_anneal = tf.tanh(rate) rate_update = self.rate_update = learn_rate / (tf.sqrt(rate) + 1.0) with scope('src'): src = self.src = placeholder(tf.int32, (None, None), src, 'src') src = tf.transpose(src) # time major order src, msk_src, len_src = trim(src, eos) with scope('tgt'): tgt = self.tgt = placeholder(tf.int32, (None, None), tgt, 'tgt') tgt = tf.transpose(tgt) # time major order tgt, msk_tgt, len_tgt = trim(tgt, eos) msk_tgt = tf.pad(msk_tgt, ((1, 0), (0, 0)), constant_values=True) # pads for decoder : lead=[bos]+tgt -> gold=tgt+[eos] lead, gold = tgt, tf.pad(tgt, paddings=((0, 1), (0, 0)), constant_values=eos) if 'train' == mode: lead *= tf.to_int32( tf.random_uniform(tf.shape(lead)) < rate_keepwd) lead = self.lead = tf.pad(lead, paddings=((1, 0), (0, 0)), constant_values=bos) # s : src length # t : tgt length plus one padding, either eos or bos # b : batch size # # len_src : b aka s # msk_src : sb without padding # msk_tgt : tb with eos # # lead : tb with bos # gold : tb with eos with scope('embed'): b = (6 / (dim_tgt / dim_emb + 1))**0.5 embedding = tf.get_variable('embedding', (dim_tgt, dim_emb), initializer=tf.random_uniform_initializer( -b, b)) emb_tgt = tf.gather(embedding, lead, name='emb_tgt') # (t, b) -> (t, b, dim_emb) emb_src = tf.gather(embedding, src, name='emb_src') # (s, b) -> (s, b, dim_emb) with scope('encode'): # (s, b, dim_emb) -> (b, dim_emb) reverse = partial(tf.reverse_sequence, seq_lengths=len_src, seq_axis=0, batch_axis=1) if bidirectional and bidir_stacked: for i in range(rnn_layers): with scope("rnn{}".format(i + 1)): emb_fwd, _ = layer_rnn(1, dim_emb, name='fwd')(emb_src) emb_bwd, _ = layer_rnn(1, dim_emb, name='bwd')(reverse(emb_src)) hs = emb_src = tf.concat((emb_fwd, reverse(emb_bwd)), axis=-1) elif bidirectional: with scope("rnn"): emb_fwd, _ = layer_rnn(rnn_layers, dim_emb, name='fwd')(emb_src) emb_bwd, _ = layer_rnn(rnn_layers, dim_emb, name='bwd')(reverse(emb_src)) hs = tf.concat((emb_fwd, reverse(emb_bwd)), axis=-1) else: hs, _ = layer_rnn(rnn_layers, dim_emb, name='rnn')(emb_src) with scope('cata'): # extract the final states from the outputs: bd <- sbd, b2 h = tf.gather_nd( hs, tf.stack( (len_src - 1, tf.range(tf.size(len_src), dtype=tf.int32)), axis=1)) if attentive: # todo fixme # the values are the outputs from all non-padding steps; # the queries are the final states; h = layer_nrm(h + tf.squeeze( # bd <- bd1 attention( # bd1 <- bd1, bds, b1s tf.expand_dims(h, axis=2), # query: bd1 <- bd tf.transpose(hs, (1, 2, 0)), # value: bds <- sbd tf.log( tf.to_float( # -inf,0 mask: b1s <- sb <- bs tf.expand_dims(tf.transpose(msk_src), axis=1))), int(h.shape[-1])), 2)) with scope('latent'): # (b, dim_emb) -> (b, dim_rep) -> (b, dim_emb) # h = layer_aff(h, dim_emb, name='in') mu = self.mu = layer_aff(h, dim_rep, name='mu') lv = self.lv = layer_aff(h, dim_rep, name='lv') with scope('z'): h = mu if 'train' == mode: h += tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv)) self.z = h h = layer_aff(h, dim_emb, name='ex') with scope('decode'): # (b, dim_emb) -> (t, b, dim_emb) -> (?, dim_emb) h = self.state_in = tf.stack((h, ) * rnn_layers) h, _ = _, (self.state_ex, ) = layer_rnn(rnn_layers, dim_emb, name='rnn')( emb_tgt, initial_state=(h, )) if 'infer' != mode: h = tf.boolean_mask(h, msk_tgt) h = layer_aff(h, dim_emb, name='out') with scope('logits'): # (?, dim_emb) -> (?, dim_tgt) if logit_use_embed: logits = self.logits = tf.tensordot(h, (dim_emb**-0.5) * tf.transpose(embedding), 1) else: logits = self.logits = layer_aff(h, dim_tgt) with scope('prob'): prob = self.prob = tf.nn.softmax(logits) with scope('pred'): pred = self.pred = tf.argmax(logits, -1, output_type=tf.int32) if 'infer' != mode: labels = tf.boolean_mask(gold, msk_tgt, name='labels') with scope('errt'): errt_samp = self.errt_samp = tf.to_float(tf.not_equal( labels, pred)) errt = self.errt = tf.reduce_mean(errt_samp) with scope('loss'): with scope('loss_gen'): loss_gen_samp = self.loss_gen_samp = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss_gen = self.loss_gen = tf.reduce_mean(loss_gen_samp) with scope('loss_kld'): loss_kld_samp = self.loss_kld_samp = 0.5 * ( tf.square(mu) + tf.exp(lv) - lv - 1.0) loss_kld = self.loss_kld = tf.reduce_mean(loss_kld_samp) loss = self.loss = rate_anneal * loss_kld + loss_gen if 'train' == mode: with scope('train'): train_step = self.train_step = tf.train.AdamOptimizer( rate_update).minimize(loss, step) return self
# load sentencepiece model vocab = sp.load_spm(path_vocab) # Load the model model = vAe('infer') # Restore the session sess = tf.InteractiveSession() tf.train.Saver().restore(sess, path_ckpt) ################################ # deterministic representation # ################################ # encode text with sentence piece model data = list(map(partial(sp.encode_capped, vocab), text)) data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32) # calculate z for the test data in batches inpt = [model.z.eval({model.src: data[i:j]}) for i, j in partition(len(data), 128)] inpt = np.concatenate(inpt, axis=0) np.save(path_emb, inpt) ####################################################### # averaged representation with sentencepiece sampling # ####################################################### def infer_avg(sent, samples=128): bat = [sp.encode_capped_sample(vocab, sent) for _ in range(samples)] bat = vpack(bat, (len(bat), max(map(len, bat))), vocab.eos_id(), np.int32)
def updateNotes( allDb ): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Updating data', max=N_notes, immediate=True ) fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc # handle secondary databases mw.progress.update( label='Creating seen/known/mature from all.db' ) seenDb = filterDbByMat( allDb, cfg1('threshold_seen') ) knownDb = filterDbByMat( allDb, cfg1('threshold_known') ) matureDb = filterDbByMat( allDb, cfg1('threshold_mature') ) mw.progress.update( label='Loading priority.db' ) priorityDb = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db if cfg1('saveDbs'): mw.progress.update( label='Saving seen/known/mature dbs' ) seenDb.save( cfg1('path_seen') ) knownDb.save( cfg1('path_known') ) matureDb.save( cfg1('path_mature') ) mw.progress.update( label='Calculating frequency information' ) pops = [ len( locs ) for locs in allDb.db.values() ] pops = [ n for n in pops if n > 1 ] mw.progress.update( label='Updating notes' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue # Get all morphemes for note ms = set() for fieldName in C('morph_fields'): try: loc = fidDb[ ( nid, guid, fieldName ) ] ms.update( locDb[ loc ] ) except KeyError: continue ms = [ m for m in ms if m.pos not in C('morph_blacklist') ] # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for m in ms: if m not in seenDb.db: unseens.add( m ) if m not in knownDb.db: unknowns.add( m ) if m not in matureDb.db: unmatures.add( m ) if m not in matureDb.db and m in knownDb.db: newKnowns.add( m ) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len( ms ), len( unseens ), len( unknowns ), len( unmatures ) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += len( allDb.db[ focusMorph ] ) F_k_avg = F_k / N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for m in newKnowns: locs = allDb.db[ m ] if locs: ivl = min( 1, max( loc.maturity for loc in locs ) ) usefulness += C('reinforce new vocab weight') / ivl #TODO: maybe average this so it doesnt favor long sentences if any( m.pos == u'動詞' for m in unknowns ): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min( 999, usefulness ) # difference from optimal length (too little context vs long sentence) lenDiff = max( 0, min( 9, abs( C('optimal sentence length') - N ) -2 ) ) # calculate mmi mmi = 10000*N_k + 1000*lenDiff + usefulness if C('set due based on mmi'): nid2mmi[ nid ] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split( tags ), splitFields( flds ) # determine card type compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag = tagNames = C('tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C('tag_alreadyKnown'), C('tag_priority') if N_m == 0: # sentence comprehension card, m+0 ts = [ compTag ] + [ t for t in ts if t not in [ vocabTag, notReadyTag ] ] setField( mid, fs, C('focusMorph'), u'' ) elif N_k == 1: # new vocab card, k+1 ts = [ vocabTag ] + [ t for t in ts if t not in [ compTag, notReadyTag ] ] setField( mid, fs, C('focusMorph'), u'%s' % focusMorph.base ) elif N_k > 1: # M+1+ and K+2+ ts = [ notReadyTag ] + [ t for t in ts if t not in [ compTag, vocabTag ] ] # set type agnostic fields setField( mid, fs, C('k+N'), u'%d' % N_k ) setField( mid, fs, C('m+N'), u'%d' % N_m ) setField( mid, fs, C('morphManIndex'), u'%d' % mmi ) setField( mid, fs, C('unknowns'), u', '.join( u.base for u in unknowns ) ) setField( mid, fs, C('unmatures'), u', '.join( u.base for u in unmatures ) ) setField( mid, fs, C('unknownFreq'), u'%d' % F_k_avg ) # other tags if priorityTag in ts: ts.remove( priorityTag ) if isPriority: ts.append( priorityTag ) # update sql db tags_ = TAG.join( TAG.canonify( ts ) ) flds_ = joinFields( fs ) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum( fs[0] ) sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] ) ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } ) mw.progress.update( value=i, label='Updating anki database...' ) mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds ) TAG.register( tagNames ) # Now reorder new cards based on MMI mw.progress.update( value=i, label='Updating new card ordering...' ) ds = [] for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[ nid ] if due != due_: # only update cards that have changed ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } ) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds ) mw.reset() printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) ) mw.progress.finish()
def __init__(self, tree, filename, firstlineno=1, nopassmacros=None, varnames=None, argcount=0, codename='*', parent=None, linemap=None, codeflags=0, docstring=None): self.code = [] self.filename = filename self.name = codename self.argcount = argcount self.var_translators = [] ## Map of ids of tuples/symbols in input to tuples of # (filename, startline, endline) if linemap is None: linemap = {} self.linemap = linemap if varnames is None: varnames = [] ## List of arguments that are either local or cell self.varnames = varnames ## List of arguments that are either global, local, or cell self.unknowns = [] ## List of other names needed in the function self.names = [] ## List of constants used in the code block self.constants = [docstring] ## Map of variable names to the variable-name list in which # they want to belong. self.declareds = {} ## List of variable names which are to be taken from the function's # closure (the func_closure attribute). # # when appending to freevars, ensure that the corresponding parent # block has its variable made into a cellvar. self.freevars = [] ## List of variable names which are referenced by nested blocks. # They may also appear in varnames. self.cellvars = [] ## Maximum depth the value stack might reach when executing this # code block. self.maxstacklevel = 0 self.stacklevel = 0 ## 0x2000: future division (standard for Noodle); 0x4: *arguments; # 0x8: **kwargs; 0x20: generator self.codeflags = 0x2000 | codeflags ## Line number table self.lnotab = [] self.lnotab_last_address = 0 self.lnotab_last_lineno = firstlineno self.curlineno = self.firstlineno = firstlineno if nopassmacros is None: nopassmacros = {} self.nopassmacros = nopassmacros self.parent = parent self.setup_array_control() self.subfunc = partial(NoodleFunction, filename=self.filename, linemap=self.linemap, parent=self) self.subclass = partial(NoodleClass, filename=self.filename, linemap=self.linemap, parent=self) self.loadscopes = { 'local': ('LOAD_FAST', self.varnames), 'cell': ('LOAD_DEREF', self.cellvars), 'free': ('LOAD_DEREF', self.freevars), 'global': ('LOAD_GLOBAL', self.names) } self.storescopes = { 'local': ('STORE_FAST', self.varnames), 'cell': ('STORE_DEREF', self.cellvars), 'free': ('STORE_DEREF', self.freevars), 'global': ('STORE_GLOBAL', self.names) } self.delscopes = { 'local': ('DELETE_FAST', self.varnames), 'global': ('DELETE_GLOBAL', self.names) } self.extra_setup() self.compile_piece(tree) self.close_block() self.compile_deferreds() self.codestring = self.arrange_code( optimizing.OptimizeBytecode(self, self.code) )
#!/usr/bin/env python3 from util import comp, partial, PointedIndex from util_io import path, load_meta, load from util_np import np, vpack names, texts = load_meta() chars = {char for text in texts for char in text} chars.remove("\n") chars.remove(" ") index = PointedIndex(" \n" + "".join(sorted(chars))) texts = vpack( map(comp(partial(np.fromiter, dtype=np.uint8), partial(map, index)), texts), index("\n")) np.save("trial/data/index", index.vec) np.save("trial/data/texts", texts) np.save("trial/data/names", names) for name in names: np.save("trial/data/grams/" + name, load(path(name)))
#!/usr/bin/env python3 path = "trial/data" from os.path import join from util import partial, PointedIndex from util_io import load, chartab, encode from util_np import np, vpack src = list(load(join(path, "train_src"))) tgt = list(load(join(path, "train_tgt"))) idx_src = PointedIndex(chartab(src)) idx_tgt = PointedIndex(chartab(tgt)) enc_src = partial(encode, idx_src) enc_tgt = partial(encode, idx_tgt) assert 1 == idx_src("\n") == idx_tgt("\n") pack = lambda txt: vpack(map(partial(np.array, dtype=np.uint8), txt), fill=1) np.save(join(path, "index_src"), idx_src.vec) np.save(join(path, "index_tgt"), idx_tgt.vec) np.save(join(path, "train_src"), pack(map(enc_src, src))) np.save(join(path, "train_tgt"), pack(map(enc_tgt, tgt))) np.save(join(path, "valid_src"), pack(map(enc_src, load(join(path, "valid_src"))))) np.save(join(path, "valid_tgt"), pack(map(enc_tgt, load(join(path, "valid_tgt")))))
def my_getNewCard( self, _old ): '''Continually call _getNewCard until we get one with a focusMorph we haven't seen before. Also skip bad vocab cards. :type self: anki.sched.Scheduler :type _old: Callable ''' while True: C = partial( cfg, None, self.col.decks.active()[0] ) if not C('next new card feature'): return _old( self ) if not C('new card merged fill'): c = _old( self ) ''' :type c: anki.cards.Card ''' else: # pop from opposite direction and skip sibling spacing if not self._fillNew(): return ( id, due ) = self._newQueue.pop( 0 ) c = self.col.getCard( id ) self.newCount -= 1 if not c: return # no more cards n = c.note() # find the right morphemizer for this note, so we can apply model-dependent options (modify off == disable skip feature) from morphemes import getMorphemes from util import getFilter notefilter = getFilter(n) if notefilter is None: return c # this note is not configured in any filter -> proceed like normal without MorphMan-plugin if not notefilter['Modify']: return c # the deck should not be modified -> the user probably doesn't want the 'skip mature' feature # get the focus morph try: focusMorph = focus( n ) # field contains either the focusMorph or is empty except KeyError: tooltip( _( 'Encountered card without the \'focus morph\' field configured in the preferences. Please check your MorphMan settings and note models.') ) return c # card has no focusMorph field -> undefined behavior -> just proceed like normal # evaluate all conditions, on which this card might be skipped/buried isVocabCard = n.hasTag(jcfg('Tag_Vocab')) isNotReady = n.hasTag(jcfg('Tag_NotReady')) isComprehensionCard = n.hasTag(jcfg('Tag_Comprehension')) isFreshVocab = n.hasTag(jcfg('Tag_Fresh')) isAlreadyKnown = n.hasTag( jcfg('Tag_AlreadyKnown') ) skipComprehension = jcfg('Option_SkipComprehensionCards') skipFresh = jcfg('Option_SkipFreshVocabCards') skipFocusMorphSeenToday = jcfg('Option_SkipFocusMorphSeenToday') skipCondition1 = (isComprehensionCard and skipComprehension) skipCondition2 = (isFreshVocab and skipFresh) skipCondition3 = isAlreadyKnown # the user requested that the vocabulary does not have to be shown skipCondition4 = (focusMorph in seenMorphs and skipFocusMorphSeenToday) # we already learned that/saw that today #skipCondition5 = not (isVocabCard or isNotReady) # even if it is not a good vocabulary card, we have no choice when there are no other cards available # skip/bury card if any skip condition is true if skipCondition1 or skipCondition2 or skipCondition3 or skipCondition4: self.buryCards( [ c.id ] ) self.newCount += 1 # the card was quaried from the "new queue" so we have to increase the "new counter" back to its original value continue break return c