def main(): # load existing all.db mw.progress.start( label='Loading existing all.db', immediate=True ) t_0 = time.time() cur = util.allDb() if cfg1('loadAllDb') else None printf( 'Loaded all.db in %f sec' % ( time.time() - t_0 ) ) mw.progress.finish() # update all.db allDb = mkAllDb( cur ) # merge in external.db mw.progress.start( label='Merging ext.db', immediate=True ) ext = MorphDb( cfg1('path_ext'), ignoreErrors=True ) allDb.merge( ext ) mw.progress.finish() # update notes knownDb = updateNotes( allDb ) # update stats and refresh display stats.updateStats( knownDb ) mw.toolbar.draw() # set global allDb util._allDb = allDb
def highlight(txt, extra, fieldDict, field, mod_field): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub(sub, repl, string): return u''.join( re.sub(sub, repl, s) if not s.startswith('<span') else s for s in re.split('(<span.*?</span>)', string)) from morphemes import getMorphemes ms = getMorphemes(txt) for m in sorted(ms, key=lambda x: len(x.inflected), reverse=True): # largest subs first locs = allDb().db.get(m, set()) mat = max(loc.maturity for loc in locs) if locs else 0 if mat >= cfg1('threshold_mature'): mtype = 'mature' elif mat >= cfg1('threshold_known'): mtype = 'known' elif mat >= cfg1('threshold_seen'): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph=m.inflected, mtype=mtype, mat=mat) txt = nonSpanSub(m.inflected, repl, txt) return txt
def main(): # load existing all.db mw.progress.start(label='Loading existing all.db', immediate=True) t_0 = time.time() cur = util.allDb() if cfg1('loadAllDb') else None printf('Loaded all.db in %f sec' % (time.time() - t_0)) mw.progress.finish() # update all.db allDb = mkAllDb(cur) # merge in external.db mw.progress.start(label='Merging ext.db', immediate=True) ext = MorphDb(cfg1('path_ext'), ignoreErrors=True) allDb.merge(ext) mw.progress.finish() # update notes knownDb = updateNotes(allDb) # update stats and refresh display stats.updateStats(knownDb) mw.toolbar.draw() # set global allDb util._allDb = allDb
def run( duelingSubsPath, outputSubsPath, morphemizer, matureFmt, knownFmt, unknownFmt ): # Load files kdb = MorphDb( cfg1('path_known') ) mdb = MorphDb( cfg1('path_mature') ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): target, native = dialogueLines[i:i+2] target, native, pre = getText( target ), getText( native ), getPreText( target ) # get unknowns ms = getMorphemes(morphemizer, target) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'target':target, 'native':native, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def updateStats( knownDb=None ): mw.progress.start( label='Updating stats', immediate=True ) from morphemes import MorphDb d = {} # Load known.db and get total morphemes known if knownDb is None: knownDb = MorphDb( cfg1('path_known'), ignoreErrors=True ) d['totalKnown'] = len( knownDb.db ) # Load Goal.*.db dbs, get morphemes required, and compare vs known.db d['goals'] = {} goalDbPaths = glob.glob( os.path.join( cfg1('path_dbs'), 'Goal.*.db' ) ) for path in goalDbPaths: name = os.path.basename( path )[5:][:-3] gdb = MorphDb( path ) # track total unique morphemes + when weighted by frequency # NOTE: a morpheme may occur multiple times within the same sentence, but this frequency is wrt note fields numUniqueReq, numUniqueKnown, numFreqReq, numFreqKnown = 0, 0, 0, 0 for m,locs in gdb.db.iteritems(): numUniqueReq += 1 numFreqReq += len( locs ) if m in knownDb.db: numUniqueKnown += 1 numFreqKnown += len( locs ) d['goals'][ name ] = { 'total':numUniqueReq, 'known':numUniqueKnown, 'freqTotal':numFreqReq, 'freqKnown':numFreqKnown } saveStats( d ) mw.progress.finish() return d
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ): # Load files kdb = MorphDb( cfg1('path_known') ) mdb = MorphDb( cfg1('path_mature') ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): jpn, eng = dialogueLines[i:i+2] jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn ) # get unknowns ms = getMorphemes( jpn, whitelist, blacklist ) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def my_reviewer_keyHandler(self, evt): ''' :type self: aqt.reviewer.Reviewer ''' key = unicode(evt.text()) key_browse, key_skip = cfg1('browse same focus key'), cfg1( 'set known and skip key') if key == key_skip: setKnownAndSkip(self) elif key == key_browse: browseSameFocus(self)
def mkAllDb( allDb=None ): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) locDb.pop( loc ) locDb[ newLoc ] = ms printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def per( st, n ): mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id ) for f in cfg( n.mid, None, 'morph_fields' ): ms = getMorphemes( n[ f ], None, cfg1('morph_blacklist') ) loc = AnkiDeck( n.id, f, n[ f ], n.guid, mats ) st['morphDb'].addMsL( ms, loc ) return st
def getMorphemesMecab(e): ms = [tuple(m.split('\t')) for m in interact(e).split('\r')] # morphemes ms = [Morpheme(*m) for m in ms if len(m) == MECAB_NODE_LENGTH] # filter garbage #if whitelist: ms = [ m for m in ms if m.pos in whitelist ] blacklist = cfg1('mecab_blacklist') if blacklist: ms = [m for m in ms if m.pos not in blacklist] ms = [fixReading(m) for m in ms] return ms
def highlight( txt, extra, fieldDict, field, mod_field ): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' from util import getFilterByTagsAndType from morphemizer import getMorphemizerByName from morphemes import getMorphemes # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub( sub, repl, string ): return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) ) # find morphemizer; because no note/card information is exposed through arguments, we have to find morphemizer based on tags alone #from aqt.qt import debug; debug() # #if mw.reviewer.card is None: return txt #note = mw.reviewer.card.note() #if not isNoteSame(note, fieldDict): return txt #from aqt.qt import debug; debug() tags = fieldDict['Tags'].split() filter = getFilterByTagsAndType(fieldDict['Type'], tags) if filter is None: return txt morphemizer = getMorphemizerByName(filter['Morphemizer']) if morphemizer is None: return txt ms = getMorphemes(morphemizer, txt, tags) for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first locs = allDb().db.get( m, set() ) mat = max( loc.maturity for loc in locs ) if locs else 0 if mat >= cfg1( 'threshold_mature' ): mtype = 'mature' elif mat >= cfg1( 'threshold_known' ): mtype = 'known' elif mat >= cfg1( 'threshold_seen' ): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph = m.inflected, mtype = mtype, mat = mat ) txt = nonSpanSub( m.inflected, repl, txt ) return txt
def per( st, n ): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? for field in cfg( n.mid, None, 'morph_fields' ): for m in getMorphemes( n[ field ], None, cfg1('morph_blacklist') ): if m in st['db'].db: n.addTag( st['tags'] ) break n.flush() return st
def onExtractTxtFile( self ): srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath ) if not srcPath: return destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' ) if not destPath: return mat = cfg1('text file import maturity') db = MorphDb.mkFromFile( str(srcPath), getAllMorphemizers()[self.morphemizerComboBox.currentIndex()], mat ) if db: db.save( str(destPath) ) infoMsg( 'Extracted successfully' )
def onExtractTxtFile( self ): srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath ) if not srcPath: return destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' ) if not destPath: return mat = cfg1('text file import maturity') db = MorphDb.mkFromFile( str(srcPath), mat ) if db: db.save( str(destPath) ) infoMsg( 'Extracted successfully' )
def per(st, n): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? for field in cfg(n.mid, None, 'morph_fields'): for m in getMorphemes(n[field], None, cfg1('morph_blacklist')): if m in st['db'].db: n.addTag(st['tags']) break n.flush() return st
def updateStats(knownDb=None): mw.progress.start(label='Updating stats', immediate=True) from morphemes import MorphDb d = {} # Load known.db and get total morphemes known if knownDb is None: knownDb = MorphDb(cfg1('path_known'), ignoreErrors=True) d['totalKnown'] = len(knownDb.db) # Load Goal.*.db dbs, get morphemes required, and compare vs known.db d['goals'] = {} goalDbPaths = glob.glob(os.path.join(cfg1('path_dbs'), 'Goal.*.db')) for path in goalDbPaths: name = os.path.basename(path)[5:][:-3] gdb = MorphDb(path) # track total unique morphemes + when weighted by frequency # NOTE: a morpheme may occur multiple times within the same sentence, but this frequency is wrt note fields numUniqueReq, numUniqueKnown, numFreqReq, numFreqKnown = 0, 0, 0, 0 for m in gdb.db.iterkeys(): freq = gdb.db.frequency(m) numUniqueReq += 1 numFreqReq += freq if m in knownDb.db: numUniqueKnown += 1 numFreqKnown += freq d['goals'][name] = { 'total': numUniqueReq, 'known': numUniqueKnown, 'freqTotal': numFreqReq, 'freqKnown': numFreqKnown } saveStats(d) mw.progress.finish() return d
def per( st, n ): n.delTag( st['tags'] ) if n['k+N'] == '1': # FIXME this special but commonly wanted logic must be a cfg option ms = getMorphemes( n['focusMorph'], None, cfg1('morph_blacklist') ) for m in ms: if m in st['db'].db: n.addTag( st['tags'] ) break n.flush() return st
def markFocusSeen( self, n ): '''Mark a focusMorph as already seen so future new cards with the same focus will be skipped. Also prints number of cards to be skipped if enabled''' global seenMorphs try: if not focus( n ): return q = u'%s:%s' % ( focusName( n ), focus( n ) ) except KeyError: return seenMorphs.add( focus(n) ) numSkipped = len( self.mw.col.findNotes( q ) ) -1 if numSkipped and cfg1('print number of alternatives skipped'): tooltip( _( '%d alternatives will be skipped' % numSkipped ) )
def highlight( txt, extra, fieldDict, field, mod_field ): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub( sub, repl, string ): return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) ) from morphemes import getMorphemes ms = getMorphemes( txt ) for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first locs = allDb().db.get( m, set() ) mat = max( loc.maturity for loc in locs ) if locs else 0 if mat >= cfg1( 'threshold_mature' ): mtype = 'mature' elif mat >= cfg1( 'threshold_known' ): mtype = 'known' elif mat >= cfg1( 'threshold_seen' ): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph = m.inflected, mtype = mtype, mat = mat ) txt = nonSpanSub( m.inflected, repl, txt ) return txt
def getStatsPath(): return cfg1('path_stats') def loadStats():
def mkAllDb(allDb=None): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf(' .mats for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf(' .morphs for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) locDb.pop(loc) locDb[newLoc] = ms printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def updateNotes( allDb ): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Updating data', max=N_notes, immediate=True ) fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc # read tag names compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg('Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg('Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg('Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong') TAG.register( tagNames ) badLengthTag = jcfg2().get('Tag_BadLength') # handle secondary databases mw.progress.update( label='Creating seen/known/mature from all.db' ) seenDb = filterDbByMat( allDb, cfg1('threshold_seen') ) knownDb = filterDbByMat( allDb, cfg1('threshold_known') ) matureDb = filterDbByMat( allDb, cfg1('threshold_mature') ) mw.progress.update( label='Loading priority.db' ) priorityDb = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db if cfg1('saveDbs'): mw.progress.update( label='Saving seen/known/mature dbs' ) seenDb.save( cfg1('path_seen') ) knownDb.save( cfg1('path_known') ) matureDb.save( cfg1('path_mature') ) mw.progress.update( label='Updating notes' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None or not notecfg['Modify']: continue # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: try: loc = fidDb[ ( nid, guid, fieldName ) ] morphemes.update( locDb[ loc ] ) except KeyError: continue # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for morpheme in morphemes: if morpheme not in seenDb.db: unseens.add( morpheme ) if morpheme not in knownDb.db: unknowns.add( morpheme ) if morpheme not in matureDb.db: unmatures.add( morpheme ) if morpheme not in matureDb.db and morpheme in knownDb.db: newKnowns.add( morpheme ) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len( morphemes ), len( unseens ), len( unknowns ), len( unmatures ) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += allDb.frequency(focusMorph) F_k_avg = F_k // N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for morpheme in newKnowns: locs = allDb.db[ morpheme ] if locs: ivl = min( 1, max( loc.maturity for loc in locs ) ) usefulness += C('reinforce new vocab weight') // ivl #TODO: maybe average this so it doesnt favor long sentences if any( morpheme.pos == u'動詞' for morpheme in unknowns ): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min( 999, usefulness ) # difference from optimal length range (too little context vs long sentence) lenDiffRaw = min(N - C('min good sentence length'), max(0, N - C('max good sentence length'))) lenDiff = min(9, abs(lenDiffRaw)) # calculate mmi mmi = 10000*N_k + 1000*lenDiff + usefulness if C('set due based on mmi'): nid2mmi[ nid ] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split( tags ), splitFields( flds ) # clear any 'special' tags, the appropriate will be set in the next few lines ts = [ t for t in ts if t not in [ notReadyTag, compTag, vocabTag, freshTag ] ] # determine card type if N_m == 0: # sentence comprehension card, m+0 ts = ts + [ compTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'' ) elif N_k == 1: # new vocab card, k+1 ts = ts + [ vocabTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base ) elif N_k > 1: # M+1+ and K+2+ ts = ts + [ notReadyTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'') elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [ freshTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base) else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [ freshTag ] setField( mid, fs, jcfg('Field_FocusMorph'), u'') # set type agnostic fields setField( mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k ) setField( mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m ) setField( mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi ) setField( mid, fs, jcfg('Field_Unknowns'), u', '.join( u.base for u in unknowns ) ) setField( mid, fs, jcfg('Field_Unmatures'), u', '.join( u.base for u in unmatures ) ) setField( mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg ) # remove deprecated tag if badLengthTag is not None and badLengthTag in ts: ts.remove( badLengthTag ) # other tags if priorityTag in ts: ts.remove( priorityTag ) if isPriority: ts.append( priorityTag ) if tooShortTag in ts: ts.remove( tooShortTag ) if lenDiffRaw < 0: ts.append( tooShortTag ) if tooLongTag in ts: ts.remove( tooLongTag ) if lenDiffRaw > 0: ts.append( tooLongTag ) # remove unnecessary tags if not jcfg('Option_SetNotRequiredTags'): unnecessary = [priorityTag, tooShortTag, tooLongTag] ts = [tag for tag in ts if tag not in unnecessary] # update sql db tags_ = TAG.join( TAG.canonify( ts ) ) flds_ = joinFields( fs ) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum( fs[0] ) sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] ) ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } ) mw.progress.update( value=i, label='Updating anki database...' ) mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds ) # Now reorder new cards based on MMI mw.progress.update( value=i, label='Updating new card ordering...' ) ds = [] # "type = 0": new cards # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type] # "type = 2": review cards for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[ nid ] if due != due_: # only update cards that have changed ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } ) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds ) mw.reset() printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) ) mw.progress.finish() return knownDb
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Calculating frequency information') pops = [len(locs) for locs in allDb.db.values()] pops = [n for n in pops if n > 1] mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue # Get all morphemes for note ms = set() for fieldName in C('morph_fields'): try: loc = fidDb[(nid, guid, fieldName)] ms.update(locDb[loc]) except KeyError: continue ms = [m for m in ms if m.pos not in C('morph_blacklist')] # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for m in ms: if m not in seenDb.db: unseens.add(m) if m not in knownDb.db: unknowns.add(m) if m not in matureDb.db: unmatures.add(m) if m not in matureDb.db and m in knownDb.db: newKnowns.add(m) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(ms), len(unseens), len(unknowns), len(unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += len(allDb.db[focusMorph]) F_k_avg = F_k / N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for m in newKnowns: locs = allDb.db[m] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) / ivl #TODO: maybe average this so it doesnt favor long sentences if any(m.pos == u'動詞' for m in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length (too little context vs long sentence) lenDiff = max(0, min(9, abs(C('optimal sentence length') - N) - 2)) tooLong = N > C('optimal sentence length') # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # determine card type compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag, badLengthTag, tooLongTag = tagNames = C( 'tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C( 'tag_alreadyKnown'), C('tag_priority'), C('tag_badLength'), C( 'tag_tooLong') if N_m == 0: # sentence comprehension card, m+0 ts = [compTag ] + [t for t in ts if t not in [vocabTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = [vocabTag ] + [t for t in ts if t not in [compTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = [notReadyTag ] + [t for t in ts if t not in [compTag, vocabTag]] # set type agnostic fields setField(mid, fs, C('k+N'), u'%d' % N_k) setField(mid, fs, C('m+N'), u'%d' % N_m) setField(mid, fs, C('morphManIndex'), u'%d' % mmi) setField(mid, fs, C('unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, C('unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, C('unknownFreq'), u'%d' % F_k_avg) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if badLengthTag in ts: ts.remove(badLengthTag) if lenDiff: ts.append(badLengthTag) if tooLongTag in ts: ts.remove(tooLongTag) if tooLong: ts.append(tooLongTag) # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) TAG.register(tagNames) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb
def mkAllDb(allDb=None): import config reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop(loc) locDb[newLoc] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg( u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.' ) return None printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def updateNotes( allDb ): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Updating data', max=N_notes, immediate=True ) fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc # handle secondary databases mw.progress.update( label='Creating seen/known/mature from all.db' ) seenDb = filterDbByMat( allDb, cfg1('threshold_seen') ) knownDb = filterDbByMat( allDb, cfg1('threshold_known') ) matureDb = filterDbByMat( allDb, cfg1('threshold_mature') ) mw.progress.update( label='Loading priority.db' ) priorityDb = MorphDb( cfg1('path_priority'), ignoreErrors=True ).db if cfg1('saveDbs'): mw.progress.update( label='Saving seen/known/mature dbs' ) seenDb.save( cfg1('path_seen') ) knownDb.save( cfg1('path_known') ) matureDb.save( cfg1('path_mature') ) mw.progress.update( label='Calculating frequency information' ) pops = [ len( locs ) for locs in allDb.db.values() ] pops = [ n for n in pops if n > 1 ] mw.progress.update( label='Updating notes' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue # Get all morphemes for note ms = set() for fieldName in C('morph_fields'): try: loc = fidDb[ ( nid, guid, fieldName ) ] ms.update( locDb[ loc ] ) except KeyError: continue ms = [ m for m in ms if m.pos not in C('morph_blacklist') ] # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for m in ms: if m not in seenDb.db: unseens.add( m ) if m not in knownDb.db: unknowns.add( m ) if m not in matureDb.db: unmatures.add( m ) if m not in matureDb.db and m in knownDb.db: newKnowns.add( m ) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len( ms ), len( unseens ), len( unknowns ), len( unmatures ) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += len( allDb.db[ focusMorph ] ) F_k_avg = F_k / N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for m in newKnowns: locs = allDb.db[ m ] if locs: ivl = min( 1, max( loc.maturity for loc in locs ) ) usefulness += C('reinforce new vocab weight') / ivl #TODO: maybe average this so it doesnt favor long sentences if any( m.pos == u'動詞' for m in unknowns ): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min( 999, usefulness ) # difference from optimal length (too little context vs long sentence) lenDiff = max( 0, min( 9, abs( C('optimal sentence length') - N ) -2 ) ) # calculate mmi mmi = 10000*N_k + 1000*lenDiff + usefulness if C('set due based on mmi'): nid2mmi[ nid ] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split( tags ), splitFields( flds ) # determine card type compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag = tagNames = C('tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C('tag_alreadyKnown'), C('tag_priority') if N_m == 0: # sentence comprehension card, m+0 ts = [ compTag ] + [ t for t in ts if t not in [ vocabTag, notReadyTag ] ] setField( mid, fs, C('focusMorph'), u'' ) elif N_k == 1: # new vocab card, k+1 ts = [ vocabTag ] + [ t for t in ts if t not in [ compTag, notReadyTag ] ] setField( mid, fs, C('focusMorph'), u'%s' % focusMorph.base ) elif N_k > 1: # M+1+ and K+2+ ts = [ notReadyTag ] + [ t for t in ts if t not in [ compTag, vocabTag ] ] # set type agnostic fields setField( mid, fs, C('k+N'), u'%d' % N_k ) setField( mid, fs, C('m+N'), u'%d' % N_m ) setField( mid, fs, C('morphManIndex'), u'%d' % mmi ) setField( mid, fs, C('unknowns'), u', '.join( u.base for u in unknowns ) ) setField( mid, fs, C('unmatures'), u', '.join( u.base for u in unmatures ) ) setField( mid, fs, C('unknownFreq'), u'%d' % F_k_avg ) # other tags if priorityTag in ts: ts.remove( priorityTag ) if isPriority: ts.append( priorityTag ) # update sql db tags_ = TAG.join( TAG.canonify( ts ) ) flds_ = joinFields( fs ) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum( fs[0] ) sfld = stripHTML( fs[ getSortFieldIndex( mid ) ] ) ds.append( { 'now':now, 'tags':tags_, 'flds':flds_, 'sfld':sfld, 'csum':csum, 'usn':mw.col.usn(), 'nid':nid } ) mw.progress.update( value=i, label='Updating anki database...' ) mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds ) TAG.register( tagNames ) # Now reorder new cards based on MMI mw.progress.update( value=i, label='Updating new card ordering...' ) ds = [] for ( cid, nid, due ) in db.execute( 'select id, nid, due from cards where type = 0' ): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[ nid ] if due != due_: # only update cards that have changed ds.append( { 'now':now, 'due':due_, 'usn':mw.col.usn(), 'cid':cid } ) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds ) mw.reset() printf( 'Updated notes in %f sec' % ( time.time() - t_0 ) ) mw.progress.finish()
def mkAllDb( allDb=None ): import config; reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] if C('ignore maturity'): mats = [ 0 for mat in mats ] ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop( loc ) locDb[ newLoc ] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.') return None printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def post( st ): ms = getMorphemes( st['txt'], None, cfg1('morph_blacklist') ) s = ms2str( ms ) infoMsg( '----- All -----\n' + s )
def my_reviewer_keyHandler( self, evt ): key = unicode( evt.text() ) key_browse, key_skip = cfg1('browse same focus key'), cfg1('set known and skip key') if key == key_skip: setKnownAndSkip( self ) elif key == key_browse: browseSameFocus( self )
def getStatsPath(): return cfg1('path_stats')
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # read tag names compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg( 'Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg( 'Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg( 'Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong') TAG.register(tagNames) badLengthTag = jcfg2().get('Tag_BadLength') # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None or not notecfg['Modify']: continue # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: try: loc = fidDb[(nid, guid, fieldName)] morphemes.update(locDb[loc]) except KeyError: continue # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for morpheme in morphemes: if morpheme not in seenDb.db: unseens.add(morpheme) if morpheme not in knownDb.db: unknowns.add(morpheme) if morpheme not in matureDb.db: unmatures.add(morpheme) if morpheme not in matureDb.db and morpheme in knownDb.db: newKnowns.add(morpheme) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(morphemes), len(unseens), len(unknowns), len( unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += allDb.frequency(focusMorph) F_k_avg = F_k // N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for morpheme in newKnowns: locs = allDb.db[morpheme] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) // ivl #TODO: maybe average this so it doesnt favor long sentences if any(morpheme.pos == u'動詞' for morpheme in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length range (too little context vs long sentence) lenDiffRaw = min(N - C('min good sentence length'), max(0, N - C('max good sentence length'))) lenDiff = min(9, abs(lenDiffRaw)) # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # clear any 'special' tags, the appropriate will be set in the next few lines ts = [ t for t in ts if t not in [notReadyTag, compTag, vocabTag, freshTag] ] # determine card type if N_m == 0: # sentence comprehension card, m+0 ts = ts + [compTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = ts + [vocabTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = ts + [notReadyTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base) else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') # set type agnostic fields setField(mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k) setField(mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m) setField(mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi) setField(mid, fs, jcfg('Field_Unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, jcfg('Field_Unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg) # remove deprecated tag if badLengthTag is not None and badLengthTag in ts: ts.remove(badLengthTag) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if tooShortTag in ts: ts.remove(tooShortTag) if lenDiffRaw < 0: ts.append(tooShortTag) if tooLongTag in ts: ts.remove(tooLongTag) if lenDiffRaw > 0: ts.append(tooLongTag) # remove unnecessary tags if not jcfg('Option_SetNotRequiredTags'): unnecessary = [priorityTag, tooShortTag, tooLongTag] ts = [tag for tag in ts if tag not in unnecessary] # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] # "type = 0": new cards # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type] # "type = 2": review cards for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb