def mkAllDb( allDb=None ): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) locDb.pop( loc ) locDb[ newLoc ] = ms printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def highlight(txt, extra, fieldDict, field, mod_field): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub(sub, repl, string): return u''.join( re.sub(sub, repl, s) if not s.startswith('<span') else s for s in re.split('(<span.*?</span>)', string)) from morphemes import getMorphemes ms = getMorphemes(txt) for m in sorted(ms, key=lambda x: len(x.inflected), reverse=True): # largest subs first locs = allDb().db.get(m, set()) mat = max(loc.maturity for loc in locs) if locs else 0 if mat >= cfg1('threshold_mature'): mtype = 'mature' elif mat >= cfg1('threshold_known'): mtype = 'known' elif mat >= cfg1('threshold_seen'): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph=m.inflected, mtype=mtype, mat=mat) txt = nonSpanSub(m.inflected, repl, txt) return txt
def per( st, f ): ms = m.getMorphemes( st['mp'], f[ 'Expression' ], bs=st['bs'] ) us = [] for x in ms: if not x in st['kdb']: us += [ x[0] ] f[ 'unknowns' ] = u','.join( us ) return st
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ): # Load files kdb = MorphDb( cfg1('path_known') ) mdb = MorphDb( cfg1('path_mature') ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): jpn, eng = dialogueLines[i:i+2] jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn ) # get unknowns ms = getMorphemes( jpn, whitelist, blacklist ) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def per( st, f ): ms = m.getMorphemes( st['mp'], f[ 'Expression' ], bs=st['bs'] ) N = 0 for x in ms: if not x in st['kdb']: N += 1 f[ 'iPlusN' ] = u'%d' % N return st
def run( duelingSubsPath, outputSubsPath, morphemizer, matureFmt, knownFmt, unknownFmt ): # Load files kdb = MorphDb( cfg1('path_known') ) mdb = MorphDb( cfg1('path_mature') ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): target, native = dialogueLines[i:i+2] target, native, pre = getText( target ), getText( native ), getPreText( target ) # get unknowns ms = getMorphemes(morphemizer, target) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'target':target, 'native':native, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def per( st, f ): d, fname = st['ed'].deck, st['fieldName'] mats = [ c.interval for c in getCards( d, [f.id] ) ] ms = M.getMorphemes( st['mp'], f[ fname ] ) loc = M.AnkiDeck( f.id, fname, f[ fname ], d.path, d.name(), mats ) st['db'].addMsL( ms, loc ) return st
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ): # Load files kdb = M.MorphDb( knownDbPath ) mdb = M.MorphDb( matureDbPath ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Start Mecab mp = M.mecab() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): jpn, eng = dialogueLines[i:i+2] jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn ) # get unknowns ms = M.getMorphemes( mp, jpn, ws=whitelist, bs=blacklist ) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def per( st, n ): mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id ) for f in cfg( n.mid, None, 'morph_fields' ): ms = getMorphemes( n[ f ], None, cfg1('morph_blacklist') ) loc = AnkiDeck( n.id, f, n[ f ], n.guid, mats ) st['morphDb'].addMsL( ms, loc ) return st
def per( st, f ): ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] ) for m in ms: if m in st['db'].db: st['ed'].deck.addTags( [f.id], st['tags'] ) return st return st
def per(st, f): ms = M.getMorphemes(st['mp'], f['Expression'], bs=st['bs']) for m in ms: if m in st['db'].db: st['ed'].deck.addTags([f.id], st['tags']) return st return st
def per( st, n ): notecfg = getFilter(n) if notecfg is None: return st morphemizer = getMorphemizerByName(notecfg['Morphemizer']) for f in notecfg['Fields']: ms = getMorphemes(morphemizer, n[f], n.tags) st['morphemes'] += ms return st
def per( st, f ): ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] ) #QMessageBox( None, 'Note', 'Comparing %s to %s' % (str(ms), str(st['ms']) ) ) for x in ms: if x in st['ms']: tags = addTags( st['tags'], f.tags ) f.tags = canonifyTags( tags ) # QMessageBox( None, 'Note', 'found morpheme in expr, set tag' ) return st
def per(st, n): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? for field in cfg(n.mid, None, 'morph_fields'): for m in getMorphemes(getMorphemizerForNote(n), n[field]): if m in st['db'].db: n.addTag(st['tags']) break n.flush() return st
def per( st, n ): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? for field in cfg( n.mid, None, 'morph_fields' ): for m in getMorphemes( n[ field ], None, cfg1('morph_blacklist') ): if m in st['db'].db: n.addTag( st['tags'] ) break n.flush() return st
def per( st, n ): mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id ) notecfg = getFilter(n) if notecfg is None: return st morphemizer = getMorphemizerByName(notecfg['Morphemizer']) for f in notecfg['Fields']: ms = getMorphemes(morphemizer, n[f], n.tags) loc = AnkiDeck(n.id, f, n[f], n.guid, mats) st['morphDb'].addMsl(ms, loc) return st
def per( st, n ): mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id ) notecfg = getFilter(n) if notecfg is None: return st morphemizer = getMorphemizerByName(notecfg['Morphemizer']) for f in notecfg['Fields']: ms = getMorphemes(morphemizer, n[f]) loc = AnkiDeck(n.id, f, n[f], n.guid, mats) st['morphDb'].addMsl(ms, loc) return st
def post( st ): ms = M.getMorphemes( st['mp'], st['txt'], bs=st['bs'] ) util.killMecab( st ) txt = M.ms2str( ms ) kdb = M.MorphDb( util.knownDbPath ) newMs = [ m for m in ms if m not in kdb.db ] newTxt = M.ms2str( newMs ) txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt QMessageBox.information( st['ed'], 'Morphemes', txt )
def per( st, n ): n.delTag( st['tags'] ) if n['k+N'] == '1': # FIXME this special but commonly wanted logic must be a cfg option ms = getMorphemes( n['focusMorph'], None, cfg1('morph_blacklist') ) for m in ms: if m in st['db'].db: n.addTag( st['tags'] ) break n.flush() return st
def post( st ): import morphemes as m mp = m.mecab( None ) ms = m.getMorphemes( mp, st['txt'], bs=st['bs'] ) mp.kill() txt = m.ms2str( ms ).decode('utf-8') kdb = m.loadDb( util.knownDbPath ) newMs = [ x for x in ms if x not in kdb ] newTxt = m.ms2str( newMs ).decode('utf-8') txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt QMessageBox.information( st['ed'], 'Morphemes', txt )
def per(st, n): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? notecfg = getFilter(n) if notecfg is None: return st morphemizer = getMorphemizerByName(notecfg['Morphemizer']) for field in notecfg['Fields']: for m in getMorphemes(morphemizer, n[field]): if m in st['db'].db: n.addTag(st['tags']) break n.flush() return st
def per( st, n ): # :: State -> Note -> State #n.delTag( st['tags'] ) # clear tags if they already exist? notecfg = getFilter(n) if notecfg is None: return st morphemizer = getMorphemizerByName(notecfg['Morphemizer']) for field in notecfg['Fields']: for m in getMorphemes(morphemizer, n[ field ], n.tags): if m in st['db'].db: n.addTag(st['tags']) break n.flush() return st
def highlight( txt, extra, fieldDict, field, mod_field ): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' from util import getFilterByTagsAndType from morphemizer import getMorphemizerByName from morphemes import getMorphemes # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub( sub, repl, string ): return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) ) # find morphemizer; because no note/card information is exposed through arguments, we have to find morphemizer based on tags alone #from aqt.qt import debug; debug() # #if mw.reviewer.card is None: return txt #note = mw.reviewer.card.note() #if not isNoteSame(note, fieldDict): return txt #from aqt.qt import debug; debug() tags = fieldDict['Tags'].split() filter = getFilterByTagsAndType(fieldDict['Type'], tags) if filter is None: return txt morphemizer = getMorphemizerByName(filter['Morphemizer']) if morphemizer is None: return txt ms = getMorphemes(morphemizer, txt, tags) for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first locs = allDb().db.get( m, set() ) mat = max( loc.maturity for loc in locs ) if locs else 0 if mat >= cfg1( 'threshold_mature' ): mtype = 'mature' elif mat >= cfg1( 'threshold_known' ): mtype = 'known' elif mat >= cfg1( 'threshold_seen' ): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph = m.inflected, mtype = mtype, mat = mat ) txt = nonSpanSub( m.inflected, repl, txt ) return txt
def parse_text(text): nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs log_fp.write('=== parse_text ===\n' + text + '\n') # print('strip',stripHTML(text)) parsed_morphs = getMorphemes(self.morphemizer, stripHTML(text)) # parsed_morphs = getMorphemes(morphemizer, text) if len(parsed_morphs) == 0: return unknown_count = 0 line_missing_morphs = set() for m in parsed_morphs: # Count morph for word report all_morphs[m] = all_morphs.get(m, 0) + 1 seen_morphs[m] = seen_morphs.get(m, 0) + 1 if m.isProperNoun(): proper_noun_count += 1 is_proper_noun = True else: is_proper_noun = False i_count += 1 if known_db.matches( m ) or is_proper_noun: # Proper nouns are easy to learn, so assume they're known. known_morphs[m] = known_morphs.get(m, 0) + 1 known_count += 1 else: unknown_db.addMorph(m, 1) source_unknown_db.addMorph(m, 1) line_missing_morphs.add(m) unknown_count += 1 line_count += 1 if unknown_count == 0: known_line_count += 1 elif unknown_count == 1: iplus1_line_count += 1 line_morphs.append(line_missing_morphs)
def highlight( txt, extra, fieldDict, field, mod_field ): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub( sub, repl, string ): return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) ) from morphemes import getMorphemes ms = getMorphemes( txt ) for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first locs = allDb().db.get( m, set() ) mat = max( loc.maturity for loc in locs ) if locs else 0 if mat >= cfg1( 'threshold_mature' ): mtype = 'mature' elif mat >= cfg1( 'threshold_known' ): mtype = 'known' elif mat >= cfg1( 'threshold_seen' ): mtype = 'seen' else: mtype = 'unknown' repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format( morph = m.inflected, mtype = mtype, mat = mat ) txt = nonSpanSub( m.inflected, repl, txt ) return txt
def per( st, f ): st['ms'].extend( m.getMorphemes( st['mp'], f[ 'Expression' ] ) ) return st
def mkAllDb(allDb=None): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf(' .mats for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf(' .morphs for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) locDb.pop(loc) locDb[newLoc] = ms printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def post( st ): ms = getMorphemes( st['txt'], None, cfg1('morph_blacklist') ) s = ms2str( ms ) infoMsg( '----- All -----\n' + s )
def mkAllDb(allDb=None): import config reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop(loc) locDb[newLoc] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg( u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.' ) return None printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def post(st): ms = getMorphemes(st['morphemizer'], st['txt']) s = ms2str(ms) infoMsg('----- All -----\n' + s)
def per( st, f ): ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] ) for m in ms: if m not in st['mfmap']: st['mfmap'][m] = [] if f not in st['mfmap'][m]: st['mfmap'][m].append( f ) return st
def mkAll(self): # IO () log('Getting initial all.db...') if not hasattr(self, '_allDb'): try: self._allDb = M.MorphDb(self.allPath) debug(' * Updating existing all.db') except IOError: self._allDb = M.MorphDb() debug(' * Creating new all.db from scratch') allDb = self._allDb log('...done') mp = M.mecab() # pre-cache lookups fieldNames = self.cfg['morph fields'] whitelist, blacklist = self.cfg['whitelist'], self.cfg['blacklist'] fid2cardsDb = self.fid2cardsDb() fidDb = allDb.fidDb() locDb = allDb.locDb() fs = self.getFacts() i, lfs = 0, len(fs) start = time.time() last = time.time() for f in fs: mats = [c.interval for c in fid2cardsDb[f.id]] for fieldName in fieldNames: try: fieldValue = normalizeFieldValue(f[fieldName]) except KeyError: # if fact doesn't have the field just skip it continue try: # existing location loc = fidDb[(f.id, fieldName)] # new loc only; no morpheme change if loc.fieldValue == fieldValue and loc.maturities != mats: debug(' .mats for %d[%s]' % (f.id, fieldName)) newLoc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = locDb.pop(loc) locDb[newLoc] = ms # new loc and new morphemes elif loc.fieldValue != fieldValue: debug(' .morphs for %d[%s]' % (f.id, fieldName)) newLoc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = M.getMorphemes(mp, fieldValue, ws=whitelist, bs=blacklist) locDb.pop(loc) locDb[newLoc] = ms except KeyError: # new location loc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = M.getMorphemes(mp, fieldValue, ws=whitelist, bs=blacklist) if ms: debug(' .loc for %d[%s]' % (f.id, fieldName)) locDb[loc] = ms i += 1 if i % 100 == 0: log(' %d / %d = %d%% in %f sec' % (i, lfs, 100. * i / lfs, time.time() - last)) last = time.time() log('Proccessed all facts in %f sec. Now saving...' % (time.time() - start)) allDb.clear() allDb.addFromLocDb(locDb) allDb.save(self.allPath) self.cfg['last db update'][self.allPath] = time.time() self.cfg['last all.db update took'] = time.time() - start log('...done') sigterm(mp) return self._allDb
def mkAll( self ): # IO () log( 'Getting initial all.db...' ) if not hasattr( self, '_allDb' ): try: self._allDb = M.MorphDb( self.allPath ) debug( ' * Updating existing all.db' ) except IOError: self._allDb = M.MorphDb() debug( ' * Creating new all.db from scratch' ) allDb = self._allDb log( '...done' ) mp = M.mecab() # pre-cache lookups fieldNames = self.cfg['morph fields'] whitelist, blacklist = parseWhitelist( self.cfg['whitelist'] ), parseWhitelist( self.cfg['blacklist'] ) fid2cardsDb = self.fid2cardsDb() fidDb = allDb.fidDb() locDb = allDb.locDb() fs = self.getFacts() i, lfs = 0, len( fs ) start = time.time() last = time.time() for f in fs: mats = [ c.interval for c in fid2cardsDb[ f.id ] ] for fieldName in fieldNames: try: fieldValue = normalizeFieldValue( f[ fieldName ] ) except KeyError: # if fact doesn't have the field just skip it continue try: # existing location loc = fidDb[ (f.id, fieldName) ] # new loc only; no morpheme change if loc.fieldValue == fieldValue and loc.maturities != mats: debug(' .mats for %d[%s]' % ( f.id, fieldName ) ) newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = locDb.pop( loc ) locDb[ newLoc ] = ms # new loc and new morphemes elif loc.fieldValue != fieldValue: debug(' .morphs for %d[%s]' % ( f.id, fieldName ) ) newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist ) locDb.pop( loc ) locDb[ newLoc ] = ms except KeyError: # new location loc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist ) if ms: debug(' .loc for %d[%s]' % ( f.id, fieldName ) ) locDb[ loc ] = ms i += 1 if i % 100 == 0: log(' %d / %d = %d%% in %f sec' % ( i, lfs, 100.*i/lfs, time.time()-last ) ) last = time.time() log( 'Proccessed all facts in %f sec. Now saving...' % ( time.time()-start ) ) allDb.clear() allDb.addFromLocDb( locDb ) allDb.save( self.allPath ) self.cfg['last db update'][ self.allPath ] = time.time() self.cfg['last all.db update took'] = time.time() - start log( '...done' ) sigterm( mp ) return self._allDb
def per(st, f): ms = M.getMorphemes(st['mp'], f['Expression'], bs=st['bs']) for m in ms: if m not in st['mfmap']: st['mfmap'][m] = [] if f not in st['mfmap'][m]: st['mfmap'][m].append(f) return st
def mkAllDb( allDb=None ): import config; reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] if C('ignore maturity'): mats = [ 0 for mat in mats ] ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop( loc ) locDb[ newLoc ] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.') return None printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb