Esempio n. 1
0
def mkAllDb( allDb=None ):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )
        if not C('enabled'): continue
        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in C('morph_fields'):
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes( fieldValue )
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes( fieldValue )
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms
    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb
Esempio n. 2
0
def highlight(txt, extra, fieldDict, field, mod_field):
    '''When a field is marked with the 'focusMorph' command, we format it by
    wrapping all the morphemes in <span>s with attributes set to its maturity'''

    # must avoid formatting a smaller morph that is contained in a bigger morph
    # => do largest subs first and don't sub anything already in <span>
    def nonSpanSub(sub, repl, string):
        return u''.join(
            re.sub(sub, repl, s) if not s.startswith('<span') else s
            for s in re.split('(<span.*?</span>)', string))

    from morphemes import getMorphemes
    ms = getMorphemes(txt)
    for m in sorted(ms, key=lambda x: len(x.inflected),
                    reverse=True):  # largest subs first
        locs = allDb().db.get(m, set())
        mat = max(loc.maturity for loc in locs) if locs else 0

        if mat >= cfg1('threshold_mature'): mtype = 'mature'
        elif mat >= cfg1('threshold_known'): mtype = 'known'
        elif mat >= cfg1('threshold_seen'): mtype = 'seen'
        else: mtype = 'unknown'
        repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format(
            morph=m.inflected, mtype=mtype, mat=mat)
        txt = nonSpanSub(m.inflected, repl, txt)
    return txt
Esempio n. 3
0
def per( st, f ):
   ms = m.getMorphemes( st['mp'], f[ 'Expression' ], bs=st['bs'] )
   us = []
   for x in ms:
      if not x in st['kdb']: us += [ x[0] ]
   f[ 'unknowns' ] = u','.join( us )
   return st
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ):
    # Load files
    kdb = MorphDb( cfg1('path_known') )
    mdb = MorphDb( cfg1('path_mature') )
    subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines()

    # Get dueling subs
    dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ]
    header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ]
    assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange( 0, len( dialogueLines ), 2 ):
        jpn, eng = dialogueLines[i:i+2]
        jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn )

        # get unknowns
        ms = getMorphemes( jpn, whitelist, blacklist )
        unknowns, N_k = getNotInDb( ms, kdb.db )
        unmatures, N_m = getNotInDb( ms, mdb.db )
        d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures }

        if N_m == 0:
            lines.append( pre + matureFmt % d )
        elif N_k == 0:
            lines.append( pre + knownFmt % d )
        else:
            lines.append( pre + unknownFmt % d )

    outFile = codecs.open( outputSubsPath, 'w', 'utf-8' )
    outFile.write( u''.join( header ) )
    outFile.write( u'\n'.join( lines ) )
    outFile.close()
Esempio n. 5
0
def per( st, f ):
   ms = m.getMorphemes( st['mp'], f[ 'Expression' ], bs=st['bs'] )
   N = 0
   for x in ms:
      if not x in st['kdb']: N += 1
   f[ 'iPlusN' ] = u'%d' % N
   return st
Esempio n. 6
0
def run( duelingSubsPath, outputSubsPath, morphemizer, matureFmt, knownFmt, unknownFmt ):
    # Load files
    kdb = MorphDb( cfg1('path_known') )
    mdb = MorphDb( cfg1('path_mature') )
    subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines()

    # Get dueling subs
    dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ]
    header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ]
    assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange( 0, len( dialogueLines ), 2 ):
        target, native = dialogueLines[i:i+2]
        target, native, pre = getText( target ), getText( native ), getPreText( target )

        # get unknowns
        ms = getMorphemes(morphemizer, target)
        unknowns, N_k = getNotInDb( ms, kdb.db )
        unmatures, N_m = getNotInDb( ms, mdb.db )
        d = { 'target':target, 'native':native, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures }

        if N_m == 0:
            lines.append( pre + matureFmt % d )
        elif N_k == 0:
            lines.append( pre + knownFmt % d )
        else:
            lines.append( pre + unknownFmt % d )

    outFile = codecs.open( outputSubsPath, 'w', 'utf-8' )
    outFile.write( u''.join( header ) )
    outFile.write( u'\n'.join( lines ) )
    outFile.close()
def per( st, f ):
    d, fname = st['ed'].deck, st['fieldName']
    mats = [ c.interval for c in getCards( d, [f.id] ) ]
    ms = M.getMorphemes( st['mp'], f[ fname ] )
    loc = M.AnkiDeck( f.id, fname, f[ fname ], d.path, d.name(), mats )
    st['db'].addMsL( ms, loc )
    return st
Esempio n. 8
0
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ):
    # Load files
    kdb = M.MorphDb( knownDbPath )
    mdb = M.MorphDb( matureDbPath )
    subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines()
    # Start Mecab
    mp = M.mecab()

    # Get dueling subs
    dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ]
    header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ]
    assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange( 0, len( dialogueLines ), 2 ):
        jpn, eng = dialogueLines[i:i+2]
        jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn )

        # get unknowns
        ms = M.getMorphemes( mp, jpn, ws=whitelist, bs=blacklist )
        unknowns, N_k = getNotInDb( ms, kdb.db )
        unmatures, N_m = getNotInDb( ms, mdb.db )
        d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures }

        if N_m == 0:
            lines.append( pre + matureFmt % d )
        elif N_k == 0:
            lines.append( pre + knownFmt % d )
        else:
            lines.append( pre + unknownFmt % d )

    outFile = codecs.open( outputSubsPath, 'w', 'utf-8' )
    outFile.write( u''.join( header ) )
    outFile.write( u'\n'.join( lines ) )
    outFile.close()
def per( st, n ):
    mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id )
    for f in cfg( n.mid, None, 'morph_fields' ):
        ms = getMorphemes( n[ f ], None, cfg1('morph_blacklist') )
        loc = AnkiDeck( n.id, f, n[ f ], n.guid, mats )
        st['morphDb'].addMsL( ms, loc )
    return st
Esempio n. 10
0
def per( st, f ):
   ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] )
   for m in ms:
      if m in st['db'].db:
         st['ed'].deck.addTags( [f.id], st['tags'] )
         return st
   return st
Esempio n. 11
0
def per(st, f):
    ms = M.getMorphemes(st['mp'], f['Expression'], bs=st['bs'])
    for m in ms:
        if m in st['db'].db:
            st['ed'].deck.addTags([f.id], st['tags'])
            return st
    return st
Esempio n. 12
0
def per( st, n ):
    notecfg = getFilter(n)
    if notecfg is None: return st
    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
    for f in notecfg['Fields']:
        ms = getMorphemes(morphemizer, n[f], n.tags)
        st['morphemes'] += ms
    return st
Esempio n. 13
0
def per( st, f ):
   ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] )
   #QMessageBox( None, 'Note', 'Comparing %s to %s' % (str(ms), str(st['ms']) ) )
   for x in ms:
      if x in st['ms']:
         tags = addTags( st['tags'], f.tags )
         f.tags = canonifyTags( tags )
   #      QMessageBox( None, 'Note', 'found morpheme in expr, set tag' )
   return st
Esempio n. 14
0
def per(st, n):  # :: State -> Note -> State
    #n.delTag( st['tags'] ) # clear tags if they already exist?

    for field in cfg(n.mid, None, 'morph_fields'):
        for m in getMorphemes(getMorphemizerForNote(n), n[field]):
            if m in st['db'].db:
                n.addTag(st['tags'])
                break
    n.flush()
    return st
Esempio n. 15
0
def per( st, n ): # :: State -> Note -> State
    #n.delTag( st['tags'] ) # clear tags if they already exist?
    
    for field in cfg( n.mid, None, 'morph_fields' ):
        for m in getMorphemes( n[ field ], None, cfg1('morph_blacklist') ):
            if m in st['db'].db:
                n.addTag( st['tags'] )
                break
    n.flush()
    return st
Esempio n. 16
0
def per( st, n ):
    mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id )
    notecfg = getFilter(n)
    if notecfg is None: return st
    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
    for f in notecfg['Fields']:
        ms = getMorphemes(morphemizer, n[f], n.tags)
        loc = AnkiDeck(n.id, f, n[f], n.guid, mats)
        st['morphDb'].addMsl(ms, loc)

    return st
Esempio n. 17
0
def per( st, n ):
    mats = mw.col.db.list( 'select ivl from cards where nid = :nid', nid=n.id )
    notecfg = getFilter(n)
    if notecfg is None: return st
    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
    for f in notecfg['Fields']:
        ms = getMorphemes(morphemizer, n[f])
        loc = AnkiDeck(n.id, f, n[f], n.guid, mats)
        st['morphDb'].addMsl(ms, loc)

    return st
Esempio n. 18
0
def post( st ):
   ms = M.getMorphemes( st['mp'], st['txt'], bs=st['bs'] )
   util.killMecab( st )
   txt = M.ms2str( ms )

   kdb = M.MorphDb( util.knownDbPath )
   newMs = [ m for m in ms if m not in kdb.db ]
   newTxt = M.ms2str( newMs )

   txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt
   QMessageBox.information( st['ed'], 'Morphemes', txt )
Esempio n. 19
0
def post( st ):
   ms = M.getMorphemes( st['mp'], st['txt'], bs=st['bs'] )
   util.killMecab( st )
   txt = M.ms2str( ms )

   kdb = M.MorphDb( util.knownDbPath )
   newMs = [ m for m in ms if m not in kdb.db ]
   newTxt = M.ms2str( newMs )

   txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt
   QMessageBox.information( st['ed'], 'Morphemes', txt )
Esempio n. 20
0
def per( st, n ):
    n.delTag( st['tags'] )

    if n['k+N'] == '1': # FIXME this special but commonly wanted logic must be a cfg option
        ms = getMorphemes( n['focusMorph'], None, cfg1('morph_blacklist') )
        for m in ms:
            if m in st['db'].db:
                n.addTag( st['tags'] )
                break

    n.flush()
    return st
Esempio n. 21
0
def post( st ):
   import morphemes as m
   mp = m.mecab( None )
   ms = m.getMorphemes( mp, st['txt'], bs=st['bs'] )
   mp.kill()
   txt = m.ms2str( ms ).decode('utf-8')

   kdb = m.loadDb( util.knownDbPath )
   newMs = [ x for x in ms if x not in kdb ]
   newTxt = m.ms2str( newMs ).decode('utf-8')

   txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt
   QMessageBox.information( st['ed'], 'Morphemes', txt )
Esempio n. 22
0
def per(st, n):  # :: State -> Note -> State
    #n.delTag( st['tags'] ) # clear tags if they already exist?

    notecfg = getFilter(n)
    if notecfg is None: return st
    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
    for field in notecfg['Fields']:
        for m in getMorphemes(morphemizer, n[field]):
            if m in st['db'].db:
                n.addTag(st['tags'])
                break

    n.flush()
    return st
Esempio n. 23
0
def per( st, n ): # :: State -> Note -> State
    #n.delTag( st['tags'] ) # clear tags if they already exist?

    notecfg = getFilter(n)
    if notecfg is None: return st
    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
    for field in notecfg['Fields']:
        for m in getMorphemes(morphemizer, n[ field ], n.tags):
            if m in st['db'].db:
                n.addTag(st['tags'])
                break

    n.flush()
    return st
Esempio n. 24
0
def highlight( txt, extra, fieldDict, field, mod_field ):
    '''When a field is marked with the 'focusMorph' command, we format it by
    wrapping all the morphemes in <span>s with attributes set to its maturity'''
    from util import getFilterByTagsAndType
    from morphemizer import getMorphemizerByName
    from morphemes import getMorphemes

    # must avoid formatting a smaller morph that is contained in a bigger morph
    # => do largest subs first and don't sub anything already in <span>
    def nonSpanSub( sub, repl, string ):
        return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) )

    # find morphemizer; because no note/card information is exposed through arguments, we have to find morphemizer based on tags alone
    #from aqt.qt import debug; debug()
    #
    #if mw.reviewer.card is None: return txt
    #note = mw.reviewer.card.note()
    #if not isNoteSame(note, fieldDict): return txt
    #from aqt.qt import debug; debug()

    tags = fieldDict['Tags'].split()
    filter = getFilterByTagsAndType(fieldDict['Type'], tags)
    if filter is None:
        return txt
    morphemizer = getMorphemizerByName(filter['Morphemizer'])
    if morphemizer is None:
        return txt
    ms = getMorphemes(morphemizer, txt, tags)

    for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first
        locs = allDb().db.get( m, set() )
        mat = max( loc.maturity for loc in locs ) if locs else 0

        if   mat >= cfg1( 'threshold_mature' ):  mtype = 'mature'
        elif mat >= cfg1( 'threshold_known' ):   mtype = 'known'
        elif mat >= cfg1( 'threshold_seen' ):    mtype = 'seen'
        else:                                    mtype = 'unknown'
        repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format(
                morph = m.inflected,
                mtype = mtype,
                mat = mat
                )
        txt = nonSpanSub( m.inflected, repl, txt )
    return txt
Esempio n. 25
0
                def parse_text(text):
                    nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                    nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                    log_fp.write('=== parse_text ===\n' + text + '\n')
                    # print('strip',stripHTML(text))
                    parsed_morphs = getMorphemes(self.morphemizer,
                                                 stripHTML(text))
                    # parsed_morphs = getMorphemes(morphemizer, text)
                    if len(parsed_morphs) == 0:
                        return

                    unknown_count = 0
                    line_missing_morphs = set()
                    for m in parsed_morphs:
                        # Count morph for word report
                        all_morphs[m] = all_morphs.get(m, 0) + 1
                        seen_morphs[m] = seen_morphs.get(m, 0) + 1

                        if m.isProperNoun():
                            proper_noun_count += 1
                            is_proper_noun = True
                        else:
                            is_proper_noun = False

                        i_count += 1
                        if known_db.matches(
                                m
                        ) or is_proper_noun:  # Proper nouns are easy to learn, so assume they're known.
                            known_morphs[m] = known_morphs.get(m, 0) + 1
                            known_count += 1
                        else:
                            unknown_db.addMorph(m, 1)
                            source_unknown_db.addMorph(m, 1)
                            line_missing_morphs.add(m)
                            unknown_count += 1
                    line_count += 1
                    if unknown_count == 0:
                        known_line_count += 1
                    elif unknown_count == 1:
                        iplus1_line_count += 1
                    line_morphs.append(line_missing_morphs)
Esempio n. 26
0
def highlight( txt, extra, fieldDict, field, mod_field ):
    '''When a field is marked with the 'focusMorph' command, we format it by
    wrapping all the morphemes in <span>s with attributes set to its maturity'''
    # must avoid formatting a smaller morph that is contained in a bigger morph
    # => do largest subs first and don't sub anything already in <span>
    def nonSpanSub( sub, repl, string ):
        return u''.join( re.sub( sub, repl, s ) if not s.startswith('<span') else s for s in re.split( '(<span.*?</span>)', string ) )
    from morphemes import getMorphemes
    ms = getMorphemes( txt )
    for m in sorted( ms, key=lambda x: len(x.inflected), reverse=True ): # largest subs first
        locs = allDb().db.get( m, set() )
        mat = max( loc.maturity for loc in locs ) if locs else 0

        if   mat >= cfg1( 'threshold_mature' ):  mtype = 'mature'
        elif mat >= cfg1( 'threshold_known' ):   mtype = 'known'
        elif mat >= cfg1( 'threshold_seen' ):    mtype = 'seen'
        else:                                    mtype = 'unknown'
        repl = u'<span class="morphHighlight" mtype="{mtype}" mat="{mat}">{morph}</span>'.format(
                morph = m.inflected,
                mtype = mtype,
                mat = mat
                )
        txt = nonSpanSub( m.inflected, repl, txt )
    return txt
Esempio n. 27
0
def per( st, f ):
   st['ms'].extend( m.getMorphemes( st['mp'], f[ 'Expression' ] ) )
   return st
Esempio n. 28
0
def mkAllDb(allDb=None):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)
        if not C('enabled'): continue

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in C('morph_fields'):
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(fieldValue)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf('    .mats for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf('    .morphs for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(fieldValue)
                    locDb.pop(loc)
                    locDb[newLoc] = ms
    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb
Esempio n. 29
0
def post( st ):
    ms = getMorphemes( st['txt'], None, cfg1('morph_blacklist') )
    s = ms2str( ms )
    infoMsg( '----- All -----\n' + s )
Esempio n. 30
0
def mkAllDb(allDb=None):
    import config
    reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    N_enabled_notes = 0  # for providing an error message if there is no note that is used for processing
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in notecfg['Fields']:
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop(loc)
                    locDb[newLoc] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(
            u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.'
        )
        return None

    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb
Esempio n. 31
0
def post(st):
    ms = getMorphemes(st['morphemizer'], st['txt'])
    s = ms2str(ms)
    infoMsg('----- All -----\n' + s)
Esempio n. 32
0
def per( st, f ):
   ms = M.getMorphemes( st['mp'], f['Expression'], bs=st['bs'] )
   for m in ms:
      if m not in st['mfmap']: st['mfmap'][m] = []
      if f not in st['mfmap'][m]: st['mfmap'][m].append( f )
   return st
Esempio n. 33
0
    def mkAll(self):  # IO ()
        log('Getting initial all.db...')
        if not hasattr(self, '_allDb'):
            try:
                self._allDb = M.MorphDb(self.allPath)
                debug('  * Updating existing all.db')
            except IOError:
                self._allDb = M.MorphDb()
                debug('  * Creating new all.db from scratch')
        allDb = self._allDb
        log('...done')

        mp = M.mecab()

        # pre-cache lookups
        fieldNames = self.cfg['morph fields']
        whitelist, blacklist = self.cfg['whitelist'], self.cfg['blacklist']
        fid2cardsDb = self.fid2cardsDb()
        fidDb = allDb.fidDb()
        locDb = allDb.locDb()
        fs = self.getFacts()

        i, lfs = 0, len(fs)
        start = time.time()
        last = time.time()
        for f in fs:
            mats = [c.interval for c in fid2cardsDb[f.id]]
            for fieldName in fieldNames:
                try:
                    fieldValue = normalizeFieldValue(f[fieldName])
                except KeyError:  # if fact doesn't have the field just skip it
                    continue
                try:  # existing location
                    loc = fidDb[(f.id, fieldName)]
                    # new loc only; no morpheme change
                    if loc.fieldValue == fieldValue and loc.maturities != mats:
                        debug('        .mats for %d[%s]' % (f.id, fieldName))
                        newLoc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                            self.deckPath, self.deckName, mats)
                        ms = locDb.pop(loc)
                        locDb[newLoc] = ms
                    # new loc and new morphemes
                    elif loc.fieldValue != fieldValue:
                        debug('        .morphs for %d[%s]' % (f.id, fieldName))
                        newLoc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                            self.deckPath, self.deckName, mats)
                        ms = M.getMorphemes(mp,
                                            fieldValue,
                                            ws=whitelist,
                                            bs=blacklist)
                        locDb.pop(loc)
                        locDb[newLoc] = ms
                except KeyError:  # new location
                    loc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                     self.deckPath, self.deckName, mats)
                    ms = M.getMorphemes(mp,
                                        fieldValue,
                                        ws=whitelist,
                                        bs=blacklist)
                    if ms:
                        debug('        .loc for %d[%s]' % (f.id, fieldName))
                        locDb[loc] = ms
            i += 1
            if i % 100 == 0:
                log('    %d / %d = %d%% in %f sec' %
                    (i, lfs, 100. * i / lfs, time.time() - last))
                last = time.time()
        log('Proccessed all facts in %f sec. Now saving...' %
            (time.time() - start))
        allDb.clear()
        allDb.addFromLocDb(locDb)
        allDb.save(self.allPath)
        self.cfg['last db update'][self.allPath] = time.time()
        self.cfg['last all.db update took'] = time.time() - start
        log('...done')
        sigterm(mp)
        return self._allDb
Esempio n. 34
0
    def mkAll( self ): # IO ()
        log( 'Getting initial all.db...' )
        if not hasattr( self, '_allDb' ):
            try:
                self._allDb = M.MorphDb( self.allPath )
                debug( '  * Updating existing all.db' )
            except IOError:
                self._allDb = M.MorphDb()
                debug( '  * Creating new all.db from scratch' )
        allDb = self._allDb
        log( '...done' )

        mp = M.mecab()

        # pre-cache lookups
        fieldNames = self.cfg['morph fields']
        whitelist, blacklist = parseWhitelist( self.cfg['whitelist'] ), parseWhitelist( self.cfg['blacklist'] )
        fid2cardsDb = self.fid2cardsDb()
        fidDb = allDb.fidDb()
        locDb = allDb.locDb()
        fs = self.getFacts()

        i, lfs = 0, len( fs )
        start = time.time()
        last = time.time()
        for f in fs:
            mats = [ c.interval for c in fid2cardsDb[ f.id ] ]
            for fieldName in fieldNames:
                try:
                    fieldValue = normalizeFieldValue( f[ fieldName ] )
                except KeyError: # if fact doesn't have the field just skip it
                    continue
                try: # existing location
                    loc = fidDb[ (f.id, fieldName) ]
                    # new loc only; no morpheme change
                    if loc.fieldValue == fieldValue and loc.maturities != mats:
                        debug('        .mats for %d[%s]' % ( f.id, fieldName ) )
                        newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                        ms = locDb.pop( loc )
                        locDb[ newLoc ] = ms
                    # new loc and new morphemes
                    elif loc.fieldValue != fieldValue:
                        debug('        .morphs for %d[%s]' % ( f.id, fieldName ) )
                        newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                        ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist )
                        locDb.pop( loc )
                        locDb[ newLoc ] = ms
                except KeyError: # new location
                    loc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                    ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist )
                    if ms:
                        debug('        .loc for %d[%s]' % ( f.id, fieldName ) )
                        locDb[ loc ] = ms
            i += 1
            if i % 100 == 0:
                log('    %d / %d = %d%% in %f sec' % ( i, lfs, 100.*i/lfs, time.time()-last ) )
                last = time.time()
        log( 'Proccessed all facts in %f sec. Now saving...' % ( time.time()-start ) )
        allDb.clear()
        allDb.addFromLocDb( locDb )
        allDb.save( self.allPath )
        self.cfg['last db update'][ self.allPath ] = time.time()
        self.cfg['last all.db update took'] = time.time() - start
        log( '...done' )
        sigterm( mp )
        return self._allDb
Esempio n. 35
0
def per(st, f):
    ms = M.getMorphemes(st['mp'], f['Expression'], bs=st['bs'])
    for m in ms:
        if m not in st['mfmap']: st['mfmap'][m] = []
        if f not in st['mfmap'][m]: st['mfmap'][m].append(f)
    return st
Esempio n. 36
0
def mkAllDb( allDb=None ):
    import config; reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        if C('ignore maturity'):
            mats = [ 0 for mat in mats ]
        ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in notecfg['Fields']:
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.')
        return None

    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb