def test4(): import stardict lm1 = stardict.LemmaDB() lm2 = stardict.LemmaDB() lm1.load('bnc-lemma.txt') lm2.load('lemma.en.txt') count1 = 0 count2 = 0 for stem in lm2.dump('stem'): childs = lm2.get(stem) stem = stem.lower() if len(stem) <= 2 and stem.isupper(): continue if not stem in lm1: count1 += 1 else: obj = lm1.get(stem) for word in childs: word = word.lower() if not word in obj: print '%s -> %s'%(stem, word) count2 += 1 for word in childs: lm1.add(stem, word.lower()) print 'count', count1, count2 lm1.save('lemma-bnc.txt') return 0
def __init__ (self, filepath, threshold=0): self.dbname = "stardict.db" self.filepath = filepath self.sd = stardict.StarDict(self.dbname) self.lemma = stardict.LemmaDB() self.lemma.load('lemma.en.txt') self.worddict = OrderedDict() self.threshold = threshold self.find_uncapword_sent=False self.myvocab = [line.split("\t")[0] for line in open("myvocab.txt", 'r').read().split('\n') if not line.startswith("#")]
def lemma(self): if self._lemma is None: fn = ccinit.path_home('share/dict/lemma.en.txt') self._lemma = stardict.LemmaDB() self._lemma.load(fn) return self._lemma