def findSeed(targetCID, useTfIdf=True, categoryName='', initialYMax=10, initialVMax=10): import vocab seedY = [] seedV = [] words = defaultdict(int) for d in document.iterDoc(targetCID): words[d[-1]] += 1 logging.log(logging.INFO, u'number of unique words:%d' % len(words)) vocab = vocab.build() if useTfIdf: sortfunc = lambda x: float(x[1])/log(vocab.get(x[0].origin,2)) else: sortfunc = lambda x: x[1] for word, cnt in sorted(words.iteritems(), key=sortfunc, reverse=True): g = vocab.get(word.origin,0) if 0 < g < 200000000: if word.surface == '(': #posid=36なので品詞で取り除けない continue #huristics logging.log(logging.INFO, (u'word:%s[%d], cnt:%d, google-cnt:%d' % (word, word.posid, cnt, g))) if word.origin in categoryName: continue #huristics if len(seedY) < initialYMax and word.willBeEntry(): seedY.append(word.get()) if len(seedV) < initialVMax and word.isAdj(): seedV.append(word.get()) if len(seedY) == initialYMax and len(seedV) == initialVMax: break print 'seedY:', repr(seedY).decode('unicode-escape') print 'seedV:', repr(seedV).decode('unicode-escape') return seedY, seedV
def createPhraseDict(targetCID): phraseDict = defaultdict(list) for val in document.iterDoc(targetCID): at, word = tuple(val[:-1]), val[-1] phraseDict[word.get()].append(at) return phraseDict