Beispiel #1
0
def findSeed(targetCID, useTfIdf=True, categoryName='', initialYMax=10, initialVMax=10):
    import vocab
    seedY = []
    seedV = []
    words = defaultdict(int)
    for d in document.iterDoc(targetCID):
        words[d[-1]] += 1
    logging.log(logging.INFO, u'number of unique words:%d' % len(words))
    vocab = vocab.build()
    if useTfIdf: sortfunc = lambda x: float(x[1])/log(vocab.get(x[0].origin,2))
    else: sortfunc = lambda x: x[1]
    for word, cnt in sorted(words.iteritems(), key=sortfunc, reverse=True):
        g = vocab.get(word.origin,0)
        if 0 < g < 200000000:
            if word.surface == '(': #posid=36なので品詞で取り除けない
                continue #huristics
            logging.log(logging.INFO, (u'word:%s[%d], cnt:%d, google-cnt:%d' % (word, word.posid, cnt, g)))
            if word.origin in categoryName: continue #huristics
            if len(seedY) < initialYMax and word.willBeEntry(): seedY.append(word.get())
            if len(seedV) < initialVMax and word.isAdj(): seedV.append(word.get())
            if len(seedY) == initialYMax and len(seedV) == initialVMax: break
    print 'seedY:', repr(seedY).decode('unicode-escape')
    print 'seedV:', repr(seedV).decode('unicode-escape')
    return seedY, seedV
Beispiel #2
0
def createPhraseDict(targetCID):
    phraseDict = defaultdict(list)
    for val in document.iterDoc(targetCID):
        at, word = tuple(val[:-1]), val[-1]
        phraseDict[word.get()].append(at)
    return phraseDict