コード例 #1
0
def coocurWordFirstOrder():

    klog.msg('find word coocur')
    Hist = {}
    nNumSentences = 0
    for nFiles, text in enumerate(textFiles()):
        sentences = lang.getSentenceList(text)
        nNumSentences += len(sentences)
        if nFiles > 100000: break
        if nFiles % 1000 == 1:
            print nFiles, " files: #.sentences", nNumSentences
        continue
        for ids, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenlist = lang.regularwords(tokenlist)
            selected = [t for t in tokenlist if not lang.isNoise(t)]
            #kbay.countCooccur(selected, Hist)
            kbay.countCooccurNearest(selected, Hist, nearest=10)

        if nFiles % 1000 == 1:
            refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h
        peek(nFiles + 1, 1000)
        if nFiles > 100000: break
    print "number of sentences:", nNumSentences
    return
    klog.msg("finished total %s files" % nFiles)
    refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)
    kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
コード例 #2
0
ファイル: mine_ieee.py プロジェクト: kunlubrain/legacy
def findRelatedGrams():

    klog.msg('find co-exist terms')
    Hist = {}

    for idx_raw, text in enumerate(ieeePapers()):

        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            grams = ngramsOfSentence(s)
            kbay.countCooccurConsideringSubgram(grams,
                                                Hist,
                                                gramDF=GRAM_DF,
                                                debug=0)

        if idx_raw % 1000 == 0:
            mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h

        peek(idx_raw + 1, 1000)
        if idx_raw > 200000: break

    kbay.filter2DHistByCount(Hist, 2, verbose=True)
    kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
コード例 #3
0
ファイル: mine_wiki_de.py プロジェクト: kunlubrain/legacy
def coocurWord(save2file, verbose=0):
    klog.msg('find word coocur')
    Hist = {}
    for nFiles, (datafile, text) in enumerate(loopWikiData()):
        sentences = cleaner.getSentences_regex(text)
        for s in sentences:
            #selected = set(cleaner.capitalizedWordsOf(s))
            selected = cleaner.tokenize_simple(s)
            if verbose or 0:
                print "sentence:", s
                print "selected:", selected
                pause()
            #kbay.countCooccurNearest(selected, Hist, nearest=10)
            kbay.countCooccurNearest(selected, Hist, nearest=2)
            #print "learn", selected
            #print Hist, len(Hist)
            #pause()
        if nFiles % 1000 == 0: print nFiles, " files done"
        if nFiles % 4000 == 0:
            print "before mem refresh:", len(Hist)
            memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            print "after mem refresh:", len(Hist), '\n'
        if nFiles > 40000: break

    klog.msg("finished total %s files" % nFiles)
    memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)

    fif.saveHist2D(Hist, save2file)

    return Hist
コード例 #4
0
def rank():
    klog.msg('rank words in text based on cross-info score')
    for nFiles, text in enumerate(textFiles()):
        sentences = lang.getSentenceList(text)
        wordprof = {}
        wordscores = []
        print "############ TEXT #############"
        print text
        for ids, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenlist = lang.regularwords(tokenlist)
            selected = set([t for t in tokenlist if not lang.isNoise(t)])
            for w in selected:
                wordprof[w] = wordprof.get(w, 0) + 1
            print s
            for w in selected:
                s = memory.sumscore([ww for ww in selected if ww != w], w)
                wordscores.append((w, s))
            print sorted(wordscores, key=lambda x: x[-1], reverse=True)
            wordscores = []
            pause()
コード例 #5
0
ファイル: mine_ieee.py プロジェクト: kunlubrain/legacy
def selectGrams():
    klog.msg('select grams')
    book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt')
    #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2)
    CoHist = {}
    CoWord = {}
    klog.msg('looping files')
    for idx_raw, text in enumerate(ieeePapers()):
        localGramHist = {}  # gram -> count
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            #print s
            #print "\n>INITIAL grams:\n", ngb
            #print "\n>SELECTED grams:\n", selecedngb
            for g, l, r in selecedngb:
                localGramHist[g] = localGramHist.get(g, 0) + 1

            if 0:
                print text
                print "#.localgrams:", len(localGramHist)
                print localGramHist
                print "#.ngram:", len([1 for g in localGramHist if ' ' in g])
                pause()

        #kbay.countCooccur(localGramHist, CoHist)

        # calculate mutual information
        gramlist = localGramHist.keys()
        gramscore = []
        for g in gramlist:
            gramscore.append(relativeInfo(g, gramlist, book))
        print sorted(gramscore, key=lambda x: x[1])
        averageScore = sum([g[1] for g in gramscore]) / len(gramscore)
        print "above average:", averageScore
        print[g for g in gramscore if g[1] > averageScore]
        pause()

        wordset = set([w for g in localGramHist for w in g.split()])
        kbay.countCooccur(wordset, CoWord)

        peek(idx_raw + 1, 1000)

        if (idx_raw + 1) % 4000 == 0:
            #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True)
            #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord)
            #print "hist size", h, hh, hh/h
            #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw)
            kbay.saveDF2D(
                CoWord,
                'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1))
            CoWord = {}  # reset
            break

        if 0:
            if (idx_raw + 1) % 40000 == 0:
                kbay.saveDF2D(
                    CoHist,
                    'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1))
                CoHist = {}  # reset