Esempio n. 1
0
def coocurWordFirstOrder():

    klog.msg('find word coocur')
    Hist = {}
    nNumSentences = 0
    for nFiles, text in enumerate(textFiles()):
        sentences = lang.getSentenceList(text)
        nNumSentences += len(sentences)
        if nFiles > 100000: break
        if nFiles % 1000 == 1:
            print nFiles, " files: #.sentences", nNumSentences
        continue
        for ids, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenlist = lang.regularwords(tokenlist)
            selected = [t for t in tokenlist if not lang.isNoise(t)]
            #kbay.countCooccur(selected, Hist)
            kbay.countCooccurNearest(selected, Hist, nearest=10)

        if nFiles % 1000 == 1:
            refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h
        peek(nFiles + 1, 1000)
        if nFiles > 100000: break
    print "number of sentences:", nNumSentences
    return
    klog.msg("finished total %s files" % nFiles)
    refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)
    kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
Esempio n. 2
0
def findRelatedGrams():

    klog.msg('find co-exist terms')
    Hist = {}

    for idx_raw, text in enumerate(ieeePapers()):

        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            grams = ngramsOfSentence(s)
            kbay.countCooccurConsideringSubgram(grams,
                                                Hist,
                                                gramDF=GRAM_DF,
                                                debug=0)

        if idx_raw % 1000 == 0:
            mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h

        peek(idx_raw + 1, 1000)
        if idx_raw > 200000: break

    kbay.filter2DHistByCount(Hist, 2, verbose=True)
    kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
Esempio n. 3
0
def selectTest():
    print "select test ..."
    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    print "book size:", len(book)

    for idx_raw, text in enumerate(ieeePapers()):
        print text
        sentences = lang.getSentenceList(text)
        localHist = {}
        scoreByLang = {}
        gramLeftRight = {}
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            for g, l, r in selecedngb:
                localHist[g] = localHist.get(g, 0) + 1

                scoreByLang[g] = scoreByLang.get(g, 0) + linguisticScore(
                    g, l, r, tokenlist)
                if not g in gramLeftRight:
                    gramLeftRight[g] = []
                lefttoken = '<L>' + ('#BEGIN' if l == 0 else tokenlist[l - 1])
                righttoken = '<R>' + ('#END' if r >=
                                      (len(tokenlist) - 1) else tokenlist[r +
                                                                          1])
                gramLeftRight[g].append((lefttoken, righttoken))

        # scores
        scoreByDF = {}

        totalDF = 0
        for g in localHist:
            scoreByDF[g] = book.get(g, 0)
            totalDF = scoreByDF[g]
        averageDF = totalDF / len(scoreByDF)
        sortedByDF = sorted(scoreByDF.items(),
                            key=lambda x: x[1],
                            reverse=True)
        print sortedByDF
        print "average DF", averageDF
        print "gram with DF above average"
        print[(g, count) for (g, count) in sortedByDF if count > averageDF]
        print "gram with DF below average"
        print[(g, count) for (g, count) in sortedByDF if not count > averageDF]

        print "lang score:"
        print scoreByLang
        print "gram left right"
        print gramLeftRight
        pause()
Esempio n. 4
0
def testPos():
    for idx_raw, text in enumerate(ieeePapers()):
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenstoplist = lang.markRawPos(tokenlist)
            #lang.markVerbs(tokenlist)
            pause()
Esempio n. 5
0
def recommendTerms():
    dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    cobook = fif.readCoocurWithFilterFunc(
        'tmp_ieee_coocur_abstractwide_grams.txt', dfbook)
    for idx_raw, text in enumerate(ieeePapers()):
        sentences = lang.getSentenceList(text)
        coHist = {}  # coocur_gram -> df for grams in abstract
        localHist = {}  # local gram -> occurrence count
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            for g in grams:
                localHist[g] = localHist.get(g, 0) + 1
        for g in localHist:
            cograms = cobook.get(g, [])
            for gg in cograms:
                coHist[gg] = coHist.get(gg, [])
                coHist[gg].append(g)
        # just by mention/occurrence
        score = {}
        for g in localHist:
            cograms = cobook.get(g, [])
            if not g in cobook: continue
            gcount = cobook[g][g]
            for gg in cograms:
                if gg == g: continue
                cocount = cobook[g][gg]

                # ignore those with only one-degree of relavance for the moment
                if not len(coHist[gg]) > 1:
                    continue

                score[gg] = score.get(gg, 0) + float(cocount) / gcount

        fluxAndPosterior = {}
        for g, colist in coHist.items():
            if not len(g.split()) > 1: continue
            if len(colist) > 1:
                fluxAndPosterior[g] = (score[g], colist)

        print "grams of text:"
        print localHist.keys()

        print "cogram having influx > 2 ..."
        for g, colist in coHist.items():
            if len(colist) > 1:
                print g, colist

        print "select from coHist ..."
        print sorted(coHist.items(), key=lambda x: len(x[1]),
                     reverse=True)[:20]

        print "select from posterior..."
        print sorted(fluxAndPosterior.items(), key=lambda x: x[1][0])
        pause()
Esempio n. 6
0
def findDFOfWords():
    DF = {}
    for nFiles, text in enumerate(ieeePapers()):
        sentences = lang.getSentenceList(text)
        words = set(sum([wordlist(s, regular=True) for s in sentences], []))
        for w in words:
            DF[w] = DF.get(w, 0) + 1
        if nFiles % 1000 == 0: print "finished", nFiles, "files"
    print "finished total", nFiles, "files"
    kbay.saveDF(DF,
                _SAVEDIR_ + SrcFileName + '_df.txt',
                sort=False,
                numDoc=idx)
Esempio n. 7
0
def df_words():

    print "find df of words..."
    DF = {}
    for nFiles, text in enumerate(textFiles()):

        sentences = lang.getSentenceList(text)
        words = set(sum([wordlist(s, regular=True) for s in sentences], []))
        kbay.count(words, DF)

        peek(nFiles, 1000)

    print "finished total", nFiles, "files"
    kbay.saveDF(DF, SAVEDIR+SrcFileName+'_df.txt', sort=False, numDoc=nFiles)
Esempio n. 8
0
def findDFOfGrams():
    gramHist = {}
    for idx_raw, text in enumerate(ieeePapers()):
        localGramHist = {}  # gram -> count
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            for g, l, r in selecedngb:
                localGramHist[g] = localGramHist.get(g, 0) + 1
                words = g.split()
                if len(words) >= 3:
                    for ii, w in enumerate(words[:-1]):
                        posEndingWord = poslist[ii + 1]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 2])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1
                if len(words) >= 4:
                    for ii, w in enumerate(words[:-2]):
                        posEndingWord = poslist[ii + 2]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 3])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1
                if len(words) >= 5:
                    for ii, w in enumerate(words[:-3]):
                        posEndingWord = poslist[ii + 3]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 4])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1

        # save the local grams
        for g in localGramHist:
            gramHist[g] = gramHist.get(g, 0) + 1

        peek(idx_raw + 1, 2000)

    kbay.saveDF(gramHist, 'ieeeGramDF.txt', sort=False, numDoc=idx_raw)
Esempio n. 9
0
def findPattern(DO_COOCUR=False, ROUND=1):

    for idx_raw, text in enumerate(ieeePapers()):
        idx = idx_raw + 1
        peek(idx, 1000)

        #if idx>10000: break

        ngram_local = {}
        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            tokenlist = lang.tokenize(s)

            seq.learnNewSequence(tokenlist)
            seq.blindMemorize(tokenlist)

            seq.showPatterns()
            pause()
Esempio n. 10
0
def rank():
    klog.msg('rank words in text based on cross-info score')
    for nFiles, text in enumerate(textFiles()):
        sentences = lang.getSentenceList(text)
        wordprof = {}
        wordscores = []
        print "############ TEXT #############"
        print text
        for ids, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenlist = lang.regularwords(tokenlist)
            selected = set([t for t in tokenlist if not lang.isNoise(t)])
            for w in selected:
                wordprof[w] = wordprof.get(w, 0) + 1
            print s
            for w in selected:
                s = memory.sumscore([ww for ww in selected if ww != w], w)
                wordscores.append((w, s))
            print sorted(wordscores, key=lambda x: x[-1], reverse=True)
            wordscores = []
            pause()
Esempio n. 11
0
def memorizeCogram():

    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    memo = mem.Memory()
    memo.setInitialCapacity(200)

    for idx_raw, text in enumerate(ieeePapers()):
        #if idx_raw<220000: continue
        sentences = lang.getSentenceList(text)
        gramsPreviousSentence = set([])
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            if not grams: continue
            goodgrams = set([g for g in grams if g in book])
            memo.learnSymbList(goodgrams)
            # grams of previous sentence: learn grams of current sentence
            # grams of current  sentence: learn grams of previous sentence
            memo.crosslearn(gramsPreviousSentence, goodgrams, crossweight=1)
            if 0 and len(list(gramsPreviousSentence) + list(goodgrams)) == 1:
                print "only 1 gram in two sentences!!!"
                print "sentence:", s
                print "grams before filtering:", grams
                print "grams after filtering", goodgrams
                if idofs > 0:
                    print "previous sentence:", sentences[idofs - 1]
                    print "previous grams before filtering:", ngramsOfSentence(
                        sentences[idofs - 1])
                    print "previous grams after filtering:", gramsPreviousSentence
                pause()
            gramsPreviousSentence = goodgrams

        peek(idx_raw + 1, 2000)
        if (idx_raw + 1) % 2000 == 0:
            memo.refresh()
            memo.showsize()

        #if idx_raw>6000:
        #    break

    kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_grams.txt')
Esempio n. 12
0
def memorizeCoword():

    memo = mem.Memory()
    memo.setInitialCapacity(200)

    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)

    for idx_raw, text in enumerate(ieeePapers()):
        #if idx_raw<70000: continue
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            if not grams: continue
            words = set(' '.join(grams).split())
            words = [w for w in words if w in book]
            memo.learnSymbList(words)

        peek(idx_raw + 1, 2000)
        if (idx_raw + 1) % 2000 == 0:
            memo.refresh()
            memo.showsize()

    kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_word_bymemo.txt')
Esempio n. 13
0
def findNgrams(DO_COOCUR=False, ROUND=1):

    NGRAMS_DOC = {}
    COOCUR_S = {}  # co-occurrence in a sentence
    NGRAM_LR = {}

    for idx, text in enumerate(textFiles()):
        peek(idx + 1, 1000)

        #if idx>1000: break

        ngram_local = {}
        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            tokenlist = lang.tokenize(s)
            poslist = lang.posnize(tokenlist)

            #print "\n-----tokens and poslist----"
            #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)]

            if not len(tokenlist) > 5:
                #print "Anormaly of sentence:", s
                #pause()
                continue

            tokenstoplist = lang.markStops(tokenlist)

            if 0:
                print "stops:"
                print tokenstoplist
                #pause()

            if len(tokenlist) > 80:
                continue
                print "###### text ######", idx
                print text
                print tokenlist, len(tokenlist)
                #pause()

            ngb = lang.ngrambounds(tokenstoplist)
            #print "gram with bounds:", ngb

            selecedngb = lang.filterSRS(ngb, tokenstoplist)
            #print "\nSRS-FIL gram with bounds:", selecedngb
            selecedngb = lang.filterAdj(selecedngb, s)
            #print "\nADJ-FIL gram with bounds:", selecedngb
            selecedngb = lang.filterAdv(selecedngb, s)
            #print "\nADV-FIL gram with bounds:", selecedngb
            #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored
            #print "\nVERB-FIL gram with bounds:", selecedngb

            # do it again after pure pos-based filtering
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            #print "\nFINAL selected gram with bounds:", selecedngb

            if ROUND == 1:
                # in the 1st round, profile the next word after a gram
                for (gram, leftidx, rightidx) in selecedngb:
                    nextword = lang.nextword(rightidx, tokenlist)
                    prevword = lang.prevword(leftidx, tokenlist)
                    nextwordcode = lang.profilingCode(nextword)
                    prevwordcode = lang.profilingCode(prevword)

                    kbay.inc3d(gram, '_', '_', NGRAM_LR)  # '_' as itself
                    kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR)
                    kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR)

                    if lang.isSubject(leftidx, rightidx, tokenlist):
                        kbay.inc3d(gram, '_', 's', NGRAM_LR)
                        #print "subject:", gram
                        #pause()

            if ROUND == 2:
                # in the 2nd round, justify the gram
                for ngb in selecedngb:
                    print "check this:", ngb
                    sg = grammer.subgram(ngb[0], ngb[1], ngb[2],
                                         READIN_GRAM_LR, tokenlist, poslist)
                    if sg:
                        print "gram", ngb, "subgram", sg
                        raw_input()

            if 0:
                print "\n\n", s
                print "raw   ngb >", ngb
                print "final ngb >", selecedngb
                pause()

            ngrams = [t[0] for t in selecedngb]
            ngrams = [g for g in ngrams if len(g.split()) > 1]

            kbay.count(ngrams, ngram_local)

            if DO_COOCUR:
                for n1 in ngrams:
                    for n2 in ngrams:
                        kbay.inc2d(n1, n2, COOCUR_S)

        # doc.freq. - each gram counted only once
        kbay.count(ngram_local, NGRAMS_DOC)

    kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt')

    #print "filter df-doc"
    #filterHistByCount(NGRAMS_DOC, 2, verbose=False)
    #kbay.saveDF(NGRAMS_DOC,   SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx)

    if DO_COOCUR:
        print "filter coocur"
        kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True)
        kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt')

    print "DONE findNgrams"
Esempio n. 14
0
def selectGrams():
    klog.msg('select grams')
    book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt')
    #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2)
    CoHist = {}
    CoWord = {}
    klog.msg('looping files')
    for idx_raw, text in enumerate(ieeePapers()):
        localGramHist = {}  # gram -> count
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            #print s
            #print "\n>INITIAL grams:\n", ngb
            #print "\n>SELECTED grams:\n", selecedngb
            for g, l, r in selecedngb:
                localGramHist[g] = localGramHist.get(g, 0) + 1

            if 0:
                print text
                print "#.localgrams:", len(localGramHist)
                print localGramHist
                print "#.ngram:", len([1 for g in localGramHist if ' ' in g])
                pause()

        #kbay.countCooccur(localGramHist, CoHist)

        # calculate mutual information
        gramlist = localGramHist.keys()
        gramscore = []
        for g in gramlist:
            gramscore.append(relativeInfo(g, gramlist, book))
        print sorted(gramscore, key=lambda x: x[1])
        averageScore = sum([g[1] for g in gramscore]) / len(gramscore)
        print "above average:", averageScore
        print[g for g in gramscore if g[1] > averageScore]
        pause()

        wordset = set([w for g in localGramHist for w in g.split()])
        kbay.countCooccur(wordset, CoWord)

        peek(idx_raw + 1, 1000)

        if (idx_raw + 1) % 4000 == 0:
            #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True)
            #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord)
            #print "hist size", h, hh, hh/h
            #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw)
            kbay.saveDF2D(
                CoWord,
                'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1))
            CoWord = {}  # reset
            break

        if 0:
            if (idx_raw + 1) % 40000 == 0:
                kbay.saveDF2D(
                    CoHist,
                    'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1))
                CoHist = {}  # reset