def coocurWordFirstOrder(): klog.msg('find word coocur') Hist = {} nNumSentences = 0 for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) nNumSentences += len(sentences) if nFiles > 100000: break if nFiles % 1000 == 1: print nFiles, " files: #.sentences", nNumSentences continue for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = [t for t in tokenlist if not lang.isNoise(t)] #kbay.countCooccur(selected, Hist) kbay.countCooccurNearest(selected, Hist, nearest=10) if nFiles % 1000 == 1: refreshMemory2D(Hist, steps=nFiles, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(nFiles + 1, 1000) if nFiles > 100000: break print "number of sentences:", nNumSentences return klog.msg("finished total %s files" % nFiles) refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
def findRelatedGrams(): klog.msg('find co-exist terms') Hist = {} for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) kbay.countCooccurConsideringSubgram(grams, Hist, gramDF=GRAM_DF, debug=0) if idx_raw % 1000 == 0: mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(idx_raw + 1, 1000) if idx_raw > 200000: break kbay.filter2DHistByCount(Hist, 2, verbose=True) kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
def selectTest(): print "select test ..." book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) print "book size:", len(book) for idx_raw, text in enumerate(ieeePapers()): print text sentences = lang.getSentenceList(text) localHist = {} scoreByLang = {} gramLeftRight = {} for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localHist[g] = localHist.get(g, 0) + 1 scoreByLang[g] = scoreByLang.get(g, 0) + linguisticScore( g, l, r, tokenlist) if not g in gramLeftRight: gramLeftRight[g] = [] lefttoken = '<L>' + ('#BEGIN' if l == 0 else tokenlist[l - 1]) righttoken = '<R>' + ('#END' if r >= (len(tokenlist) - 1) else tokenlist[r + 1]) gramLeftRight[g].append((lefttoken, righttoken)) # scores scoreByDF = {} totalDF = 0 for g in localHist: scoreByDF[g] = book.get(g, 0) totalDF = scoreByDF[g] averageDF = totalDF / len(scoreByDF) sortedByDF = sorted(scoreByDF.items(), key=lambda x: x[1], reverse=True) print sortedByDF print "average DF", averageDF print "gram with DF above average" print[(g, count) for (g, count) in sortedByDF if count > averageDF] print "gram with DF below average" print[(g, count) for (g, count) in sortedByDF if not count > averageDF] print "lang score:" print scoreByLang print "gram left right" print gramLeftRight pause()
def testPos(): for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenstoplist = lang.markRawPos(tokenlist) #lang.markVerbs(tokenlist) pause()
def recommendTerms(): dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) cobook = fif.readCoocurWithFilterFunc( 'tmp_ieee_coocur_abstractwide_grams.txt', dfbook) for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) coHist = {} # coocur_gram -> df for grams in abstract localHist = {} # local gram -> occurrence count for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) for g in grams: localHist[g] = localHist.get(g, 0) + 1 for g in localHist: cograms = cobook.get(g, []) for gg in cograms: coHist[gg] = coHist.get(gg, []) coHist[gg].append(g) # just by mention/occurrence score = {} for g in localHist: cograms = cobook.get(g, []) if not g in cobook: continue gcount = cobook[g][g] for gg in cograms: if gg == g: continue cocount = cobook[g][gg] # ignore those with only one-degree of relavance for the moment if not len(coHist[gg]) > 1: continue score[gg] = score.get(gg, 0) + float(cocount) / gcount fluxAndPosterior = {} for g, colist in coHist.items(): if not len(g.split()) > 1: continue if len(colist) > 1: fluxAndPosterior[g] = (score[g], colist) print "grams of text:" print localHist.keys() print "cogram having influx > 2 ..." for g, colist in coHist.items(): if len(colist) > 1: print g, colist print "select from coHist ..." print sorted(coHist.items(), key=lambda x: len(x[1]), reverse=True)[:20] print "select from posterior..." print sorted(fluxAndPosterior.items(), key=lambda x: x[1][0]) pause()
def findDFOfWords(): DF = {} for nFiles, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) words = set(sum([wordlist(s, regular=True) for s in sentences], [])) for w in words: DF[w] = DF.get(w, 0) + 1 if nFiles % 1000 == 0: print "finished", nFiles, "files" print "finished total", nFiles, "files" kbay.saveDF(DF, _SAVEDIR_ + SrcFileName + '_df.txt', sort=False, numDoc=idx)
def df_words(): print "find df of words..." DF = {} for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) words = set(sum([wordlist(s, regular=True) for s in sentences], [])) kbay.count(words, DF) peek(nFiles, 1000) print "finished total", nFiles, "files" kbay.saveDF(DF, SAVEDIR+SrcFileName+'_df.txt', sort=False, numDoc=nFiles)
def findDFOfGrams(): gramHist = {} for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 words = g.split() if len(words) >= 3: for ii, w in enumerate(words[:-1]): posEndingWord = poslist[ii + 1] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 2]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 4: for ii, w in enumerate(words[:-2]): posEndingWord = poslist[ii + 2] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 3]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 5: for ii, w in enumerate(words[:-3]): posEndingWord = poslist[ii + 3] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 4]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 # save the local grams for g in localGramHist: gramHist[g] = gramHist.get(g, 0) + 1 peek(idx_raw + 1, 2000) kbay.saveDF(gramHist, 'ieeeGramDF.txt', sort=False, numDoc=idx_raw)
def findPattern(DO_COOCUR=False, ROUND=1): for idx_raw, text in enumerate(ieeePapers()): idx = idx_raw + 1 peek(idx, 1000) #if idx>10000: break ngram_local = {} sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) seq.learnNewSequence(tokenlist) seq.blindMemorize(tokenlist) seq.showPatterns() pause()
def rank(): klog.msg('rank words in text based on cross-info score') for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) wordprof = {} wordscores = [] print "############ TEXT #############" print text for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = set([t for t in tokenlist if not lang.isNoise(t)]) for w in selected: wordprof[w] = wordprof.get(w, 0) + 1 print s for w in selected: s = memory.sumscore([ww for ww in selected if ww != w], w) wordscores.append((w, s)) print sorted(wordscores, key=lambda x: x[-1], reverse=True) wordscores = [] pause()
def memorizeCogram(): book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) memo = mem.Memory() memo.setInitialCapacity(200) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<220000: continue sentences = lang.getSentenceList(text) gramsPreviousSentence = set([]) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue goodgrams = set([g for g in grams if g in book]) memo.learnSymbList(goodgrams) # grams of previous sentence: learn grams of current sentence # grams of current sentence: learn grams of previous sentence memo.crosslearn(gramsPreviousSentence, goodgrams, crossweight=1) if 0 and len(list(gramsPreviousSentence) + list(goodgrams)) == 1: print "only 1 gram in two sentences!!!" print "sentence:", s print "grams before filtering:", grams print "grams after filtering", goodgrams if idofs > 0: print "previous sentence:", sentences[idofs - 1] print "previous grams before filtering:", ngramsOfSentence( sentences[idofs - 1]) print "previous grams after filtering:", gramsPreviousSentence pause() gramsPreviousSentence = goodgrams peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() #if idx_raw>6000: # break kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_grams.txt')
def memorizeCoword(): memo = mem.Memory() memo.setInitialCapacity(200) book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<70000: continue sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue words = set(' '.join(grams).split()) words = [w for w in words if w in book] memo.learnSymbList(words) peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_word_bymemo.txt')
def findNgrams(DO_COOCUR=False, ROUND=1): NGRAMS_DOC = {} COOCUR_S = {} # co-occurrence in a sentence NGRAM_LR = {} for idx, text in enumerate(textFiles()): peek(idx + 1, 1000) #if idx>1000: break ngram_local = {} sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posnize(tokenlist) #print "\n-----tokens and poslist----" #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)] if not len(tokenlist) > 5: #print "Anormaly of sentence:", s #pause() continue tokenstoplist = lang.markStops(tokenlist) if 0: print "stops:" print tokenstoplist #pause() if len(tokenlist) > 80: continue print "###### text ######", idx print text print tokenlist, len(tokenlist) #pause() ngb = lang.ngrambounds(tokenstoplist) #print "gram with bounds:", ngb selecedngb = lang.filterSRS(ngb, tokenstoplist) #print "\nSRS-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdj(selecedngb, s) #print "\nADJ-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdv(selecedngb, s) #print "\nADV-FIL gram with bounds:", selecedngb #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored #print "\nVERB-FIL gram with bounds:", selecedngb # do it again after pure pos-based filtering selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print "\nFINAL selected gram with bounds:", selecedngb if ROUND == 1: # in the 1st round, profile the next word after a gram for (gram, leftidx, rightidx) in selecedngb: nextword = lang.nextword(rightidx, tokenlist) prevword = lang.prevword(leftidx, tokenlist) nextwordcode = lang.profilingCode(nextword) prevwordcode = lang.profilingCode(prevword) kbay.inc3d(gram, '_', '_', NGRAM_LR) # '_' as itself kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR) kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR) if lang.isSubject(leftidx, rightidx, tokenlist): kbay.inc3d(gram, '_', 's', NGRAM_LR) #print "subject:", gram #pause() if ROUND == 2: # in the 2nd round, justify the gram for ngb in selecedngb: print "check this:", ngb sg = grammer.subgram(ngb[0], ngb[1], ngb[2], READIN_GRAM_LR, tokenlist, poslist) if sg: print "gram", ngb, "subgram", sg raw_input() if 0: print "\n\n", s print "raw ngb >", ngb print "final ngb >", selecedngb pause() ngrams = [t[0] for t in selecedngb] ngrams = [g for g in ngrams if len(g.split()) > 1] kbay.count(ngrams, ngram_local) if DO_COOCUR: for n1 in ngrams: for n2 in ngrams: kbay.inc2d(n1, n2, COOCUR_S) # doc.freq. - each gram counted only once kbay.count(ngram_local, NGRAMS_DOC) kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt') #print "filter df-doc" #filterHistByCount(NGRAMS_DOC, 2, verbose=False) #kbay.saveDF(NGRAMS_DOC, SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx) if DO_COOCUR: print "filter coocur" kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True) kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt') print "DONE findNgrams"
def selectGrams(): klog.msg('select grams') book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt') #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2) CoHist = {} CoWord = {} klog.msg('looping files') for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print s #print "\n>INITIAL grams:\n", ngb #print "\n>SELECTED grams:\n", selecedngb for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 if 0: print text print "#.localgrams:", len(localGramHist) print localGramHist print "#.ngram:", len([1 for g in localGramHist if ' ' in g]) pause() #kbay.countCooccur(localGramHist, CoHist) # calculate mutual information gramlist = localGramHist.keys() gramscore = [] for g in gramlist: gramscore.append(relativeInfo(g, gramlist, book)) print sorted(gramscore, key=lambda x: x[1]) averageScore = sum([g[1] for g in gramscore]) / len(gramscore) print "above average:", averageScore print[g for g in gramscore if g[1] > averageScore] pause() wordset = set([w for g in localGramHist for w in g.split()]) kbay.countCooccur(wordset, CoWord) peek(idx_raw + 1, 1000) if (idx_raw + 1) % 4000 == 0: #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True) #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord) #print "hist size", h, hh, hh/h #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw) kbay.saveDF2D( CoWord, 'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1)) CoWord = {} # reset break if 0: if (idx_raw + 1) % 40000 == 0: kbay.saveDF2D( CoHist, 'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1)) CoHist = {} # reset