def coocurWordFirstOrder(): klog.msg('find word coocur') Hist = {} nNumSentences = 0 for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) nNumSentences += len(sentences) if nFiles > 100000: break if nFiles % 1000 == 1: print nFiles, " files: #.sentences", nNumSentences continue for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = [t for t in tokenlist if not lang.isNoise(t)] #kbay.countCooccur(selected, Hist) kbay.countCooccurNearest(selected, Hist, nearest=10) if nFiles % 1000 == 1: refreshMemory2D(Hist, steps=nFiles, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(nFiles + 1, 1000) if nFiles > 100000: break print "number of sentences:", nNumSentences return klog.msg("finished total %s files" % nFiles) refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
def findRelatedGrams(): klog.msg('find co-exist terms') Hist = {} for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) kbay.countCooccurConsideringSubgram(grams, Hist, gramDF=GRAM_DF, debug=0) if idx_raw % 1000 == 0: mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(idx_raw + 1, 1000) if idx_raw > 200000: break kbay.filter2DHistByCount(Hist, 2, verbose=True) kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
def coocurWord(save2file, verbose=0): klog.msg('find word coocur') Hist = {} for nFiles, (datafile, text) in enumerate(loopWikiData()): sentences = cleaner.getSentences_regex(text) for s in sentences: #selected = set(cleaner.capitalizedWordsOf(s)) selected = cleaner.tokenize_simple(s) if verbose or 0: print "sentence:", s print "selected:", selected pause() #kbay.countCooccurNearest(selected, Hist, nearest=10) kbay.countCooccurNearest(selected, Hist, nearest=2) #print "learn", selected #print Hist, len(Hist) #pause() if nFiles % 1000 == 0: print nFiles, " files done" if nFiles % 4000 == 0: print "before mem refresh:", len(Hist) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) print "after mem refresh:", len(Hist), '\n' if nFiles > 40000: break klog.msg("finished total %s files" % nFiles) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) fif.saveHist2D(Hist, save2file) return Hist
def rank(): klog.msg('rank words in text based on cross-info score') for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) wordprof = {} wordscores = [] print "############ TEXT #############" print text for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = set([t for t in tokenlist if not lang.isNoise(t)]) for w in selected: wordprof[w] = wordprof.get(w, 0) + 1 print s for w in selected: s = memory.sumscore([ww for ww in selected if ww != w], w) wordscores.append((w, s)) print sorted(wordscores, key=lambda x: x[-1], reverse=True) wordscores = [] pause()
def selectGrams(): klog.msg('select grams') book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt') #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2) CoHist = {} CoWord = {} klog.msg('looping files') for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print s #print "\n>INITIAL grams:\n", ngb #print "\n>SELECTED grams:\n", selecedngb for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 if 0: print text print "#.localgrams:", len(localGramHist) print localGramHist print "#.ngram:", len([1 for g in localGramHist if ' ' in g]) pause() #kbay.countCooccur(localGramHist, CoHist) # calculate mutual information gramlist = localGramHist.keys() gramscore = [] for g in gramlist: gramscore.append(relativeInfo(g, gramlist, book)) print sorted(gramscore, key=lambda x: x[1]) averageScore = sum([g[1] for g in gramscore]) / len(gramscore) print "above average:", averageScore print[g for g in gramscore if g[1] > averageScore] pause() wordset = set([w for g in localGramHist for w in g.split()]) kbay.countCooccur(wordset, CoWord) peek(idx_raw + 1, 1000) if (idx_raw + 1) % 4000 == 0: #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True) #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord) #print "hist size", h, hh, hh/h #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw) kbay.saveDF2D( CoWord, 'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1)) CoWord = {} # reset break if 0: if (idx_raw + 1) % 40000 == 0: kbay.saveDF2D( CoHist, 'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1)) CoHist = {} # reset