def forgetOnFile(ctffile, memcapacity, save2file, normlized=True): cohist = fif.readWordCoocur(ctffile) cohist = forget(cohist, termfreq=None, forgetby='average', memcapacity=None) if normlized: h = normalize(cohist) else: h = cohist fif.saveHist2D(h, save2file, splitter=',')
import kbay import fif #co = fif.readWordCoocur('stats/pmed/pmed_word_coocur.txt') co = fif.readWordCoocur('stats/wiki_word_coocur.txt') def infoscore(w1, w2): try: n_w1 = co[w1][w1] n_w2_on_w1 = co[w1].get(w2,0) n_w2 = co[w2][w2] n_total = 2607000 except: return 0 r2 = float(n_w2)/n_total r2on1 = float(n_w2_on_w1)/n_w1 score = r2on1 / r2 return score def sumscore(wordlist, wref): return sum([infoscore(w, wref) for w in wordlist]) if 0: for w1, hist in co.items(): if not w1 in ['red', 'blue', 'black', 'green', 'orange', 'white', 'color']: continue print ">>> checking word", w1 n_w1 = hist.get(w1) scores = [] for w2, n_w2_on_w1 in hist.items(): try:
def selectGrams(): klog.msg('select grams') book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt') #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2) CoHist = {} CoWord = {} klog.msg('looping files') for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print s #print "\n>INITIAL grams:\n", ngb #print "\n>SELECTED grams:\n", selecedngb for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 if 0: print text print "#.localgrams:", len(localGramHist) print localGramHist print "#.ngram:", len([1 for g in localGramHist if ' ' in g]) pause() #kbay.countCooccur(localGramHist, CoHist) # calculate mutual information gramlist = localGramHist.keys() gramscore = [] for g in gramlist: gramscore.append(relativeInfo(g, gramlist, book)) print sorted(gramscore, key=lambda x: x[1]) averageScore = sum([g[1] for g in gramscore]) / len(gramscore) print "above average:", averageScore print[g for g in gramscore if g[1] > averageScore] pause() wordset = set([w for g in localGramHist for w in g.split()]) kbay.countCooccur(wordset, CoWord) peek(idx_raw + 1, 1000) if (idx_raw + 1) % 4000 == 0: #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True) #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord) #print "hist size", h, hh, hh/h #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw) kbay.saveDF2D( CoWord, 'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1)) CoWord = {} # reset break if 0: if (idx_raw + 1) % 40000 == 0: kbay.saveDF2D( CoHist, 'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1)) CoHist = {} # reset
while depth < max_depth: new_front = [] for new in front_nodes: if not new in visited: # visit this node new_front = new_front + __expand(co, new, max_width) # after the visit visited.append(new) # finish one depth depth += 1 front_nodes = new_front # update the front # logging print "depth", depth, "new front", set(front_nodes) # final result: all found nodes return set(visited) if __name__=='__main__': # #print "nodes", nodes #print "#. nodes", len(nodes) hist_coocur = fif.readWordCoocur(coocurfile) selected = expand_network(hist_coocur, "red", max_depth=12, max_width=5) # trans_csv_selected(hist_coocur, ["red"]) trans_csv_selected(hist_coocur, selected, max_width=5)
if 0: #TODO - a way to register the nbatch read_gutefrage(FIN, TF, debug=0) raw_input('-----') fif.saveTF(TF, totalWordCount=1, fname=FOUT_TF) raw_input('--- tf done ---') if 1: print "\n --- coocur ctf ---" tf = fif.readTF(FOUT_TF, {}, threshold=5) print "selected tf size", len(tf) raw_input('-----') do_ctf(FIN, tf, refreshperiod=2 * 1e4, denoise_ratio=0.0019, cutoff=2, ctf=CTF) fif.saveHist2D(CTF, FOUT_CTF) raw_input('--- ctf done ---') if 1: print "\n --- semantic ---" ctf = fif.readWordCoocur(FOUT_CTF, 1) print "> initial ctf size", mem.histsize(ctf) ctf3 = do_semantic(FIN, ctf, refresh_period=1e4, ctf3={}) print "> final size", histsize_3d(ctf3) fif.saveHist3D(ctf3, FOUT_CTF3)
grammode='ng', denoise_ratio=0.003) # save the 2D conditional tf fif.saveHist2D(ctf, FOUT_CTF) if TEST_MEMORY: mem.forgetOnFile(ctffile=fname_ctf, memcapacity=100, save2file=fname_ctf_mem) # it does not finish # size before forget: 100201 : 34424319 # ==> filter when reading the file!!! if RUN_SEMANTIC: ctf = fif.readWordCoocur(fname_ctf_mem, 10) print "> initial ctf size", mem.histsize(ctf) ctf3 = semantic(ctf, grammode=GRAM_MODE, ctf3={}) print "> final size", histsize_3d(ctf3) fif.saveHist3D(ctf3, fname_semet3) def semantic4(significants, ctf={}): for nbatch, terms in enumerate(termsOfBatch()): for t in terms: if not t in significants: continue related = significants[t] for tt in terms: if tt == t: continue if not tt in related: continue