def findDFOfWords(): DF = {} for nFiles, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) words = set(sum([wordlist(s, regular=True) for s in sentences], [])) for w in words: DF[w] = DF.get(w, 0) + 1 if nFiles % 1000 == 0: print "finished", nFiles, "files" print "finished total", nFiles, "files" kbay.saveDF(DF, _SAVEDIR_ + SrcFileName + '_df.txt', sort=False, numDoc=idx)
def df_words(): print "find df of words..." DF = {} for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) words = set(sum([wordlist(s, regular=True) for s in sentences], [])) kbay.count(words, DF) peek(nFiles, 1000) print "finished total", nFiles, "files" kbay.saveDF(DF, SAVEDIR+SrcFileName+'_df.txt', sort=False, numDoc=nFiles)
def findDFOfGrams(): gramHist = {} for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 words = g.split() if len(words) >= 3: for ii, w in enumerate(words[:-1]): posEndingWord = poslist[ii + 1] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 2]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 4: for ii, w in enumerate(words[:-2]): posEndingWord = poslist[ii + 2] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 3]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 5: for ii, w in enumerate(words[:-3]): posEndingWord = poslist[ii + 3] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 4]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 # save the local grams for g in localGramHist: gramHist[g] = gramHist.get(g, 0) + 1 peek(idx_raw + 1, 2000) kbay.saveDF(gramHist, 'ieeeGramDF.txt', sort=False, numDoc=idx_raw)
def filterGramDF(): book = fif.readWithFilter('ieeeGramDF.txt', filtervalue=3) kbay.saveDF(book, 'ieeeGramDF_above3.txt', sort=False, numDoc=460035)