def getHeapsData(filenames): outfile = open('vocabWordCount.csv', 'w') outfile.write('Vocab,WordCount\n') totalVocab = set() wordCount = 0 for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 2)) termFreqMat = countVectorizer.fit_transform([text]) totalVocab = totalVocab.union(set(countVectorizer.vocabulary_.keys())) wordCount += termFreqMat.todense().sum() outfile.write(str(len(totalVocab)) + ', ' + str(wordCount) + '\n') if (i % 100 == 0): print(i, 'of', len(filenames)) outfile.close()
def getVocabFreqDict(filenames, stop, ngramTup=(1, 1)): vocabDict = {} for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) #writeTextToFile(f + '.txt', text) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=ngramTup) termFreqMat = countVectorizer.fit_transform([text]) for term in list(countVectorizer.vocabulary_.keys()): vocabDict.setdefault(term, {'f': []}) vocabDict[term]['f'].append(f) if (i % 100 == 0): print(i, 'of', len(filenames)) if (i > stop): break return vocabDict
def getVocabFreqDict(filenames): vocabDict = {} for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 2)) termFreqMat = countVectorizer.fit_transform([text]) for term in list(countVectorizer.vocabulary_.keys()): vocabDict.setdefault(term, 0) vocabDict[term] += 1 if (i % 100 == 0): print(i, 'of', len(filenames)) dumpJsonToFile('1-2-gram.json', vocabDict)
def getAssociationForPair(vocabDict, pair, windowSize): a, b = pair Na = 0 Nb = 0 Nab = 0 if (vocabDict[a] and vocabDict[b]): for f in vocabDict[a]['f']: f = f + '.txt' f = readTextFromFile(f) counts = searchKwordWindowsOpt(f, windowSize, a, b) Na += counts['left'] Nab += counts['both'] for f in vocabDict[b]['f']: counts = searchKwordWindowsOpt(f, windowSize, b, a, True) Nb += counts['left'] if (Nab != 0): return Nab / (Na + Nb) else: return -1
def transformDocToWindow(vocabDict, vocab): if (vocab not in vocabDict): print('term:', vocab, 'not in vocab') return allWindows = [] for i in range(len(vocabDict[vocab]['f'])): f = vocabDict[vocab]['f'][i] + '.txt' f = readTextFromFile(f) allWindows += getKwordWindows(f, 5) vocabDict[vocab]['f'] = allWindows
def transformDocToWindowOpt(vocabDict, vocab): if (vocab not in vocabDict): print('term:', vocab, 'not in vocab') return allWindows = {'tot': 0, 'windows': []} for i in range(len(vocabDict[vocab]['f'])): f = vocabDict[vocab]['f'][i] + '.txt' f = readTextFromFile(f) allWindows['windows'] += getKwordWindowsOpt(f, 5) allWindows['tot'] = len(allWindows['windows']) windowsWithVocab = [] for win in allWindows['windows']: if (vocab in win): windowsWithVocab.append(win) allWindows['windows'] = windowsWithVocab vocabDict[vocab]['f'] = allWindows
def getTopKPages(pathnames, filenames): if( len(pathnames) == 0 ): return [] outlinksDict = {} for i in range(len(pathnames)): wiki = pathnames[i] wiki = wiki.strip() html = readTextFromFile(wiki) if( i % 100 == 0 ): print(i, 'of', len(pathnames), 'wiki file:', wiki) print('\tlen:', len(outlinksDict)) sourcewiki = getHTMLFilename(wiki) getWikiOutlinks(sourcewiki, html, outlinksDict) #if( i == 3 ): # break dumpJsonToFile('./outlinksDict.json', outlinksDict)
from Porter import PorterStemmer from krovetzstemmer import Stemmer from common import readTextFromFile from common import getTextFromHTML krov = Stemmer() f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html' text = getTextFromHTML(readTextFromFile(f)) print 'ori:\n', text, '\n' print 'porter:\n', PorterStemmer.useStemer(text), '\n' print 'krov:\n', krov.stem(text), '\n'