def getHeapsData(filenames): outfile = open('vocabWordCount.csv', 'w') outfile.write('Vocab,WordCount\n') totalVocab = set() wordCount = 0 for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 2)) termFreqMat = countVectorizer.fit_transform([text]) totalVocab = totalVocab.union(set(countVectorizer.vocabulary_.keys())) wordCount += termFreqMat.todense().sum() outfile.write(str(len(totalVocab)) + ', ' + str(wordCount) + '\n') if (i % 100 == 0): print(i, 'of', len(filenames)) outfile.close()
def getVocabFreqDict(filenames, stop, ngramTup=(1, 1)): vocabDict = {} for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) #writeTextToFile(f + '.txt', text) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=ngramTup) termFreqMat = countVectorizer.fit_transform([text]) for term in list(countVectorizer.vocabulary_.keys()): vocabDict.setdefault(term, {'f': []}) vocabDict[term]['f'].append(f) if (i % 100 == 0): print(i, 'of', len(filenames)) if (i > stop): break return vocabDict
def getVocabFreqDict(filenames): vocabDict = {} for i in range(len(filenames)): f = filenames[i].strip() html = readTextFromFile(f) text = getTextFromHTML(html) if (len(text) == 0): continue countVectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 2)) termFreqMat = countVectorizer.fit_transform([text]) for term in list(countVectorizer.vocabulary_.keys()): vocabDict.setdefault(term, 0) vocabDict[term] += 1 if (i % 100 == 0): print(i, 'of', len(filenames)) dumpJsonToFile('1-2-gram.json', vocabDict)
from Porter import PorterStemmer from krovetzstemmer import Stemmer from common import readTextFromFile from common import getTextFromHTML krov = Stemmer() f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html' text = getTextFromHTML(readTextFromFile(f)) print 'ori:\n', text, '\n' print 'porter:\n', PorterStemmer.useStemer(text), '\n' print 'krov:\n', krov.stem(text), '\n'