def getHeapsData(filenames):

    outfile = open('vocabWordCount.csv', 'w')
    outfile.write('Vocab,WordCount\n')

    totalVocab = set()
    wordCount = 0
    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        totalVocab = totalVocab.union(set(countVectorizer.vocabulary_.keys()))
        wordCount += termFreqMat.todense().sum()

        outfile.write(str(len(totalVocab)) + ', ' + str(wordCount) + '\n')

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    outfile.close()
def getVocabFreqDict(filenames, stop, ngramTup=(1, 1)):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()

        html = readTextFromFile(f)
        text = getTextFromHTML(html)
        #writeTextToFile(f + '.txt', text)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=ngramTup)
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, {'f': []})
            vocabDict[term]['f'].append(f)

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

        if (i > stop):
            break

    return vocabDict
def getVocabFreqDict(filenames):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, 0)
            vocabDict[term] += 1

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    dumpJsonToFile('1-2-gram.json', vocabDict)
예제 #4
0
def getAssociationForPair(vocabDict, pair, windowSize):

    a, b = pair

    Na = 0
    Nb = 0
    Nab = 0

    if (vocabDict[a] and vocabDict[b]):

        for f in vocabDict[a]['f']:
            f = f + '.txt'
            f = readTextFromFile(f)

            counts = searchKwordWindowsOpt(f, windowSize, a, b)

            Na += counts['left']
            Nab += counts['both']

        for f in vocabDict[b]['f']:
            counts = searchKwordWindowsOpt(f, windowSize, b, a, True)
            Nb += counts['left']

    if (Nab != 0):
        return Nab / (Na + Nb)
    else:
        return -1
def transformDocToWindow(vocabDict, vocab):

    if (vocab not in vocabDict):
        print('term:', vocab, 'not in vocab')
        return

    allWindows = []
    for i in range(len(vocabDict[vocab]['f'])):
        f = vocabDict[vocab]['f'][i] + '.txt'
        f = readTextFromFile(f)

        allWindows += getKwordWindows(f, 5)

    vocabDict[vocab]['f'] = allWindows
def transformDocToWindowOpt(vocabDict, vocab):

    if (vocab not in vocabDict):
        print('term:', vocab, 'not in vocab')
        return

    allWindows = {'tot': 0, 'windows': []}
    for i in range(len(vocabDict[vocab]['f'])):
        f = vocabDict[vocab]['f'][i] + '.txt'
        f = readTextFromFile(f)

        allWindows['windows'] += getKwordWindowsOpt(f, 5)

    allWindows['tot'] = len(allWindows['windows'])
    windowsWithVocab = []

    for win in allWindows['windows']:
        if (vocab in win):
            windowsWithVocab.append(win)

    allWindows['windows'] = windowsWithVocab
    vocabDict[vocab]['f'] = allWindows
예제 #7
0
def getTopKPages(pathnames, filenames):

	if( len(pathnames) == 0 ):
		return []

	outlinksDict = {}

	for i in range(len(pathnames)):
		wiki = pathnames[i]
		wiki = wiki.strip()
		html = readTextFromFile(wiki)

		if( i % 100 == 0 ):
			print(i, 'of', len(pathnames), 'wiki file:', wiki)
			print('\tlen:', len(outlinksDict))

		sourcewiki = getHTMLFilename(wiki)
		getWikiOutlinks(sourcewiki, html, outlinksDict)

		#if( i == 3 ):
		#	break

	dumpJsonToFile('./outlinksDict.json', outlinksDict)
예제 #8
0
from Porter import PorterStemmer
from krovetzstemmer import Stemmer

from common import readTextFromFile
from common import getTextFromHTML

krov = Stemmer()

f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html'
text = getTextFromHTML(readTextFromFile(f))

print 'ori:\n', text, '\n'
print 'porter:\n', PorterStemmer.useStemer(text), '\n'
print 'krov:\n', krov.stem(text), '\n'