def getVocabFreqDict(filenames):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, 0)
            vocabDict[term] += 1

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    dumpJsonToFile('1-2-gram.json', vocabDict)
Beispiel #2
0
def getStemclasses():

    stemClasses = {}
    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0

    for voc, vocDict in vocabDict.items():

        stem = PorterStemmer.useStemer(voc)
        stemClasses.setdefault(stem, [])
        stemClasses[stem].append(voc)

        if (counter % 10000 == 0):
            print('\t', counter, voc)

        counter += 1

    dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
Beispiel #3
0
def getTopKPages(pathnames, filenames):

	if( len(pathnames) == 0 ):
		return []

	outlinksDict = {}

	for i in range(len(pathnames)):
		wiki = pathnames[i]
		wiki = wiki.strip()
		html = readTextFromFile(wiki)

		if( i % 100 == 0 ):
			print(i, 'of', len(pathnames), 'wiki file:', wiki)
			print('\tlen:', len(outlinksDict))

		sourcewiki = getHTMLFilename(wiki)
		getWikiOutlinks(sourcewiki, html, outlinksDict)

		#if( i == 3 ):
		#	break

	dumpJsonToFile('./outlinksDict.json', outlinksDict)
            vocabDict.setdefault(term, {'f': []})
            vocabDict[term]['f'].append(f)

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

        if (i > stop):
            break

    return vocabDict


stop = 500
filenames = getHTMLPaths()
vocabDict = getVocabFreqDict(filenames, stop)
dumpJsonToFile('wiki-small-vocab-' + str(stop) + '.json', vocabDict, False)

word = 'hospital'
N = 6042
k = 20
getAssocMeasuresDocs(word, N, k)
'''
#command line: python A3.P1.py
if( len(sys.argv) > 1 ):
	filename = 'wiki-small-vocab.json'
	word = sys.argv[1]
	N = 15103
	k = 20
	getAssocMeasuresWindow(word, N, filename, k)
'''