Python readTextFromFile 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: common

메소드/함수: readTextFromFile

hotexamples.com에서의 예제들: 8

Python readTextFromFile - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 common.readTextFromFile에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: P4.py 프로젝트: UdochukwuNweke/cs834-f17-Information-Retrieval

def getHeapsData(filenames):

    outfile = open('vocabWordCount.csv', 'w')
    outfile.write('Vocab,WordCount\n')

    totalVocab = set()
    wordCount = 0
    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        totalVocab = totalVocab.union(set(countVectorizer.vocabulary_.keys()))
        wordCount += termFreqMat.todense().sum()

        outfile.write(str(len(totalVocab)) + ', ' + str(wordCount) + '\n')

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    outfile.close()

예제 #2

파일 보기

파일: A3.P1.py 프로젝트: UdochukwuNweke/cs834-f17-Information-Retrieval

def getVocabFreqDict(filenames, stop, ngramTup=(1, 1)):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()

        html = readTextFromFile(f)
        text = getTextFromHTML(html)
        #writeTextToFile(f + '.txt', text)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=ngramTup)
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, {'f': []})
            vocabDict[term]['f'].append(f)

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

        if (i > stop):
            break

    return vocabDict

예제 #3

파일 보기

파일: P3.py 프로젝트: UdochukwuNweke/cs834-f17-Information-Retrieval

def getVocabFreqDict(filenames):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, 0)
            vocabDict[term] += 1

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    dumpJsonToFile('1-2-gram.json', vocabDict)

예제 #4

파일 보기

def getAssociationForPair(vocabDict, pair, windowSize):

    a, b = pair

    Na = 0
    Nb = 0
    Nab = 0

    if (vocabDict[a] and vocabDict[b]):

        for f in vocabDict[a]['f']:
            f = f + '.txt'
            f = readTextFromFile(f)

            counts = searchKwordWindowsOpt(f, windowSize, a, b)

            Na += counts['left']
            Nab += counts['both']

        for f in vocabDict[b]['f']:
            counts = searchKwordWindowsOpt(f, windowSize, b, a, True)
            Nb += counts['left']

    if (Nab != 0):
        return Nab / (Na + Nb)
    else:
        return -1

예제 #5

파일 보기

파일: A3.P1.py 프로젝트: UdochukwuNweke/cs834-f17-Information-Retrieval

def transformDocToWindow(vocabDict, vocab):

    if (vocab not in vocabDict):
        print('term:', vocab, 'not in vocab')
        return

    allWindows = []
    for i in range(len(vocabDict[vocab]['f'])):
        f = vocabDict[vocab]['f'][i] + '.txt'
        f = readTextFromFile(f)

        allWindows += getKwordWindows(f, 5)

    vocabDict[vocab]['f'] = allWindows

예제 #6

파일 보기

파일: A3.P1.py 프로젝트: UdochukwuNweke/cs834-f17-Information-Retrieval

def transformDocToWindowOpt(vocabDict, vocab):

    if (vocab not in vocabDict):
        print('term:', vocab, 'not in vocab')
        return

    allWindows = {'tot': 0, 'windows': []}
    for i in range(len(vocabDict[vocab]['f'])):
        f = vocabDict[vocab]['f'][i] + '.txt'
        f = readTextFromFile(f)

        allWindows['windows'] += getKwordWindowsOpt(f, 5)

    allWindows['tot'] = len(allWindows['windows'])
    windowsWithVocab = []

    for win in allWindows['windows']:
        if (vocab in win):
            windowsWithVocab.append(win)

    allWindows['windows'] = windowsWithVocab
    vocabDict[vocab]['f'] = allWindows

예제 #7

파일 보기

def getTopKPages(pathnames, filenames):

	if( len(pathnames) == 0 ):
		return []

	outlinksDict = {}

	for i in range(len(pathnames)):
		wiki = pathnames[i]
		wiki = wiki.strip()
		html = readTextFromFile(wiki)

		if( i % 100 == 0 ):
			print(i, 'of', len(pathnames), 'wiki file:', wiki)
			print('\tlen:', len(outlinksDict))

		sourcewiki = getHTMLFilename(wiki)
		getWikiOutlinks(sourcewiki, html, outlinksDict)

		#if( i == 3 ):
		#	break

	dumpJsonToFile('./outlinksDict.json', outlinksDict)

예제 #8

파일 보기

from Porter import PorterStemmer
from krovetzstemmer import Stemmer

from common import readTextFromFile
from common import getTextFromHTML

krov = Stemmer()

f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html'
text = getTextFromHTML(readTextFromFile(f))

print 'ori:\n', text, '\n'
print 'porter:\n', PorterStemmer.useStemer(text), '\n'
print 'krov:\n', krov.stem(text), '\n'