Esempi in Python per getTextFromHTML

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: common

Metodo/funzione: getTextFromHTML

Esempi su hotexamples.com: 4

getTextFromHTML in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per common.getTextFromHTML, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: P4.py Progetto: UdochukwuNweke/cs834-f17-Information-Retrieval

def getHeapsData(filenames):

    outfile = open('vocabWordCount.csv', 'w')
    outfile.write('Vocab,WordCount\n')

    totalVocab = set()
    wordCount = 0
    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        totalVocab = totalVocab.union(set(countVectorizer.vocabulary_.keys()))
        wordCount += termFreqMat.todense().sum()

        outfile.write(str(len(totalVocab)) + ', ' + str(wordCount) + '\n')

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    outfile.close()

Esempio n. 2

Mostra file

File: A3.P1.py Progetto: UdochukwuNweke/cs834-f17-Information-Retrieval

def getVocabFreqDict(filenames, stop, ngramTup=(1, 1)):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()

        html = readTextFromFile(f)
        text = getTextFromHTML(html)
        #writeTextToFile(f + '.txt', text)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=ngramTup)
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, {'f': []})
            vocabDict[term]['f'].append(f)

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

        if (i > stop):
            break

    return vocabDict

Esempio n. 3

Mostra file

File: P3.py Progetto: UdochukwuNweke/cs834-f17-Information-Retrieval

def getVocabFreqDict(filenames):

    vocabDict = {}

    for i in range(len(filenames)):
        f = filenames[i].strip()
        html = readTextFromFile(f)
        text = getTextFromHTML(html)

        if (len(text) == 0):
            continue

        countVectorizer = CountVectorizer(min_df=1,
                                          stop_words='english',
                                          ngram_range=(1, 2))
        termFreqMat = countVectorizer.fit_transform([text])

        for term in list(countVectorizer.vocabulary_.keys()):
            vocabDict.setdefault(term, 0)
            vocabDict[term] += 1

        if (i % 100 == 0):
            print(i, 'of', len(filenames))

    dumpJsonToFile('1-2-gram.json', vocabDict)

Esempio n. 4

Mostra file

from Porter import PorterStemmer
from krovetzstemmer import Stemmer

from common import readTextFromFile
from common import getTextFromHTML

krov = Stemmer()

f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html'
text = getTextFromHTML(readTextFromFile(f))

print 'ori:\n', text, '\n'
print 'porter:\n', PorterStemmer.useStemer(text), '\n'
print 'krov:\n', krov.stem(text), '\n'