Python WordCounter 예제들, WordCounter.WordCounter Python 예제들

예제 #1

0

파일 보기

파일: corpus_consistency.py 프로젝트: JelmerVNuss/Corpus-Consistency

def main(rootDirectory, words, confidenceLevel, functionType):
    corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS)
    #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH)
    sampler = Sampler(SAMPLE_SIZE,
                      sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE)

    documentSamples = {}
    for documentTitle in corpus.documents:
        documentSample = sampler.sample(corpus.documents[documentTitle],
                                        usePercentage=True)
        documentSamples[documentTitle] = documentSample

    wordCounter = WordCounter(words, SAMPLE_SIZE)
    wordCounter.countOccurrences(documentSamples)

    dataLabels = sorted(list(wordCounter.occurrences.keys()))
    dataSets = []
    for dataLabel in dataLabels:
        #dataSet = wordCounter.occurrences[dataLabel]
        dataSet = wordCounter.occurrencesPerMillionWords[dataLabel]
        dataSets.append(dataSet)

    statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE,
                                          words)
    statisticsPlotter.plotStatistics(functionType=functionType)

예제 #2

0

파일 보기

    def test_counts_a_real_java_sourceline(self):
        sample = """
	private void addOrderEntryLibJars(Tag set) {
		for (String jarName : libJars())
			if (isJunit(jarName))
				addLibraryOrderEntry(set, "junit");
			else
				addLibraryOrderEntry(set, jarName.split(".jar")[0]);
	}"""
        counter = WordCounter()
        counter.count(sample)
        expected = [('jar', 4), ('set', 3), ('name', 3), ('add', 3)]
        self.assertEquals(expected, counter.mostUsedWords(4))

예제 #3

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

    def test_counts_a_real_java_sourceline(self):
        sample = """
	private void addOrderEntryLibJars(Tag set) {
		for (String jarName : libJars())
			if (isJunit(jarName))
				addLibraryOrderEntry(set, "junit");
			else
				addLibraryOrderEntry(set, jarName.split(".jar")[0]);
	}"""
        counter = WordCounter()
        counter.count(sample)
        expected = [
                ('jar',4),
                ('set',3),
                ('name',3),
                ('add',3)
        ]
        self.assertEquals(expected, counter.mostUsedWords(4))

예제 #4

0

파일 보기

def main():
    rootdir = 'C:/Users/pouya\python-projects\\truth-goggles\congressionalrecord\output'
    wordStems = {
    }  # dictionary of all the stemmed words said and the number of times they are said

    # creates dictionary of legislators to cross-reference for party and chamber
    legislators = PoliticiansDatabase()

    # if we want to re-parse/clean the congressional record, this builds the dictionary of speakers and speeches
    # builds dictionary where key is a speaker and value is an array of all of their speeches
    speakerDict = buildWriterFile(rootdir)

    numberOfSpeechesDems = 0  # stores number of speeches given by Dems
    numberOfSpeechesGop = 0  # stores number of speeches given by GOP

    words = WordCounter()

    for speaker in speakerDict:  # iterates through the speakers of all speeches

        speakerArr = speaker.split(
            ";"
        )  # list that has [name of speaker (+ "of " + their state), chamber]
        speakerName = speakerArr[0]  # stores speaker's name
        if speaker not in wordStems:
            wordStems[speaker] = list(
            )  # adds speaker to wordStems if not already in it
        speakerDictKey = speaker.split(".")[-1].lower()[
            1:]  # stores name of spekaer without prefix and their chamber

        numberOfSpeechesBoolean = True  # helper boolean to keep track of number of speeches we parse
        for tup in speakerDict[speaker]:
            speech = tup[1]  # stores speech given by speaker
            year = tup[0]  # stores year that speech was given

            # finds party of speaker and assigns "not found" if cannot find speaker
            speakerParty = legislators.getSpeakerParty(year, speakerDictKey)

            # adds the number of speeches given by the speaker to the appropriate party's speech count
            if numberOfSpeechesBoolean:
                if speakerParty == "democrat":
                    numberOfSpeechesDems += len(speakerDict[speaker])
                elif speakerParty == "republican":
                    numberOfSpeechesGop += len(speakerDict[speaker])
            numberOfSpeechesBoolean = False

            words.addSingleWords(speech, speakerParty)

    words.calculateFrequencies(minimumOccs=125)
    words.printTopFrequencies(howMany=1000, whichParty='gop')
    words.printTopFrequencies(howMany=1000, whichParty='dem')

예제 #5

0

파일 보기

파일: WordCount.py 프로젝트: lazyradish/Insight-Coding-Challenge

def main(indir='wc_input',
         outdir='wc_output',
         word_count_report='wc_result.txt',
         running_median_report='med_result.txt',
         running_median_method='hashtracker'):

    # create output directory
    try:
        os.mkdir(outdir)
    except Exception:
        pass

    # initialize data structures
    wc = WordCounter(report_filename=os.path.join(outdir, word_count_report))
    rm = RunningMedian.factory(running_median_method, report_filename=os.path.join(outdir, running_median_report))

    for file in os.listdir(indir):
        if not file.endswith('.txt'):
            continue

        with open(os.path.join(indir, file), 'r') as f:
            for line in f:
                words = line.split()

                # count each word
                for w in words:
                    wc.update(w)

                # count number of words
                rm.update(len(words))
                rm.report()

    # report word count after everything is done
    wc.report()

예제 #6

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_acronym_in_middle(self):
     counter = WordCounter()
     result = counter._uncamelCase('embedHTMLHere')
     self.assertEquals(['embed','HTML', 'Here'], list(result))

예제 #7

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_joined_leadingUpper(self):
     counter = WordCounter()
     result = counter._uncamelCase('LeadingUpper')
     self.assertEquals(['Leading','Upper'], list(result))

예제 #8

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_capitalized(self):
     counter = WordCounter()
     result = counter._uncamelCase('This')
     self.assertEquals(['This'], list(result))

예제 #9

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_failFast(self):
     sample = "public Album"
     counter = WordCounter()
     counter.count(sample)
     self.assertEqual(1, counter.timesOccurred('public'))

예제 #10

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_differentCase(self):
     counter = WordCounter()
     counter.count("Album album ALBUM")
     self.assertEqual(3, counter.timesOccurred('album'))

예제 #11

0

파일 보기

 def test_leading_upper(self):
     counter = WordCounter()
     result = counter._uncamelCase('THIS')
     self.assertEquals(['THIS'], list(result))

예제 #12

0

파일 보기

 def test_all_lower(self):
     counter = WordCounter()
     result = counter._uncamelCase('this')
     self.assertEquals(['this'], list(result))

예제 #13

0

파일 보기

 def test_counts_words_despite_punctuation(self):
     counter = WordCounter()
     counter.count("Album album = new ALBUM()")
     self.assertEqual(3, counter.timesOccurred('album'))

예제 #14

0

파일 보기

 def test_differentCase(self):
     counter = WordCounter()
     counter.count("Album album ALBUM")
     self.assertEqual(3, counter.timesOccurred('album'))

예제 #15

0

파일 보기

from HelperMethods import *
from WordCounter import WordCounter

import random

# Create WordCounters with nonsense words and a random word count
wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

wc = WordCounter(makeNonsenseWord(), random.randint(1, 45))
print(wc)

예제 #16

0

파일 보기

 def test_capitalized(self):
     counter = WordCounter()
     result = counter._uncamelCase('This')
     self.assertEquals(['This'], list(result))

예제 #17

0

파일 보기

파일: Program_to_count_all_word.py 프로젝트: katarzynalatos/Lab_1

import argparse
from InputFileValidator import InputFileValidator
from WordCounter import WordCounter

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='This is a Python version of grep function.')
    parser.add_argument('name_of_file',
                        type=str,
                        help='an name of the file to check')
    args = parser.parse_args()
    name_of_file = args.name_of_file
    word_searcher = InputFileValidator(name_of_file)
    if word_searcher.validate():
        To_count_all_word = WordCounter(word_searcher.get_list_of_files())
        To_count_all_word.returning_count_of_all_chars_words_lines_provided()

예제 #18

0

파일 보기

 def test_failFast(self):
     sample = "public Album"
     counter = WordCounter()
     counter.count(sample)
     self.assertEqual(1, counter.timesOccurred('public'))

예제 #19

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_counts_words_despite_punctuation(self):
     counter = WordCounter()
     counter.count("Album album = new ALBUM()")
     self.assertEqual(3, counter.timesOccurred('album'))

예제 #20

0

파일 보기

 def test_joined_leadingLower(self):
     counter = WordCounter()
     result = counter._uncamelCase('thisTime')
     self.assertEquals(['this', 'Time'], list(result))

예제 #21

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_all_lower(self):
     counter = WordCounter()
     result = counter._uncamelCase('this')
     self.assertEquals(['this'], list(result))

예제 #22

0

파일 보기

 def test_joined_leadingUpper(self):
     counter = WordCounter()
     result = counter._uncamelCase('LeadingUpper')
     self.assertEquals(['Leading', 'Upper'], list(result))

예제 #23

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_leading_upper(self):
     counter = WordCounter()
     result = counter._uncamelCase('THIS')
     self.assertEquals(['THIS'], list(result))

예제 #24

0

파일 보기

 def test_end_with_acronym(self):
     counter = WordCounter()
     result = counter._uncamelCase('endWithHTML')
     self.assertEquals(['end', 'With', 'HTML'], list(result))

예제 #25

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_joined_leadingLower(self):
     counter = WordCounter()
     result = counter._uncamelCase('thisTime')
     self.assertEquals(['this','Time'], list(result))

예제 #26

0

파일 보기

 def test_acronym_in_middle(self):
     counter = WordCounter()
     result = counter._uncamelCase('embedHTMLHere')
     self.assertEquals(['embed', 'HTML', 'Here'], list(result))

예제 #27

0

파일 보기

파일: testWordCounter.py 프로젝트: tottinge/wordCounter

 def test_end_with_acronym(self):
     counter = WordCounter()
     result = counter._uncamelCase('endWithHTML')
     self.assertEquals(['end','With','HTML'], list(result))

예제 #28

0

파일 보기

 def test_word_count(self):
     word_list=['test','this','this','that','is','test','is','test','is','of','yes']
     word_counter=WordCounter(words=word_list) 
     self.assertEqual(word_counter.wordcount,[('test', 3), ('is', 3), ('this', 2), ('that', 1), ('of', 1), ('yes', 1)])

예제 #29

0

파일 보기

import os
from WordCounter import WordCounter,FileWordExtractor

FILEPATH=os.path.join(os.path.dirname(__file__), 'input.txt')

try:
    if os.path.exists(FILEPATH):
        text_file=FileWordExtractor(FILEPATH)
        words=text_file.words
        word_counter=WordCounter(words=words) #init wordcounter class
        word_count_dict=word_counter.wordcount #get wordcount
        text_file.display_total_words()
        word_counter.display_word_counts()
    else:
        print('File not found')
except: 
    print('Application Error')

예제 #30

0

파일 보기

Results
-------
    - All output is written to output file (output.txt).

Notes
-----
    - The program ignores case sensitivity from the input file
    - All special characters from the input file will be filtered out.
    - This program will call functions from WordCounter.py
"""

import sys
import os
from WordCounter import WordCounter

# validate number of input arguments
if len(sys.argv) < 2:
    print("filename needed as command argument")
    sys.exit(1)

filename = sys.argv[1]

# validate input file
if os.path.isfile(filename) == False:
    print(f"Error: {filename} is not a file.")
    print("Exit Program")
    sys.exit(1)

wc = WordCounter(filename)
wc.generateHistogram()