def main(rootDirectory, words, confidenceLevel, functionType): corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS) #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH) sampler = Sampler(SAMPLE_SIZE, sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE) documentSamples = {} for documentTitle in corpus.documents: documentSample = sampler.sample(corpus.documents[documentTitle], usePercentage=True) documentSamples[documentTitle] = documentSample wordCounter = WordCounter(words, SAMPLE_SIZE) wordCounter.countOccurrences(documentSamples) dataLabels = sorted(list(wordCounter.occurrences.keys())) dataSets = [] for dataLabel in dataLabels: #dataSet = wordCounter.occurrences[dataLabel] dataSet = wordCounter.occurrencesPerMillionWords[dataLabel] dataSets.append(dataSet) statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE, words) statisticsPlotter.plotStatistics(functionType=functionType)
def test_counts_a_real_java_sourceline(self): sample = """ private void addOrderEntryLibJars(Tag set) { for (String jarName : libJars()) if (isJunit(jarName)) addLibraryOrderEntry(set, "junit"); else addLibraryOrderEntry(set, jarName.split(".jar")[0]); }""" counter = WordCounter() counter.count(sample) expected = [('jar', 4), ('set', 3), ('name', 3), ('add', 3)] self.assertEquals(expected, counter.mostUsedWords(4))
def test_counts_a_real_java_sourceline(self): sample = """ private void addOrderEntryLibJars(Tag set) { for (String jarName : libJars()) if (isJunit(jarName)) addLibraryOrderEntry(set, "junit"); else addLibraryOrderEntry(set, jarName.split(".jar")[0]); }""" counter = WordCounter() counter.count(sample) expected = [ ('jar',4), ('set',3), ('name',3), ('add',3) ] self.assertEquals(expected, counter.mostUsedWords(4))
def main(): rootdir = 'C:/Users/pouya\python-projects\\truth-goggles\congressionalrecord\output' wordStems = { } # dictionary of all the stemmed words said and the number of times they are said # creates dictionary of legislators to cross-reference for party and chamber legislators = PoliticiansDatabase() # if we want to re-parse/clean the congressional record, this builds the dictionary of speakers and speeches # builds dictionary where key is a speaker and value is an array of all of their speeches speakerDict = buildWriterFile(rootdir) numberOfSpeechesDems = 0 # stores number of speeches given by Dems numberOfSpeechesGop = 0 # stores number of speeches given by GOP words = WordCounter() for speaker in speakerDict: # iterates through the speakers of all speeches speakerArr = speaker.split( ";" ) # list that has [name of speaker (+ "of " + their state), chamber] speakerName = speakerArr[0] # stores speaker's name if speaker not in wordStems: wordStems[speaker] = list( ) # adds speaker to wordStems if not already in it speakerDictKey = speaker.split(".")[-1].lower()[ 1:] # stores name of spekaer without prefix and their chamber numberOfSpeechesBoolean = True # helper boolean to keep track of number of speeches we parse for tup in speakerDict[speaker]: speech = tup[1] # stores speech given by speaker year = tup[0] # stores year that speech was given # finds party of speaker and assigns "not found" if cannot find speaker speakerParty = legislators.getSpeakerParty(year, speakerDictKey) # adds the number of speeches given by the speaker to the appropriate party's speech count if numberOfSpeechesBoolean: if speakerParty == "democrat": numberOfSpeechesDems += len(speakerDict[speaker]) elif speakerParty == "republican": numberOfSpeechesGop += len(speakerDict[speaker]) numberOfSpeechesBoolean = False words.addSingleWords(speech, speakerParty) words.calculateFrequencies(minimumOccs=125) words.printTopFrequencies(howMany=1000, whichParty='gop') words.printTopFrequencies(howMany=1000, whichParty='dem')
def main(indir='wc_input', outdir='wc_output', word_count_report='wc_result.txt', running_median_report='med_result.txt', running_median_method='hashtracker'): # create output directory try: os.mkdir(outdir) except Exception: pass # initialize data structures wc = WordCounter(report_filename=os.path.join(outdir, word_count_report)) rm = RunningMedian.factory(running_median_method, report_filename=os.path.join(outdir, running_median_report)) for file in os.listdir(indir): if not file.endswith('.txt'): continue with open(os.path.join(indir, file), 'r') as f: for line in f: words = line.split() # count each word for w in words: wc.update(w) # count number of words rm.update(len(words)) rm.report() # report word count after everything is done wc.report()
def test_acronym_in_middle(self): counter = WordCounter() result = counter._uncamelCase('embedHTMLHere') self.assertEquals(['embed','HTML', 'Here'], list(result))
def test_joined_leadingUpper(self): counter = WordCounter() result = counter._uncamelCase('LeadingUpper') self.assertEquals(['Leading','Upper'], list(result))
def test_capitalized(self): counter = WordCounter() result = counter._uncamelCase('This') self.assertEquals(['This'], list(result))
def test_failFast(self): sample = "public Album" counter = WordCounter() counter.count(sample) self.assertEqual(1, counter.timesOccurred('public'))
def test_differentCase(self): counter = WordCounter() counter.count("Album album ALBUM") self.assertEqual(3, counter.timesOccurred('album'))
def test_leading_upper(self): counter = WordCounter() result = counter._uncamelCase('THIS') self.assertEquals(['THIS'], list(result))
def test_all_lower(self): counter = WordCounter() result = counter._uncamelCase('this') self.assertEquals(['this'], list(result))
def test_counts_words_despite_punctuation(self): counter = WordCounter() counter.count("Album album = new ALBUM()") self.assertEqual(3, counter.timesOccurred('album'))
from HelperMethods import * from WordCounter import WordCounter import random # Create WordCounters with nonsense words and a random word count wc = WordCounter(makeNonsenseWord(), random.randint(1, 45)) print(wc) wc = WordCounter(makeNonsenseWord(), random.randint(1, 45)) print(wc) wc = WordCounter(makeNonsenseWord(), random.randint(1, 45)) print(wc) wc = WordCounter(makeNonsenseWord(), random.randint(1, 45)) print(wc) wc = WordCounter(makeNonsenseWord(), random.randint(1, 45)) print(wc)
import argparse from InputFileValidator import InputFileValidator from WordCounter import WordCounter if __name__ == '__main__': parser = argparse.ArgumentParser( description='This is a Python version of grep function.') parser.add_argument('name_of_file', type=str, help='an name of the file to check') args = parser.parse_args() name_of_file = args.name_of_file word_searcher = InputFileValidator(name_of_file) if word_searcher.validate(): To_count_all_word = WordCounter(word_searcher.get_list_of_files()) To_count_all_word.returning_count_of_all_chars_words_lines_provided()
def test_joined_leadingLower(self): counter = WordCounter() result = counter._uncamelCase('thisTime') self.assertEquals(['this', 'Time'], list(result))
def test_joined_leadingUpper(self): counter = WordCounter() result = counter._uncamelCase('LeadingUpper') self.assertEquals(['Leading', 'Upper'], list(result))
def test_end_with_acronym(self): counter = WordCounter() result = counter._uncamelCase('endWithHTML') self.assertEquals(['end', 'With', 'HTML'], list(result))
def test_joined_leadingLower(self): counter = WordCounter() result = counter._uncamelCase('thisTime') self.assertEquals(['this','Time'], list(result))
def test_acronym_in_middle(self): counter = WordCounter() result = counter._uncamelCase('embedHTMLHere') self.assertEquals(['embed', 'HTML', 'Here'], list(result))
def test_end_with_acronym(self): counter = WordCounter() result = counter._uncamelCase('endWithHTML') self.assertEquals(['end','With','HTML'], list(result))
def test_word_count(self): word_list=['test','this','this','that','is','test','is','test','is','of','yes'] word_counter=WordCounter(words=word_list) self.assertEqual(word_counter.wordcount,[('test', 3), ('is', 3), ('this', 2), ('that', 1), ('of', 1), ('yes', 1)])
import os from WordCounter import WordCounter,FileWordExtractor FILEPATH=os.path.join(os.path.dirname(__file__), 'input.txt') try: if os.path.exists(FILEPATH): text_file=FileWordExtractor(FILEPATH) words=text_file.words word_counter=WordCounter(words=words) #init wordcounter class word_count_dict=word_counter.wordcount #get wordcount text_file.display_total_words() word_counter.display_word_counts() else: print('File not found') except: print('Application Error')
Results ------- - All output is written to output file (output.txt). Notes ----- - The program ignores case sensitivity from the input file - All special characters from the input file will be filtered out. - This program will call functions from WordCounter.py """ import sys import os from WordCounter import WordCounter # validate number of input arguments if len(sys.argv) < 2: print("filename needed as command argument") sys.exit(1) filename = sys.argv[1] # validate input file if os.path.isfile(filename) == False: print(f"Error: {filename} is not a file.") print("Exit Program") sys.exit(1) wc = WordCounter(filename) wc.generateHistogram()