def crawlerWordFrequencies(): '''For each file in directory gather all text from text meaningful tags, tokenize that, then add it to a WordFrequencyCounter; repeat for each file.''' def freqSort(x): ''' sorts by greatest frequency and then alphabetically ''' return (-x[1], x[0]) parsedCount = 1 wordFreqs = dict() for fileName in os.listdir("data/content"): if os.stat("data/content/"+fileName)[6] > 0: try: with open("data/content/"+fileName) as mUp: wordFreqs.update(wordFrequencyCount(Utilities.tokenizeFile(BeautifulSoup(mUp.read()).get_text()))) except: continue finally: parsedCount+=1 tupForm = [(k,v) for k,v in wordFreqs.items()] topFH = 0 for item in sorted(tupForm,key = lambda x: freqSort(x)): if topFH == 500: break with open('CommonWords.txt','a') as f: f.write('{0:<20} {1}\n'.format(str(item[0]),str(item[1]))) topFH +=1 print('Completed Word Frequency Accumulation')
def main(filename: 'str') -> None: ''' Finds the palindromes in the file and prints the number of occurences of them. ''' words = Utilities.tokenizeFile(filename) frequencies = computePalindromeFrequencies(words) Utilities.printFrequencies(frequencies)
def main(filename: 'str') -> None: ''' Computes two word combinations and outputs them and their frequency ''' words = Utilities.tokenizeFile(filename) frequencies = computeTwoGramFrequencies(words) Utilities.printFrequencies(frequencies)
def main(filename: 'str') -> None: ''' Computes word tokens and outputs them and their frequency ''' words = Utilities.tokenizeFile(filename)#input file name here #make sure it's in the same directory frequencies = computeWordFrequencies(words) Utilities.printFrequencies(frequencies)
def crawlerWordFrequencies(): '''For each file in directory gather all text from text meaningful tags, tokenize that, then add it to a WordFrequencyCounter; repeat for each file.''' def freqSort(x): ''' sorts by greatest frequency and then alphabetically ''' return (-x[1], x[0]) longestText = '' wordFreqs = dict() for fileName in os.listdir("data/content"): if os.stat("data/content/"+fileName)[6] > 0: try: with open("data/content/"+fileName) as mUp: text = BeautifulSoup(mUp.read()).get_text() wordFreqs.update(wordFrequencyCount(Utilities.tokenizeFile(text))) except: continue finally: if len(text) > len(longestText): longestText = text longestTextFileName = fileName print(fileName) with open("longestPage.txt",'a') as lPage: lPage.write(longestTextFileName+'\n') lPage.write(longestText) tupForm = [(k,v) for k,v in wordFreqs.items()] topFH = 0 for item in sorted(tupForm,key = lambda x: freqSort(x)): print(item[1]) if topFH == 500: break with open('CommonWords.txt','a') as f: f.write(str(item[0])+'\n') topFH +=1 print('Completed Word Frequency Accumulation')
if not tokens: #check if list is empty, [] == False return [] tempFreq = defaultdict(int) palindromeAccumulator = '' '' #The variable accumulates tokens into a string until a palindrome is form tokens.append('addOne') #For the algorithm to work it is necessary that the last token ends a palindrome for i in range(len(tokens)): ''' iterates for the length of the tokens list. Each iteration checks palindromAccumulator and the reverse of palindrome accumulator twice. if checks for non-matches and catches the empty string(entry case). When true, appends token[i] to palindromeAccumulator else there is a palindrome. tempFreq is incremented and palindromeAccumulator is reset to token[i] ''' if palindromeAccumulator != palindromeAccumulator[::-1] or palindromeAccumulator == '': # compare pal and revesed pal; check for pal palindromeAccumulator += tokens[i] #concatenate tokens[i] to palindromeAccumulator else: tempFreq[palindromeAccumulator] += 1 # assign value to dict palindromeAccumulator = tokens[i] #reset to palindromeAccumulator to tokens[i] return Utilities.collateFrequencies(tempFreq) if __name__ == '__main__': #createTestFile() Utilities.printFrequencies(palindromeFrequencyCount(Utilities.tokenizeFile(open('test.txt').read())))
TwoGramFrequencyCounter.py Python 34 ''' import Utilities from Frequency import Frequency from collections import defaultdict def twoGramFrequencyCount(tokens : [str]) -> [Frequency]: ''' Counts frequency of 2grams from a tokenized list. ''' if not tokens: #check if list is empty, [] == False return [] tokens = list(filter(lambda x: x !='', tokens)) #filters out empty strings tempFreq = defaultdict(int) for twoGram in zip(tokens,tokens[1:]): # for (token,token) in [(token,token)] ''' iterate over a list of the combined list of words and the off-set list of words to create the twoGrams, using a defaultdict(int) with the 2gram as key and count duplicates for value ''' tempFreq[' '.join(list(twoGram))] += 1 # tempFreq['token token'] = frequency value return Utilities.collateFrequencies(tempFreq) if __name__ == '__main__': Utilities.printFrequencies(twoGramFrequencyCount(Utilities.tokenizeFile(open('xyz.txt').read())))