def compressionRatio(self,encodingWordUsage, compressionUsage,tree = None): # the first two arguments can either be speeches or dictionaries with words and frequencies if(isinstance(encodingWordUsage,speech.Speech)): encodingWordUsage = encodingWordUsage.wordUsage if(isinstance(compressionUsage,speech.Speech)): compressionUsage = compressionUsage.wordUsage #tree can be provided, to help performance, otherwise make a tree if(tree == None): tree = huffmanCoding.huffmanCodingTree(encodingWordUsage) wordCount = 0 bitsInHuffmanCoding = 0 #note, don't use the additive smoothing for finding ratio for entry in compressionUsage.iteritems(): #bits needed dictionary has the number of bits that each node of tree would encode. This could #be found using tree.findNode(), but looking things up in the tree took a long time, relative #to the number of times that it has to be done. bitsInHuffmanCoding += (entry[1]-1)*tree.bitsNeededDictionary[entry[0]] wordCount += entry[1] - 1 bitsInBlockCoding = wordCount*int(math.ceil(math.log(len(compressionUsage),2))) #for debugging #print "huffmanCoding = "+str(bitsInHuffmanCoding)+" blockCoding: "+str(bitsInBlockCoding) if(bitsInBlockCoding > 0): return float(bitsInHuffmanCoding)/float(bitsInBlockCoding) else: return 0
if(len(sys.argv) >= 3): print "finding speech set" set = speechSet.SpeechSet(sys.argv[1]) reverse = False if(len(sys.argv) == 4): reverse = True print "finding all filenames" if(reverse):#note that reversing the list puts it in chronological non-reverse listRecentFilenames = sorted(glob.glob(set.directoryPath+"/"+set.fileType),reverse = False) else: listRecentFilenames = sorted(glob.glob(set.directoryPath+"/"+set.fileType),reverse = True) print "finding word usage" wordUsage = determineWordUsage(set.wordUsage,int(sys.argv[2])) wordsSorted = sorted(wordUsage.iteritems(),key = operator.itemgetter(1)) print "start finding coding tree "+str(time.clock()) tree = huffmanCoding.huffmanCodingTree(wordUsage) print "bits of the longest codeword:"+str(wordsSorted[0])+", "+str(tree.bitsNeededDictionary[wordsSorted[0][0]]) print "bits of the shortest codeword:"+str(wordsSorted[len(wordsSorted)-1])+", "+str(tree.bitsNeededDictionary[wordsSorted[len(wordsSorted)-1][0]]) #print "bit the "+ str(tree.bitsNeededDictionary["the"]) #print "bit a " + str(tree.bitsNeededDictionary["a"]) #print "bit for " + str(tree.bitsNeededDictionary["for"]) print "completed finding coding tree "+str(time.clock()) file = None fileForWordCount = open("WordCount.txt","w+") if(reverse): file = open(sys.argv[2]+"LeastRecentSpeech.txt","w+") else: file = open(sys.argv[2]+"MostRecentSpeech.txt","w+") print "starting compression ratio, "+str(time.clock()) for filename in listRecentFilenames: