def __init__(self, filepath): """Initialize FrequencyStrategy""" self.possible = WordSet() # Cache of possible words given current game state. # Initially just all the possible words, divided out based on length. # As games progress, more WordSets-possible-given-game-state-X are added. self.wordCache = collections.defaultdict(WordSet) # process dictionary file self.parseWordsFile(filepath) self.seedCache() self.newGame()
def __init__(self, streets_file, towns_file, country_file): super(AddressDetector, self).__init__('Detect addresses') self.streets = WLTStorage(streets_file) self.towns = WLTStorage(towns_file) self.countries = WordSet(country_file)
# word: The string in which the word being modified is stored. # punctuation: Tuple containing punctuation that is to be removed. def removePunctuation(word, punctuation): for x in punctuation: if (word[len(word) - 1] == x): word = word[0:len(word) - 1] return word #Main #st = LancasterStemmer() con = sql.connect("UselessWords.db") cur = con.cursor() #Word_List stores words from the input parapgraph. Word_List = WordSet() paragraph = input("Enter the text to be summarized: ") #tokens: list of words in the input parargraph. tokens = paragraph.split() #Removes formatting from each word then adds it to Word_List. for item in tokens: item = item.strip() item = removePunctuation(item, punctuation) #x = st.stem(x) if (not useless(cur, item)): Word_List.add(Word(item, None, 1)) #output stores the Word Object with the highest count output = Word_List.max() print(output.word) con.close()
def __init__(self, ignore_list): super(AbbrDetector, self).__init__('Detect abbreviations') self.ignored_words = WordSet(ignore_list, filt=lambda x: x.lower()) self.cache = set()
def generateOutputFile(developmentSetFilename, testSetFilename, inputWord, outputFilename): print "Started with:" print "\tDevelopment set filename: %s" % developmentSetFilename print "\tTest set filename: %s" % testSetFilename print "\tInput word: %s" % inputWord print "\tOutput filename: %s" % outputFilename vocabularySize = 300000 file = open(outputFilename, "w+") file.write("#Students:\tSaar Arbel (315681775), Boaz Berman (311504401)\n") file.write("Output1: " + developmentSetFilename + "\n") file.write("Output2: " + testSetFilename + "\n") file.write("Output3: " + inputWord + "\n") file.write("Output4: " + outputFilename + "\n") file.write("Output5: " + str(vocabularySize) + "\n") file.write("Output6: " + str(calcPuniform(vocabularySize)) + "\n") # Lidstone model with open(developmentSetFilename, 'rb') as input_file: input_file_data = input_file.read() words = parse_file_data(input_file_data) cuttingIndex = int(round(len(words) * 0.9)) trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:] trainingWordSet, validationWordSet = WordSet(trainingSet, vocabularySize), WordSet(validationSet, vocabularySize) file.write("Output7: " + str(len(words)) + "\n") file.write("Output8: " + str(validationWordSet.length) + "\n") file.write("Output9: " + str(trainingWordSet.length) + "\n") file.write("Output10: " + str(trainingWordSet.distinctLength) + "\n") file.write("Output11: " + str(trainingWordSet.countAppearances(inputWord)) + "\n") file.write("Output12: " + str(trainingWordSet.pMaximumLikelihoodEstimate(inputWord)) + "\n") file.write("Output13: " + str(trainingWordSet.pMaximumLikelihoodEstimate("unseen-word")) + "\n") print "Lidstone validation: " + str(validateLidstone(validationWordSet, 0.1)) file.write("Output14: " + str(trainingWordSet.pLidstone(inputWord, 0.1)) + "\n") file.write("Output15: " + str(trainingWordSet.pLidstone("unseen-word", 0.1)) + "\n") file.write("Output16: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 0.01)) + "\n") file.write("Output17: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 0.1)) + "\n") file.write("Output18: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 1.0)) + "\n") minperplexity, minlamda = minimumPerplexityZeroToTwo(trainingWordSet, validationWordSet) file.write("Output19: " + str(minlamda) + "\n") file.write("Output20: " + str(minperplexity) + "\n") # HeldOut model cuttingHeldOutIndex = int(round(len(words) * 0.5)) heldOutTrainingSet, heldOutSet = words[:cuttingHeldOutIndex], words[cuttingHeldOutIndex:] heldOutTrainingWordSet, heldOutWordSet = WordSet(heldOutTrainingSet, vocabularySize), WordSet(heldOutSet, vocabularySize) heldOut = HeldOutWordSet(heldOutTrainingWordSet, heldOutWordSet) file.write("Output21: " + str(len(heldOutTrainingSet)) + "\n") file.write("Output22: " + str(len(heldOutSet)) + "\n") file.write("Output23: " + str(heldOut.pHeldOut(inputWord)) + "\n") file.write("Output24: " + str(heldOut.pHeldOut("unseen-word")) + "\n") print "Held Out validation: " + str(heldOut.validateHeldOut(heldOutTrainingWordSet)) with open(testSetFilename, 'rb') as input_file_test: input_file_data_test = input_file_test.read() testWords = parse_file_data(input_file_data_test) testTrainingSet = WordSet(testWords, vocabularySize) file.write("Output25: " + str(len(testWords)) + "\n") # find out if the perplexity should be done on testTrainingSet with the old trainingWordSet that we # calculate with him the minLamda lidstonPerplexityVar = lidstonPerplexity(trainingWordSet, testTrainingSet, minlamda) heldOutPerplexityVar = heldOutPerplexity(heldOut, testTrainingSet) file.write("Output26: " + str(lidstonPerplexityVar) + "\n") file.write("Output27: " + str(heldOutPerplexityVar) + "\n") file.write("Output28: " + ('L' if lidstonPerplexityVar < heldOutPerplexityVar else 'H') + "\n") file.write("Output29:") file.write(printTable(heldOut, trainingWordSet , minlamda)) file.close print "Ended"
class FrequencyStrategy: """Guesses letters based on which occurs in most of the popular""" #--- # CLASS CONSTANTS #--- # don't pre-seed cache for word sets smaller than this SEED_MIN = 10 #----------------------------------------------------------------------------- # ctor #----------------------------------------------------------------------------- def __init__(self, filepath): """Initialize FrequencyStrategy""" self.possible = WordSet() # Cache of possible words given current game state. # Initially just all the possible words, divided out based on length. # As games progress, more WordSets-possible-given-game-state-X are added. self.wordCache = collections.defaultdict(WordSet) # process dictionary file self.parseWordsFile(filepath) self.seedCache() self.newGame() #----------------------------------------------------------------------------- # pick a strategy #----------------------------------------------------------------------------- def nextGuess(self, game): """Updates possible words based on previous guess, then decides whether to guess a word or a letter, and returns the GuessWord/GuessLetter.""" with util.Timer() as upTime: self.updatePossibleWords(game) util.DBG("Update took %.09f sec." % upTime.interval, TIMING) # pick a strategy # if we can guess all the possible words and not lose, go for it. if len(self.possible) <= game.numWrongGuessesRemaining(): return GuessWord(self.wordStrategy(game)) else: # Pick a letter. # Any letter. # Not that letter. return GuessLetter( self.letterStrategy(self.possible, game.getAllGuessedLetters(), game.numWrongGuessesRemaining())) #----------------------------------------------------------------------------- # pick-a-word strategy #----------------------------------------------------------------------------- def wordStrategy(self, game): """Guess a word, based on possible words return - the word to be guessed (string)""" # sorted the word set for stable word scores for word in sorted(self.possible.words): if word not in game.getIncorrectlyGuessedWords(): util.DBG("GUESS: " + word, DEBUG) return word #----------------------------------------------------------------------------- # pick-a-letter strategy #----------------------------------------------------------------------------- def letterStrategy(self, wordSet, letterSet, guessesLeft): """Guess a letter, based on letter frequency in possible words return - the letter to be guessed (string)""" # Pick the first letter that hasn't been guessed. # Sort letterFreq for stable word scores by ensuring that 11 a's always # get guessed after 11 b's. Turns out a-z gets worse scores than z-a # in the test dictionary. for letter, _ in sorted(wordSet.letterFreq.most_common(), key=lambda lc: (lc[1], lc[0]), reverse=True): # z-a #key=lambda lc: (-lc[1], lc[0])): # a-z if letter not in letterSet: util.DBG("GUESS: " + letter, DEBUG) return letter #----------------------------------------------------------------------------- # find possibilities #----------------------------------------------------------------------------- def updatePossibleWords(self, game): """Uses the dictionary and current guess to narrow down the possible words. return - Nothing. Updates self vars.""" # check the cache before doing any work if self.key(game) in self.wordCache: # It's there. Use it. self.possible = self.wordCache[self.key(game)].copy() return # Set to use in the regex. Either any caps letter, # or not the incorrect letters. wrongLetters = game.getIncorrectlyGuessedLetters() if wrongLetters: notWrongLetters = "[^" + "".join(wrongLetters) + "]{" else: notWrongLetters = "[A-Z]{" # turn guessedSoFar into a regex using notWrongLetters current = re.compile("(" + HangmanGame.MYSTERY_LETTER + "+|[A-Z]+)").findall(game.getGuessedSoFar()) for i in range(len(current)): if current[i][0] == HangmanGame.MYSTERY_LETTER: current[i] = notWrongLetters + str(len(current[i])) + "}" # match() only matches at start of string so add a final "$" to make sure # it's a whole match and we're not saying "c-t" matches "catapult" current.append("$") guessRegex = re.compile("".join(current)) # need a (different) set to iterate over whist I remove words # from the possible set tempPossibles = self.possible.words.copy() # test each word in the possibilites set for word in tempPossibles: # purge words that can't match current guess if guessRegex.match(word) == None: self.possible.words.remove(word) # inform the WordSet that it's been updated self.possible.updated() # cache results self.cache(game) #----------------------------------------------------------------------------- # The cache is locked, apparently. #----------------------------------------------------------------------------- def key(self, game): """Returns the key to be used for the words cache.""" # make dict key return "".join( list(game.getGuessedSoFar()) + ['!'] + sorted(game.getIncorrectlyGuessedLetters())) #----------------------------------------------------------------------------- # Caching! #----------------------------------------------------------------------------- def cache(self, game): """Saves current self.possible to cache.""" # save to dict self.wordCache[self.key(game)] = self.possible.copy() #----------------------------------------------------------------------------- # reset for new game #----------------------------------------------------------------------------- def newGame(self): """Resets class variables to be ready for another game.""" self.firstRun = True #----------------------------------------------------------------------------- # Read words file #----------------------------------------------------------------------------- def parseWordsFile(self, filepath): """Reads the dictionary and places each word into the words cache. Dictionary file must be one word per line. Does not verify the words (i.e. does not check that they've only got letters in them). exception - IOError if file can't be found/opened/read""" with open(filepath, 'r') as dictionary: util.DBG(util.pretty_size(os.path.getsize(filepath)), DEBUG) # read words file into set for line in dictionary: word = line.strip() if word != "": # avoid empty lines key = HangmanGame.MYSTERY_LETTER * len(word) + "!" self.wordCache[key].words.add(word.upper()) # Everything read in. Generate the WordSets' letter freqs. for k in self.wordCache: self.wordCache[k].updated() #----------------------------------------------------------------------------- # seed cache #----------------------------------------------------------------------------- def seedCache(self): """Pre-compute misses for common letters.""" emptySet = set() # used to get letterStrategy's first guess temp = self.wordCache.copy( ) # can't add to what we're iterating over, so temp for k in temp: # don't bother for the sets that are tiny if len(self.wordCache[k]) > self.SEED_MIN: # determine first guess letter letter = self.letterStrategy(self.wordCache[k], emptySet, 1000) # weed down to just failures noLetter = self.wordCache[k].copy() for word in self.wordCache[k].words: if letter in word: noLetter.words.remove(word) noLetter.updated() # save to cache with new key key = HangmanGame.MYSTERY_LETTER * len(word) + "!" + letter self.wordCache[key] = noLetter util.DBG("pre-cached: " + key, DEBUG)
def generateOutputFile(developmentSetFilename, testSetFilename, firstInputWord, secondInputWord, outputFilename): print "Started with: " print "\tDevelopment set filename: %s" % developmentSetFilename print "\tTest set filename: %s" % testSetFilename print "\tInput word: %s" % firstInputWord print "\tInput word2: %s" % secondInputWord print "\tOutput filename: %s" % outputFilename vocabularySize = 300000 file = open(outputFilename, "w+") file.write("#Students:\tSaar Arbel\tBoaz Berman\t315681775\t311504401\n") file.write("Output1: " + developmentSetFilename + "\n") file.write("Output2: " + testSetFilename + "\n") file.write("Output3: " + firstInputWord + " " + secondInputWord + "\n") file.write("Output4: " + outputFilename + "\n") file.write("Output5: " + str(vocabularySize) + "\n") with open(developmentSetFilename, 'rb') as input_file: input_file_data = input_file.read() words = parse_file_data(input_file_data) cuttingIndex = int(round(len(words) * 0.9)) trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:] trainingWordSet, validationWordSet = WordSet( trainingSet, vocabularySize), WordSet(validationSet, vocabularySize) file.write("Output6: " + str(len(words)) + "\n") file.write("Output7: " + str(validationWordSet.length) + "\n") file.write("Output8: " + str(trainingWordSet.length) + "\n") file.write("Output9: " + str(trainingWordSet.distinctLength) + "\n") file.write("Output10: " + str(trainingWordSet.countAppearances(firstInputWord)) + "\n") trainingBigramWordSet = BigramWordSet(trainingSet, vocabularySize, trainingWordSet) file.write("Output11: " + str( trainingBigramWordSet.countAppearances(firstInputWord, secondInputWord)) + "\n") validationBigramWordSet = BigramWordSet(validationSet, vocabularySize, validationWordSet) backOffTrainingModel = BackOffModel(trainingBigramWordSet, trainingWordSet) backOffValidationModel = BackOffModel(validationBigramWordSet, validationWordSet) print str( backOffTrainingModel.bigramWordSet.pLidstone( ("bank", "economist"), 0.001)) + " boaz" print backOffTrainingModel.pBackOff("bank", "economist", 0.1) print "Debug %f" % backOffTrainingModel.debug() file.write('Output12: ' + str( backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.0001)) + "\n") print "finished 12" file.write('Output13: ' + str( backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.001)) + "\n") print "finished 13" file.write('Output14: ' + str( backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.1)) + "\n") print "finished 14" minperplexity, minlamda = minimumPerplexity(backOffTrainingModel, backOffValidationModel) file.write('Output15: ' + str(minlamda) + "\n") print "finished 15" file.write('Output16: ' + str(minperplexity) + "\n") print "finished 16" with open(testSetFilename, 'rb') as input_file2: input_file_data2 = input_file2.read() words2 = parse_file_data(input_file_data2) trainingWordSet2 = WordSet(words2, vocabularySize) trainingBigramWordSet2 = BigramWordSet(words2, vocabularySize, trainingWordSet2) backOffTrainingModel2 = BackOffModel(trainingBigramWordSet2, trainingWordSet2) file.write('Output17: ' + str( backOffPerplexity(backOffTrainingModel, backOffTrainingModel2, 0.0003)) + "\n") print "finished 17" file.write('Output18: ' + str(printTable(backOffTrainingModel, 0.001, firstInputWord)))
def generateOutputFile(developmentSetFilename, testSetFilename, firstInputWord, secondInputWord, outputFilename): print "Started with: " print "\tDevelopment set filename: %s" % developmentSetFilename print "\tTest set filename: %s" % testSetFilename print "\tInput word: %s" % firstInputWord print "\tInput word2: %s" % secondInputWord print "\tOutput filename: %s" % outputFilename vocabularySize = 300000 file = open(outputFilename, "w+") file.write("#Students:\tSaar Arbel\tBoaz Berman\t315681775\t311504401\n") file.write("Output1: " + developmentSetFilename + "\n") file.write("Output2: " + testSetFilename + "\n") file.write("Output3: " + firstInputWord + " " + secondInputWord + "\n") file.write("Output4: " + outputFilename + "\n") file.write("Output5: " + str(vocabularySize) + "\n") with open(developmentSetFilename, 'rb') as input_file: input_file_data = input_file.read() words = parse_file_data(input_file_data) cuttingIndex = int(round(len(words) * 0.9)) trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:] trainingWordSet, validationWordSet = WordSet(trainingSet, vocabularySize), WordSet(validationSet, vocabularySize) file.write("Output6: " + str(len(words)) + "\n") file.write("Output7: " + str(validationWordSet.length) + "\n") file.write("Output8: " + str(trainingWordSet.length) + "\n") file.write("Output9: " + str(trainingWordSet.distinctLength) + "\n") file.write("Output10: " + str(trainingWordSet.countAppearances(firstInputWord)) + "\n") trainingBigramWordSet = BigramWordSet(trainingSet, vocabularySize, trainingWordSet) file.write("Output11: " + str(trainingBigramWordSet.countAppearances(firstInputWord, secondInputWord)) + "\n") validationBigramWordSet = BigramWordSet(validationSet, vocabularySize, validationWordSet) backOffTrainingModel = BackOffModel(trainingBigramWordSet,trainingWordSet) backOffValidationModel = BackOffModel(validationBigramWordSet, validationWordSet) print str(backOffTrainingModel.bigramWordSet.pLidstone(("bank", "economist"), 0.001)) + " boaz" print backOffTrainingModel.pBackOff("bank", "economist",0.1) print "Debug %f" % backOffTrainingModel.debug() file.write('Output12: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.0001)) + "\n") print "finished 12" file.write('Output13: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.001)) + "\n") print "finished 13" file.write('Output14: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.1)) + "\n") print "finished 14" minperplexity, minlamda = minimumPerplexity(backOffTrainingModel, backOffValidationModel) file.write('Output15: ' + str(minlamda) + "\n") print "finished 15" file.write('Output16: ' + str(minperplexity) + "\n") print "finished 16" with open(testSetFilename, 'rb') as input_file2: input_file_data2 = input_file2.read() words2 = parse_file_data(input_file_data2) trainingWordSet2 = WordSet(words2,vocabularySize) trainingBigramWordSet2 = BigramWordSet(words2, vocabularySize, trainingWordSet2) backOffTrainingModel2 = BackOffModel(trainingBigramWordSet2,trainingWordSet2) file.write('Output17: ' + str(backOffPerplexity(backOffTrainingModel, backOffTrainingModel2, 0.0003)) + "\n") print "finished 17" file.write('Output18: ' + str(printTable(backOffTrainingModel,0.001,firstInputWord)))