Example #1
0
    def __init__(self, filepath):
        """Initialize FrequencyStrategy"""

        self.possible = WordSet()

        # Cache of possible words given current game state.
        # Initially just all the possible words, divided out based on length.
        # As games progress, more WordSets-possible-given-game-state-X are added.
        self.wordCache = collections.defaultdict(WordSet)

        # process dictionary file
        self.parseWordsFile(filepath)

        self.seedCache()

        self.newGame()
Example #2
0
 def __init__(self, streets_file, towns_file, country_file):
     super(AddressDetector, self).__init__('Detect addresses')
     self.streets = WLTStorage(streets_file)
     self.towns = WLTStorage(towns_file)
     self.countries = WordSet(country_file)
Example #3
0
#   word: The string in which the word being modified is stored.
#   punctuation: Tuple containing punctuation that is to be removed.
def removePunctuation(word, punctuation):
    for x in punctuation:
        if (word[len(word) - 1] == x):
            word = word[0:len(word) - 1]
    return word


#Main

#st = LancasterStemmer()
con = sql.connect("UselessWords.db")
cur = con.cursor()
#Word_List stores words from the input parapgraph.
Word_List = WordSet()
paragraph = input("Enter the text to be summarized: ")
#tokens: list of words in the input parargraph.
tokens = paragraph.split()
#Removes formatting from each word then adds it to Word_List.
for item in tokens:
    item = item.strip()
    item = removePunctuation(item, punctuation)
    #x = st.stem(x)
    if (not useless(cur, item)):
        Word_List.add(Word(item, None, 1))
#output stores the Word Object with the highest count
output = Word_List.max()
print(output.word)
con.close()
Example #4
0
 def __init__(self, ignore_list):
     super(AbbrDetector, self).__init__('Detect abbreviations')
     self.ignored_words = WordSet(ignore_list, filt=lambda x: x.lower())
     self.cache = set()
Example #5
0
def generateOutputFile(developmentSetFilename, testSetFilename, inputWord, outputFilename):
    print "Started with:"
    print "\tDevelopment set filename: %s" % developmentSetFilename
    print "\tTest set filename: %s" % testSetFilename
    print "\tInput word: %s" % inputWord
    print "\tOutput filename: %s" % outputFilename

    vocabularySize = 300000

    file = open(outputFilename, "w+")
    file.write("#Students:\tSaar Arbel (315681775), Boaz Berman (311504401)\n")
    file.write("Output1: " + developmentSetFilename + "\n")
    file.write("Output2: " + testSetFilename + "\n")
    file.write("Output3: " + inputWord + "\n")
    file.write("Output4: " + outputFilename + "\n")
    file.write("Output5: " + str(vocabularySize) + "\n")
    file.write("Output6: " + str(calcPuniform(vocabularySize)) + "\n")

    # Lidstone model
    with open(developmentSetFilename, 'rb') as input_file:
        input_file_data = input_file.read()
    words = parse_file_data(input_file_data)

    cuttingIndex = int(round(len(words) * 0.9))
    trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:]
    trainingWordSet, validationWordSet = WordSet(trainingSet, vocabularySize), WordSet(validationSet, vocabularySize)

    file.write("Output7: " + str(len(words)) + "\n")
    file.write("Output8: " + str(validationWordSet.length) + "\n")
    file.write("Output9: " + str(trainingWordSet.length) + "\n")
    file.write("Output10: " + str(trainingWordSet.distinctLength) + "\n")
    file.write("Output11: " + str(trainingWordSet.countAppearances(inputWord)) + "\n")
    file.write("Output12: " + str(trainingWordSet.pMaximumLikelihoodEstimate(inputWord)) + "\n")
    file.write("Output13: " + str(trainingWordSet.pMaximumLikelihoodEstimate("unseen-word")) + "\n")

    print "Lidstone validation: " + str(validateLidstone(validationWordSet, 0.1))

    file.write("Output14: " + str(trainingWordSet.pLidstone(inputWord, 0.1)) + "\n")
    file.write("Output15: " + str(trainingWordSet.pLidstone("unseen-word", 0.1)) + "\n")
    file.write("Output16: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 0.01)) + "\n")
    file.write("Output17: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 0.1)) + "\n")
    file.write("Output18: " + str(lidstonPerplexity(trainingWordSet, validationWordSet, 1.0)) + "\n")

    minperplexity, minlamda = minimumPerplexityZeroToTwo(trainingWordSet, validationWordSet)

    file.write("Output19: " + str(minlamda) + "\n")
    file.write("Output20: " + str(minperplexity) + "\n")

    # HeldOut model
    cuttingHeldOutIndex = int(round(len(words) * 0.5))
    heldOutTrainingSet, heldOutSet = words[:cuttingHeldOutIndex], words[cuttingHeldOutIndex:]
    heldOutTrainingWordSet, heldOutWordSet = WordSet(heldOutTrainingSet, vocabularySize), WordSet(heldOutSet,
                                                                                                  vocabularySize)
    heldOut = HeldOutWordSet(heldOutTrainingWordSet, heldOutWordSet)
    file.write("Output21: " + str(len(heldOutTrainingSet)) + "\n")
    file.write("Output22: " + str(len(heldOutSet)) + "\n")
    file.write("Output23: " + str(heldOut.pHeldOut(inputWord)) + "\n")
    file.write("Output24: " + str(heldOut.pHeldOut("unseen-word")) + "\n")
    print "Held Out validation: " + str(heldOut.validateHeldOut(heldOutTrainingWordSet))

    with open(testSetFilename, 'rb') as input_file_test:
        input_file_data_test = input_file_test.read()
    testWords = parse_file_data(input_file_data_test)

    testTrainingSet = WordSet(testWords, vocabularySize)
    file.write("Output25: " + str(len(testWords)) + "\n")
    # find out if the perplexity should be done on testTrainingSet with the old trainingWordSet that we
    # calculate with him the minLamda
    lidstonPerplexityVar = lidstonPerplexity(trainingWordSet, testTrainingSet, minlamda)
    heldOutPerplexityVar = heldOutPerplexity(heldOut, testTrainingSet)
    file.write("Output26: " + str(lidstonPerplexityVar) + "\n")
    file.write("Output27: " + str(heldOutPerplexityVar) + "\n")
    file.write("Output28: " + ('L' if lidstonPerplexityVar < heldOutPerplexityVar else 'H') + "\n")
    file.write("Output29:")
    file.write(printTable(heldOut, trainingWordSet , minlamda))

    file.close

    print "Ended"
Example #6
0
class FrequencyStrategy:
    """Guesses letters based on which occurs in most of the popular"""

    #---
    # CLASS CONSTANTS
    #---

    # don't pre-seed cache for word sets smaller than this
    SEED_MIN = 10

    #-----------------------------------------------------------------------------
    # ctor
    #-----------------------------------------------------------------------------
    def __init__(self, filepath):
        """Initialize FrequencyStrategy"""

        self.possible = WordSet()

        # Cache of possible words given current game state.
        # Initially just all the possible words, divided out based on length.
        # As games progress, more WordSets-possible-given-game-state-X are added.
        self.wordCache = collections.defaultdict(WordSet)

        # process dictionary file
        self.parseWordsFile(filepath)

        self.seedCache()

        self.newGame()

    #-----------------------------------------------------------------------------
    # pick a strategy
    #-----------------------------------------------------------------------------
    def nextGuess(self, game):
        """Updates possible words based on previous guess, then decides whether
      to guess a word or a letter, and returns the GuessWord/GuessLetter."""

        with util.Timer() as upTime:
            self.updatePossibleWords(game)
        util.DBG("Update took %.09f sec." % upTime.interval, TIMING)

        # pick a strategy
        # if we can guess all the possible words and not lose, go for it.
        if len(self.possible) <= game.numWrongGuessesRemaining():
            return GuessWord(self.wordStrategy(game))
        else:
            # Pick a letter.
            # Any letter.
            # Not that letter.
            return GuessLetter(
                self.letterStrategy(self.possible, game.getAllGuessedLetters(),
                                    game.numWrongGuessesRemaining()))

    #-----------------------------------------------------------------------------
    # pick-a-word strategy
    #-----------------------------------------------------------------------------
    def wordStrategy(self, game):
        """Guess a word, based on possible words
      return - the word to be guessed (string)"""

        # sorted the word set for stable word scores
        for word in sorted(self.possible.words):
            if word not in game.getIncorrectlyGuessedWords():
                util.DBG("GUESS: " + word, DEBUG)
                return word

    #-----------------------------------------------------------------------------
    # pick-a-letter strategy
    #-----------------------------------------------------------------------------
    def letterStrategy(self, wordSet, letterSet, guessesLeft):
        """Guess a letter, based on letter frequency in possible words
      return - the letter to be guessed (string)"""

        # Pick the first letter that hasn't been guessed.
        # Sort letterFreq for stable word scores by ensuring that 11 a's always
        # get guessed after 11 b's. Turns out a-z gets worse scores than z-a
        # in the test dictionary.
        for letter, _ in sorted(wordSet.letterFreq.most_common(),
                                key=lambda lc: (lc[1], lc[0]),
                                reverse=True):  # z-a
            #key=lambda lc: (-lc[1], lc[0])): # a-z
            if letter not in letterSet:
                util.DBG("GUESS: " + letter, DEBUG)
                return letter

    #-----------------------------------------------------------------------------
    # find possibilities
    #-----------------------------------------------------------------------------
    def updatePossibleWords(self, game):
        """Uses the dictionary and current guess to narrow down the possible words.
      return - Nothing. Updates self vars."""

        # check the cache before doing any work
        if self.key(game) in self.wordCache:
            # It's there. Use it.
            self.possible = self.wordCache[self.key(game)].copy()
            return

        # Set to use in the regex. Either any caps letter,
        # or not the incorrect letters.
        wrongLetters = game.getIncorrectlyGuessedLetters()
        if wrongLetters:
            notWrongLetters = "[^" + "".join(wrongLetters) + "]{"
        else:
            notWrongLetters = "[A-Z]{"

        # turn guessedSoFar into a regex using notWrongLetters
        current = re.compile("(" + HangmanGame.MYSTERY_LETTER +
                             "+|[A-Z]+)").findall(game.getGuessedSoFar())
        for i in range(len(current)):
            if current[i][0] == HangmanGame.MYSTERY_LETTER:
                current[i] = notWrongLetters + str(len(current[i])) + "}"

        # match() only matches at start of string so add a final "$" to make sure
        # it's a whole match and we're not saying "c-t" matches "catapult"
        current.append("$")
        guessRegex = re.compile("".join(current))

        # need a (different) set to iterate over whist I remove words
        # from the possible set
        tempPossibles = self.possible.words.copy()

        # test each word in the possibilites set
        for word in tempPossibles:
            # purge words that can't match current guess
            if guessRegex.match(word) == None:
                self.possible.words.remove(word)

        # inform the WordSet that it's been updated
        self.possible.updated()

        # cache results
        self.cache(game)

    #-----------------------------------------------------------------------------
    # The cache is locked, apparently.
    #-----------------------------------------------------------------------------
    def key(self, game):
        """Returns the key to be used for the words cache."""

        # make dict key
        return "".join(
            list(game.getGuessedSoFar()) + ['!'] +
            sorted(game.getIncorrectlyGuessedLetters()))

    #-----------------------------------------------------------------------------
    # Caching!
    #-----------------------------------------------------------------------------
    def cache(self, game):
        """Saves current self.possible to cache."""

        # save to dict
        self.wordCache[self.key(game)] = self.possible.copy()

    #-----------------------------------------------------------------------------
    # reset for new game
    #-----------------------------------------------------------------------------
    def newGame(self):
        """Resets class variables to be ready for another game."""

        self.firstRun = True

    #-----------------------------------------------------------------------------
    # Read words file
    #-----------------------------------------------------------------------------
    def parseWordsFile(self, filepath):
        """Reads the dictionary and places each word into the words cache.
      Dictionary file must be one word per line.
      Does not verify the words (i.e. does not check that they've only got 
      letters in them).
      exception - IOError if file can't be found/opened/read"""

        with open(filepath, 'r') as dictionary:
            util.DBG(util.pretty_size(os.path.getsize(filepath)), DEBUG)

            # read words file into set
            for line in dictionary:
                word = line.strip()
                if word != "":  # avoid empty lines
                    key = HangmanGame.MYSTERY_LETTER * len(word) + "!"
                    self.wordCache[key].words.add(word.upper())

        # Everything read in. Generate the WordSets' letter freqs.
        for k in self.wordCache:
            self.wordCache[k].updated()

    #-----------------------------------------------------------------------------
    # seed cache
    #-----------------------------------------------------------------------------
    def seedCache(self):
        """Pre-compute misses for common letters."""

        emptySet = set()  # used to get letterStrategy's first guess
        temp = self.wordCache.copy(
        )  # can't add to what we're iterating over, so temp
        for k in temp:
            # don't bother for the sets that are tiny
            if len(self.wordCache[k]) > self.SEED_MIN:
                # determine first guess letter
                letter = self.letterStrategy(self.wordCache[k], emptySet, 1000)

                # weed down to just failures
                noLetter = self.wordCache[k].copy()
                for word in self.wordCache[k].words:
                    if letter in word:
                        noLetter.words.remove(word)
                noLetter.updated()

                # save to cache with new key
                key = HangmanGame.MYSTERY_LETTER * len(word) + "!" + letter
                self.wordCache[key] = noLetter
                util.DBG("pre-cached: " + key, DEBUG)
Example #7
0
def generateOutputFile(developmentSetFilename, testSetFilename, firstInputWord,
                       secondInputWord, outputFilename):
    print "Started with: "
    print "\tDevelopment set filename: %s" % developmentSetFilename
    print "\tTest set filename: %s" % testSetFilename
    print "\tInput word: %s" % firstInputWord
    print "\tInput word2: %s" % secondInputWord
    print "\tOutput filename: %s" % outputFilename
    vocabularySize = 300000

    file = open(outputFilename, "w+")
    file.write("#Students:\tSaar Arbel\tBoaz Berman\t315681775\t311504401\n")
    file.write("Output1: " + developmentSetFilename + "\n")
    file.write("Output2: " + testSetFilename + "\n")
    file.write("Output3: " + firstInputWord + " " + secondInputWord + "\n")
    file.write("Output4: " + outputFilename + "\n")
    file.write("Output5: " + str(vocabularySize) + "\n")

    with open(developmentSetFilename, 'rb') as input_file:
        input_file_data = input_file.read()
    words = parse_file_data(input_file_data)

    cuttingIndex = int(round(len(words) * 0.9))
    trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:]
    trainingWordSet, validationWordSet = WordSet(
        trainingSet, vocabularySize), WordSet(validationSet, vocabularySize)
    file.write("Output6: " + str(len(words)) + "\n")
    file.write("Output7: " + str(validationWordSet.length) + "\n")
    file.write("Output8: " + str(trainingWordSet.length) + "\n")
    file.write("Output9: " + str(trainingWordSet.distinctLength) + "\n")
    file.write("Output10: " +
               str(trainingWordSet.countAppearances(firstInputWord)) + "\n")

    trainingBigramWordSet = BigramWordSet(trainingSet, vocabularySize,
                                          trainingWordSet)
    file.write("Output11: " + str(
        trainingBigramWordSet.countAppearances(firstInputWord,
                                               secondInputWord)) + "\n")

    validationBigramWordSet = BigramWordSet(validationSet, vocabularySize,
                                            validationWordSet)
    backOffTrainingModel = BackOffModel(trainingBigramWordSet, trainingWordSet)
    backOffValidationModel = BackOffModel(validationBigramWordSet,
                                          validationWordSet)

    print str(
        backOffTrainingModel.bigramWordSet.pLidstone(
            ("bank", "economist"), 0.001)) + " boaz"
    print backOffTrainingModel.pBackOff("bank", "economist", 0.1)
    print "Debug %f" % backOffTrainingModel.debug()

    file.write('Output12: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel,
                          0.0001)) + "\n")
    print "finished 12"
    file.write('Output13: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.001))
               + "\n")
    print "finished 13"
    file.write('Output14: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.1)) +
               "\n")
    print "finished 14"
    minperplexity, minlamda = minimumPerplexity(backOffTrainingModel,
                                                backOffValidationModel)
    file.write('Output15: ' + str(minlamda) + "\n")
    print "finished 15"
    file.write('Output16: ' + str(minperplexity) + "\n")
    print "finished 16"

    with open(testSetFilename, 'rb') as input_file2:
        input_file_data2 = input_file2.read()
    words2 = parse_file_data(input_file_data2)
    trainingWordSet2 = WordSet(words2, vocabularySize)
    trainingBigramWordSet2 = BigramWordSet(words2, vocabularySize,
                                           trainingWordSet2)
    backOffTrainingModel2 = BackOffModel(trainingBigramWordSet2,
                                         trainingWordSet2)

    file.write('Output17: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffTrainingModel2, 0.0003))
               + "\n")
    print "finished 17"

    file.write('Output18: ' +
               str(printTable(backOffTrainingModel, 0.001, firstInputWord)))
Example #8
0
def generateOutputFile(developmentSetFilename, testSetFilename, firstInputWord, secondInputWord, outputFilename):
    print "Started with: "
    print "\tDevelopment set filename: %s" % developmentSetFilename
    print "\tTest set filename: %s" % testSetFilename
    print "\tInput word: %s" % firstInputWord
    print "\tInput word2: %s" % secondInputWord
    print "\tOutput filename: %s" % outputFilename
    vocabularySize = 300000

    file = open(outputFilename, "w+")
    file.write("#Students:\tSaar Arbel\tBoaz Berman\t315681775\t311504401\n")
    file.write("Output1: " + developmentSetFilename + "\n")
    file.write("Output2: " + testSetFilename + "\n")
    file.write("Output3: " + firstInputWord + " " + secondInputWord + "\n")
    file.write("Output4: " + outputFilename + "\n")
    file.write("Output5: " + str(vocabularySize) + "\n")

    with open(developmentSetFilename, 'rb') as input_file:
        input_file_data = input_file.read()
    words = parse_file_data(input_file_data)

    cuttingIndex = int(round(len(words) * 0.9))
    trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:]
    trainingWordSet, validationWordSet = WordSet(trainingSet, vocabularySize), WordSet(validationSet, vocabularySize)
    file.write("Output6: " + str(len(words)) + "\n")
    file.write("Output7: " + str(validationWordSet.length) + "\n")
    file.write("Output8: " + str(trainingWordSet.length) + "\n")
    file.write("Output9: " + str(trainingWordSet.distinctLength) + "\n")
    file.write("Output10: " + str(trainingWordSet.countAppearances(firstInputWord)) + "\n")

    trainingBigramWordSet = BigramWordSet(trainingSet, vocabularySize, trainingWordSet)
    file.write("Output11: " + str(trainingBigramWordSet.countAppearances(firstInputWord, secondInputWord)) + "\n")

    validationBigramWordSet = BigramWordSet(validationSet, vocabularySize, validationWordSet)
    backOffTrainingModel = BackOffModel(trainingBigramWordSet,trainingWordSet)
    backOffValidationModel = BackOffModel(validationBigramWordSet, validationWordSet)

    print str(backOffTrainingModel.bigramWordSet.pLidstone(("bank", "economist"), 0.001)) + " boaz"
    print backOffTrainingModel.pBackOff("bank", "economist",0.1)
    print "Debug %f" % backOffTrainingModel.debug()

    file.write('Output12: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.0001)) + "\n")
    print "finished 12"
    file.write('Output13: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.001)) + "\n")
    print "finished 13"
    file.write('Output14: ' + str(backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.1)) + "\n")
    print "finished 14"
    minperplexity, minlamda = minimumPerplexity(backOffTrainingModel, backOffValidationModel)
    file.write('Output15: ' + str(minlamda) + "\n")
    print "finished 15"
    file.write('Output16: ' + str(minperplexity) + "\n")
    print "finished 16"

    with open(testSetFilename, 'rb') as input_file2:
        input_file_data2 = input_file2.read()
    words2 = parse_file_data(input_file_data2)
    trainingWordSet2 = WordSet(words2,vocabularySize)
    trainingBigramWordSet2 = BigramWordSet(words2, vocabularySize, trainingWordSet2)
    backOffTrainingModel2 = BackOffModel(trainingBigramWordSet2,trainingWordSet2)

    file.write('Output17: ' + str(backOffPerplexity(backOffTrainingModel, backOffTrainingModel2, 0.0003)) + "\n")
    print "finished 17"

    file.write('Output18: ' + str(printTable(backOffTrainingModel,0.001,firstInputWord)))