Exemple #1
0
    def __init__(self, filepath):
        """Initialize FrequencyStrategy"""

        self.possible = WordSet()

        # Cache of possible words given current game state.
        # Initially just all the possible words, divided out based on length.
        # As games progress, more WordSets-possible-given-game-state-X are added.
        self.wordCache = collections.defaultdict(WordSet)

        # process dictionary file
        self.parseWordsFile(filepath)

        self.seedCache()

        self.newGame()
Exemple #2
0
 def __init__(self, streets_file, towns_file, country_file):
     super(AddressDetector, self).__init__('Detect addresses')
     self.streets = WLTStorage(streets_file)
     self.towns = WLTStorage(towns_file)
     self.countries = WordSet(country_file)
Exemple #3
0
#   word: The string in which the word being modified is stored.
#   punctuation: Tuple containing punctuation that is to be removed.
def removePunctuation(word, punctuation):
    for x in punctuation:
        if (word[len(word) - 1] == x):
            word = word[0:len(word) - 1]
    return word


#Main

#st = LancasterStemmer()
con = sql.connect("UselessWords.db")
cur = con.cursor()
#Word_List stores words from the input parapgraph.
Word_List = WordSet()
paragraph = input("Enter the text to be summarized: ")
#tokens: list of words in the input parargraph.
tokens = paragraph.split()
#Removes formatting from each word then adds it to Word_List.
for item in tokens:
    item = item.strip()
    item = removePunctuation(item, punctuation)
    #x = st.stem(x)
    if (not useless(cur, item)):
        Word_List.add(Word(item, None, 1))
#output stores the Word Object with the highest count
output = Word_List.max()
print(output.word)
con.close()
Exemple #4
0
 def __init__(self, ignore_list):
     super(AbbrDetector, self).__init__('Detect abbreviations')
     self.ignored_words = WordSet(ignore_list, filt=lambda x: x.lower())
     self.cache = set()
Exemple #5
0
def generateOutputFile(developmentSetFilename, testSetFilename, firstInputWord,
                       secondInputWord, outputFilename):
    print "Started with: "
    print "\tDevelopment set filename: %s" % developmentSetFilename
    print "\tTest set filename: %s" % testSetFilename
    print "\tInput word: %s" % firstInputWord
    print "\tInput word2: %s" % secondInputWord
    print "\tOutput filename: %s" % outputFilename
    vocabularySize = 300000

    file = open(outputFilename, "w+")
    file.write("#Students:\tSaar Arbel\tBoaz Berman\t315681775\t311504401\n")
    file.write("Output1: " + developmentSetFilename + "\n")
    file.write("Output2: " + testSetFilename + "\n")
    file.write("Output3: " + firstInputWord + " " + secondInputWord + "\n")
    file.write("Output4: " + outputFilename + "\n")
    file.write("Output5: " + str(vocabularySize) + "\n")

    with open(developmentSetFilename, 'rb') as input_file:
        input_file_data = input_file.read()
    words = parse_file_data(input_file_data)

    cuttingIndex = int(round(len(words) * 0.9))
    trainingSet, validationSet = words[:cuttingIndex], words[cuttingIndex:]
    trainingWordSet, validationWordSet = WordSet(
        trainingSet, vocabularySize), WordSet(validationSet, vocabularySize)
    file.write("Output6: " + str(len(words)) + "\n")
    file.write("Output7: " + str(validationWordSet.length) + "\n")
    file.write("Output8: " + str(trainingWordSet.length) + "\n")
    file.write("Output9: " + str(trainingWordSet.distinctLength) + "\n")
    file.write("Output10: " +
               str(trainingWordSet.countAppearances(firstInputWord)) + "\n")

    trainingBigramWordSet = BigramWordSet(trainingSet, vocabularySize,
                                          trainingWordSet)
    file.write("Output11: " + str(
        trainingBigramWordSet.countAppearances(firstInputWord,
                                               secondInputWord)) + "\n")

    validationBigramWordSet = BigramWordSet(validationSet, vocabularySize,
                                            validationWordSet)
    backOffTrainingModel = BackOffModel(trainingBigramWordSet, trainingWordSet)
    backOffValidationModel = BackOffModel(validationBigramWordSet,
                                          validationWordSet)

    print str(
        backOffTrainingModel.bigramWordSet.pLidstone(
            ("bank", "economist"), 0.001)) + " boaz"
    print backOffTrainingModel.pBackOff("bank", "economist", 0.1)
    print "Debug %f" % backOffTrainingModel.debug()

    file.write('Output12: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel,
                          0.0001)) + "\n")
    print "finished 12"
    file.write('Output13: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.001))
               + "\n")
    print "finished 13"
    file.write('Output14: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffValidationModel, 0.1)) +
               "\n")
    print "finished 14"
    minperplexity, minlamda = minimumPerplexity(backOffTrainingModel,
                                                backOffValidationModel)
    file.write('Output15: ' + str(minlamda) + "\n")
    print "finished 15"
    file.write('Output16: ' + str(minperplexity) + "\n")
    print "finished 16"

    with open(testSetFilename, 'rb') as input_file2:
        input_file_data2 = input_file2.read()
    words2 = parse_file_data(input_file_data2)
    trainingWordSet2 = WordSet(words2, vocabularySize)
    trainingBigramWordSet2 = BigramWordSet(words2, vocabularySize,
                                           trainingWordSet2)
    backOffTrainingModel2 = BackOffModel(trainingBigramWordSet2,
                                         trainingWordSet2)

    file.write('Output17: ' + str(
        backOffPerplexity(backOffTrainingModel, backOffTrainingModel2, 0.0003))
               + "\n")
    print "finished 17"

    file.write('Output18: ' +
               str(printTable(backOffTrainingModel, 0.001, firstInputWord)))