Beispiel #1
0
def makeWordFreqList(filename):
    tokenFrequencyList = {} #track freq of each token
    
    #split the corpus into sentences.
    corpusSentences = sensplit.sen_splitter(filename)
    
    #for each sentence...
    for sen in corpusSentences:
        #Make tokens (words) from the sentence by splitting on whitespace.
        senTokens = sen.split()

        #for each 'word' in the current sentence...
        for token in senTokens:
            #ignore any stop words (function words, etc).
            #stop words will be found in stopwords.txt
            
            #first turn the token lowercase for easier comparison!
            token = token.lower()
           
            #remove punctuation if it exists in the current token.
            token = removePunctuation(token)
            
            #if not a stop word...
            if not isStopWord(token) and len(token.strip()) > 0:

                #add to the list
                if token in tokenFrequencyList:
                    tokenFrequencyList[token] += 1
                else:
                    tokenFrequencyList[token] = 1

    return tokenFrequencyList
Beispiel #2
0
def makeWordFreqList(filename):
    tokenFrequencyList = {}  #track freq of each token

    #split the corpus into sentences.
    corpusSentences = sensplit.sen_splitter(filename)

    #for each sentence...
    for sen in corpusSentences:
        #Make tokens (words) from the sentence by splitting on whitespace.
        senTokens = sen.split()

        #for each 'word' in the current sentence...
        for token in senTokens:
            #ignore any stop words (function words, etc).
            #stop words will be found in stopwords.txt

            #first turn the token lowercase for easier comparison!
            token = token.lower()

            #remove punctuation if it exists in the current token.
            token = removePunctuation(token)

            #if not a stop word...
            if not isStopWord(token) and len(token.strip()) > 0:

                #add to the list
                if token in tokenFrequencyList:
                    tokenFrequencyList[token] += 1
                else:
                    tokenFrequencyList[token] = 1

    return tokenFrequencyList
Beispiel #3
0
def makeBigramFreqList(filename):
    bigramFrequencyList = {} #track freq of each bigram
    
    #split the corpus into sentences. (Due to assuming bigrams cannot cross sentence ends.)
    corpusSentences = sensplit.sen_splitter(filename)
    
    #quick check if we should continue - if file is not found, don't move on.
    if len(corpusSentences) == 0:
        return []

    wordFreqList = makeWordFreqList(filename)
    
    #make a combinatorial list of all bigram pairs first. (extras smoothed later)
    for word1 in wordFreqList:
        for word2 in wordFreqList:
            bigram = word1 + " " + word2
            bigramFrequencyList[bigram] = 0
    
    #for each sentence...
    for sen in corpusSentences:
        #Make tokens (words) from the sentence by splitting on whitespace.
        senTokens = sen.split()
        
        tokenPair = [] #keep track of our current bigram pair as we go through the sentence
        
        #for each 'word' in the current sentence...
        for token in senTokens:
            #ignore any stop words (function words, etc).
            #stop words will be found in stopwords.txt
            
            #first turn the token lowercase for easier comparison!
            token = token.lower()
           
            #remove punctuation if it exists in the current token.
            token = removePunctuation(token)
            
            #if not a stop word...
            if not isStopWord(token) and len(token.strip()) > 0:

                #add to our current tokenpair
                tokenPair.append(token)
                #if our tokenpair is now two words, add the pair to the bigramFreq list by
                #   combining the two words as the key; ex: "word1 word2" as a key, seperated by space
                #   Steps: check if pair exists already, if not add pair and set freq to 1 
                #      Ex.    bigramFrequencyList["word1 word2"] = 1
                #   If pair exists already, simply increment frequency for that pair by 1.
                if len(tokenPair) == 2:
                    pairKey = tokenPair[0] + ' ' + tokenPair[1]
                    if pairKey in bigramFrequencyList:
                        bigramFrequencyList[pairKey] += 1
                    else:
                        bigramFrequencyList[pairKey] = 1

                #if we put the tokenpair into the bigramFreq list, clear the current pair and start a new pair
                #   this pair will start with the current token as the first word.
                tokenPair = []
                tokenPair.append(token)


    #SMOOTHING TIME
    #use Good Turing discount formula to modify the frequency of the bigram table
    #Formula:
    # c* = (c+1) * NumBigramsOfFreq(C+1) / NumBigramsOfFreq(C)
        
    #NOTE: There is an inherent issue with Good Turing smoothing when numBigramsOfFreq(C+1) == 0
    #This becomes almost a non-issue with any regular sized corpus, but this smoothing will ruin
    #the frequencies by setting them to 0 if there are any interm frequency counts that are 0.
    #Moral: Don't use tiny data sets.
    #Source: http://www.ee.ucla.edu/~weichu/htkbook/node214_mn.html

    #Now, get some data we'll need for our formula...

    #get a list of all frequencies and occurances of those frequencies in the freq. list
    bigramStats = queryBigramStats(bigramFrequencyList)

    #make a new list to hold the new bigram frequencies we will replace the old ones with
    newBigramFrequencies = copy.deepcopy(bigramStats)

    #for each frequency... (Use -1 due to using c+1 and c being an index)
    for c in range(0,len(bigramStats)-1):
        #avoid division by 0 error
        if bigramStats[c] != 0:
            #Adjust the counts using the Good Turing Discount formula
            newBigramFrequencies[c] = float((c+1)*bigramStats[c+1])/bigramStats[c]
        else:
            newBigramFrequencies[c] = 0

    #Then replace the old frequency counts with the new frequency counts
    #The old frequency will be the index into the new array to get the updated count
    for bigram in bigramFrequencyList.keys():
        oldBigramFreq = bigramFrequencyList[bigram]
        bigramFrequencyList[bigram] = newBigramFrequencies[oldBigramFreq]

    return bigramFrequencyList
Beispiel #4
0
def makeBigramFreqList(filename):
    bigramFrequencyList = {}  #track freq of each bigram

    #split the corpus into sentences. (Due to assuming bigrams cannot cross sentence ends.)
    corpusSentences = sensplit.sen_splitter(filename)

    #quick check if we should continue - if file is not found, don't move on.
    if len(corpusSentences) == 0:
        return []

    wordFreqList = makeWordFreqList(filename)

    #make a combinatorial list of all bigram pairs first. (extras smoothed later)
    for word1 in wordFreqList:
        for word2 in wordFreqList:
            bigram = word1 + " " + word2
            bigramFrequencyList[bigram] = 0

    #for each sentence...
    for sen in corpusSentences:
        #Make tokens (words) from the sentence by splitting on whitespace.
        senTokens = sen.split()

        tokenPair = [
        ]  #keep track of our current bigram pair as we go through the sentence

        #for each 'word' in the current sentence...
        for token in senTokens:
            #ignore any stop words (function words, etc).
            #stop words will be found in stopwords.txt

            #first turn the token lowercase for easier comparison!
            token = token.lower()

            #remove punctuation if it exists in the current token.
            token = removePunctuation(token)

            #if not a stop word...
            if not isStopWord(token) and len(token.strip()) > 0:

                #add to our current tokenpair
                tokenPair.append(token)
                #if our tokenpair is now two words, add the pair to the bigramFreq list by
                #   combining the two words as the key; ex: "word1 word2" as a key, seperated by space
                #   Steps: check if pair exists already, if not add pair and set freq to 1
                #      Ex.    bigramFrequencyList["word1 word2"] = 1
                #   If pair exists already, simply increment frequency for that pair by 1.
                if len(tokenPair) == 2:
                    pairKey = tokenPair[0] + ' ' + tokenPair[1]
                    if pairKey in bigramFrequencyList:
                        bigramFrequencyList[pairKey] += 1
                    else:
                        bigramFrequencyList[pairKey] = 1

                #if we put the tokenpair into the bigramFreq list, clear the current pair and start a new pair
                #   this pair will start with the current token as the first word.
                tokenPair = []
                tokenPair.append(token)

    #SMOOTHING TIME
    #use Good Turing discount formula to modify the frequency of the bigram table
    #Formula:
    # c* = (c+1) * NumBigramsOfFreq(C+1) / NumBigramsOfFreq(C)

    #NOTE: There is an inherent issue with Good Turing smoothing when numBigramsOfFreq(C+1) == 0
    #This becomes almost a non-issue with any regular sized corpus, but this smoothing will ruin
    #the frequencies by setting them to 0 if there are any interm frequency counts that are 0.
    #Moral: Don't use tiny data sets.
    #Source: http://www.ee.ucla.edu/~weichu/htkbook/node214_mn.html

    #Now, get some data we'll need for our formula...

    #get a list of all frequencies and occurances of those frequencies in the freq. list
    bigramStats = queryBigramStats(bigramFrequencyList)

    #make a new list to hold the new bigram frequencies we will replace the old ones with
    newBigramFrequencies = copy.deepcopy(bigramStats)

    #for each frequency... (Use -1 due to using c+1 and c being an index)
    for c in range(0, len(bigramStats) - 1):
        #avoid division by 0 error
        if bigramStats[c] != 0:
            #Adjust the counts using the Good Turing Discount formula
            newBigramFrequencies[c] = float(
                (c + 1) * bigramStats[c + 1]) / bigramStats[c]
        else:
            newBigramFrequencies[c] = 0

    #Then replace the old frequency counts with the new frequency counts
    #The old frequency will be the index into the new array to get the updated count
    for bigram in bigramFrequencyList.keys():
        oldBigramFreq = bigramFrequencyList[bigram]
        bigramFrequencyList[bigram] = newBigramFrequencies[oldBigramFreq]

    return bigramFrequencyList