Example #1
0
def main():
    print("doing stuff")
    readRegionalisms()
    #because sets aren't in a stable ordering,
    # and calculateIDF expects just a freqdist not a (sub name, freqdist) tuple
    toProcess = list(readSubredditSet())
    frequencies = list()
    for subredditname in toProcess:
        #todo: figure out how to also do bigrams.
        c_freqdist = getFrequency(subredditname)
        frequencies.append(c_freqdist)

    #now have a list of all words
    totalfreq = FreqDist()
    for frequency in frequencies:
        totalfreq = frequency + totalfreq

    #get list of words to calculate tf-idf score for
    N = len(frequencies)
    all_words = set(totalfreq.keys())
    #remove all words that only occcur on average less than once per corpus.
    #based on http://www.nltk.org/_modules/nltk/probability.html#FreqDist.hapaxes
    all_words = all_words - {
        item
        for item in totalfreq.keys() if totalfreq[item] < N
    }

    rnrdict = totalfreq.r_Nr()
    numremoved = 0
    for i in range(N):
        numremoved += rnrdict.get(i, 0)

    print("removed " + str(numremoved) +
          " words from the set of words processed due to low frequency.")
    del rnrdict, numremoved

    idfdict = dict()
    for word in all_words:
        #calculate idf scores for words
        idfdict[word] = calculateIDF(frequencies, word, len(toProcess))

    #frequencies are in the freq dist, idf of a word is in idfdict. now onto like ???
    #tfidf = list()
    for i in range(len(toProcess)):
        current = (toProcess[i], calcTfidf(frequencies[i], all_words, idfdict))
        outputResults(current)