def main(): print("doing stuff") readRegionalisms() #because sets aren't in a stable ordering, # and calculateIDF expects just a freqdist not a (sub name, freqdist) tuple toProcess = list(readSubredditSet()) frequencies = list() for subredditname in toProcess: #todo: figure out how to also do bigrams. c_freqdist = getFrequency(subredditname) frequencies.append(c_freqdist) #now have a list of all words totalfreq = FreqDist() for frequency in frequencies: totalfreq = frequency + totalfreq #get list of words to calculate tf-idf score for N = len(frequencies) all_words = set(totalfreq.keys()) #remove all words that only occcur on average less than once per corpus. #based on http://www.nltk.org/_modules/nltk/probability.html#FreqDist.hapaxes all_words = all_words - { item for item in totalfreq.keys() if totalfreq[item] < N } rnrdict = totalfreq.r_Nr() numremoved = 0 for i in range(N): numremoved += rnrdict.get(i, 0) print("removed " + str(numremoved) + " words from the set of words processed due to low frequency.") del rnrdict, numremoved idfdict = dict() for word in all_words: #calculate idf scores for words idfdict[word] = calculateIDF(frequencies, word, len(toProcess)) #frequencies are in the freq dist, idf of a word is in idfdict. now onto like ??? #tfidf = list() for i in range(len(toProcess)): current = (toProcess[i], calcTfidf(frequencies[i], all_words, idfdict)) outputResults(current)