def getUniqueWords(subredditname): wordfile_path = datadirectory + "/ProcessedData/" + subredditname + "_words" + ".txt" set_of_words = set() freq_subreddit = FreqDist() if not path.exists(wordfile_path): for datafile in getTextFileNames(subredditname): if path.exists(datafile): print("reading " + datafile) freq_subreddit = collectFreqData(datafile) + freq_subreddit else: print("no data for " + datafile) for i in freq_subreddit.most_common(20): print(i) with open(wordfile_path, "a+") as wordfile: for word in freq_subreddit.keys(): word = word.strip() word = word.lower() set_of_words.add(word) wordfile.write(word + "\n") return set_of_words else: with open(wordfile_path, "r") as wordfile: #read line by line print("reading " + wordfile_path) for word in wordfile: word = word.strip() word = word.lower() set_of_words.add(word) return set_of_words
def main(): #initalize globals readRegionalisms() addStopWords() #to select subreddits to process, add their name to the CurrentSubredditSet file toProcess = readSubredditSet() for corpusname in toProcess: print("doing, "+corpusname) download("subreddit-"+corpusname, data_dir=datadirectory+"/DataDownloads") #create the directory if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname): os.makedirs(datadirectory+"/ProcessedData/"+corpusname) if os.path.exists(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip") and not os.path.exists(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip"): os.rename(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip", datadirectory+"/DataDownloads/"+corpusname+".corpus.zip") print(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip") with ZipFile(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip", mode="r") as corpuszip: if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname+"/utterances.jsonl"): corpuszip.extract("utterances.jsonl", path=datadirectory+"/ProcessedData/"+corpusname+"/") #make the unfilted text files old_data_exists = False for file in getTextFileNames(corpusname, filtered=False): if os.path.exists(file): old_data_exists = True if not old_data_exists: convertToText(corpusname) else: print(corpusname + " has already been converted to unfiltered text files, moving on") # remove stopwords old_data_exists = False for file in getTextFileNames(corpusname): if os.path.exists(file): old_data_exists = True if not old_data_exists: removeStopwordsFromConverted(corpusname) else: print(corpusname + " has already had its text files filtered")
def recordAudienceData(corpusname, csvwritter): towrite = [corpusname] for file in getTextFileNames(corpusname, filtered=True): print("recording the file: " + file) if path.exists(file): comment_with, numcomments = collectAudienceFreqData(file) print("num comment: " + str(numcomments)) print("num w/ thing: " + str(comment_with)) freq = round(comment_with / numcomments, 5) towrite.append(freq) else: print("no such file: " + file) towrite.append(0) csvwritter.writerow(towrite)
def recordCountData(corpusname, csvwritter): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname) for file in processed_corpus_texts: print("recording the file: " + file) if path.exists(file): freqs = collectFreqData(file) totalFQ = freqs + totalFQ totalFQ["<end_comment>"] = 0 towrite = dict() towrite["Subreddit"] = corpusname for word in getRegionalisms(): towrite[word] = totalFQ[word] csvwritter.writerow(towrite)
def main(prefix=""): readRegionalisms() # to select subreddits to process, add their name to the CurrentSubredditSet file #NOTE: actually returns a list. toAnalyze = readSubredditSet() print(toAnalyze) allwords = getRegionalisms().copy() for subreddit in toAnalyze: numprevwords = len(allwords) print("num unique words: " + str(numprevwords)) allwords = allwords.union(getUniqueWords(subreddit)) print(subreddit + " added " + str(len(allwords) - numprevwords) + " words.") print("in total, there are " + str(len(allwords)) + " words in the vector.") frequenceylist = list() for subreddit in toAnalyze: currentfreq = initalizeFreqDistWithKeys(allwords) print("num keys in frequency: " + str(len(currentfreq.keys()))) for file in getTextFileNames(subreddit): currentfreq = collectFreqData(file) + currentfreq frequenceylist.append(currentfreq) print("currently, there are " + str(len(currentfreq.keys())) + "unique words in " + subreddit) idf_scores = computeIDF(frequenceylist, allwords) tfidf_scores = list() #problem, don't know what goes with what. for frequency in frequenceylist: tfidf_scores.append(computeTFIDF(frequency, idf_scores, allwords)) csv_keys = getRegionalismsOnlyCSVKeys() #csv_keys = getAllWordCSVKeys(allwords) for i in range(len(toAnalyze)): recordRegionalismTFIDF(tfidf_scores[i], toAnalyze[i], csv_keys)
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname) for file in processed_corpus_texts: print("recording the file: " + file) if path.exists(file): freqs = collectFreqData(file) totalFQ = freqs + totalFQ towrite = dict() towrite["Subreddit"] = corpusname for word in getRegionalisms(): if totalFQ[word] == 0: towrite[word] = 0 else: if useLogFreq: towrite[word] = math.log(totalFQ[word] / totalFQ.N()) else: towrite[word] = totalFQ[word] / totalFQ.N() csvwritter.writerow(towrite)
def unigramFreqFile(subreddit): # get filtered files filenames = getTextFileNames(subreddit) countFileName = getCountFileName(subreddit) with open(countFileName, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() for filename in filenames: print("sending normalized values of " + filename + " to " + countFileName) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for word in line.split(): word = word.strip() if word.startswith("http") or word.isnumeric(): continue if 0 < len(word) < 23: frequencies[word] = frequencies.get(word, 0) + 1 frequencies["<end_comment>"] = 0 # write total number of words countVectorFile.write(str(frequencies.N())) for word in frequencies: countVectorFile.write(word+" "+str(frequencies[word])+"\n")
def bigramFreqFile(subreddit): #get filtered files filenames = getTextFileNames(subreddit) countfilename = getCountFileName(subreddit, unigram=False) with open(countfilename, "a+", errors='ignore') as countVectorFile: frequencies = FreqDist() #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish. for filename in filenames: print("sending normalized values of " + filename + " to " + countfilename) with open(filename, "r", errors="ignore") as current_file: for line in current_file: for bigram in list(bigrams(line.split())): okayrange = 0 < len(bigram[0]) < 23 and 0 < len(bigram[1]) < 23 if okayrange and bigram[1] != "<end_comment>": frequencies[bigram] = frequencies.get(bigram, 0) + 1 #write total number of words countVectorFile.write(str(frequencies.N())) #note, another good improvement, organize this for faster searching. for bigram in frequencies: countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
def analyzeAudienceData(corpuslist, outfilename="results_audience", prefix=""): if len(prefix) > 0: datafilepath = datadirectory + "/results/" + prefix + outfilename + ".csv" else: datafilepath = datadirectory + "/results/" + outfilename + ".csv" if not path.exists(datafilepath): # create an empty file open(datafilepath, "x").close() csvfile = open(datafilepath, "a", newline='') csvwriter = csv.writer(csvfile, dialect='excel') # see getUnfilteredTextFilename for order csvwriter.writerow([ "Subreddit", "Post", "Post with Mention", "Comment ", "Comment with Mention" ]) else: csvfile = open(datafilepath, "a", newline='') csvwriter = csv.writer(csvfile, dialect='excel') for corpus_name in corpuslist: files = getTextFileNames(corpus_name) recordAudienceData(corpus_name, csvwriter) csvfile.close()
def removeStopwordsFromConverted(corpusname): for file in getTextFileNames(corpusname, filtered=False): if path.exists(file): removestopwords(file) return
def recordStatsData(corpusname, csvwritter): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname, filtered=False) numcomments_pnm = 0 numcomments_pm = 0 numcomments_cm = 0 numcomments_cnm = 0 # post no mention if path.exists(processed_corpus_texts[0]): print("reading: " + processed_corpus_texts[0]) freqs_pnm = collectFreqData(processed_corpus_texts[0]) totalFQ = totalFQ + freqs_pnm junk, numcomments_pnm = collectAudienceFreqData( processed_corpus_texts[0]) # post mention if path.exists(processed_corpus_texts[1]): print("reading: " + processed_corpus_texts[1]) freqs_pm = collectFreqData(processed_corpus_texts[1]) totalFQ = totalFQ + freqs_pm junk, numcomments_pm = collectAudienceFreqData( processed_corpus_texts[1]) # comment no mention if path.exists(processed_corpus_texts[2]): print("reading: " + processed_corpus_texts[2]) freqs_cnm = collectFreqData(processed_corpus_texts[2]) totalFQ = totalFQ + freqs_cnm junk, numcomments_cnm = collectAudienceFreqData( processed_corpus_texts[2]) # comment mention if path.exists(processed_corpus_texts[3]): print("reading: " + processed_corpus_texts[3]) freqs_cm = collectFreqData(processed_corpus_texts[3]) totalFQ = totalFQ + freqs_cm junk, numcomments_cm = collectAudienceFreqData( processed_corpus_texts[3]) print("writing") towrite = dict() towrite["Subreddit"] = corpusname towrite["N"] = totalFQ.N() towrite["B"] = totalFQ.B() towrite[ "Num Utterences"] = numcomments_pnm + numcomments_pm + numcomments_cm + numcomments_cnm towrite["Num Utterences - Post NM"] = numcomments_pnm towrite["Num Utterences - Post M"] = numcomments_pm towrite["Num Utterences - Comment"] = numcomments_cnm towrite["Num Utterences - Comment M"] = numcomments_cm if path.exists(processed_corpus_texts[0]): towrite["N-Post"] = freqs_pnm.N() towrite["B-Post"] = freqs_pnm.B() else: towrite["N-Post"] = 0 towrite["B-Post"] = 0 if path.exists(processed_corpus_texts[1]): towrite["N-Post with Mention"] = freqs_pm.N() towrite["B-Post with Mention"] = freqs_pm.B() else: towrite["N-Post with Mention"] = 0 towrite["B-Post with Mention"] = 0 if path.exists(processed_corpus_texts[2]): towrite["N -Comment"] = freqs_cnm.N() towrite["B -Comment"] = freqs_cnm.B() else: towrite["N -Comment"] = 0 towrite["B -Comment"] = 0 if path.exists(processed_corpus_texts[3]): towrite["N -Comment with Mention"] = freqs_cm.N() towrite["B -Comment with Mention"] = freqs_cm.B() else: towrite["N -Comment with Mention"] = 0 towrite["B -Comment with Mention"] = 0 csvwritter.writerow(towrite)