def build_years_tally(directory, year_list, yrange_min, yrange_max): years_tally = {} for y in year_list: try: years_tally[y] = 0 except KeyError: pass for subdir, dirs, files in os.walk(directory): print("Counting number of volumes per period.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf-8') as in_file: jsondata = json.load(in_file) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) try: years_tally[target] += 1 except KeyError: pass return years_tally
def populate_overall_sentiment(directory, overall_list, year_list, afinn): overall_sent = common.build_dict_of_lists(year_list, overall_list) for subdir, dirs, files in os.walk(directory): print("Calculating sentiment across entire corpus.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as inpt: sentiment = 0 jsondata = json.load(inpt) text = jsondata["Filtered Text"] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: # determine which period it falls within target = common.determine_year(year, year_list) for i in range(len(text)): sentiment += afinn.score(text[i]) # even though overall_list only has one keyword, this looks # better than just hard-coding "all" within the method truncated_sentiment = float(sentiment/len(text)) for keyword in overall_list: # append entry as tuple rather than just sentiment score # so I can use sent_calcs to get average overall_sent[target][keyword].append((jsondoc, truncated_sentiment)) return overall_sent
def init_sent_doc_dict(input_dir, key_list, year_list, stopwords, yrange_min, yrange_max, text_type): doc_dict = common.build_dict_of_lists(year_list, key_list) for dirs, subdirs, files in os.walk(input_dir): # 'subdir' corresponds to each keyword print("Building volumes dictionary.") for subdir in tqdm.tqdm(subdirs): for folders, subfolders, file in os.walk(dirs + "/" + subdir): for jsondoc in file: if jsondoc[0] != ".": with open(dirs + "/" + subdir + "/" + jsondoc, 'r', encoding='utf8') as inpt: jsondata = json.load(inpt) text = jsondata[text_type] # remove stopwords for i in range(len(text) - 1, -1, -1): # Delete empty strings if text[i] in stopwords or len(text[i]) < 2: del text[i] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) try: doc_dict[target][subdir].append(text) except KeyError: pass return doc_dict
def calculate_idf_results(keywords, year_list, years_tally, directory, yrange_min, yrange_max): idf_results = common.build_dict_of_nums(year_list, keywords) for subdir, dirs, files in os.walk(directory): print("Calculating IDF scores.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as in_file: jsondata = json.load(in_file) text = jsondata[text_type] if bigrams: text = nltk.bigrams(text) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) # create word frequency distribution fdist = nltk.FreqDist(text) for keyword in keywords: if not bigrams: words = keyword.split("/") for w in words: # check if word occurs in document if fdist[w] > 0: try: idf_results[target][keyword] += 1 break except KeyError: pass else: pass else: for i in range(len(keyword)): if fdist[keyword[i]] > 0: try: idf_results[target][keyword] += 1 break except KeyError: pass else: pass for y in year_list: for keyword in keywords: try: # Add 1 before logarithm to ensure idf is nonzero, unless the word doesn't # occur at all for the period, in which case it's idf score is 0. if idf_results[y][keyword] > 0: idf_results[y][keyword] = 1 + round( math.log( (years_tally[y]) / idf_results[y][keyword], 10), 4) else: idf_results[y][keyword] = 0 except KeyError: pass return idf_results
def keyword_and_word_count(year_list, directory, yrange_min, yrange_max, keywords): word_totals = common.build_simple_dict_of_nums(year_list) word_count_dict = common.build_nested_dict_of_nums(year_list, keywords) # keyword_totals = common.build_dict_of_nums(year_list, keywords) frequency_list = common.build_dict_of_lists(year_list, keywords) # word_count = {} for subdir, dirs, files in os.walk(directory): print("Taking word counts") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as in_file: jsondata = json.load(in_file) text = jsondata[text_type] if bigrams: text = nltk.bigrams(text) num_words = len(list(text)) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) fdist = nltk.FreqDist(text) for keyword in keywords: # keeping this here for bigrams word_count = 0 # update keyword count for period/keyword pair if not bigrams: keys = keyword.split("/") for k in keys: word_count += fdist[k] word_count_dict[target][keyword][ k] += fdist[k] else: # TODO: implement same functionality above for bigrams # TODO: pretty much everything for bigrams is not functional for i in range(len(keyword)): word_count += fdist[keyword[i]] try: # add word count to frequency totals (for frequency as percentage of total words) # keyword_totals[target][keyword] += word_count # append word count to frequency list (for mean & variance of samples) # frequency_list[target][keyword].append(word_count) word_totals[target] += num_words word_count_dict[target][keyword][ "TOTAL"] += word_count frequency_list[target][keyword].append( word_count) except KeyError: # decade out of range pass return [word_count_dict, frequency_list, word_totals]
def calculate_tfidf_results(year_list, keywords, directory, idf_results, yrange_min, yrange_max): tf_idf_results = common.build_dict_of_lists(year_list, keywords) for subdir, dirs, files in os.walk(directory): print("Calculating TF-IDF scores.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as inpt: jsondata = json.load(inpt) text = jsondata[text_type] if bigrams: text = nltk.bigrams(text) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) # create word frequency distribution fdist = nltk.FreqDist(text) # calculate tf and tf-idf for each keyword for keyword in keywords: # if single-word keywords are being searched for, then # they can be grouped together, separated by a "/" character. temp = 0 if not bigrams: words = keyword.split("/") for w in words: temp += calculate_tf(fdist, w) else: for i in range(len(keyword)): temp += calculate_tf(fdist, keyword[i]) try: idf = idf_results[target][keyword] tf_idf = calculate_tfidf(idf, temp) # append tuple of document/tf-idf score pair tf_idf_results[target][keyword].append( (jsondoc, tf_idf)) except KeyError: pass for year in year_list: for keyword in keywords: tf_idf_results[year][keyword] = sorted( tf_idf_results[year][keyword], key=lambda x: x[1]) return tf_idf_results
def build_samples(csv_inpt, year_list, yrange_min, yrange_max): # set up observation and sample size dicts p = common.build_simple_dict_of_nums(year_list) n = common.build_simple_dict_of_nums(year_list) with open(csv_inpt, 'r') as csv_file: read_csv = csv.reader(csv_file, delimiter=',') row1 = next(read_csv) # this column is populated if the csv file stores word frequencies if row1[-1] == "total words": binary = False else: binary = True print("Building a set of samples") for row in tqdm.tqdm(read_csv): if row[0] != "filename": year = int(row[1]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: # determine which period it falls within target = common.determine_year(year, year_list) try: if binary: # one more volume to sample size w/r/t year period n[target] += 1 else: # add total words to sample size w/r/t year period n[target] += int(row[-1]) except KeyError: pass for cell in row[2:-1]: if binary: if cell == "1": try: # add one to observation dict and break p[target] += 1 break except KeyError: pass else: try: # add frequency in this cell to observation dict p[target] += int(cell) except KeyError: pass return [p, n]
def populate_sent_dict(directory, key_list, year_list, afinn): sent_dict = common.build_dict_of_lists(year_list, key_list) for dirs, subdirs, files in os.walk(directory): # 'subdir' corresponds to a keyword for subdir in subdirs: for folders, subfolders, file in os.walk(dirs + "/" + subdir): for jsondoc in file: if jsondoc[0] != ".": with open(dirs + "/" + subdir + "/" + jsondoc, 'r', encoding='utf8') as inpt: sentiment = 0 jsondata = json.load(inpt) text = jsondata["Text"] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) sentiment += afinn.score(text) sent_dict[target][subdir].append((jsondoc, sentiment)) sent_dict_sorted = sort_sent_dict(year_list, key_list, sent_dict) return sent_dict_sorted
def calculate_n_words(year_list, directory, num, yrange_min, yrange_max): fdist_dict = common.build_simple_dict_of_nums(year_list) text_lengths = common.build_simple_dict_of_nums(year_list) n_dict = common.build_simple_dict_of_lists(year_list) print("Calculating top {0} words per period".format(str(num))) for subdir, dirs, files in os.walk(directory): for jsondoc in files: if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as in_file: jsondata = json.load(in_file) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) text = jsondata[text_type] text_len = len(text) if bigrams: text = nltk.bigrams(text) text_len = len(text) fdist = nltk.FreqDist(text) if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) text_lengths[target] += text_len if fdist_dict[target] == 0: fdist_dict[target] = fdist else: fdist_dict[target] |= fdist for year in year_list: if num <= len(fdist_dict[year]): n_dict[year].extend( obtain_n_words(fdist_dict[year], num, text_lengths[year])) else: n_dict[year].extend( obtain_n_words(fdist_dict[year], len(fdist_dict[year]), text_lengths[year])) return n_dict