def populate_overall_sentiment(directory, overall_list, year_list, afinn): overall_sent = common.build_dict_of_lists(year_list, overall_list) for subdir, dirs, files in os.walk(directory): print("Calculating sentiment across entire corpus.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as inpt: sentiment = 0 jsondata = json.load(inpt) text = jsondata["Filtered Text"] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: # determine which period it falls within target = common.determine_year(year, year_list) for i in range(len(text)): sentiment += afinn.score(text[i]) # even though overall_list only has one keyword, this looks # better than just hard-coding "all" within the method truncated_sentiment = float(sentiment/len(text)) for keyword in overall_list: # append entry as tuple rather than just sentiment score # so I can use sent_calcs to get average overall_sent[target][keyword].append((jsondoc, truncated_sentiment)) return overall_sent
def init_sent_doc_dict(input_dir, key_list, year_list, stopwords, yrange_min, yrange_max, text_type): doc_dict = common.build_dict_of_lists(year_list, key_list) for dirs, subdirs, files in os.walk(input_dir): # 'subdir' corresponds to each keyword print("Building volumes dictionary.") for subdir in tqdm.tqdm(subdirs): for folders, subfolders, file in os.walk(dirs + "/" + subdir): for jsondoc in file: if jsondoc[0] != ".": with open(dirs + "/" + subdir + "/" + jsondoc, 'r', encoding='utf8') as inpt: jsondata = json.load(inpt) text = jsondata[text_type] # remove stopwords for i in range(len(text) - 1, -1, -1): # Delete empty strings if text[i] in stopwords or len(text[i]) < 2: del text[i] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) try: doc_dict[target][subdir].append(text) except KeyError: pass return doc_dict
def keyword_and_word_count(year_list, directory, yrange_min, yrange_max, keywords): word_totals = common.build_simple_dict_of_nums(year_list) word_count_dict = common.build_nested_dict_of_nums(year_list, keywords) # keyword_totals = common.build_dict_of_nums(year_list, keywords) frequency_list = common.build_dict_of_lists(year_list, keywords) # word_count = {} for subdir, dirs, files in os.walk(directory): print("Taking word counts") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as in_file: jsondata = json.load(in_file) text = jsondata[text_type] if bigrams: text = nltk.bigrams(text) num_words = len(list(text)) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) fdist = nltk.FreqDist(text) for keyword in keywords: # keeping this here for bigrams word_count = 0 # update keyword count for period/keyword pair if not bigrams: keys = keyword.split("/") for k in keys: word_count += fdist[k] word_count_dict[target][keyword][ k] += fdist[k] else: # TODO: implement same functionality above for bigrams # TODO: pretty much everything for bigrams is not functional for i in range(len(keyword)): word_count += fdist[keyword[i]] try: # add word count to frequency totals (for frequency as percentage of total words) # keyword_totals[target][keyword] += word_count # append word count to frequency list (for mean & variance of samples) # frequency_list[target][keyword].append(word_count) word_totals[target] += num_words word_count_dict[target][keyword][ "TOTAL"] += word_count frequency_list[target][keyword].append( word_count) except KeyError: # decade out of range pass return [word_count_dict, frequency_list, word_totals]
def calculate_tfidf_results(year_list, keywords, directory, idf_results, yrange_min, yrange_max): tf_idf_results = common.build_dict_of_lists(year_list, keywords) for subdir, dirs, files in os.walk(directory): print("Calculating TF-IDF scores.") for jsondoc in tqdm.tqdm(files): if jsondoc[0] != ".": with open(directory + "/" + jsondoc, 'r', encoding='utf8') as inpt: jsondata = json.load(inpt) text = jsondata[text_type] if bigrams: text = nltk.bigrams(text) try: year = int(jsondata["Year Published"]) except KeyError: year = int(jsondata["Date"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) # create word frequency distribution fdist = nltk.FreqDist(text) # calculate tf and tf-idf for each keyword for keyword in keywords: # if single-word keywords are being searched for, then # they can be grouped together, separated by a "/" character. temp = 0 if not bigrams: words = keyword.split("/") for w in words: temp += calculate_tf(fdist, w) else: for i in range(len(keyword)): temp += calculate_tf(fdist, keyword[i]) try: idf = idf_results[target][keyword] tf_idf = calculate_tfidf(idf, temp) # append tuple of document/tf-idf score pair tf_idf_results[target][keyword].append( (jsondoc, tf_idf)) except KeyError: pass for year in year_list: for keyword in keywords: tf_idf_results[year][keyword] = sorted( tf_idf_results[year][keyword], key=lambda x: x[1]) return tf_idf_results
def populate_sent_dict(directory, key_list, year_list, afinn): sent_dict = common.build_dict_of_lists(year_list, key_list) for dirs, subdirs, files in os.walk(directory): # 'subdir' corresponds to a keyword for subdir in subdirs: for folders, subfolders, file in os.walk(dirs + "/" + subdir): for jsondoc in file: if jsondoc[0] != ".": with open(dirs + "/" + subdir + "/" + jsondoc, 'r', encoding='utf8') as inpt: sentiment = 0 jsondata = json.load(inpt) text = jsondata["Text"] year = int(jsondata["Year Published"]) # check to make sure it's within range specified by user if yrange_min <= year < yrange_max: target = common.determine_year(year, year_list) sentiment += afinn.score(text) sent_dict[target][subdir].append((jsondoc, sentiment)) sent_dict_sorted = sort_sent_dict(year_list, key_list, sent_dict) return sent_dict_sorted
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", metavar='in-directory', action="store", help="input directory argument") parser.add_argument("-txt", help="output text file argument", action="store") parser.add_argument("-num_topics", action="store", help="number of topics to display") parser.add_argument("-num_words", action="store", help="number of words per topic") parser.add_argument("-weights", action="store_true", help="display topic weights with topics") parser.add_argument("-lang", action="store", help="language") parser.add_argument("-type", action="store", help="json field to analyze") parser.add_argument("-ignore", action="store", help="path to ignored list json file") parser.add_argument( "-p", help= "boolean to analyze by different periods rather than a fixed increment value", action="store_true") parser.add_argument( "-y", help="min/max for year range and increment value, surround with quotes", action="store") parser.add_argument("-lsi", help="Topic modeling vida LSI", action="store_true") parser.add_argument("-include_keys", help="don't filter keywords from topics", action="store_true") parser.add_argument("-passes", help="number of passes on corpus", action="store") parser.add_argument("-seed", help="generator seed for deterministic(ish) modeling", action="store") try: args = parser.parse_args() except IOError as msg: print(parser.error(str(msg))) if args.num_topics is None: num_topics = 10 else: num_topics = int(args.num_topics) if args.num_words is None: num_words = 10 else: num_words = int(args.num_words) if args.type is not None: text_type = args.type else: text_type = "Words" if args.passes is None: passes = 1 else: passes = int(args.passes) if args.seed is None: deterministic = False else: deterministic = True seed = int(args.seed) if args.lsi: lda = False lsi = True else: lda = True lsi = False weights = args.weights periods = args.p include_keys = args.include_keys language = args.lang range_years = args.y.split() year_params = common.year_params(range_years, periods) increment, yrange_min, yrange_max = year_params[0], year_params[ 1], year_params[2] # initialize list of years and dict to keep track of # how many books between each year range year_list = common.build_year_list(increment, range_years, periods, yrange_max, yrange_min) # build list of keywords that we'll be making topic models for key_list = build_key_list(args.i) # add words in json file to stopwords set, if filepath is given if args.ignore is not None: stopwords = build_ignore_list(args.ignore, language) else: stopwords = set(nltk.corpus.stopwords.words(language)) # add keywords in stopwords set if include_keys is not set if not include_keys: for key in key_list: sub_keys = key.split("_") for wd in sub_keys: stopwords.add(wd) doc_dict = init_sent_doc_dict(args.i, key_list, year_list, stopwords, yrange_min, yrange_max, text_type) dictionary_dict = build_frequency_dict(doc_dict, key_list, year_list) corpus_dict = common.build_dict_of_lists(year_list, key_list) if lda: lda_dict = common.build_dict_of_lists(year_list, key_list) if deterministic: # generator seed rands = numpy.random.RandomState(seed) if lsi: tfidf_dict = common.build_dict_of_lists(year_list, key_list) lsi_dict = common.build_dict_of_lists(year_list, key_list) print("Building topic models.") for year in tqdm.tqdm(year_list): for key in key_list: corpus_dict[year][key] = \ [dictionary_dict[year][key].doc2bow(doc) for doc in doc_dict[year][key]] numdocs = len(corpus_dict[year][key]) if lda: try: if not deterministic: # stochastic lda_dict[year][key] = (gensim.models.ldamodel.LdaModel( corpus=corpus_dict[year][key], id2word=dictionary_dict[year][key], num_topics=num_topics, passes=passes), numdocs) else: # deterministic (ish) lda_dict[year][key] = (gensim.models.ldamodel.LdaModel( corpus=corpus_dict[year][key], id2word=dictionary_dict[year][key], num_topics=num_topics, random_state=rands, passes=passes), numdocs) except ValueError: lda_dict[year][key] = "No Documents for this period." if lsi: try: tfidf_dict[year][key] = gensim.models.TfidfModel( corpus_dict[year][key]) tfidf = tfidf_dict[year][key] lsi_dict[year][key] = gensim.models.LsiModel( corpus=tfidf[corpus_dict[year][key]], id2word=dictionary_dict[year][key], num_topics=200) except ValueError: lsi_dict[year][key] = "No Documents for this period." with open(args.txt + '.txt', 'w', encoding='utf8') as txt_out: txt_out.write("Topics per period / keyword pair: " + "\n") for i in range(len(year_list) - 1): txt_out.write("Period: {0} - {1}".format(str( year_list[i]), str(year_list[i + 1])) + "\n") for key in key_list: txt_out.write("For extracted documents around {0}:".format( str(key).replace("_", "/")) + "\n") txt_out.write( "Number of documents for this period / keyword pair: {0}". format(str(lda_dict[year_list[i]][key][1])) + "\n") try: if lda: topics = lda_dict[year_list[i]][key][0].show_topics( num_topics=num_topics, num_words=num_words) if lsi: try: topics = lsi_dict[year_list[i]][key].show_topics( num_topics=num_topics, num_words=num_words) except TypeError: topics = [ "There were no documents for this period." ] j = 1 for topic in topics: if weights: topic = str(topic[1]) filtered = topic.split('+') for k in range(len(filtered) - 1, -1, -1): if filtered[k] == "" or filtered[k] == "None": del filtered[k] else: filtered[k] = filtered[k].split('*') for k in range(len(filtered)): if k == 0: txt_out.write( "Topic {0}: {1} ({2}), ".format( str(j), filtered[k][1].strip(), filtered[k][0].strip())) elif k == len(filtered) - 1: txt_out.write("{0} ({1})".format( filtered[k][1].strip(), filtered[k][0].strip())) else: txt_out.write("{0} ({1}), ".format( filtered[k][1].strip(), filtered[k][0].strip())) else: topic = str(topic) filtered = re.split('\W[0-9]*', topic) for k in range(len(filtered) - 1, -1, -1): if filtered[k] == "" or filtered[k] == "None": del filtered[k] else: filtered[k] = filtered[k].lower() txt_out.write("Topic {0}: {1}".format( str(j), ", ".join(filtered))) j += 1 txt_out.write("\n") txt_out.write("\n") except AttributeError: txt_out.write(lda_dict[year_list[i]][key]) txt_out.write("\n")