def year_graph_generator(df, col_name, path): col_name = str(col_name) year = col_name filename = path + year + ".png" helpers.path_checker(path) plt.style.use('fivethirtyeight') df = df.sort_values(col_name, ascending=False) df2 = pd.DataFrame(columns=['keyword', col_name]) limit = 10 counter = 0 for index, row in df.iterrows(): if row[col_name] != 0: df2 = df2.append(df[df.index == index]) counter += 1 if counter == limit: break df2.set_index('keyword', drop=True, inplace=True) if len(df2) > 0: ax = df2[col_name].plot.bar(figsize=(20, 12.75)) plt.xlabel("Keyword/Term") plt.ylabel("Number of Tweets") plt.title("10 most frequent keywords/terms for " + year) plt.subplots_adjust(bottom=0.3) plt.savefig(filename) plt.close() return filename
def month_year_graph_generator(df, col_name, path): col_name = str(col_name) month_string = col_name.split("_")[0] year = col_name.split("_")[1] filename = path + col_name + ".png" helpers.path_checker(path) month_map = [["1", "January"], ["2", "February"], ["3", "March"], ["4", "April"], ["5", "May"], ["6", "June"], ["7", "July"], ["8", "August"], ["9", "September"], ["10", "October"], ["11", "November"], ["12", "December"]] for month in month_map: if month[0] == month_string: month_name = month[1] plt.style.use('fivethirtyeight') df = df.sort_values(col_name, ascending=False) df2 = pd.DataFrame(columns=['keyword', col_name]) limit = 10 counter = 0 for index, row in df.iterrows(): if row[col_name] != 0: df2 = df2.append(df[df.index == index]) counter += 1 if counter == limit: break df2.set_index('keyword', drop=True, inplace=True) if len(df2) > 0: ax = df2[col_name].plot.bar(figsize=(20, 12.75)) plt.xlabel("Keyword/Term") plt.ylabel("Number of Tweets") plt.title("10 most frequent keywords/terms for " + month_name + " " + year) plt.subplots_adjust(bottom=0.35) plt.savefig(filename) plt.close() return filename
def processing(): keyword_list = helpers.load_dataset(ds.output_data + "keywords/keywords_single_list.csv") store = {} keyword_list = list_creator(keyword_list) file_paths = [] for df in ds.all_datasets: print(" - Processing", df) f_name = df store[f_name] = {} df = helpers.load_dataset(ds.dataset + df) df = df[df.tweet_language == "en"] for index, row in df.iterrows(): matches = check_keyword(clean_tweet(row.tweet_text), keyword_list) if len(matches) != 0: store[f_name][row.tweetid] = matches # # storage matches_counter = 0 for f_name in store: data_list = [] filename = f_name.split("/") dataset = filename[0] filename = filename[1] path = ds.output_data + "individual_keyword_matches/" dataset_path = path + dataset + "/" helpers.path_checker(dataset_path) file_path = dataset_path + filename for item in store[f_name]: data_list.append([item, store[f_name][item]]) matches_counter += 1 helpers.data_to_file_two_values(data_list, '"tweet_id","matches"', file_path) file_paths.append(file_path) return file_paths
def single_list_generator(): df = helpers.load_dataset("original_keywords.csv") list_of_terms = df.keywords.tolist() individual_terms = word_extractor(list_of_terms) list_df = pd.DataFrame(individual_terms, columns=["keyword"]) output_dir = ds.output_data + "keywords/" helpers.path_checker(output_dir) output_file = output_dir + "keywords_single_list.csv" helpers.dataframe_to_csv(list_df, output_file) return output_file
def tagged_keywords_generator(): df = helpers.load_dataset(ds.output_data + "keywords/original_keywords.csv") for item in df.iterrows(): list_of_terms.append(item[1][0]) tagged_terms = term_tagger(list_of_terms) output_dir = ds.output_data + "keywords/" helpers.path_checker(output_dir) output_file = output_dir + "keywords_tagged.csv" helpers.data_to_file_two_values(tagged_terms, '"term","tag"', output_file) return output_file
def merge(dataset_type): print(" - Processing " + dataset_type + " files:") for file in ds.all_datasets: print(" - " + file) file_path = file.split("/") f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1] df = helpers.load_dataset(f_name) if file == ds.all_datasets[0]: merge_hold = df else: merge_hold = pd.concat([merge_hold, df], sort=False) output_path = ds.output_data + "merged_dataset_extraction/" helpers.path_checker(output_path) file_name = dataset_type + ".csv" helpers.dataframe_to_csv(merge_hold, output_path + file_name) return output_path + file_name
def tweet_extractor(): files_created_generic = [] files_created_specific = [] for file in ds.all_datasets: generic_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/generic/" + file) specific_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/specific/" + file) print(" - loading data", file) df = helpers.load_dataset(ds.dataset + file) df = df[df.tweet_language == "en"] columns = [] for h in df.head(): columns.append(h) columns.append("matches") columns.append("source_file") columns.append("month") columns.append("year") df["matches"] = "" df["source_file"] = "" df["tweet_time"] = df["tweet_time"].astype("datetime64") df["month"] = df["tweet_time"].dt.month df["year"] = df["tweet_time"].dt.year specific_tweets, generic_tweets = pd.DataFrame( columns=columns), pd.DataFrame(columns=columns) specific_tweets = match_extractor(specific_df, df, specific_tweets, file, "specific") generic_tweets = match_extractor(generic_df, df, generic_tweets, file, "generic") output_data_path = ds.output_data + "first_dataset_extraction/" dataset = file.split("/")[0] filename = file.split("/")[1] specific_path = output_data_path + "specific/" + dataset + "/" helpers.path_checker(specific_path) helpers.dataframe_to_csv(specific_tweets, specific_path + filename) files_created_specific.append(specific_path + filename) generic_path = output_data_path + "generic/" + dataset + "/" helpers.path_checker(generic_path) helpers.dataframe_to_csv(generic_tweets, generic_path + filename) files_created_generic.append(generic_path + filename) return files_created_generic, files_created_specific
def total_frequency_graph_generator(df, path): col_name = "total" filename = path + "total_freq.png" helpers.path_checker(path) plt.style.use('fivethirtyeight') df = df.sort_values(col_name, ascending=False) zero = df[df[col_name] == 0] df = df[df[col_name] != 0] helpers.dataframe_to_csv(zero, ds.output_data + "statistics/zero_matches.csv") df.set_index('keyword', drop=True, inplace=True) if len(df) > 0: ax = df[col_name].plot.bar(figsize=(20, 12.75)) plt.xlabel("Keyword/Term") plt.ylabel("Number of Tweets") plt.title("Keyword frequency") plt.subplots_adjust(bottom=0.3) plt.savefig(filename) plt.close() return [filename, ds.output_data + "statistics/zero_matches.csv"]
def date_selection(): output_files = [] path = ds.output_data + "merged_dataset_extraction/" files = helpers.path_fetcher(path) for file in files: df = helpers.load_dataset(path + file) df_2013 = df[df.year == 2013] df_2013_8 = df_2013[df.month == 8] df_2013_9 = df_2013[df.month == 9] df_2013_10 = df_2013[df.month == 10] df_2013_11 = df_2013[df.month == 11] df_2013_12 = df_2013[df.month == 12] df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) | (df.year == 2017) | (df.year == 2018)] df = pd.concat( [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df]) storage_path = ds.output_data + "time_filtered_dataset_extraction/" helpers.path_checker(storage_path) helpers.dataframe_to_csv(df, storage_path + file) output_files.append(storage_path + file) return output_files
def processing(): create_storage_dataframes() create_freq_matrix() global generic_list, specific_list, year_freq_df, month_year_freq_df, generic_specific_freq_df created_files = [] file_path = ds.output_data + "time_filtered_dataset_extraction/" generic_tweets = file_path + "generic.csv" specific_tweets = file_path + "specific.csv" all_tweets_df = pd.concat([ helpers.load_dataset(specific_tweets), helpers.load_dataset(generic_tweets) ]) all_tweets_df.reset_index(inplace=True, drop=True) limit = len(all_tweets_df) + 1 counter = 0 for index, row in all_tweets_df.iterrows(): if counter == limit: break counter += 1 if index % 100 == 0: print(" -", str(index), "/", str(len(all_tweets_df))) generic_matches = [] specific_matches = [] # preprocessing row.matches = row.matches.strip("''][").split("', '") for match in row.matches: generic_check = keyword_checker(match, generic_list) generic_matches.append(generic_check) specific_check = keyword_checker(match, specific_list) specific_matches.append(specific_check) if generic_check | specific_check: year_freq_update(match, str(row.year)) month_year_freq_update(match, str(row.month), str(row.year)) if True in generic_matches: if True in specific_matches: for match in row.matches: generic_specific_freq_update(match, "generic_specific") else: for match in row.matches: generic_specific_freq_update(match, "generic") else: for match in row.matches: generic_specific_freq_update(match, "specific") freq_matrix_update(row.matches) #store dataframe storage_path = ds.output_data + "statistics/" helpers.path_checker(storage_path) # Store year frequency helpers.dataframe_to_csv(year_freq_df, storage_path + "year_frequency.csv") created_files.append(storage_path + "year_frequency.csv") # Store month year frequency helpers.dataframe_to_csv(month_year_freq_df, storage_path + "month_year_frequency.csv") created_files.append(storage_path + "month_year_frequency.csv") # Store generic specific frequency helpers.dataframe_to_csv(generic_specific_freq_df, storage_path + "generic_specific_frequency.csv") created_files.append(storage_path + "generic_specific_frequency.csv") # Store frequency matrix helpers.dataframe_to_csv(freq_matrix, storage_path + "frequency_matrix.csv") created_files.append(storage_path + "frequency_matrix.csv") month_year_freq_output_files = preprocess_month_year_graph( month_year_freq_df) year_freq_output_files = preprocess_year_graph(year_freq_df) frequency_total_output_files = preprocess_frequency_total_graph( generic_specific_freq_df) for file in month_year_freq_output_files: created_files.append(file) for file in year_freq_output_files: created_files.append(file) for file in frequency_total_output_files: created_files.append(file) return created_files
def store_dataset(df): helpers.path_checker(ds.output_data) helpers.dataframe_to_csv( df, '/' + ds.output_data + "/sentiwordnet_labelled.csv")
def processing(): tagged_keywords = load_tagged_keywords() match_files = find_match_files() files_created = [] for dataset in match_files: for file in match_files[dataset]: print(" - " + file) new_store = {} generic_matches = {} specific_matches = {} matches_df = load_match_file(file) for match_index, match_row in matches_df.iterrows(): generic_counter = 0 specific_counter = 0 tempstore = [] match_row.matches = match_row.matches.strip("''][").split( "', '") match_row.matches = remove_duplicates(match_row.matches) matches_df.matches.at[match_index] = match_row.matches for keyword_index, keyword_row in tagged_keywords.iterrows(): if comparison(match_row.matches, keyword_row['split']): if keyword_row.tag == "generic": generic_counter += 1 if keyword_row.tag == "specific": specific_counter += 1 tempstore.append(keyword_row.term) if specific_counter != 0: specific_matches[match_row.tweet_id] = tempstore else: if generic_counter != 0: generic_matches[match_row.tweet_id] = tempstore generic_data_list = [] specific_data_list = [] for tweet_id in generic_matches: generic_data_list.append([tweet_id, generic_matches[tweet_id]]) for tweet_id in specific_matches: specific_data_list.append( [tweet_id, specific_matches[tweet_id]]) generic_file_path = ds.output_data + "actual_keyword_matches/generic/" + file.split( "/")[-2] + "/" helpers.path_checker(generic_file_path) generic_file_name = generic_file_path + file.split("/")[-1] helpers.data_to_file_two_values(generic_data_list, '"tweet_id","matches"', generic_file_name) files_created.append(generic_file_name) specific_file_path = ds.output_data + "actual_keyword_matches/specific/" + file.split( "/")[-2] + "/" helpers.path_checker(specific_file_path) specific_file_name = specific_file_path + file.split("/")[-1] helpers.data_to_file_two_values(specific_data_list, '"tweet_id","matches"', specific_file_name) files_created.append(specific_file_name) file_path = ds.output_data + "single_keyword_matches_dup_removed/" + file.split( "/")[-2] + "/" helpers.path_checker(file_path) file_name = file_path + file.split("/")[-1] helpers.dataframe_to_csv(matches_df, file_name) files_created.append(file_name) return files_created