Beispiel #1
0
def year_graph_generator(df, col_name, path):
    col_name = str(col_name)
    year = col_name
    filename = path + year + ".png"
    helpers.path_checker(path)
    plt.style.use('fivethirtyeight')
    df = df.sort_values(col_name, ascending=False)
    df2 = pd.DataFrame(columns=['keyword', col_name])
    limit = 10
    counter = 0
    for index, row in df.iterrows():
        if row[col_name] != 0:
            df2 = df2.append(df[df.index == index])
            counter += 1
        if counter == limit:
            break
    df2.set_index('keyword', drop=True, inplace=True)
    if len(df2) > 0:
        ax = df2[col_name].plot.bar(figsize=(20, 12.75))
        plt.xlabel("Keyword/Term")
        plt.ylabel("Number of Tweets")
        plt.title("10 most frequent keywords/terms for " + year)
        plt.subplots_adjust(bottom=0.3)
        plt.savefig(filename)
        plt.close()
    return filename
Beispiel #2
0
def month_year_graph_generator(df, col_name, path):
    col_name = str(col_name)
    month_string = col_name.split("_")[0]
    year = col_name.split("_")[1]
    filename = path + col_name + ".png"
    helpers.path_checker(path)
    month_map = [["1", "January"], ["2", "February"], ["3", "March"],
                 ["4", "April"], ["5", "May"], ["6", "June"], ["7", "July"],
                 ["8", "August"], ["9", "September"], ["10", "October"],
                 ["11", "November"], ["12", "December"]]
    for month in month_map:
        if month[0] == month_string:
            month_name = month[1]
    plt.style.use('fivethirtyeight')
    df = df.sort_values(col_name, ascending=False)
    df2 = pd.DataFrame(columns=['keyword', col_name])
    limit = 10
    counter = 0
    for index, row in df.iterrows():
        if row[col_name] != 0:
            df2 = df2.append(df[df.index == index])
            counter += 1
        if counter == limit:
            break
    df2.set_index('keyword', drop=True, inplace=True)
    if len(df2) > 0:
        ax = df2[col_name].plot.bar(figsize=(20, 12.75))
        plt.xlabel("Keyword/Term")
        plt.ylabel("Number of Tweets")
        plt.title("10 most frequent keywords/terms for " + month_name + " " +
                  year)
        plt.subplots_adjust(bottom=0.35)
        plt.savefig(filename)
        plt.close()
    return filename
Beispiel #3
0
def processing():
    keyword_list = helpers.load_dataset(ds.output_data +
                                        "keywords/keywords_single_list.csv")
    store = {}
    keyword_list = list_creator(keyword_list)
    file_paths = []

    for df in ds.all_datasets:
        print("    - Processing", df)
        f_name = df
        store[f_name] = {}
        df = helpers.load_dataset(ds.dataset + df)
        df = df[df.tweet_language == "en"]
        for index, row in df.iterrows():
            matches = check_keyword(clean_tweet(row.tweet_text), keyword_list)
            if len(matches) != 0:
                store[f_name][row.tweetid] = matches
    # # storage
    matches_counter = 0
    for f_name in store:
        data_list = []
        filename = f_name.split("/")
        dataset = filename[0]
        filename = filename[1]
        path = ds.output_data + "individual_keyword_matches/"
        dataset_path = path + dataset + "/"
        helpers.path_checker(dataset_path)
        file_path = dataset_path + filename
        for item in store[f_name]:
            data_list.append([item, store[f_name][item]])
            matches_counter += 1
        helpers.data_to_file_two_values(data_list, '"tweet_id","matches"',
                                        file_path)
        file_paths.append(file_path)
    return file_paths
def single_list_generator():
    df = helpers.load_dataset("original_keywords.csv")
    list_of_terms = df.keywords.tolist()
    individual_terms = word_extractor(list_of_terms)
    list_df = pd.DataFrame(individual_terms, columns=["keyword"])
    output_dir = ds.output_data + "keywords/"
    helpers.path_checker(output_dir)
    output_file = output_dir + "keywords_single_list.csv"
    helpers.dataframe_to_csv(list_df, output_file)
    return output_file
def tagged_keywords_generator():
    df = helpers.load_dataset(ds.output_data +
                              "keywords/original_keywords.csv")
    for item in df.iterrows():
        list_of_terms.append(item[1][0])
    tagged_terms = term_tagger(list_of_terms)
    output_dir = ds.output_data + "keywords/"
    helpers.path_checker(output_dir)
    output_file = output_dir + "keywords_tagged.csv"
    helpers.data_to_file_two_values(tagged_terms, '"term","tag"', output_file)
    return output_file
Beispiel #6
0
def merge(dataset_type):
    print("      - Processing " + dataset_type + " files:")
    for file in ds.all_datasets:
        print("        - " + file)
        file_path = file.split("/")
        f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1]
        df = helpers.load_dataset(f_name)
        if file == ds.all_datasets[0]:
            merge_hold = df
        else:
            merge_hold = pd.concat([merge_hold, df], sort=False)
    output_path = ds.output_data + "merged_dataset_extraction/"
    helpers.path_checker(output_path)
    file_name = dataset_type + ".csv"
    helpers.dataframe_to_csv(merge_hold, output_path + file_name)
    return output_path + file_name
Beispiel #7
0
def tweet_extractor():
    files_created_generic = []
    files_created_specific = []
    for file in ds.all_datasets:
        generic_df = helpers.load_dataset(ds.output_data +
                                          "actual_keyword_matches/generic/" +
                                          file)
        specific_df = helpers.load_dataset(ds.output_data +
                                           "actual_keyword_matches/specific/" +
                                           file)
        print("      - loading data", file)
        df = helpers.load_dataset(ds.dataset + file)
        df = df[df.tweet_language == "en"]
        columns = []
        for h in df.head():
            columns.append(h)
        columns.append("matches")
        columns.append("source_file")
        columns.append("month")
        columns.append("year")
        df["matches"] = ""
        df["source_file"] = ""
        df["tweet_time"] = df["tweet_time"].astype("datetime64")
        df["month"] = df["tweet_time"].dt.month
        df["year"] = df["tweet_time"].dt.year
        specific_tweets, generic_tweets = pd.DataFrame(
            columns=columns), pd.DataFrame(columns=columns)
        specific_tweets = match_extractor(specific_df, df, specific_tweets,
                                          file, "specific")
        generic_tweets = match_extractor(generic_df, df, generic_tweets, file,
                                         "generic")
        output_data_path = ds.output_data + "first_dataset_extraction/"
        dataset = file.split("/")[0]
        filename = file.split("/")[1]

        specific_path = output_data_path + "specific/" + dataset + "/"
        helpers.path_checker(specific_path)
        helpers.dataframe_to_csv(specific_tweets, specific_path + filename)
        files_created_specific.append(specific_path + filename)

        generic_path = output_data_path + "generic/" + dataset + "/"
        helpers.path_checker(generic_path)
        helpers.dataframe_to_csv(generic_tweets, generic_path + filename)
        files_created_generic.append(generic_path + filename)
    return files_created_generic, files_created_specific
Beispiel #8
0
def total_frequency_graph_generator(df, path):
    col_name = "total"
    filename = path + "total_freq.png"
    helpers.path_checker(path)
    plt.style.use('fivethirtyeight')
    df = df.sort_values(col_name, ascending=False)
    zero = df[df[col_name] == 0]
    df = df[df[col_name] != 0]
    helpers.dataframe_to_csv(zero,
                             ds.output_data + "statistics/zero_matches.csv")
    df.set_index('keyword', drop=True, inplace=True)
    if len(df) > 0:
        ax = df[col_name].plot.bar(figsize=(20, 12.75))
        plt.xlabel("Keyword/Term")
        plt.ylabel("Number of Tweets")
        plt.title("Keyword frequency")
        plt.subplots_adjust(bottom=0.3)
        plt.savefig(filename)
        plt.close()
    return [filename, ds.output_data + "statistics/zero_matches.csv"]
Beispiel #9
0
def date_selection():
    output_files = []
    path = ds.output_data + "merged_dataset_extraction/"
    files = helpers.path_fetcher(path)
    for file in files:
        df = helpers.load_dataset(path + file)
        df_2013 = df[df.year == 2013]
        df_2013_8 = df_2013[df.month == 8]
        df_2013_9 = df_2013[df.month == 9]
        df_2013_10 = df_2013[df.month == 10]
        df_2013_11 = df_2013[df.month == 11]
        df_2013_12 = df_2013[df.month == 12]
        df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) |
                (df.year == 2017) | (df.year == 2018)]
        df = pd.concat(
            [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df])
        storage_path = ds.output_data + "time_filtered_dataset_extraction/"
        helpers.path_checker(storage_path)
        helpers.dataframe_to_csv(df, storage_path + file)
        output_files.append(storage_path + file)
    return output_files
Beispiel #10
0
def processing():
    create_storage_dataframes()
    create_freq_matrix()
    global generic_list, specific_list, year_freq_df, month_year_freq_df, generic_specific_freq_df
    created_files = []
    file_path = ds.output_data + "time_filtered_dataset_extraction/"
    generic_tweets = file_path + "generic.csv"
    specific_tweets = file_path + "specific.csv"
    all_tweets_df = pd.concat([
        helpers.load_dataset(specific_tweets),
        helpers.load_dataset(generic_tweets)
    ])
    all_tweets_df.reset_index(inplace=True, drop=True)
    limit = len(all_tweets_df) + 1
    counter = 0
    for index, row in all_tweets_df.iterrows():
        if counter == limit:
            break
        counter += 1
        if index % 100 == 0:
            print("      -", str(index), "/", str(len(all_tweets_df)))
        generic_matches = []
        specific_matches = []
        # preprocessing
        row.matches = row.matches.strip("''][").split("', '")
        for match in row.matches:
            generic_check = keyword_checker(match, generic_list)
            generic_matches.append(generic_check)
            specific_check = keyword_checker(match, specific_list)
            specific_matches.append(specific_check)
            if generic_check | specific_check:
                year_freq_update(match, str(row.year))
                month_year_freq_update(match, str(row.month), str(row.year))
        if True in generic_matches:
            if True in specific_matches:
                for match in row.matches:
                    generic_specific_freq_update(match, "generic_specific")
            else:
                for match in row.matches:
                    generic_specific_freq_update(match, "generic")
        else:
            for match in row.matches:
                generic_specific_freq_update(match, "specific")
        freq_matrix_update(row.matches)

    #store dataframe
    storage_path = ds.output_data + "statistics/"
    helpers.path_checker(storage_path)
    # Store year frequency
    helpers.dataframe_to_csv(year_freq_df, storage_path + "year_frequency.csv")
    created_files.append(storage_path + "year_frequency.csv")
    # Store month year frequency
    helpers.dataframe_to_csv(month_year_freq_df,
                             storage_path + "month_year_frequency.csv")
    created_files.append(storage_path + "month_year_frequency.csv")
    # Store generic specific frequency
    helpers.dataframe_to_csv(generic_specific_freq_df,
                             storage_path + "generic_specific_frequency.csv")
    created_files.append(storage_path + "generic_specific_frequency.csv")
    # Store frequency matrix
    helpers.dataframe_to_csv(freq_matrix,
                             storage_path + "frequency_matrix.csv")
    created_files.append(storage_path + "frequency_matrix.csv")
    month_year_freq_output_files = preprocess_month_year_graph(
        month_year_freq_df)
    year_freq_output_files = preprocess_year_graph(year_freq_df)
    frequency_total_output_files = preprocess_frequency_total_graph(
        generic_specific_freq_df)
    for file in month_year_freq_output_files:
        created_files.append(file)
    for file in year_freq_output_files:
        created_files.append(file)
    for file in frequency_total_output_files:
        created_files.append(file)
    return created_files
Beispiel #11
0
def store_dataset(df):
    helpers.path_checker(ds.output_data)
    helpers.dataframe_to_csv(
        df, '/' + ds.output_data + "/sentiwordnet_labelled.csv")
def processing():
    tagged_keywords = load_tagged_keywords()
    match_files = find_match_files()
    files_created = []
    for dataset in match_files:
        for file in match_files[dataset]:
            print("      - " + file)
            new_store = {}
            generic_matches = {}
            specific_matches = {}
            matches_df = load_match_file(file)
            for match_index, match_row in matches_df.iterrows():
                generic_counter = 0
                specific_counter = 0
                tempstore = []
                match_row.matches = match_row.matches.strip("''][").split(
                    "', '")
                match_row.matches = remove_duplicates(match_row.matches)
                matches_df.matches.at[match_index] = match_row.matches
                for keyword_index, keyword_row in tagged_keywords.iterrows():
                    if comparison(match_row.matches, keyword_row['split']):
                        if keyword_row.tag == "generic":
                            generic_counter += 1
                        if keyword_row.tag == "specific":
                            specific_counter += 1
                        tempstore.append(keyword_row.term)

                if specific_counter != 0:
                    specific_matches[match_row.tweet_id] = tempstore
                else:
                    if generic_counter != 0:
                        generic_matches[match_row.tweet_id] = tempstore
            generic_data_list = []
            specific_data_list = []

            for tweet_id in generic_matches:
                generic_data_list.append([tweet_id, generic_matches[tweet_id]])
            for tweet_id in specific_matches:
                specific_data_list.append(
                    [tweet_id, specific_matches[tweet_id]])

            generic_file_path = ds.output_data + "actual_keyword_matches/generic/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(generic_file_path)
            generic_file_name = generic_file_path + file.split("/")[-1]
            helpers.data_to_file_two_values(generic_data_list,
                                            '"tweet_id","matches"',
                                            generic_file_name)
            files_created.append(generic_file_name)

            specific_file_path = ds.output_data + "actual_keyword_matches/specific/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(specific_file_path)
            specific_file_name = specific_file_path + file.split("/")[-1]
            helpers.data_to_file_two_values(specific_data_list,
                                            '"tweet_id","matches"',
                                            specific_file_name)
            files_created.append(specific_file_name)
            file_path = ds.output_data + "single_keyword_matches_dup_removed/" + file.split(
                "/")[-2] + "/"
            helpers.path_checker(file_path)
            file_name = file_path + file.split("/")[-1]
            helpers.dataframe_to_csv(matches_df, file_name)
            files_created.append(file_name)
    return files_created