def format_data(raw_result_file, team_file, team): """ A file to take season data format the date column. The data from the website has dates in the form "Sun 10, 2019" This function turns it into the format MM/DD/YY. Parameter raw_result_file: a file for the team in the parameter. It contains information about the games the team it played, the scores, and the result. Precondition: must be a reader from the pandas module. Parameter team_file: a reader object that contains the names of the teams in the NBA. Precondition: must be a DataFrame reader object and contain the header 'team'. Parameter team: the team that the file raw_result_file is about. Precondition: must be a string. """ assertions.assert_raw_result_file_format(raw_result_file) assertions.assert_team_file_format(team_file) assertions.assert_team(team) date = raw_result_file["Date"] num_games = len(raw_result_file["G"]) append_list1 = [] append_list2 = [] for number in range(num_games): append_list1.append(raw_result_file["Opponent"][number].split()[-1]) append_list2.append(_change_date(date[number])) raw_result_file["Opponent Shortened"] = append_list1 raw_result_file["New Date"] = append_list2 raw_result_file.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' + team + '/csv_data/2019-2020_scores.csv', index=False)
def _assertion_win_or_lose(game_thread_info_file, result_file, team_str, global_ID, team): """ Function to test assertions for win_or_lose(). """ assertions.assert_game_thread_info_file_format(game_thread_info_file) assertions.assert_season_result_file_format(result_file) assertions.assert_team(team_str) assertions.assert_global_ID(global_ID) assertions.assert_team(team)
def _assertion_roster_mentions(cmt_data_list, roster_file, roster_list, team): """ Function to test assertions for function roster_mentions(). """ assertions.assert_cmt_data_list(cmt_data_list) assertions.assert_roster_file_format(roster_file) assertions.assert_str_list(roster_list) assertions.assert_team(team)
def _assertion_mgmt_cmt_sent(classifier, cmt_lvl_rost_ment_reader, global_ID, roster_list, mgmt_list, team): """ Assertions for function manager_cmt_sentiment() """ assertions.assert_classifier(classifier) assertions.assert_cmt_lvl_ment_file_format(cmt_lvl_rost_ment_reader, roster_list) assertions.assert_global_ID(global_ID) assertions.assert_str_list(roster_list) assertions.assert_str_list(mgmt_list) assertions.assert_team(team)
def _assertion_comment_roster_glob(cmt_lvl_ment_file, roster_list, global_ID, team): """ Function to test assertions for function coach_mentions_glob() """ assertions.assert_cmt_lvl_ment_file_format(cmt_lvl_ment_file, roster_list) assertions.assert_str_list(roster_list) assertions.assert_global_ID(global_ID) assertions.assert_team(team)
def compare_files(machine_code_file, ground_truth_file, roster_list, team): """ Compare the machine_code_file created by nameMatching function comment_players to a generated file ground_truth_file to look at the accuracy. Calculates the precision and recall of the columns of player mentions for each comment and the total precision/recall. Then modify the csv file comment_mentions by adding rows at the end of the file with total recall/prec and recall/prec for each column. If the recall/prec is "null", that means that due to a divide by zero error in calculations, the recall/prec was uncalculable. To compare the accuracy, the function creates matrices of each file and adds/ subtracts them. If an entry has a 2, then the mention of the player in the comment is a true positive. When subtracting the matrices, a value of -1 corresponds to a false positive (the ground truth code doesn't have the player mention but the machine code does.) A value of +1 is a false negative (the ground truth code has a player mention but the machine code does not.) Parameter machine_code_file: a csv file containing individual comments and marks for whether or not each comment contains a mention of a player Precondition: must be a DataFrame object from the pandas module. Parameter ground_truth_file: a csv file contaniing individual comments Precondition: must be a DataFrame object from the pandas module. The columns global_ID, local_ID, and comment must be the same as comment_file and contain the same player headers. Parameter roster_list: a list of strings with players to compare the csvfile to. Precondition: roster_list must be a list with string entries. Parameter team: the basketball team the code is run on. Precondition: team is of type string """ assertions.assert_compare_machine_hand(machine_code_file, ground_truth_file) assertions.assert_str_list(roster_list) assertions.assert_team(team) length = len(machine_code_file) matrix_one = _create_matrix(numpy.empty((0, length), int), machine_code_file, roster_list) matrix_two = _create_matrix(numpy.empty((0, length), int), ground_truth_file, roster_list) matrix_diff = numpy.subtract(matrix_two, matrix_one) matrix_sum = numpy.add(matrix_one, matrix_two) appendDict = { "Calculations": [ "True Positives", "False Positives", "False Negatives", "Precision", "Recall", "Num. of comments" ], "Total": [] } _add_values(appendDict, matrix_sum, matrix_diff, roster_list) df = pandas.DataFrame(appendDict) df.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' + team + '/precision_and_recall.csv', index=False)
def _assertion_coach_mentions_glob(global_ID, agg_rost_ment_file, roster_file, mgmt_list, sent_dict, result, team): """ Function to test assertions for coach_mentions_glob(). """ assertions.assert_global_ID(global_ID) assertions.assert_agg_roster_ment_file_format(agg_rost_ment_file, roster_file) assertions.assert_roster_file_format(roster_file) assertions.assert_str_list(mgmt_list) assertions.assert_result(result) assertions.assert_team(team)
def _assertion_roster_mentions_glob(global_ID, rost_ment_file, roster_file, roster_list, team): """ Function to test assertions for function roster_mentions_glob(). """ assertions.assert_global_ID(global_ID) assertions.assert_roster_ment_by_game_file_format(rost_ment_file) assertions.assert_roster_file_format(roster_file) assertions.assert_str_list(roster_list) assertions.assert_team(team)
def _assertion_comment_roster(raw_data_file, roster_file, roster_list, cmt_data_list, team): """ Function to test assertions for function comment_roster(). """ assertions.assert_raw_data_file_format(raw_data_file) assertions.assert_roster_file_format(roster_file) assertions.assert_str_list(roster_list) assertions.assert_cmt_data_list(cmt_data_list) assertions.assert_team(team)
def make_team_str(team_file, team): """ A function to create a regular expression string containing all of the NBA basketball teams excluding the one passed in under the team parameter. This regex will be used in function win_or_lose(). Parameter team_file: a file containing all 30 basketball teams in the NBA. Precondition: must be a csv reader object containing the header "Team". Parameter team: the team that the regex skips. Precondition: must be a string. """ assertions.assert_team_file_format(team_file) assertions.assert_team(team) teams = team_file["Team"] reg_ex_str = "" for index in range(team_file.shape[0]): if teams[index] != team: reg_ex_str += teams[index] + "|" return reg_ex_str[:-1]
def create_data_frame(global_ID, cmt_data_list, team): """ Returns a csv file after making a DataFrame object with four columns: global_ID, local_ID, name, and the category. Using the global ID attribute, all of the rows in the DataFrame should have the same global ID. Parameter globalID: the global ID corresponding to a Reddit thread. For this specific global ID, extract the data from the 2DList. Precondition: must be an integer greater than zero. Parameter cmt_data_list: a two-dimensional list that has as one of these two entries: 1) a list of the form [global ID, local ID, name, category] 2) [global ID, local ID] - if the comment corresponding to a global/local ID has no named entities. Parameter team: the basketball team the code is running on. Precondition: team is a string """ assertions.assert_global_ID(global_ID) assertions.assert_cmt_data_list(cmt_data_list) assertions.assert_team(team) dictionary = {"global_ID": [], "local_ID": [], "name": [], "category": []} for lst in cmt_data_list: # if the comment has less than 2 characters or has no named entities, add # an the global and local ID, but no named entity or category. if (len(lst) == 2 and global_ID == lst[0]): dictionary["global_ID"].append(global_ID) dictionary["local_ID"].append(lst[1]) dictionary["name"].append("") dictionary["category"].append("") elif (len(lst) == 4 and global_ID == lst[0]): dictionary["global_ID"].append(global_ID) dictionary["local_ID"].append(lst[1]) dictionary["name"].append(lst[2]) dictionary["category"].append(lst[3]) df = pandas.DataFrame(dictionary) df.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' + team + '/roster_mentions_by_game/' + str(global_ID) + ".csv", index=False)
def _assertion_calc_mgmt_stats(global_ID_list, mgmt_list, roster_file, team): """ Function to assert assertions for calc_mgmt_stats(). """ assertions.assert_int_list(global_ID_list) assertions.assert_str_list(mgmt_list) assertions.assert_roster_file_format(roster_file) assertions.assert_team(team)
def extract_col_data(raw_data_file, roster_file, word_file_reader, team): """ Returns a two dimensional list. Each inner list corresponds to a named entity, its category (person, place, nickname) with its associated global and local ID. All global_IDs and local_IDs must be integers. Comments can be of any category, but will be cast into strings. The function also checks if the comments have nicknames as named entities. Assume that for every global_ID and local_ID, there is an associated comment in that row. This means that the the number of elements in each of the three columns is the same. If the function fails to extract the data in any way, raise an exception. The order of the three columns in raw_data_file does not matter. The function will rearrange the 2D list order to make the keys go "global_ID", "local_ID", and "named entity/category". Since regexes are used to find full names, nicknames, first/last names, words that contain these names as substrings are removed from the comments before extraction. For shortened first/last names, a different method other than regex is used so substrings are not relevant for those. Parameter raw_data_file: the reader object with the csvfile that you want to extract the data from. Precondition: must be a DataFrame object created from the pandas module with headers global ID, local ID and comment. The terms in the global and local ID columns must be integers. Parameter roster_file: the reader object containing nicknames to check for in the comments. Precondition: must be a DataFrame object created from the pandas module and contain the correct headers. """ assertions.assert_raw_data_file_format(raw_data_file) assertions.assert_roster_file_format(roster_file) assertions.assert_team(team) assertions.assert_word_removal_file_format(word_file_reader, team) glob_ID = raw_data_file["global_ID"] loc_ID = raw_data_file["local_ID"] comm = raw_data_file["comment"] cmt_data_list = [] # For column "Player", "Nicknames", "First", and "Last", include potential # substrings in this list that could mistake as names. stop_words = word_file_reader[team].tolist() # every comment is unique in its glob/loc ID. Maintain list of past IDs to # prevent having duplicate comments. duplicates = [] # call create_list here to prevent redundacy short_f = _create_list(roster_file, "First Short") short_l = _create_list(roster_file, "Last Short") name_str = _make_name_str(roster_file["Player"], roster_file["First"], roster_file["Last"]) nickname_str = _make_nickname_str(_create_list(roster_file, "Nicknames")) for index in range(raw_data_file.shape[0]): if [glob_ID[index], loc_ID[index] ] not in duplicates and not pandas.isnull(comm[index]): new_comm = comm[index] try: for word in stop_words: new_comm = new_comm.replace(word, "") new_comm = new_comm.replace(string.capwords(word), "") _extract_entities(cmt_data_list, glob_ID[index], loc_ID[index], new_comm, name_str, nickname_str, short_f, short_l) except: raise Exception("Failed to create 2D list of named entities.") duplicates.append([glob_ID[index], loc_ID[index]]) return cmt_data_list