コード例 #1
0
def format_data(raw_result_file, team_file, team):
    """
    A file to take season data format the date column. The data from the website
    has dates in the form "Sun 10, 2019" This function turns it into the format
    MM/DD/YY.

    Parameter raw_result_file: a file for the team in the parameter. It contains
    information about the games the team it played, the scores, and the result.
    Precondition: must be a reader from the pandas module.

    Parameter team_file: a reader object that contains the names of the teams in
    the NBA.
    Precondition: must be a DataFrame reader object and contain the header 'team'.

    Parameter team: the team that the file raw_result_file is about.
    Precondition: must be a string.
    """
    assertions.assert_raw_result_file_format(raw_result_file)
    assertions.assert_team_file_format(team_file)
    assertions.assert_team(team)
    date = raw_result_file["Date"]
    num_games = len(raw_result_file["G"])
    append_list1 = []
    append_list2 = []
    for number in range(num_games):
        append_list1.append(raw_result_file["Opponent"][number].split()[-1])
        append_list2.append(_change_date(date[number]))

    raw_result_file["Opponent Shortened"] = append_list1
    raw_result_file["New Date"] = append_list2
    raw_result_file.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' +
                           team + '/csv_data/2019-2020_scores.csv',
                           index=False)
コード例 #2
0
def _assertion_win_or_lose(game_thread_info_file, result_file, team_str,
                           global_ID, team):
    """ Function to test assertions for win_or_lose(). """
    assertions.assert_game_thread_info_file_format(game_thread_info_file)
    assertions.assert_season_result_file_format(result_file)
    assertions.assert_team(team_str)
    assertions.assert_global_ID(global_ID)
    assertions.assert_team(team)
コード例 #3
0
def _assertion_roster_mentions(cmt_data_list, roster_file, roster_list, team):
    """
    Function to test assertions for function roster_mentions().
    """
    assertions.assert_cmt_data_list(cmt_data_list)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_str_list(roster_list)
    assertions.assert_team(team)
コード例 #4
0
def _assertion_mgmt_cmt_sent(classifier, cmt_lvl_rost_ment_reader, global_ID,
    roster_list, mgmt_list, team):
    """ Assertions for function manager_cmt_sentiment() """
    assertions.assert_classifier(classifier)
    assertions.assert_cmt_lvl_ment_file_format(cmt_lvl_rost_ment_reader, roster_list)
    assertions.assert_global_ID(global_ID)
    assertions.assert_str_list(roster_list)
    assertions.assert_str_list(mgmt_list)
    assertions.assert_team(team)
コード例 #5
0
def _assertion_comment_roster_glob(cmt_lvl_ment_file, roster_list, global_ID,
                                   team):
    """
    Function to test assertions for function coach_mentions_glob()
    """
    assertions.assert_cmt_lvl_ment_file_format(cmt_lvl_ment_file, roster_list)
    assertions.assert_str_list(roster_list)
    assertions.assert_global_ID(global_ID)
    assertions.assert_team(team)
コード例 #6
0
def compare_files(machine_code_file, ground_truth_file, roster_list, team):
    """
    Compare the machine_code_file created by nameMatching function comment_players
    to a generated file ground_truth_file to look at the accuracy.
    Calculates the precision and recall of the columns of player mentions for each
    comment and the total precision/recall. Then modify the csv file comment_mentions
    by adding rows at the end of the file with total recall/prec and recall/prec
    for each column. If the recall/prec is "null", that means that due to a divide
    by zero error in calculations, the recall/prec was uncalculable.

    To compare the accuracy, the function creates matrices of each file and adds/
    subtracts them. If an entry has a 2, then the mention of the player in the
    comment is a true positive. When subtracting the matrices, a value of -1
    corresponds to a false positive (the ground truth code doesn't have the player
    mention but the machine code does.) A value of +1 is a false negative
    (the ground truth code has a player mention but the machine code does not.)

    Parameter machine_code_file: a csv file containing individual comments and
    marks for whether or not each comment contains a mention of a player
    Precondition: must be a DataFrame object from the pandas module.

    Parameter ground_truth_file: a csv file contaniing individual comments
    Precondition: must be a DataFrame object from the pandas module. The columns
    global_ID, local_ID, and comment must be the same as comment_file and contain
    the same player headers.

    Parameter roster_list: a list of strings with players to compare the csvfile to.
    Precondition: roster_list must be a list with string entries.

    Parameter team: the basketball team the code is run on.
    Precondition: team is of type string
    """
    assertions.assert_compare_machine_hand(machine_code_file,
                                           ground_truth_file)
    assertions.assert_str_list(roster_list)
    assertions.assert_team(team)
    length = len(machine_code_file)
    matrix_one = _create_matrix(numpy.empty((0, length), int),
                                machine_code_file, roster_list)
    matrix_two = _create_matrix(numpy.empty((0, length), int),
                                ground_truth_file, roster_list)
    matrix_diff = numpy.subtract(matrix_two, matrix_one)
    matrix_sum = numpy.add(matrix_one, matrix_two)
    appendDict = {
        "Calculations": [
            "True Positives", "False Positives", "False Negatives",
            "Precision", "Recall", "Num. of comments"
        ],
        "Total": []
    }

    _add_values(appendDict, matrix_sum, matrix_diff, roster_list)
    df = pandas.DataFrame(appendDict)
    df.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' + team +
              '/precision_and_recall.csv',
              index=False)
コード例 #7
0
def _assertion_coach_mentions_glob(global_ID, agg_rost_ment_file, roster_file,
                                   mgmt_list, sent_dict, result, team):
    """ Function to test assertions for coach_mentions_glob(). """
    assertions.assert_global_ID(global_ID)
    assertions.assert_agg_roster_ment_file_format(agg_rost_ment_file,
                                                  roster_file)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_str_list(mgmt_list)
    assertions.assert_result(result)
    assertions.assert_team(team)
コード例 #8
0
def _assertion_roster_mentions_glob(global_ID, rost_ment_file, roster_file,
                                    roster_list, team):
    """
    Function to test assertions for function roster_mentions_glob().
    """
    assertions.assert_global_ID(global_ID)
    assertions.assert_roster_ment_by_game_file_format(rost_ment_file)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_str_list(roster_list)
    assertions.assert_team(team)
コード例 #9
0
def _assertion_comment_roster(raw_data_file, roster_file, roster_list,
                              cmt_data_list, team):
    """
    Function to test assertions for function comment_roster().
    """
    assertions.assert_raw_data_file_format(raw_data_file)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_str_list(roster_list)
    assertions.assert_cmt_data_list(cmt_data_list)
    assertions.assert_team(team)
コード例 #10
0
def make_team_str(team_file, team):
    """
    A function to create a regular expression string containing all of the NBA
    basketball teams excluding the one passed in under the team parameter. This
    regex will be used in function win_or_lose().

    Parameter team_file: a file containing all 30 basketball teams in the NBA.
    Precondition: must be a csv reader object containing the header "Team".

    Parameter team: the team that the regex skips.
    Precondition: must be a string.
    """
    assertions.assert_team_file_format(team_file)
    assertions.assert_team(team)
    teams = team_file["Team"]
    reg_ex_str = ""
    for index in range(team_file.shape[0]):
        if teams[index] != team:
            reg_ex_str += teams[index] + "|"
    return reg_ex_str[:-1]
コード例 #11
0
def create_data_frame(global_ID, cmt_data_list, team):
    """
    Returns a csv file after making a DataFrame object with four columns: global_ID,
    local_ID, name, and the category. Using the global ID attribute, all of the
    rows in the DataFrame should have the same global ID.

    Parameter globalID: the global ID corresponding to a Reddit thread. For this
    specific global ID, extract the data from the 2DList.
    Precondition: must be an integer greater than zero.

    Parameter cmt_data_list: a two-dimensional list that has as one of these two
    entries:
    1) a list of the form [global ID, local ID, name, category]
    2) [global ID, local ID] - if the comment corresponding to a global/local ID
    has no named entities.

    Parameter team: the basketball team the code is running on.
    Precondition: team is a string
    """
    assertions.assert_global_ID(global_ID)
    assertions.assert_cmt_data_list(cmt_data_list)
    assertions.assert_team(team)
    dictionary = {"global_ID": [], "local_ID": [], "name": [], "category": []}
    for lst in cmt_data_list:
        # if the comment has less than 2 characters or has no named entities, add
        # an the global and local ID, but no named entity or category.
        if (len(lst) == 2 and global_ID == lst[0]):
            dictionary["global_ID"].append(global_ID)
            dictionary["local_ID"].append(lst[1])
            dictionary["name"].append("")
            dictionary["category"].append("")
        elif (len(lst) == 4 and global_ID == lst[0]):
            dictionary["global_ID"].append(global_ID)
            dictionary["local_ID"].append(lst[1])
            dictionary["name"].append(lst[2])
            dictionary["category"].append(lst[3])
    df = pandas.DataFrame(dictionary)
    df.to_csv(r'/home/sebastianguo/Documents/Research/Teams/' + team +
              '/roster_mentions_by_game/' + str(global_ID) + ".csv",
              index=False)
コード例 #12
0
def _assertion_calc_mgmt_stats(global_ID_list, mgmt_list, roster_file, team):
    """ Function to assert assertions for calc_mgmt_stats(). """
    assertions.assert_int_list(global_ID_list)
    assertions.assert_str_list(mgmt_list)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_team(team)
コード例 #13
0
def extract_col_data(raw_data_file, roster_file, word_file_reader, team):
    """
    Returns a two dimensional list. Each inner list corresponds to a named entity,
    its category (person, place, nickname) with its associated global and local
    ID. All global_IDs and local_IDs must be integers. Comments can be of any
    category, but will be cast into strings. The function also checks if the
    comments have nicknames as named entities.

    Assume that for every global_ID and local_ID, there is an associated comment
    in that row. This means that the the number of elements in each of the
    three columns is the same. If the function fails to extract the data in any
    way, raise an exception.

    The order of the three columns in raw_data_file does not matter. The function
    will rearrange the 2D list order to make the keys go "global_ID", "local_ID",
    and "named entity/category".

    Since regexes are used to find full names, nicknames, first/last names, words
    that contain these names as substrings are removed from the comments before
    extraction. For shortened first/last names, a different method other than
    regex is used so substrings are not relevant for those.

    Parameter raw_data_file: the reader object with the csvfile that you want
    to extract the data from.
    Precondition: must be a DataFrame object created from the pandas module with
    headers global ID, local ID and comment. The terms in the global and local ID
    columns must be integers.

    Parameter roster_file: the reader object containing nicknames to check for in
    the comments.
    Precondition: must be a DataFrame object created from the pandas module and
    contain the correct headers.
    """
    assertions.assert_raw_data_file_format(raw_data_file)
    assertions.assert_roster_file_format(roster_file)
    assertions.assert_team(team)
    assertions.assert_word_removal_file_format(word_file_reader, team)
    glob_ID = raw_data_file["global_ID"]
    loc_ID = raw_data_file["local_ID"]
    comm = raw_data_file["comment"]
    cmt_data_list = []
    # For column "Player", "Nicknames", "First", and "Last", include potential
    # substrings in this list that could mistake as names.
    stop_words = word_file_reader[team].tolist()
    # every comment is unique in its glob/loc ID. Maintain list of past IDs to
    # prevent having duplicate comments.
    duplicates = []
    # call create_list here to prevent redundacy
    short_f = _create_list(roster_file, "First Short")
    short_l = _create_list(roster_file, "Last Short")
    name_str = _make_name_str(roster_file["Player"], roster_file["First"],
                              roster_file["Last"])
    nickname_str = _make_nickname_str(_create_list(roster_file, "Nicknames"))
    for index in range(raw_data_file.shape[0]):
        if [glob_ID[index], loc_ID[index]
            ] not in duplicates and not pandas.isnull(comm[index]):
            new_comm = comm[index]
            try:
                for word in stop_words:
                    new_comm = new_comm.replace(word, "")
                    new_comm = new_comm.replace(string.capwords(word), "")
                _extract_entities(cmt_data_list, glob_ID[index], loc_ID[index],
                                  new_comm, name_str, nickname_str, short_f,
                                  short_l)
            except:
                raise Exception("Failed to create 2D list of named entities.")
        duplicates.append([glob_ID[index], loc_ID[index]])
    return cmt_data_list