Example #1
0
def EmolexRes():
    # Read as DF
    os.chdir(paths.READ_PATH_GAME_INFO)
    dfEmolex = useful_methods.csv_dic_df("Predict-Emolex-63min-NonRT.csv")

    # Convert strings to Integer & Float
    dfEmolex['score_ht_home'] = [int(st) for st in dfEmolex['score_ht_home']]
    dfEmolex['score_ht_away'] = [int(st) for st in dfEmolex['score_ht_away']]
    dfEmolex['score_ft_home'] = [int(st) for st in dfEmolex['score_ft_home']]
    dfEmolex['score_ft_away'] = [int(st) for st in dfEmolex['score_ft_away']]

    dfEmolex['pn_home'] = [float(st) for st in dfEmolex['pn_home']]
    dfEmolex['pn_away'] = [float(st) for st in dfEmolex['pn_away']]

    # Use only Twitter data Games
    dfEmolex = dfEmolex[(dfEmolex['pn_home'] != -1)].copy().reset_index(
        drop=True)

    # Show Resulst
    EmolexAnalyze(dfEmolex)

    return dfEmolex
Example #2
0
def ReviewRes():
    # Read as DF
    os.chdir(paths.READ_PATH_GAME_INFO)
    dfNB = useful_methods.csv_dic_df("Predict-NB-63min.csv")

    # Convert strings to Integer & Float
    dfNB['score_ht_home'] = [int(st) for st in dfNB['score_ht_home']]
    dfNB['score_ht_away'] = [int(st) for st in dfNB['score_ht_away']]
    dfNB['score_ft_home'] = [int(st) for st in dfNB['score_ft_home']]
    dfNB['score_ft_away'] = [int(st) for st in dfNB['score_ft_away']]

    dfNB['nb_pos_home'] = [float(st) for st in dfNB['nb_pos_home']]
    dfNB['nb_neg_home'] = [float(st) for st in dfNB['nb_neg_home']]
    dfNB['nb_pos_away'] = [float(st) for st in dfNB['nb_pos_away']]
    dfNB['nb_neg_away'] = [float(st) for st in dfNB['nb_neg_away']]

    # Use only Twitter data Games
    dfNB = dfNB[(dfNB['nb_neg_home'] != -1)].copy().reset_index(drop=True)

    # Print resulst
    ReviewAnalyze(dfNB)

    return dfNB
# *******************************************************
# *******************************************************

# Limitations
TIME_LIMIT = 63
RETWEET_STATUS = False
FILTER_STATUS = True
START_TIME = 1
END_TIME = 63

# *******************************************************
# *******************************************************

# Game Infos
os.chdir(paths.READ_PATH_GAME_INFO)
dfGameInfos = useful_methods.csv_dic_df('game_infos.csv')
dfGameInfos = useful_methods.DropNanGames(dfGameInfos)

# which week
WEEK_NUM = input()
dfGameInfos = dfGameInfos[dfGameInfos.GW == int(WEEK_NUM)].copy().reset_index(
    drop=True)

# Convert number strings to integers
dfGameInfos['GW'] = [int(GW) for GW in dfGameInfos['GW']]
dfGameInfos['score_ht_home'] = [
    int(number) for number in dfGameInfos['score_ht_home']
]
dfGameInfos['score_ht_away'] = [
    int(number) for number in dfGameInfos['score_ht_away']
]
# *******************************************************

# Limitations
TIME_LIMIT = 63
RETWEET_STATUS = False
FILTER_STATUS = True
START_TIME = 1
END_TIME = 63


# *******************************************************
# *******************************************************

# Game Infos
os.chdir(paths.READ_PATH_GAME_INFO)
dfGameInfos = useful_methods.csv_dic_df('game_infos.csv')
dfGameInfos = useful_methods.DropNanGames(dfGameInfos)


# Convert number strings to integers
dfGameInfos['GW'] = [int(GW) for GW in dfGameInfos['GW']]
dfGameInfos['score_ht_home'] = [int(number) for number in dfGameInfos['score_ht_home']]
dfGameInfos['score_ht_away'] = [int(number) for number in dfGameInfos['score_ht_away']]
dfGameInfos['score_ft_home'] = [int(number) for number in dfGameInfos['score_ft_home']]
dfGameInfos['score_ft_away'] = [int(number) for number in dfGameInfos['score_ft_away']]


# Read Emotion-Lexicon-Soccer as Dictionary
dic_emolex_soccer = emolex.EmolexSoccerDic()

Example #5
0
def ClassifierTrain(save=False):
    date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")

    # ***************************************************
    # [Step 1]: Data Load
    # ***************************************************

    # Read Hashtags in Emotion Words Tweets
    df = useful_methods.csv_dic_df(paths.DATA_HOME + "TweetsPN/tweet_hash_emolex_pn.csv")

    # positive: 1, negative: 0
    df['label'] = [int(label) for label in df['label']]

    # ***************************************************
    # [Step 2]: Data Split(train=0.8, test=0.2)
    # ***************************************************

    # Split data Train and Test data
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'],
        df['label'],
        test_size=0.2
    )

    print(
        "\n\n### DATA ##################################\n",
        "\n\tTrain data: \t", len(X_train),
        "\n\tTest data: \t", len(X_test),
        "\n\tAll data: \t", len(y_train) + len(y_test)
    )

    # ***************************************************
    # [Step 3]: Define Classifier
    # ***************************************************

    grid_search = SVM(y_train)

    # ***************************************************
    # [Step 4]: Compute Classifier
    # ***************************************************

    start_time = time()

    # fitting training sets to classifier
    grid_search.fit(X_train, y_train)

    # ***************************************************
    # [Step 4]: Print Classifier Details
    # ***************************************************

    # print trained parameters
    DetecterParams(grid_search, title="SVM")

    # print computed time
    print("\n\n### COMPUTED TIME #########################\n")
    taken_time = time() - start_time
    print("[Started Time]: ", date_now)
    print("\n[Taken Time]: ", str(datetime.timedelta(seconds=taken_time)))

    # print classifier test results
    DetecterMetrics(X_test, y_test, grid_search, title="Test")

    # ***************************************************
    # [Step 5]: Save Classifier Details
    # ***************************************************

    if save:
        filename = "dtr_hash_svn_" + date_now + ".pkl"
        with open(paths.DETECTER_HOME + filename, 'wb') as fout:
            pickle.dump(grid_search, fout)
            print("\n\n[Saved in]: ", paths.DETECTER_HOME + filename)
Example #6
0
def HashEmolexAllCreate():
    # Emolex dic
    dic_emolex_soccer, dic_emolex_stemmed_soccer = emolex.EmolexSoccerDic()

    # Define: All Hash Emolex DF
    dfNewHashEmolex = pd.DataFrame(
        columns=['text', 'hash_emolex_word', 'sentiments'])
    texts = []
    hash_emolexs = []
    sentiments = []

    start_time = time.time()

    # Start All Here
    for index in range(len(weeks)):
        try:
            # Read Single Game as DF
            filename = home_teams[index] + '_vs_' + away_teams[index] + '.csv'
            os.chdir(paths.READ_PATH_EXTRACTED_CSV + 'GW' + str(weeks[index]) +
                     '/SingleGames/')
            df = useful_methods.csv_dic_df(filename)

            ###########################################################################
            # Filter DF: remove tweets of stream, bots ...
            dfFilter = useful_methods.FilterDF(df[df.status != 'retweet'])

            # Add column: hash_emolex
            dfFilter['hash_emolex'] = [
                tokenizer.HashEmolex(tags, dic_emolex_soccer)
                for tags in dfFilter.tags
            ]

            # Emolex in Hashtags
            dfHashEmolex = dfFilter[(dfFilter.hash_emolex != '')]

            # Check through each Tweet
            for i in range(len(dfHashEmolex)):
                # hash tags
                hash_emolex = dfHashEmolex.iloc[i]['hash_emolex'].split(',')

                # each tag
                for emo in set(hash_emolex):
                    if emo in dic_emolex_soccer:
                        text = (dfHashEmolex.iloc[i]['text'])
                        if text not in texts:
                            # appedings
                            texts.append(text)
                            hash_emolexs.append(emo)
                            sentiment = ",".join([
                                key for key, value in
                                dic_emolex_soccer[emo].items() if (value > 0)
                            ])
                            sentiments.append(sentiment)
        except:
            continue

    print("[Done]: %.2f" % (time.time() - start_time))

    # Result
    dfNewHashEmolex['text'] = texts
    dfNewHashEmolex['hash_emolex_word'] = hash_emolexs
    dfNewHashEmolex['sentiments'] = sentiments

    # Save
    useful_methods.DFtoCSV(dfNewHashEmolex,
                           "/Users/Bya/Dropbox/Research/datas/TweetsPN/",
                           "hash_emolex_all",
                           index=False)
    print(
        "[Saved in]: /Users/Bya/Dropbox/Research/datas/TweetsPN/hash_emolex_all.csv"
    )
Example #7
0
def TweetPnEqualRead():
    os.chdir("/Users/Bya/Dropbox/Research/datas/TweetsPN/")
    df = useful_methods.csv_dic_df('tweets_pn_eq.csv')
    return df
Example #8
0
def HashEmolexAllRead():
    os.chdir("/Users/Bya/Dropbox/Research/datas/TweetsPN/")
    df = useful_methods.csv_dic_df('hash_emolex_all.csv')
    return df
Example #9
0
def ClassifierTrain(save=False):
    date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_")

    # ***************************************************
    # [Step 1]: Data Load
    # ***************************************************

    # Read Hashtags in Emotion Words Tweets
    df = useful_methods.csv_dic_df(paths.DATA_HOME +
                                   "TweetsPN/tweet_hash_emolex_pn.csv")

    # positive: 1, negative: 0
    df['label'] = [int(label) for label in df['label']]

    # ***************************************************
    # [Step 2]: Data Split(train=0.8, test=0.2)
    # ***************************************************

    # Split data Train and Test data
    X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                        df['label'],
                                                        test_size=0.2)

    print("\n\n### DATA ##################################\n",
          "\n\tTrain data: \t", len(X_train), "\n\tTest data: \t", len(X_test),
          "\n\tAll data: \t",
          len(y_train) + len(y_test))

    # ***************************************************
    # [Step 3]: Define Classifier
    # ***************************************************

    grid_search = SVM(y_train)

    # ***************************************************
    # [Step 4]: Compute Classifier
    # ***************************************************

    start_time = time()

    # fitting training sets to classifier
    grid_search.fit(X_train, y_train)

    # ***************************************************
    # [Step 4]: Print Classifier Details
    # ***************************************************

    # print trained parameters
    DetecterParams(grid_search, title="SVM")

    # print computed time
    print("\n\n### COMPUTED TIME #########################\n")
    taken_time = time() - start_time
    print("[Started Time]: ", date_now)
    print("\n[Taken Time]: ", str(datetime.timedelta(seconds=taken_time)))

    # print classifier test results
    DetecterMetrics(X_test, y_test, grid_search, title="Test")

    # ***************************************************
    # [Step 5]: Save Classifier Details
    # ***************************************************

    if save:
        filename = "dtr_hash_svn_" + date_now + ".pkl"
        with open(paths.DETECTER_HOME + filename, 'wb') as fout:
            pickle.dump(grid_search, fout)
            print("\n\n[Saved in]: ", paths.DETECTER_HOME + filename)