コード例 #1
0
def run_experiment_with_rake():
    print "\nBegin experiment using RAKE algorithm..."
    # RAKE: predict keyword dengan RAKE, ambil words dengan RAKE skor tertinggi
    rake = RakeKeywordExtractor()
    tweets_rake['keyword'] = tweets_rake.apply(lambda t: rake.extract_keyword(
        rake.extract_candidates(t['text'], incl_scores=True)),
                                               axis=1)

    # RAKE: infer aspect dengan aspect mapping, dengan similarity terbesar
    tweets_rake['selected_keyword'] = tweets_rake.apply(
        lambda t: asp.find_nearest_inferred_aspect(t['keyword'], emb)[1],
        axis=1)
    tweets_rake['inferred_aspect'] = tweets_rake.apply(
        lambda t: asp.find_nearest_inferred_aspect(t['keyword'], emb)[0],
        axis=1)
    tweets_rake['gold_aspect'] = tweets_rake.apply(
        lambda t: asp.INVERTED_ASPECTS[t['inferred_aspect']], axis=1)

    tweets_rake.to_csv('dump/result_rake.csv', encoding='utf-8', index=False)

    # RAKE: Evaluasi dengan accuracy
    eva_rake = Evaluation(tweets_rake)
    conf_matrix = eva_rake.build_confusion_matrix(tweets_rake)
    print "Confusion matrix:"
    print conf_matrix
    print "Accuracy using RAKE algorithm: {}".format(eva_rake.accuracy())
    print "Average Precision using RAKE algorithm: {}".format(
        eva_rake.average_precision())
    print "Average Recall using RAKE algorithm: {}".format(
        eva_rake.average_recall())
コード例 #2
0
def run_experiment_with_tfidf(tweets_tfidf):
    print "\nBegin experiment using TF-IDF weighting algorithm..."
    # TF-IDF: cari keyword dengan TF-IDF, ambil yang single word aja dengan bobot tertinggi
    tfidf = TfidfKeywordExtractor()
    tfidf_weight = tfidf.fit_transform(tweets_tfidf)
    tfidf_weight['keyword'] = tfidf_weight.idxmax(axis=1)

    # MUST BE after extracting keyword
    # OTHERWISE, the keyword will be "tweet_no" for all tweets
    tfidf_weight = tfidf_weight.reset_index().rename(
        columns={'index': 'tweet_no'})
    tfidf_weight['tweet_no'] = tfidf_weight['tweet_no'] + 1
    tfidf_weight = tfidf_weight[['tweet_no', 'keyword']]
    tfidf_weight.to_csv('tfidf_keyword.csv', encoding='utf-8', index=False)

    tweets_tfidf = tweets_tfidf.reset_index().rename(
        columns={'index': 'tweet_no'})
    tweets_tfidf['tweet_no'] = tweets_tfidf['tweet_no'] + 1
    tweets_tfidf.to_csv('tweets_tfidf.csv', encoding='utf-8', index=False)

    tweets_tfidf = pd.merge(tweets_tfidf,
                            tfidf_weight,
                            how='left',
                            on='tweet_no')
    tweets_tfidf.to_csv('tweets_tfidf_after_merge.csv',
                        encoding='utf-8',
                        index=False)

    # TF-IDF: infer aspect dengan aspect mapping, dengan similarity terbesar
    tweets_tfidf['selected_keyword'] = tweets_tfidf.apply(
        lambda t: asp.find_nearest_inferred_aspect(t['keyword'], emb)[1],
        axis=1)
    tweets_tfidf['inferred_aspect'] = tweets_tfidf.apply(
        lambda t: asp.find_nearest_inferred_aspect(t['keyword'], emb)[0],
        axis=1)
    tweets_tfidf['gold_aspect'] = tweets_tfidf.apply(
        lambda t: asp.INVERTED_ASPECTS[t['inferred_aspect']], axis=1)

    tweets_tfidf.to_csv('dump/result_tfidf.csv', encoding='utf-8', index=False)

    # RAKE: Evaluasi dengan accuracy
    eva_tfidf = Evaluation(tweets_tfidf)
    conf_matrix = eva_tfidf.build_confusion_matrix(tweets_tfidf)
    print "Confusion matrix:"
    print conf_matrix
    print "Accuracy using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.accuracy())
    print "Average Precision using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.average_precision())
    print "Average Recall using TF-IDF weighting algorithm: {}".format(
        eva_tfidf.average_recall())
コード例 #3
0
 def calculate_score_and_show_result(self, w2v_150_socres, w2v_300_scores,
                                     bert_768_scores):
     e = Evaluation()
     w2v_precision_150 = e.average_precision(w2v_150_socres, 2)
     w2v_ndcg_150 = e.average_ndcg(w2v_150_socres)
     w2v_precision_300 = e.average_precision(w2v_300_scores, 2)
     w2v_ndcg_300 = e.average_ndcg(w2v_300_scores)
     bert_precision_768 = e.average_precision(bert_768_scores, 2)
     bert_ndcg_768 = e.average_ndcg(bert_768_scores)
     print(w2v_precision_150)
     print(w2v_ndcg_150)
     print(w2v_precision_300)
     print(w2v_ndcg_300)
     print(bert_precision_768)
     print(bert_ndcg_768)
     plt.title("Semantic Prediction in Frequent Emotions and Events")
     plt.xlabel(
         "The number of predict semantic for each emotions or events")
     plt.ylabel("Average Precision")
     plt.plot(range(1, 11),
              w2v_precision_150,
              "-o",
              color='r',
              label="e2v-w2v-sg 150 dimension")  # 畫出平均數值
     plt.plot(range(1, 11),
              w2v_precision_300,
              "-o",
              color='b',
              label="e2v-w2v-sg 300 dimension")  # 畫出平均數值
     plt.plot(range(1, 11),
              bert_precision_768,
              "-o",
              color='g',
              label="e2v-bert 768 dimension")  # 畫出平均數值
     plt.legend(loc="best")
     # save image
     plt.savefig('image/precision.png')
     plt.close()
     plt.title("Semantic Prediction in Frequent Emotions and Events")
     plt.xlabel(
         "The number of predict semantic for each emotions or events")
     plt.ylabel("Average NDCG")
     plt.plot(range(1, 11),
              w2v_ndcg_150,
              "-o",
              color='r',
              label="e2v-w2v-sg 150 dimension")  # 畫出平均數值
     plt.plot(range(1, 11),
              w2v_ndcg_300,
              "-o",
              color='b',
              label="e2v-w2v-sg 300 dimension")  # 畫出平均數值
     plt.plot(range(1, 11),
              bert_ndcg_768,
              "-o",
              color='g',
              label="e2v-bert 768 dimension")  # 畫出平均數值
     plt.legend(loc="best")
     # save image
     plt.savefig('image/ndcg.png')
     plt.close()