def create_by_information_gain(raw_rdd: RDD, threshold: float) -> None: """ information gain(IG)を元にして不要語を抽出する. 各カテゴリckに対して単語tj毎にInformation Gainを以下のように定義 IG(tj, ck) = p(tj, ck)(log(p(tj, ck)) - log(p(tj)) - log(p(ck))) + p(!tj, ck)(log(p(!tj, ck)) - log(p(!tj)) - log(p(ck))) """ # rdd(key, rows) category_group = raw_rdd.groupBy( # row(category, words) lambda row: row['category'] ) category_hist = category_group.map( lambda key_and_rows: len(key_and_ros[1]) # length of rows )