-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster_analysis.py
102 lines (90 loc) · 4.35 KB
/
cluster_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! -*- coding: utf-8 -*-
import csv
from typing import List, Dict, Any
import itertools
import logging
import statistics
import json
import jsonlines
import tqdm
from DocumentFeatureSelection import interface
logging.basicConfig(level=logging.INFO)
def load_leaf_table(path_cluster_leaf_table: str) -> List[Any]:
f_obj = open(path_cluster_leaf_table, 'r')
reader = csv.reader(f_obj, delimiter='\t')
next(reader)
__ = list(reader)
f_obj.close()
return __
def load_preprocessed_record(path_preprocessed_jsonl: str) -> Dict[str, List[str]]:
with jsonlines.open(path_preprocessed_jsonl) as reader:
__ = {record['file_name']: record['morphs'] for record in tqdm.tqdm(reader)}
return __
# クラスタリング結果のテーブル
path_cluster_leaf_table = 'analysis_data/leaf_information.tsv'
# 前処理済みのjsonl
path_preprocessed_jsonl = './text/processed_data.jsonl'
cluster_leaf_table = load_leaf_table(path_cluster_leaf_table)
# クラスタ数
n_cluster = len(set([r[2] for r in cluster_leaf_table]))
logging.info(f'クラスタ数 -> {n_cluster}')
# クラスタの大きさ分布
cluster_distribution = [[r[1] for r in g_obj]
for custer_id, g_obj
in itertools.groupby(sorted(cluster_leaf_table, key=lambda t: t[2]), key=lambda t: t[2])]
clsuter_distribution = [len(set(l)) for l in cluster_distribution]
min_cluster_per_cluster = min(clsuter_distribution)
max_cluster_per_cluster = max(clsuter_distribution)
avg_cluster_per_cluster = statistics.mean(clsuter_distribution)
median_cluster_per_cluster = statistics.median(clsuter_distribution)
logging.info(f'クラスタの統計 最小値:{min_cluster_per_cluster} 最大値:{max_cluster_per_cluster} \
平均:{avg_cluster_per_cluster} 中央値:{median_cluster_per_cluster}')
# クラスタごとのライブドアラベル分布
livedoor_label_distribution = [[r[3] for r in g_obj]
for custer_id, g_obj
in itertools.groupby(sorted(cluster_leaf_table, key=lambda t: t[2]), key=lambda t: t[2])]
logging.info(f'ライブドアラベル分布のリスト: {livedoor_label_distribution[:10]}')
# クラスタごとのラベル偏りを数値化してみる
numeric_livedoor_label_distribution = [len(set(l)) for l in livedoor_label_distribution]
min_label_per_cluster = min(numeric_livedoor_label_distribution)
max_label_per_cluster = max(numeric_livedoor_label_distribution)
avg_label_per_cluster = statistics.mean(numeric_livedoor_label_distribution)
median_label_per_cluster = statistics.median(numeric_livedoor_label_distribution)
logging.info(f'ライブドアラベル分布の統計 最小値:{min_label_per_cluster} 最大値:{max_label_per_cluster} \
平均:{avg_label_per_cluster} 中央値:{median_label_per_cluster}')
# クラスタごとの特徴的な単語を探索する(特徴量重み付け)
# こんな形の入力に変形したい
sample_input_dict = {
"label_a": [
["I", "aa", "aa", "aa", "aa", "aa"],
["bb", "aa", "aa", "aa", "aa", "aa"],
["I", "aa", "hero", "some", "ok", "aa"]
],
"label_b": [
["bb", "bb", "bb"],
["bb", "bb", "bb"],
["hero", "ok", "bb"],
["hero", "cc", "bb"],
],
"label_c": [
["cc", "cc", "cc"],
["cc", "cc", "bb"],
["xx", "xx", "cc"],
["aa", "xx", "cc"],
]
}
# 前処理済みデータから{file_name: [単語]}のdictを得る
filename2morphs = load_preprocessed_record(path_preprocessed_jsonl)
# テーブル情報から[(クラスタ番号, 元ファイル名)]を得る
arg_information = [(r[2], json.loads(r[4])) for r in cluster_leaf_table]
# [(クラスタ番号, [単語])]のリストを作る
cluster_word = [(t[0], [word_pos[0] for word_pos in filename2morphs[t[1]['file_name']]]) for t in arg_information]
# 入力形式を整える
input_dict = {c_id: [t[1] for t in g_obj]
for c_id, g_obj
in itertools.groupby(sorted(cluster_word, key=lambda t: t[0]), key=lambda t: t[0])}
feature_selection_result = interface.run_feature_selection(input_dict, method='tf_idf', use_cython=True).convert_score_matrix2score_record()
# 重み付け結果をファイル出力
import pandas
df_feature_selection = pandas.DataFrame(feature_selection_result)
df_feature_selection.to_csv('./analysis_data/feature_selection.csv', index_label=False, index=False)