-
Notifications
You must be signed in to change notification settings - Fork 1
/
rating_stats.py
154 lines (136 loc) · 5.47 KB
/
rating_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
from __future__ import division, print_function
import cPickle as pickle
import graph_tool.all as gt
import io
import numpy as np
import os
import pandas as pd
import pdb
def get_stats(dataset, dataset_id2rating_count, dataset_id2title, rec_type):
folder = os.path.join('data', dataset, 'graphs')
res = {}
nodes = {}
stats = {}
links_to_scc = {}
for N in Ns:
print(' ', N)
gt_file = os.path.join(folder, '%s_%d.gt' % (rec_type, N))
graph = gt.load_graph(gt_file, fmt='gt')
# # DEBUG
# title2dataset_id = {v: k for k, v in dataset_id2title.items()}
# dataset_id2node = {graph.vp['name'][n]: n for n in graph.vertices()}
# title2node = {title: dataset_id2node[title2dataset_id[title]] for title in dataset_id2title.values()}
# #/DEBUG
dataset_id2bow_tie = {graph.vp['name'][n]: graph.vp['bowtie'][n]
for n in graph.vertices()}
bt2ratings = {l: [] for l in bt_labels}
bt2nodes = {l: [] for l in bt_labels}
for did, bt in dataset_id2bow_tie.items():
bt2ratings[bt].append(dataset_id2rating_count[did])
bt2nodes[bt].append(dataset_id2title[did])
res[N] = {k: [np.mean(v), np.median(v)] for k, v in bt2ratings.items()}
res[N] = {k: v for k, v in res[N].items() if not np.isnan(v[0])}
stats[N] = graph_stats(graph)
links_to_scc[N] = get_links_to_scc(graph)
nodes[N] = bt2nodes
return res, nodes, stats, links_to_scc
def graph_stats(graph):
clustering_coefficient = 0
neighbors = {int(node): set([int(n) for n in node.out_neighbours()])
for node in graph.vertices()}
for idx, node in enumerate(graph.vertices()):
node = int(node)
if len(neighbors[node]) < 2:
continue
edges = sum(len(neighbors[int(n)] & neighbors[node])
for n in neighbors[node])
cc = edges / (len(neighbors[node]) * (len(neighbors[node]) - 1))
clustering_coefficient += cc
component, histogram = gt.label_components(graph)
return [
clustering_coefficient / graph.num_vertices(),
len(histogram),
]
def get_links_to_scc(graph):
count = 0
count_scc = 0
for node in graph.vertices():
if not graph.vp['bowtie'][node] == 'IN':
continue
for nb in node.out_neighbours():
count += 1
if graph.vp['bowtie'][nb] == 'SCC':
count_scc += 1
if count == 0:
return -1
return count_scc / count
if __name__ == '__main__':
datasets = [
'movielens',
'bookcrossing',
'imdb'
]
Ns = [
# 2,
5,
20
]
rec_types = [
'rbar',
'rb',
'rbiw',
'rbmf'
]
bt_labels = ['IN', 'SCC', 'OUT', 'TL_IN', 'TL_OUT', 'TUBE', 'OTHER']
results = {}
result_nodes = {}
result_stats = {}
result_links_to_scc = {}
for dataset in datasets:
print(dataset)
results[dataset] = {}
result_nodes[dataset] = {}
result_stats[dataset] = {}
result_links_to_scc[dataset] = {}
df = pd.read_pickle(os.path.join('data', dataset, 'item_stats.obj'))
dataset_id2rating_count = {r['dataset_id']: r['rating_count']
for ridx, r in df.iterrows()}
dataset_id2title = {r['dataset_id']: r['original_title']
for ridx, r in df.iterrows()}
for rec_type in rec_types:
print(' ', rec_type)
results[dataset][rec_type], result_nodes[dataset][rec_type], \
result_stats[dataset][rec_type], result_links_to_scc[dataset][rec_type] = \
get_stats(dataset, dataset_id2rating_count, dataset_id2title,
rec_type)
out_folder = os.path.join('data', 'rating_stats')
if not os.path.exists(out_folder):
os.makedirs(out_folder)
with io.open(os.path.join(out_folder, 'counts.txt'), 'w', encoding='utf-8') as outfile:
for dataset in datasets:
outfile.write(u'%s\n' % (dataset))
for rec_type in rec_types:
outfile.write(u' %s\n' % (rec_type))
for N in Ns:
outfile.write(u' %d (%d components, %.2f cc, %.2f links to scc)\n' % (
N, result_stats[dataset][rec_type][N][1],
result_stats[dataset][rec_type][N][0],
result_links_to_scc[dataset][rec_type][N]))
for label in results[dataset][rec_type][N]:
if label not in ['IN', 'SCC', 'OUT']:
continue
outfile.write(u' %.2f\t%.2f\t%s\n' % (
results[dataset][rec_type][N][label][0],
results[dataset][rec_type][N][label][1],
label
))
for dataset in datasets:
for rec_type in rec_types:
for N in Ns:
outpath = os.path.join(out_folder, '%s_%s_%d.txt' % (dataset, rec_type, N))
with io.open(outpath, 'w', encoding='utf-8') as outfile:
for label, nodes in result_nodes[dataset][rec_type][N].items():
outfile.write(u'%s %s\n' % (label, u'%' * 64))
for n in sorted(nodes):
outfile.write(u' %s\t[%s]\n' % (n, label))