-
Notifications
You must be signed in to change notification settings - Fork 0
/
cf_stat.py
219 lines (208 loc) · 8.2 KB
/
cf_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#-*- coding:utf-8 -*-
import os
import datetime
import sys
from fim import fpgrowth
import build_candidate_ar as bca
import rs_utils as utils
from scipy import sparse
from scipy import io
import numpy as np
import build_candidate_cf as bccf
"""
计算在交易集trans中的item的一个分布情况
"""
def compute_item_distribution_in_trans(fin_str):
item_distribution_dict = dict()
print fin_str
for fre_item in fpgrowth(bca.iter_trans_data(fin_str).values(),supp=0,zmax=1,report='[a'):
print fre_item[0][0],fre_item[1][0]
base = 5
key = int(fre_item[1][0])/base
item_distribution_dict.setdefault(key,0)
item_distribution_dict[key] +=1
for key in item_distribution_dict:
print >> sys.stdout,'[%d,%d) itemnums is %d' %(key*base , (key+1) *base, item_distribution_dict[key])
"""
每天的购买记录 与 resource/item 求交集
fbuy_str:某一天的购买记录
fitem_str:resource/item 给定的item集合
fout_str:过滤之后的输出文件data_xxxx_buy_filter
"""
def filter_buy_records_by_selected_item(fbuy_str,fitem_str,fout_str):
item_dict = dict()
with open(fitem_str) as fin:
for line in fin:
#itemid,position,cate
cols = line.strip().split(',')
if cols[0] not in item_dict:
item_dict[cols[0]] = 0
fout = open(fout_str,'w')
with open(fbuy_str) as fin:
for line in fin:
#user_id,item_id,behavior_type,postion,cate,time
cols = line.strip().split(',')
if cols[1] in item_dict:
print >> fout,line.strip()
fout.close()
def build_buy_dict(fbuy_str,fbefore_str):
buy_dict=dict()
buy_num_dict = dict()
with open(fbuy_str) as fin:
for line in fin:
#user_id,item_id,behavior_type,postion,cate,time
cols = line.strip().split(',')
key = "%s#%s"%(cols[0],cols[1])
if key not in buy_dict:
buy_dict[key] = 0
buy_num_dict.setdefault(key,0)
buy_num_dict[key] +=1
with open(fbefore_str) as fin:
for line in fin:
#user_id,item_id,behavior_type,postion,cate,time
cols = line.strip().split(',')
pair = "%s#%s"%(cols[0],cols[1])
if pair in buy_dict and int(cols[2]) > buy_dict[pair]:
buy_dict[pair] = int(cols[2])
return buy_dict,buy_num_dict
"""
fbuy_str:某一天的购买记录
fbefore_str:这天之前的行为记录
fout_str:输出结果 日期,buy:num,cart:num:...none:num
(cart :num 表示历史行为最高为加购物车行为的pair的数量)
is_append:是否以追加的形式打开fout
"""
def compute_purchase_compose(fbuy_str,fbefore_str,fout_str,is_append=False):
buy_dict,buy_num_dict = build_buy_dict(fbuy_str,fbefore_str)
#fbuy_str的为data_xxx_buy
date = fbuy_str.strip().split('_')[-2]
print >> sys.stdout,"去重后%s的购买对:%d" %(date,len(buy_dict))
#统计各种行为的总次数
num_dict=dict()
for key in buy_dict:
val = buy_dict[key]
num_dict.setdefault(val,0)
num_dict[val] += buy_num_dict[key]
#输出
if is_append:
fout = open(fout_str,'a')
else:
fout = open(fout_str,'w')
num_list = ["%s:%s"%(k,v) for k,v in num_dict.iteritems()]
print >> fout,"%s,%s"%(date,','.join(num_list))
fout.close()
"""
在一天的购买记录中,购买的物品用户之前完全对其没有行为的user
并非是新用户(以前完全没行为的user,而且对其购买的item 完全没行为的user
"""
def user_no_history_in_purchase(fbuy_str,fbefore_str,fout):
buy_dict = build_buy_dict(fbuy_str,fbefore_str)
user_dict = dict()
for key in buy_dict:
user,item = key.split('#')
user_dict.setdefault(user,[0,0])
if buy_dict[key] == 0:
user_dict[user][0] +=1
else:
user_dict[user][1] +=1
user_history_dict = get_user_history(fbefore_str)
fout = open(fout_str,'w')
num_dict = dict()
base = 1000
for user in user_dict:
sum_0,sum_else = user_dict[user]
if sum_0 > 0 and sum_else == 0 and user in user_history_dict:
print >> fout,'%s,%d,%s' %(user,len(user_history_dict[user]),'\002'.join(user_history_dict[user]))
key = len(user_history_dict[user])/base
num_dict.setdefault(key,0)
num_dict[key] +=1
for num in num_dict:
print >> sys.stdout,'[%d,%d] %d' %(num*base,(num+1)*base,num_dict[num])
fout.close()
"""
购买记录里,在以前从没有过行为的item 冷启动的item
"""
def get_new_items_in_purchase(frate_str,fbuy_str):
rate_matrix = io.mmread('data')
rate_matrix = rate_matrix.tocsc()
user_ids_list,item_ids_list,user_ids_dict,item_ids_dict = bccf.compute_user_item_list(frate_str)
item_set = set()
new_item_num = 0
new_item_n = 0
with open(fbuy_str) as fin:
for line in fin:
cols = line.strip().split(',')
item_set.add(cols[1])
count = 0
print item_set
for item in item_set:
count += 1
"""
print 'count'+str(count)
print item
"""
if item not in item_ids_dict:
new_item_n +=1
else:
i_ix = item_ids_dict[item]
"""
print i_ix
print np.count_nonzero(rate_matrix[:,i_ix])
"""
"""
if np.count_nonzero(rate_matrix[:,i_ix]) == 0:
new_item_num+=1
"""
print 'hah'
print >> sys.stdout, 'new items is %d ' %(new_item_num)
print >> sys.stdout, 'new items is %d ' %(new_item_n)
print >> sys.stdout, 'total items (rm dup) %d' %(len(item_set))
def compute_user_bhr_dis(fbefore_str):
user_history_dict = get_user_history(fbefore_str)
num_dict = dict()
base = 1000
for user in user_history_dict:
num = len(user_history_dict[user])
key = num/base
num_dict.setdefault(key,0)
num_dict[key] +=1
for num in num_dict:
print >> sys.stdout,'[%d,%d] %d' %(num*base, (num+1)*base, num_dict[num])
def get_user_history(fbefore_str):
user_dict=dict()
with open(fbefore_str) as fin:
for line in fin:
cols = line.strip().split(',')
user_dict.setdefault(cols[0],[])
user_dict[cols[0]].append('\001'.join(cols[1:]))
return user_dict
if __name__=='__main__':
begin_date = datetime.datetime(2014,11,18)
split_date = datetime.datetime(2014,12,17)
td = datetime.timedelta(1)
next_date = split_date+td
data_dir = utils.get_data_dir(utils.FLAG_TRAIN_TEST)
cf_dir = utils.get_data_dir(utils.FLAG_CF)
frate_str = '%s/rate_%s_%s' %(cf_dir,begin_date.strftime('%m%d'),split_date.strftime('%m%d'))
fbefore_str = '%s/data_%s_%s' %(data_dir,begin_date.strftime('%m%d'),split_date.strftime('%m%d'))
fbuy_str = '%s/data_buy_%s' %(data_dir,next_date.strftime('%m%d'))
candidate_dir = utils.get_data_dir(utils.FLAG_STAT)
fout_str = '%s/user_no_behavior_in_buy_%s' %(candidate_dir,next_date.strftime('%m%d'))
fpurchase_str = '%s/purchase_compose_%s' %(candidate_dir,next_date.strftime('%m%d'))
#get_new_items_in_purchase(frate_str,fbuy_str)
compute_purchase_compose(fbuy_str,fbefore_str,fpurchase_str)
#user_no_history_in_purchase(fbuy_str,fbefore_str,fout_str)
#compute_user_bhr_dis(fbefore_str)
"""
is_filter = False
if is_filter:
fbuy_str = "%s/data/train_test/data_%s_buy_filter"%(parent_dir,next_date.strftime('%m%d'))
fout= "%s/data/candidate/purchase_compose_filter"%(parent_dir)
filter_buy_records_by_selected_item("%s/data/train_test/data_%s_buy"%(parent_dir,next_date.strftime('%m%d')),"%s/resource/item"%(parent_dir),fbuy_str)
else:
fbuy_str = "%s/data/train_test/data_%s_buy"%(parent_dir,next_date.strftime('%m%d'))
fout= "%s/data/candidate/purchase_compose"%(parent_dir)
compute_purchase_compose(fbuy_str,'%s/data/train_test/data_1118_%s'%(parent_dir,split_date.strftime('%m%d')),fout,True)
compute_item_distribution_in_trans('%s/data/ar/ar_cate/trans_1208_%s'%(parent_dir,split_date.strftime('%m%d')))
#item_distribution('%s/data/ar/trans_1118_%s'%(parent_dir,split_date.strftime('%m%d')))
"""