def __init__(self): self.data_dir = 'E:\\gitshell\\tianchi2' # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 20000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率对数 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 类别 self.class_M = np.zeros((3000000, 2)) # 类别id 类别商品计数/ 概率对数 self.dict_class = {} self.class_num = 0 self.class_class = np.zeros((2, 2)) # 原始人工经验 self.exp_peo = exp_of_people() self.exp_peo.read_jingyan() # self.matrix_item = np.zeros((10000000,3)) # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 60000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 pass
def __init__(self): self.data_dir = 'E:\\gitshell\\tianchi3' # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 15000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率对数 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 类别 self.class_M = np.zeros((3000000, 2)) # 类别id 类别商品计数/ 概率对数 self.dict_class = {} self.class_num = 0 self.class_class = np.zeros((2, 2)) # 原始人工经验 # self.matrix_item = np.zeros((10000000,3)) # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 100000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 pass
def __init__(self): self.data_dir = "E:\\gitshell\\tianchi3" self.my_matrix = np.zeros((2, 2)) # 记录词到商品的概率对数 # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 15000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 统一记录真实概率 # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) # 第一列词 第二列词的次数 self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 100000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 self.num_word2 = 800 pass
class most_like(): def __init__(self): self.data_dir = 'E:\\gitshell\\tianchi3' # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 15000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率对数 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 类别 self.class_M = np.zeros((3000000, 2)) # 类别id 类别商品计数/ 概率对数 self.dict_class = {} self.class_num = 0 self.class_class = np.zeros((2, 2)) # 原始人工经验 # self.matrix_item = np.zeros((10000000,3)) # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 100000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 pass def read_txt(self, filename="dim_items.txt"): # 读取商品的类别信息表 r_path = os.path.join(self.data_dir, filename) r_stream = open(r_path, 'r') self.item_num = 0 for line_i in r_stream: if self.item_num % 100000 == 0: print self.item_num, time.time() # 录入商品 my_str = line_i.strip('\n').split(" ") self.dict_item[int(my_str[0])] = self.item_num self.item_M[self.item_num, :] = [my_str[0], int(my_str[1])] self.item_word_array[self.item_num] = my_str[2] self.item_num += 1 # 录入不同的词组 my_str2 = my_str[2].split(',') for x_word in my_str2: try: word_id = int(x_word) except: continue word_ind = self.dict_word.get(word_id, -1) if word_ind == -1: self.dict_word[word_id] = self.word_num self.word_M[self.word_num, :] = [word_id, 1] # self.word_item_array[self.word_num] = my_str[0] # 商品 self.word_num += 1 else: self.word_M[word_ind, 1] += 1 # self.word_item_array[word_ind] += ',' + my_str[0] # 商品 # 录入分类信息 class_id = int(my_str[1]) class_ind = self.dict_class.get(class_id, -1) if class_ind == -1: self.dict_class[class_id] = self.class_num self.class_M[self.class_num, :] = [class_id, 1] self.class_num += 1 else: self.class_M[class_ind, 1] += 1 self.class_M = self.class_M[0:self.class_num, :] self.word_M = self.word_M[0:self.word_num, :] self.item_M = self.item_M[0:self.item_num, :] self.item_word_array = self.item_word_array[0:self.item_num] # self.word_item_array = self.word_item_array[0:self.word_num] # 根据热度排行对词进行重新排序 order = np.argsort(-self.word_M[:, 1]) self.word_M = self.word_M[order, :] # temp_a = self.word_item_array # for x in xrange(0, len(order)): # self.word_item_array[x] = temp_a[order[x]] for x in xrange(0, self.word_num): self.dict_word[int(self.word_M[x, 0])] = x r_stream.close() # # 转化word_M 第2 列 为概率对数: sum_word_num = sum(self.word_M[:, 1]) self.word_M[:, 1] = np.log(self.word_M[:, 1] / sum_word_num) # 转化word_M 第2 列 为概率对数: sum_class_num = sum(self.class_M[:, 1]) self.class_M[:, 1] = np.log(self.class_M[:, 1] / sum_class_num) def result_word(self, file_name='test_set.txt'): # 只对需要预测的商品进行计算 找出需要计算的词汇 file_name = os.path.join(self.data_dir, file_name) r_stream = open(file_name, 'r') for line_i in r_stream: item_id = int(line_i.strip().split('\t')[-1]) self.test_item.append(item_id) item_ind = self.dict_item.get(item_id, -1) if item_ind == -1: continue word_array = self.item_word_array[item_ind].split(',') for word_id in word_array: word_id = int(word_id) word_ind = self.r_dict_word.get(word_id, -1) if word_ind == -1: self.r_dict_word[word_id] = self.r_word_num self.r_word_M[self.r_word_num, :] = [word_id, 1] self.r_word_num += 1 else: self.r_word_M[word_ind, 1] += 1 r_stream.close() order = np.argsort(-self.r_word_M[0:self.r_word_num, 1]) self.r_word_M = self.r_word_M[order, :] i_record = self.r_word_num for i in xrange(0, self.r_word_num): if self.r_word_M[i, 1] == 1: i_record = i print i_record break self.r_word_num = i_record self.r_word_M = self.r_word_M[0:i_record, :] # 统计 类类 关系 def my_tongji2(self): # 统计过程由 sql sever 完成后存为class_class.txt 这里直接读取 # 检查完毕 r_path = os.path.join(self.data_dir, "class_class.txt") r_stream = open(r_path, 'r') self.class_class = np.zeros((self.class_num, self.class_num)) for line in r_stream: my_str = line.strip().split('\t') class_ind1 = self.dict_class[int(my_str[0])] class_ind2 = self.dict_class[int(my_str[1])] num = int(my_str[2]) self.class_class[class_ind1, class_ind2] += num r_stream.close() row_sum = self.class_class.sum(1) # 按照行求和 # all_num = 6 # 商品总数 # self.class_class[class_ind1, class_ind2] 存储 id1 类别 后面搭配 id2 类别的概率 w_path1 = open(os.path.join(self.data_dir, "class_class1.txt"), 'w') w_path2 = open(os.path.join(self.data_dir, "ceshi_class_class2.txt"), 'w') # 记录中间结果 用于测试 # w_path3 = open(os.path.join(self.data_dir, "o_class_class3.txt"), 'w') for ind1 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind1, 1]) # 原假设: ind2 发生的概率 # w_path3.writelines(str(p_pre) + '\t') # 记录原假设 for ind2 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind2, 1]) # 原假设: ind2 发生的概率 if ind2 == self.class_num - 1: w_path1.writelines(str(self.class_class[ind1, ind2]) + '\n') else: w_path1.writelines(str(self.class_class[ind1, ind2]) + '\t') a = int(self.class_class[ind1, ind2]) self.class_class[ind1, ind2] = self.pro_guji.get_pro_r(p_pre, self.class_class[ind1, ind2], row_sum[ind1]) # ind1 条件下 ind2 的概率 if ind1 == 1: w_path2.writelines(str(p_pre) + '\t' + str(self.class_class[ind1, ind2])+ '\t' +str(a) +'\t'+ str(row_sum[ind1]) + '\n') w_path1.close() w_path2.close() # w_path3.close() # 统计词词关系 new 基于sql sever 处理过的文件开始统计 简化代码 def my_tongji3(self): # split_ss = self.r_word_num temp_array = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) p_remain = sum(np.exp(self.word_M[self.top_k_word:, 1])) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" r_path = os.path.join(self.data_dir, "learn_wordstr_wordstr0.txt") r_stream = open(r_path, 'r') for wor_str in r_stream: my_str = wor_str.strip().split('\t') if my_str[0] == '' or my_str[1] == '': continue word_p = my_str[0].split(',') word_s = my_str[1].split(',') num = int(my_str[2]) for word_id1 in word_p: word_ind1 = self.r_dict_word.get(int(word_id1), -1) # 行号 if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) for word_id2 in word_s: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue # 非录入词 word_ind2 = min(word_ind2, self.top_k_word) temp_array[word_ind1, word_ind2] += num # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 r_stream.close() # 求概率 temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape o_stream = open(os.path.join(self.data_dir, "ceshi_tongji3.txt"), 'w') for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = np.exp(self.word_M[i_col, 1]) for i_row in xrange(0, row_num): a = temp_array[i_row, i_col] temp_array[i_row, i_col] = \ self.pro_guji.get_pro_r(p_pre, temp_array[i_row, i_col], temp_array_sum[i_row]) # p n m if i_row == 1: o_stream.write(str(p_pre)+'\t'+str(temp_array[i_row, i_col]) +'\t'+str(a) +'\t'+str(temp_array_sum[i_row])+'\n') # 静态存储 o_stream.close() w_file = os.path.join(self.data_dir, file_name + str(i_file) + '.txt') w_stream = open(w_file, 'w') for i_row in xrange(0, row_num): my_str = '' for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + ',' my_str += str(math.log(temp_array[i_row, col_num - 1])) + '\n' w_stream.writelines(my_str) w_stream.close() self.word_word = temp_array # 读取之前计算的词词关系 def read_word_word(self): self.word_word = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) o_stream = open(os.path.join(self.data_dir, "word_word_pro0.txt"), 'r') i_line = 0 for line in o_stream: my_str = line.strip().split(',') for x in xrange(0, self.top_k_word + 1): self.word_word[i_line, x] = math.exp(float(my_str[x])) # 真概率 i_line += 1 o_stream.close() if i_line == (self.r_word_num + 1): print time.time(), "good" # 根据热度重组商品矩阵 def read_item_hot(self, write=True): path = os.path.join(self.data_dir, 'my_item_hot.txt') nums_array = np.array([0] * self.item_num) r_stream = open(path, 'r') for line_i in r_stream: my_str = line_i.strip().split('\t') item_id = int(my_str[0]) nums = int(my_str[-1]) item_ind = self.dict_item[item_id] nums_array[item_ind] = nums r_stream.close() a = np.argsort(-nums_array) # 降序排列 self.item_M = self.item_M[a, :] for x in xrange(0, self.item_num): self.dict_item[int(self.item_M[x, 0])] = x # 记录商品热度 if write: w_stream = open(os.path.join(self.data_dir, 'my_item_hot.txt'), 'w') nums_array = nums_array[a] for x in xrange(0, self.item_num): w_stream.writelines(str(int(self.item_M[x, 0])) + '\t' + str(nums_array[x]) + '\n') w_stream.close() # 仅仅计算出所有结果前6万商品的搭配结果 # 加入了 原始的搭配概率 0.0006,在 搭配中由于这一值大家相同,所以无关紧要,但在与python2中计算的概率融合时会有影响。 # 这次会计算出前6万的搭配概率,供 my_python2 做 进一步筛查 def da_pei2(self): file_name = os.path.join(self.data_dir, 'fm_submissions2_tag_m.txt') file_name2 = os.path.join(self.data_dir, 'fm_submissions2_tag_c.txt') # class 意见 file_name1 = os.path.join(self.data_dir, 'fm_submissions2_tag_w.txt') # word 意见 w_stream = open(file_name, 'w') w_stream1 = open(file_name1, 'w') w_stream2 = open(file_name2, 'w') iii = -1 t_b = time.time() for item_id in self.test_item: iii += 1 t_n = time.time() if iii % 100 == 0 or t_n>t_b+100: t_b = t_n print t_n, iii item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] # 类别编号 class_ind = self.dict_class[class_id] # 类别索引 # item_id == self.class_M[item_ind,0] temp_result_array = np.zeros((self.item_num, 2)) # 第一列记录词组的意见,第二列记录类别的意见 概率乘 化作 加 class_pro = np.log(self.class_class[class_ind, :]) # 搭配时 该商品类别到各个类别的概率 class_pro2 = self.class_M[:, 1] # 不搭配时 该商品类别到各个类别的概率对数 temp_word_pro = np.array([0.0] * (self.top_k_word + 1)) # 该商品词组发生后各个词组的概率 word_num = 0 word_str_array = word_str.split(',') # 获得该商品后 其他商品的输出概率 # temp_word_pro = self.word_M[:, 1] # 原本的每个词的概率 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) temp_word_pro += np.log(self.word_word[word_ind1, :]) # word_word 记录的是 真实概率 word_num += 1 temp_word_pro2 = self.word_M[:, 1] # 不搭配 意见 for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] class_ind00 = self.dict_class[int(class_id)] temp_result_array[item_ind, 1] = class_pro[class_ind00] - class_pro2[class_ind00] # 其exp 为搭配是 不搭配发生的倍数 if word_str == "": continue word_str = word_str.split(',') word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) temp_result_array[item_ind, 0] += temp_word_pro[word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 # temp_result_array[item_ind, 0] *= (1.0 / word_num2) temp_result_array[item_ind, 0] *= 1 a = temp_result_array[:, 0] + temp_result_array[:, 1] # 类别的意见, 加上词的意见 a元素 中存储的是 pro_a = self.p_match * np.exp(a) / (self.p_match * np.exp(a) + (1 - self.p_match) * 1) # 得到各个商品 的概率 # my_str00 = "" w_stream.writelines(str(item_id) + '\t') w_stream1.writelines(str(item_id) + '\t') w_stream2.writelines(str(item_id) + '\t') for item_ind in xrange(0, self.item_top_k): if item_ind != (self.item_top_k - 1): w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\t') w_stream1.writelines(str(round(np.exp(temp_result_array[item_ind, 0]), 9)) + '\t') w_stream2.writelines(str(round(np.exp(temp_result_array[item_ind, 1]), 9)) + '\t') else: w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\n') w_stream1.writelines(str(round(np.exp(temp_result_array[item_ind, 0]), 9)) + '\n') w_stream2.writelines(str(round(np.exp(temp_result_array[item_ind, 1]), 9)) + '\n') w_stream.close()
class most_like(): def __init__(self): self.data_dir = 'E:\\gitshell\\tianchi2' # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 20000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率对数 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 类别 self.class_M = np.zeros((3000000, 2)) # 类别id 类别商品计数/ 概率对数 self.dict_class = {} self.class_num = 0 self.class_class = np.zeros((2, 2)) # 原始人工经验 self.exp_peo = exp_of_people() self.exp_peo.read_jingyan() # self.matrix_item = np.zeros((10000000,3)) # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 60000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 pass def read_txt(self, filename="dim_items.txt"): r_path = os.path.join(self.data_dir, filename) r_stream = open(r_path, 'r') self.item_num = 0 for line_i in r_stream: if self.item_num % 100000 == 0: print self.item_num, time.time() # 录入商品 my_str = line_i.strip('\n').split(" ") self.dict_item[int(my_str[0])] = self.item_num self.item_M[self.item_num, :] = [my_str[0], int(my_str[1])] self.item_word_array[self.item_num] = my_str[2] self.item_num += 1 # 录入不同的词组 my_str2 = my_str[2].split(',') for x_word in my_str2: try: word_id = int(x_word) except: continue word_ind = self.dict_word.get(word_id, -1) if word_ind == -1: self.dict_word[word_id] = self.word_num self.word_M[self.word_num, :] = [word_id, 1] # self.word_item_array[self.word_num] = my_str[0] # 商品 self.word_num += 1 else: self.word_M[word_ind, 1] += 1 # self.word_item_array[word_ind] += ',' + my_str[0] # 商品 # 录入分类信息 class_id = int(my_str[1]) class_ind = self.dict_class.get(class_id, -1) if class_ind == -1: self.dict_class[class_id] = self.class_num self.class_M[self.class_num, :] = [class_id, 1] self.class_num += 1 else: self.class_M[class_ind, 1] += 1 self.class_M = self.class_M[0:self.class_num, :] self.word_M = self.word_M[0:self.word_num, :] self.item_M = self.item_M[0:self.item_num, :] self.item_word_array = self.item_word_array[0:self.item_num] # self.word_item_array = self.word_item_array[0:self.word_num] # 根据热度排行对词进行重新排序 order = np.argsort(-self.word_M[:, 1]) self.word_M = self.word_M[order, :] # temp_a = self.word_item_array # for x in xrange(0, len(order)): # self.word_item_array[x] = temp_a[order[x]] for x in xrange(0, self.word_num): self.dict_word[int(self.word_M[x, 0])] = x r_stream.close() # # 转化word_M 第2 列 为概率对数: sum_word_num = sum(self.word_M[:, 1]) self.word_M[:, 1] = np.log(self.word_M[:, 1] / sum_word_num) # 转化word_M 第2 列 为概率对数: sum_class_num = sum(self.class_M[:, 1]) self.class_M[:, 1] = np.log(self.class_M[:, 1] / sum_class_num) def result_word(self, file_name='test_items2.txt'): # 找出需要计算的词汇 file_name = os.path.join(self.data_dir, file_name) r_stream = open(file_name, 'r') for line_i in r_stream: item_id = int(line_i.strip()) self.test_item.append(item_id) item_ind = self.dict_item.get(item_id, -1) if item_ind == -1: continue word_array = self.item_word_array[item_ind].split(',') for word_id in word_array: word_id = int(word_id) word_ind = self.r_dict_word.get(word_id, -1) if word_ind == -1: self.r_dict_word[word_id] = self.r_word_num self.r_word_M[self.r_word_num, :] = [word_id, 1] self.r_word_num += 1 else: self.r_word_M[word_ind, 1] += 1 r_stream.close() order = np.argsort(-self.r_word_M[0:self.r_word_num, 1]) self.r_word_M = self.r_word_M[order, :] i_record = self.r_word_num for i in xrange(0, self.r_word_num): if self.r_word_M[i, 1] == 1: i_record = i break self.r_word_num = i_record self.r_word_M = self.r_word_M[0:i_record, :] # 返回一个词的所有 商品 def get_item_array(self, word_id): word_ind = self.dict_word.get(word_id, -1) item_array = [] if word_ind == -1: return item_array else: item_str = self.word_item_array[word_ind].split(',') for item in item_str: item_array.append(int(item)) return item_array # 统计词词关系 def my_tongji1(self): split_ss = self.r_word_num temp_array = np.zeros((split_ss, self.top_k_word + 1)) p_remain = sum(self.word_M[self.top_k_word:, 1]) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" # word_ind = 0 for word_ind in xrange(0, self.r_word_num): word_id = int(self.r_word_M[word_ind, 0]) item_array = self.get_item_array(word_id) for item_id in item_array: item_array2 = self.exp_peo.associated_items(int(item_id)) # 关联商品 if len(item_array2) == 0: continue for item_id2 in item_array2: item_ind = self.dict_item.get(item_id2, -1) if item_ind != -1: word_str = self.item_word_array[item_ind] # 所关联商品的分词组 if word_str == '': continue word_array = word_str.split(',') for word_id2 in word_array: word_ind2 = min(self.dict_word[int(word_id2)], self.top_k_word) temp_array[word_ind, word_ind2] += 1 # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 if word_ind > 0 and ((word_ind % split_ss == 0) | (word_ind == self.r_word_num - 1)): temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = self.word_M[i_col, 1] for i_row in xrange(0, row_num): temp_array[i_row, i_col] = \ self.pro_guji.get_pro_r(p_pre, temp_array[i_row, i_col], temp_array_sum[i_row]) # p n m # 静态存储 i_file = math.floor(word_ind / split_ss) w_file = os.path.join(self.data_dir, file_name + str(i_file) + '.txt') w_stream = open(w_file, 'w') for i_row in xrange(0, row_num): my_str = '' for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + ',' my_str += str(math.log(temp_array[i_row, col_num - 1])) + '\n' w_stream.writelines(my_str) w_stream.close() pass # 统计 类类 关系 def my_tongji2(self): # 统计结果由 sql sever 完成后存为txt 这里直接读取 # 检查完毕 r_path = os.path.join(self.data_dir, "class_class.txt") r_stream = open(r_path, 'r') self.class_class = np.zeros((self.class_num, self.class_num)) for line in r_stream: my_str = line.strip().split('\t') class_ind1 = self.dict_class[int(my_str[0])] class_ind2 = self.dict_class[int(my_str[1])] num = int(my_str[2]) self.class_class[class_ind1, class_ind2] += num r_stream.close() row_sum = self.class_class.sum(1) # 按照行求和 # all_num = 6 # 商品总数 # self.class_class[class_ind1, class_ind2] 存储 id1 类别 后面搭配 id2 类别的概率 w_path1 = open(os.path.join(self.data_dir, "class_class1.txt"), 'w') w_path2 = open(os.path.join(self.data_dir, "class_class2.txt"), 'w') w_path3 = open(os.path.join(self.data_dir, "class_class3.txt"), 'w') for ind1 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind1, 1]) # 原假设: ind2 发生的概率 w_path3.writelines(str(p_pre) + '\t') for ind2 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind2, 1]) # 原假设: ind2 发生的概率 if ind2 == self.class_num - 1: w_path1.writelines(str(self.class_class[ind1, ind2]) + '\n') else: w_path1.writelines(str(self.class_class[ind1, ind2]) + '\t') self.class_class[ind1, ind2] = self.pro_guji.get_pro_r(p_pre, self.class_class[ind1, ind2], row_sum[ind1]) # ind1 条件下 ind2 的概率 if ind2 == self.class_num - 1: w_path2.writelines(str(self.class_class[ind1, ind2]) + '\n') else: w_path2.writelines(str(self.class_class[ind1, ind2]) + '\t') w_path1.close() w_path2.close() w_path3.close() # 统计词词关系 new 基于sql sever 处理过的文件开始统计 简化代码 def my_tongji3(self): split_ss = self.r_word_num temp_array = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) p_remain = sum(np.exp(self.word_M[self.top_k_word:, 1])) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" r_path = os.path.join(self.data_dir, "wordstr_wordstr.txt") r_stream = open(r_path, 'r') for wor_str in r_stream: my_str = wor_str.strip().split('\t') if my_str[0] == '' or my_str[1] == '': continue word_p = my_str[0].split(',') word_s = my_str[1].split(',') num = int(my_str[2]) for word_id1 in word_p: word_ind1 = self.r_dict_word.get(int(word_id1), -1) # 行号 if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) for word_id2 in word_s: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue # 非录入词 word_ind2 = min(word_ind2, self.top_k_word) temp_array[word_ind1, word_ind2] += num # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 r_stream.close() # 求概率 temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = np.exp(self.word_M[i_col, 1]) for i_row in xrange(0, row_num): temp_array[i_row, i_col] = \ self.pro_guji.get_pro_r(p_pre, temp_array[i_row, i_col], temp_array_sum[i_row]) # p n m # 静态存储 w_file = os.path.join(self.data_dir, file_name + str(i_file) + '.txt') w_stream = open(w_file, 'w') for i_row in xrange(0, row_num): my_str = '' for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + ',' my_str += str(math.log(temp_array[i_row, col_num - 1])) + '\n' w_stream.writelines(my_str) w_stream.close() self.word_word = temp_array # 读取之前计算的词词关系 def read_word_word(self): self.word_word = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) o_stream = open(os.path.join(self.data_dir, "word_word_pro0.txt"), 'r') i_line = 0 for line in o_stream: my_str = line.strip().split(',') for x in xrange(0, self.top_k_word + 1): self.word_word[i_line, x] = math.exp(float(my_str[x])) # 真概率 i_line += 1 o_stream.close() if i_line == (self.r_word_num + 1): print time.time(), "good" # 根据热度重组商品矩阵 def read_item_hot(self, write=True): path = os.path.join(self.data_dir, 'dim_items2.txt') nums_array = np.array([0] * self.item_num) r_stream = open(path, 'r') for line_i in r_stream: my_str = line_i.strip().split(' ') item_id = int(my_str[0]) nums = int(my_str[3]) item_ind = self.dict_item[item_id] nums_array[item_ind] = nums r_stream.close() a = np.argsort(-nums_array) self.item_M = self.item_M[a, :] for x in xrange(0, self.item_num): self.dict_item[int(self.item_M[x, 0])] = x # 记录商品热度 if write: w_stream = open(os.path.join(self.data_dir, 'my_item_hot.txt'), 'w') nums_array = nums_array[a] for x in xrange(0, self.item_num): w_stream.writelines(str(int(self.item_M[x, 0])) + '\t' + str(nums_array[x]) + '\n') w_stream.close() # 搭配算法 主进程 def da_pei(self): file_name = os.path.join(self.data_dir, 'fm_submissions2_tag.txt') w_stream = open(file_name, 'w') iii = -1 for item_id in self.test_item: iii += 1 if iii % 100 == 0: print time.time(), iii item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] # 类别编号 class_ind = self.dict_class[class_id] # 类别索引 # item_id == self.class_M[item_ind,0] temp_result_array = np.zeros((self.item_num, 2)) # 第一列记录词组的意见,第二列记录类别的意见 概率乘 化作 加 class_pro = np.log(self.class_class[class_ind, :]) # 搭配时 该商品类别到各个类别的概率 class_pro2 = self.class_M[:, 1] # 不搭配时 该商品类别到各个类别的概率 temp_word_pro = np.array([0.0] * (self.top_k_word + 1)) # 该商品词组到各个词组的概率 word_num = 0 word_str_array = word_str.split(',') # 获得该商品后 其他商品的输出概率 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) temp_word_pro += np.log(self.word_word[word_ind1, :]) # word_word 记录的是 真实概率 word_num += 1 if word_num == 0: temp_word_pro = self.word_M[:, 1] else: temp_word_pro *= (1.0 / word_num) # 搭配 平均词意见 temp_word_pro2 = self.word_M[:, 1] # 不搭配 意见 for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] class_ind00 = self.dict_class[int(class_id)] temp_result_array[item_ind, 1] = class_pro[class_ind00] - class_pro2[class_ind00] # 其exp 为搭配是 不搭配的倍数 if word_str == "": continue word_str = word_str.split(',') word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) temp_result_array[item_ind, 0] += temp_word_pro[word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 if word_num2 == 0: temp_result_array[item_ind, 0] = 0 else: temp_result_array[item_ind, 0] *= (1.0 / word_num2) a = temp_result_array[:, 0] + temp_result_array[:, 1] # 类别的意见, 加上词的意见 my_order = np.argsort(-a) # 降序排序 并输出 # 找出前6百个 按照热度进行排名 将 前 200 个 写入文件 top_top_k = 400 # 重要参数 temp_rrr = np.zeros((top_top_k, 2)) for i in xrange(0, top_top_k): temp_rrr[i, :] = [self.item_M[my_order[i], 0], my_order[i]] my_order = np.argsort(temp_rrr[:, 1]) # 按照序号排名 即热度 result_str = str(item_id) + ' ' + str(int(temp_rrr[my_order[0], 0])) for i in xrange(1, 200): result_str += ',' + str(int(temp_rrr[my_order[i], 0])) w_stream.writelines(result_str + '\n') pass # 仅仅计算出所有结果前6万商品的搭配结果 # 加入了 原始的搭配概率 0.006,在 搭配中由于这一直大家相同,所以无关紧要。 # 这次会计算出前6万的搭配概率,供 my_python2 做 进一步筛查 def da_pei2(self): file_name = os.path.join(self.data_dir, 'fm_submissions2_tag_m.txt') w_stream = open(file_name, 'w') iii = -1 for item_id in self.test_item: iii += 1 if iii % 100 == 0: print time.time(), iii item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] # 类别编号 class_ind = self.dict_class[class_id] # 类别索引 # item_id == self.class_M[item_ind,0] temp_result_array = np.zeros((self.item_num, 2)) # 第一列记录词组的意见,第二列记录类别的意见 概率乘 化作 加 class_pro = np.log(self.class_class[class_ind, :]) # 搭配时 该商品类别到各个类别的概率 class_pro2 = self.class_M[:, 1] # 不搭配时 该商品类别到各个类别的概率对数 temp_word_pro = np.array([0.0] * (self.top_k_word + 1)) # 该商品词组发生后各个词组的概率 word_num = 0 word_str_array = word_str.split(',') # 获得该商品后 其他商品的输出概率 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) temp_word_pro += np.log(self.word_word[word_ind1, :]) # word_word 记录的是 真实概率 word_num += 1 if word_num == 0: temp_word_pro = self.word_M[:, 1] else: temp_word_pro *= (1.0 / word_num) # 搭配 平均词意见 temp_word_pro2 = self.word_M[:, 1] # 不搭配 意见 for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] class_ind00 = self.dict_class[int(class_id)] temp_result_array[item_ind, 1] = class_pro[class_ind00] - class_pro2[class_ind00] # 其exp 为搭配是 不搭配发生的倍数 if word_str == "": continue word_str = word_str.split(',') word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) temp_result_array[item_ind, 0] += temp_word_pro[word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 if word_num2 == 0: temp_result_array[item_ind, 0] = 0 else: temp_result_array[item_ind, 0] *= (1.0 / word_num2) a = temp_result_array[:, 0] + temp_result_array[:, 1] # 类别的意见, 加上词的意见 a元素 中存储的是 pro_a = self.p_match * np.exp(a) / (self.p_match * np.exp(a) + (1 - self.p_match) * 1) # 得到各个商品 的概率 # my_str00 = "" w_stream.writelines(str(item_id) + '\t') for item_ind in xrange(0, self.item_top_k): if item_ind != (self.item_top_k - 1): w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\t') else: w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\n') w_stream.close()
class most_like(): def __init__(self): self.data_dir = 'E:\\gitshell\\tianchi2' # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 20000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率对数 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 类别 self.class_M = np.zeros((3000000, 2)) # 类别id 类别商品计数/ 概率对数 self.dict_class = {} self.class_num = 0 self.class_class = np.zeros((2, 2)) # 原始人工经验 self.exp_peo = exp_of_people() self.exp_peo.read_jingyan() # self.matrix_item = np.zeros((10000000,3)) # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 60000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 pass def read_txt(self, filename="dim_items.txt"): r_path = os.path.join(self.data_dir, filename) r_stream = open(r_path, 'r') self.item_num = 0 for line_i in r_stream: if self.item_num % 100000 == 0: print self.item_num, time.time() # 录入商品 my_str = line_i.strip('\n').split(" ") self.dict_item[int(my_str[0])] = self.item_num self.item_M[self.item_num, :] = [my_str[0], int(my_str[1])] self.item_word_array[self.item_num] = my_str[2] self.item_num += 1 # 录入不同的词组 my_str2 = my_str[2].split(',') for x_word in my_str2: try: word_id = int(x_word) except: continue word_ind = self.dict_word.get(word_id, -1) if word_ind == -1: self.dict_word[word_id] = self.word_num self.word_M[self.word_num, :] = [word_id, 1] # self.word_item_array[self.word_num] = my_str[0] # 商品 self.word_num += 1 else: self.word_M[word_ind, 1] += 1 # self.word_item_array[word_ind] += ',' + my_str[0] # 商品 # 录入分类信息 class_id = int(my_str[1]) class_ind = self.dict_class.get(class_id, -1) if class_ind == -1: self.dict_class[class_id] = self.class_num self.class_M[self.class_num, :] = [class_id, 1] self.class_num += 1 else: self.class_M[class_ind, 1] += 1 self.class_M = self.class_M[0:self.class_num, :] self.word_M = self.word_M[0:self.word_num, :] self.item_M = self.item_M[0:self.item_num, :] self.item_word_array = self.item_word_array[0:self.item_num] # self.word_item_array = self.word_item_array[0:self.word_num] # 根据热度排行对词进行重新排序 order = np.argsort(-self.word_M[:, 1]) self.word_M = self.word_M[order, :] # temp_a = self.word_item_array # for x in xrange(0, len(order)): # self.word_item_array[x] = temp_a[order[x]] for x in xrange(0, self.word_num): self.dict_word[int(self.word_M[x, 0])] = x r_stream.close() # # 转化word_M 第2 列 为概率对数: sum_word_num = sum(self.word_M[:, 1]) self.word_M[:, 1] = np.log(self.word_M[:, 1] / sum_word_num) # 转化word_M 第2 列 为概率对数: sum_class_num = sum(self.class_M[:, 1]) self.class_M[:, 1] = np.log(self.class_M[:, 1] / sum_class_num) def result_word(self, file_name='test_items2.txt'): # 找出需要计算的词汇 file_name = os.path.join(self.data_dir, file_name) r_stream = open(file_name, 'r') for line_i in r_stream: item_id = int(line_i.strip()) self.test_item.append(item_id) item_ind = self.dict_item.get(item_id, -1) if item_ind == -1: continue word_array = self.item_word_array[item_ind].split(',') for word_id in word_array: word_id = int(word_id) word_ind = self.r_dict_word.get(word_id, -1) if word_ind == -1: self.r_dict_word[word_id] = self.r_word_num self.r_word_M[self.r_word_num, :] = [word_id, 1] self.r_word_num += 1 else: self.r_word_M[word_ind, 1] += 1 r_stream.close() order = np.argsort(-self.r_word_M[0:self.r_word_num, 1]) self.r_word_M = self.r_word_M[order, :] i_record = self.r_word_num for i in xrange(0, self.r_word_num): if self.r_word_M[i, 1] == 1: i_record = i break self.r_word_num = i_record self.r_word_M = self.r_word_M[0:i_record, :] # 返回一个词的所有 商品 def get_item_array(self, word_id): word_ind = self.dict_word.get(word_id, -1) item_array = [] if word_ind == -1: return item_array else: item_str = self.word_item_array[word_ind].split(',') for item in item_str: item_array.append(int(item)) return item_array # 统计词词关系 def my_tongji1(self): split_ss = self.r_word_num temp_array = np.zeros((split_ss, self.top_k_word + 1)) p_remain = sum(self.word_M[self.top_k_word:, 1]) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" # word_ind = 0 for word_ind in xrange(0, self.r_word_num): word_id = int(self.r_word_M[word_ind, 0]) item_array = self.get_item_array(word_id) for item_id in item_array: item_array2 = self.exp_peo.associated_items( int(item_id)) # 关联商品 if len(item_array2) == 0: continue for item_id2 in item_array2: item_ind = self.dict_item.get(item_id2, -1) if item_ind != -1: word_str = self.item_word_array[item_ind] # 所关联商品的分词组 if word_str == '': continue word_array = word_str.split(',') for word_id2 in word_array: word_ind2 = min(self.dict_word[int(word_id2)], self.top_k_word) temp_array[ word_ind, word_ind2] += 1 # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 if word_ind > 0 and ((word_ind % split_ss == 0) | (word_ind == self.r_word_num - 1)): temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = self.word_M[i_col, 1] for i_row in xrange(0, row_num): temp_array[i_row, i_col] = \ self.pro_guji.get_pro_r(p_pre, temp_array[i_row, i_col], temp_array_sum[i_row]) # p n m # 静态存储 i_file = math.floor(word_ind / split_ss) w_file = os.path.join(self.data_dir, file_name + str(i_file) + '.txt') w_stream = open(w_file, 'w') for i_row in xrange(0, row_num): my_str = '' for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + ',' my_str += str(math.log(temp_array[i_row, col_num - 1])) + '\n' w_stream.writelines(my_str) w_stream.close() pass # 统计 类类 关系 def my_tongji2(self): # 统计结果由 sql sever 完成后存为txt 这里直接读取 # 检查完毕 r_path = os.path.join(self.data_dir, "class_class.txt") r_stream = open(r_path, 'r') self.class_class = np.zeros((self.class_num, self.class_num)) for line in r_stream: my_str = line.strip().split('\t') class_ind1 = self.dict_class[int(my_str[0])] class_ind2 = self.dict_class[int(my_str[1])] num = int(my_str[2]) self.class_class[class_ind1, class_ind2] += num r_stream.close() row_sum = self.class_class.sum(1) # 按照行求和 # all_num = 6 # 商品总数 # self.class_class[class_ind1, class_ind2] 存储 id1 类别 后面搭配 id2 类别的概率 w_path1 = open(os.path.join(self.data_dir, "class_class1.txt"), 'w') w_path2 = open(os.path.join(self.data_dir, "class_class2.txt"), 'w') w_path3 = open(os.path.join(self.data_dir, "class_class3.txt"), 'w') for ind1 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind1, 1]) # 原假设: ind2 发生的概率 w_path3.writelines(str(p_pre) + '\t') for ind2 in xrange(0, self.class_num): p_pre = np.exp(self.class_M[ind2, 1]) # 原假设: ind2 发生的概率 if ind2 == self.class_num - 1: w_path1.writelines( str(self.class_class[ind1, ind2]) + '\n') else: w_path1.writelines( str(self.class_class[ind1, ind2]) + '\t') self.class_class[ind1, ind2] = self.pro_guji.get_pro_r( p_pre, self.class_class[ind1, ind2], row_sum[ind1]) # ind1 条件下 ind2 的概率 if ind2 == self.class_num - 1: w_path2.writelines( str(self.class_class[ind1, ind2]) + '\n') else: w_path2.writelines( str(self.class_class[ind1, ind2]) + '\t') w_path1.close() w_path2.close() w_path3.close() # 统计词词关系 new 基于sql sever 处理过的文件开始统计 简化代码 def my_tongji3(self): split_ss = self.r_word_num temp_array = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) p_remain = sum(np.exp(self.word_M[self.top_k_word:, 1])) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" r_path = os.path.join(self.data_dir, "wordstr_wordstr.txt") r_stream = open(r_path, 'r') for wor_str in r_stream: my_str = wor_str.strip().split('\t') if my_str[0] == '' or my_str[1] == '': continue word_p = my_str[0].split(',') word_s = my_str[1].split(',') num = int(my_str[2]) for word_id1 in word_p: word_ind1 = self.r_dict_word.get(int(word_id1), -1) # 行号 if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) for word_id2 in word_s: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue # 非录入词 word_ind2 = min(word_ind2, self.top_k_word) temp_array[ word_ind1, word_ind2] += num # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 r_stream.close() # 求概率 temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = np.exp(self.word_M[i_col, 1]) for i_row in xrange(0, row_num): temp_array[i_row, i_col] = \ self.pro_guji.get_pro_r(p_pre, temp_array[i_row, i_col], temp_array_sum[i_row]) # p n m # 静态存储 w_file = os.path.join(self.data_dir, file_name + str(i_file) + '.txt') w_stream = open(w_file, 'w') for i_row in xrange(0, row_num): my_str = '' for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + ',' my_str += str(math.log(temp_array[i_row, col_num - 1])) + '\n' w_stream.writelines(my_str) w_stream.close() self.word_word = temp_array # 读取之前计算的词词关系 def read_word_word(self): self.word_word = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) o_stream = open(os.path.join(self.data_dir, "word_word_pro0.txt"), 'r') i_line = 0 for line in o_stream: my_str = line.strip().split(',') for x in xrange(0, self.top_k_word + 1): self.word_word[i_line, x] = math.exp(float(my_str[x])) # 真概率 i_line += 1 o_stream.close() if i_line == (self.r_word_num + 1): print time.time(), "good" # 根据热度重组商品矩阵 def read_item_hot(self, write=True): path = os.path.join(self.data_dir, 'dim_items2.txt') nums_array = np.array([0] * self.item_num) r_stream = open(path, 'r') for line_i in r_stream: my_str = line_i.strip().split(' ') item_id = int(my_str[0]) nums = int(my_str[3]) item_ind = self.dict_item[item_id] nums_array[item_ind] = nums r_stream.close() a = np.argsort(-nums_array) self.item_M = self.item_M[a, :] for x in xrange(0, self.item_num): self.dict_item[int(self.item_M[x, 0])] = x # 记录商品热度 if write: w_stream = open(os.path.join(self.data_dir, 'my_item_hot.txt'), 'w') nums_array = nums_array[a] for x in xrange(0, self.item_num): w_stream.writelines( str(int(self.item_M[x, 0])) + '\t' + str(nums_array[x]) + '\n') w_stream.close() # 搭配算法 主进程 def da_pei(self): file_name = os.path.join(self.data_dir, 'fm_submissions2_tag.txt') w_stream = open(file_name, 'w') iii = -1 for item_id in self.test_item: iii += 1 if iii % 100 == 0: print time.time(), iii item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] # 类别编号 class_ind = self.dict_class[class_id] # 类别索引 # item_id == self.class_M[item_ind,0] temp_result_array = np.zeros( (self.item_num, 2)) # 第一列记录词组的意见,第二列记录类别的意见 概率乘 化作 加 class_pro = np.log( self.class_class[class_ind, :]) # 搭配时 该商品类别到各个类别的概率 class_pro2 = self.class_M[:, 1] # 不搭配时 该商品类别到各个类别的概率 temp_word_pro = np.array([0.0] * (self.top_k_word + 1)) # 该商品词组到各个词组的概率 word_num = 0 word_str_array = word_str.split(',') # 获得该商品后 其他商品的输出概率 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) temp_word_pro += np.log( self.word_word[word_ind1, :]) # word_word 记录的是 真实概率 word_num += 1 if word_num == 0: temp_word_pro = self.word_M[:, 1] else: temp_word_pro *= (1.0 / word_num) # 搭配 平均词意见 temp_word_pro2 = self.word_M[:, 1] # 不搭配 意见 for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] class_ind00 = self.dict_class[int(class_id)] temp_result_array[item_ind, 1] = class_pro[class_ind00] - class_pro2[ class_ind00] # 其exp 为搭配是 不搭配的倍数 if word_str == "": continue word_str = word_str.split(',') word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) temp_result_array[item_ind, 0] += temp_word_pro[ word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 if word_num2 == 0: temp_result_array[item_ind, 0] = 0 else: temp_result_array[item_ind, 0] *= (1.0 / word_num2) a = temp_result_array[:, 0] + temp_result_array[:, 1] # 类别的意见, 加上词的意见 my_order = np.argsort(-a) # 降序排序 并输出 # 找出前6百个 按照热度进行排名 将 前 200 个 写入文件 top_top_k = 400 # 重要参数 temp_rrr = np.zeros((top_top_k, 2)) for i in xrange(0, top_top_k): temp_rrr[i, :] = [self.item_M[my_order[i], 0], my_order[i]] my_order = np.argsort(temp_rrr[:, 1]) # 按照序号排名 即热度 result_str = str(item_id) + ' ' + str(int(temp_rrr[my_order[0], 0])) for i in xrange(1, 200): result_str += ',' + str(int(temp_rrr[my_order[i], 0])) w_stream.writelines(result_str + '\n') pass # 仅仅计算出所有结果前6万商品的搭配结果 # 加入了 原始的搭配概率 0.006,在 搭配中由于这一直大家相同,所以无关紧要。 # 这次会计算出前6万的搭配概率,供 my_python2 做 进一步筛查 def da_pei2(self): file_name = os.path.join(self.data_dir, 'fm_submissions2_tag_m.txt') w_stream = open(file_name, 'w') iii = -1 for item_id in self.test_item: iii += 1 if iii % 100 == 0: print time.time(), iii item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] # 类别编号 class_ind = self.dict_class[class_id] # 类别索引 # item_id == self.class_M[item_ind,0] temp_result_array = np.zeros( (self.item_num, 2)) # 第一列记录词组的意见,第二列记录类别的意见 概率乘 化作 加 class_pro = np.log( self.class_class[class_ind, :]) # 搭配时 该商品类别到各个类别的概率 class_pro2 = self.class_M[:, 1] # 不搭配时 该商品类别到各个类别的概率对数 temp_word_pro = np.array([0.0] * (self.top_k_word + 1)) # 该商品词组发生后各个词组的概率 word_num = 0 word_str_array = word_str.split(',') # 获得该商品后 其他商品的输出概率 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) temp_word_pro += np.log( self.word_word[word_ind1, :]) # word_word 记录的是 真实概率 word_num += 1 if word_num == 0: temp_word_pro = self.word_M[:, 1] else: temp_word_pro *= (1.0 / word_num) # 搭配 平均词意见 temp_word_pro2 = self.word_M[:, 1] # 不搭配 意见 for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] class_id = self.item_M[item_ind, 1] class_ind00 = self.dict_class[int(class_id)] temp_result_array[item_ind, 1] = class_pro[class_ind00] - class_pro2[ class_ind00] # 其exp 为搭配是 不搭配发生的倍数 if word_str == "": continue word_str = word_str.split(',') word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) temp_result_array[item_ind, 0] += temp_word_pro[ word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 if word_num2 == 0: temp_result_array[item_ind, 0] = 0 else: temp_result_array[item_ind, 0] *= (1.0 / word_num2) a = temp_result_array[:, 0] + temp_result_array[:, 1] # 类别的意见, 加上词的意见 a元素 中存储的是 pro_a = self.p_match * np.exp(a) / (self.p_match * np.exp(a) + (1 - self.p_match) * 1 ) # 得到各个商品 的概率 # my_str00 = "" w_stream.writelines(str(item_id) + '\t') for item_ind in xrange(0, self.item_top_k): if item_ind != (self.item_top_k - 1): w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\t') else: w_stream.writelines(str(round(pro_a[item_ind], 9)) + '\n') w_stream.close()
class WordOpinion: def __init__(self): self.data_dir = "E:\\gitshell\\tianchi3" self.my_matrix = np.zeros((2, 2)) # 记录词到商品的概率对数 # 词组 self.word_num = 0 self.dict_word = {} self.top_k_word = 15000 # 详细计算前20000的词组 self.word_M = np.zeros((1000000, 2)) # 第一列 记录word_id 第二列 记录 概率 self.word_item_array = [""] * 1000000 # 每个词被哪些商品使用 self.word_word = np.zeros((3, 3)) # 统一记录真实概率 # 需要预测的词组 self.r_word_num = 0 self.r_dict_word = {} self.r_word_M = np.zeros((80000, 2)) # 第一列词 第二列词的次数 self.test_item = [] # 商品 self.dict_item = {} self.item_M = np.zeros((600000, 2), int) # item_id 类别编号 self.item_word_array = [""] * 600000 self.item_num = 0 # 概率优化模块 self.pro_guji = Pro_estimate() # 只考虑 最热的 6万 商品 self.item_top_k = 100000 # 原始的搭配概率 self.p_match = 0.0006 # 任意随机商品 搭配的概率 self.num_word2 = 800 pass def read_txt(self, filename="dim_items.txt"): # 读取商品的类别信息表 r_path = os.path.join(self.data_dir, filename) r_stream = open(r_path, "r") self.item_num = 0 for line_i in r_stream: if self.item_num % 100000 == 0: print self.item_num, time.time() # 录入商品 my_str = line_i.strip("\n").split(" ") self.dict_item[int(my_str[0])] = self.item_num self.item_M[self.item_num, :] = [my_str[0], int(my_str[1])] self.item_word_array[self.item_num] = my_str[2] self.item_num += 1 # 录入不同的词组 my_str2 = my_str[2].split(",") for x_word in my_str2: try: word_id = int(x_word) except: continue word_ind = self.dict_word.get(word_id, -1) if word_ind == -1: self.dict_word[word_id] = self.word_num self.word_M[self.word_num, :] = [word_id, 1] # self.word_item_array[self.word_num] = my_str[0] # 商品 self.word_num += 1 else: self.word_M[word_ind, 1] += 1 # self.word_item_array[word_ind] += ',' + my_str[0] # 商品 self.word_M = self.word_M[0 : self.word_num, :] self.item_M = self.item_M[0 : self.item_num, :] self.item_word_array = self.item_word_array[0 : self.item_num] # self.word_item_array = self.word_item_array[0:self.word_num] # 根据热度排行对词进行重新排序 order = np.argsort(-self.word_M[:, 1]) self.word_M = self.word_M[order, :] # temp_a = self.word_item_array # for x in xrange(0, len(order)): # self.word_item_array[x] = temp_a[order[x]] for x in xrange(0, self.word_num): self.dict_word[int(self.word_M[x, 0])] = x r_stream.close() # # 转化word_M 第2 列 为概率对数: sum_word_num = sum(self.word_M[:, 1]) self.word_M[:, 1] = self.word_M[:, 1] / sum_word_num def result_word(self, file_name="test_set.txt"): # 只对需要预测的商品进行计算 找出需要计算的词汇 file_name = os.path.join(self.data_dir, file_name) r_stream = open(file_name, "r") for line_i in r_stream: item_id = int(line_i.strip().split("\t")[-1]) self.test_item.append(item_id) item_ind = self.dict_item.get(item_id, -1) if item_ind == -1: continue word_array = self.item_word_array[item_ind].split(",") for word_id in word_array: word_id = int(word_id) word_ind = self.r_dict_word.get(word_id, -1) if word_ind == -1: self.r_dict_word[word_id] = self.r_word_num self.r_word_M[self.r_word_num, :] = [word_id, 1] self.r_word_num += 1 else: self.r_word_M[word_ind, 1] += 1 r_stream.close() order = np.argsort(-self.r_word_M[0 : self.r_word_num, 1]) self.r_word_M = self.r_word_M[order, :] i_record = self.r_word_num for i in xrange(0, self.r_word_num): if self.r_word_M[i, 1] == 1: i_record = i print i_record break self.r_word_num = i_record self.r_word_M = self.r_word_M[0:i_record, :] # 统计词词关系 new 基于sql sever 处理过的文件开始统计 简化代码 def my_tongji3(self): # split_ss = self.r_word_num temp_array = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) p_remain = sum(self.word_M[self.top_k_word :, 1]) # 残余项原始概率 i_file = 0 file_name = "word_word_pro" r_path = os.path.join(self.data_dir, "learn_wordstr_wordstr0.txt") r_stream = open(r_path, "r") for wor_str in r_stream: my_str = wor_str.strip().split("\t") if my_str[0] == "" or my_str[1] == "": continue word_p = my_str[0].split(",") word_s = my_str[1].split(",") num = int(my_str[2]) for word_id1 in word_p: word_ind1 = self.r_dict_word.get(int(word_id1), -1) # 行号 if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) for word_id2 in word_s: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue # 非录入词 word_ind2 = min(word_ind2, self.top_k_word) temp_array[word_ind1, word_ind2] += num # word_ind 指示的词发生后其关联商品 为 含word_ind2的词 次数+1 r_stream.close() for x in xrange(0, self.top_k_word): temp_array[:, x] += self.word_M[x, 1] # 增加 一个未知事件 保证没有概率为0 的情况 temp_array[:, self.top_k_word] += sum(self.word_M[self.top_k_word :, 1]) # 求概率 temp_array_sum = temp_array.sum(1) # 按照行进行求和 (row_num, col_num) = temp_array.shape o_stream = open(os.path.join(self.data_dir, "ceshi_tongji3.txt"), "w") for i_col in xrange(0, col_num): if i_col % 200 == 0: print i_col, time.time() if i_col == self.top_k_word: p_pre = p_remain else: p_pre = self.word_M[i_col, 1] for i_row in xrange(0, row_num): a = temp_array[i_row, i_col] # temp_array[i_row, i_col] = temp_array[i_row, i_col]/temp_array_sum[i_row] temp_array[i_row, i_col] = self.pro_guji.get_pro_r( p_pre, temp_array[i_row, i_col], temp_array_sum[i_row] ) # p n m if i_row == 1: o_stream.write( str(p_pre) + "\t" + str(temp_array[i_row, i_col]) + "\t" + str(a) + "\t" + str(temp_array_sum[i_row]) + "\n" ) # 静态存储 o_stream.close() w_file = os.path.join(self.data_dir, file_name + str(i_file) + ".txt") w_stream = open(w_file, "w") for i_row in xrange(0, row_num): my_str = "" for i_col in xrange(0, col_num - 1): my_str += str(math.log(temp_array[i_row, i_col])) + "," my_str += str(math.log(temp_array[i_row, col_num - 1])) + "\n" w_stream.writelines(my_str) w_stream.close() self.word_word = temp_array # 读取之前计算的词词关系 def read_word_word(self): self.word_word = np.zeros((self.r_word_num + 1, self.top_k_word + 1)) o_stream = open(os.path.join(self.data_dir, "word_word_pro0.txt"), "r") i_line = 0 for line in o_stream: my_str = line.strip().split(",") for x in xrange(0, self.top_k_word + 1): self.word_word[i_line, x] = math.exp(float(my_str[x])) # 真概率 i_line += 1 o_stream.close() if i_line == (self.r_word_num + 1): print time.time(), "good" # 根据热度重组商品矩阵 def read_item_hot(self, write=True): path = os.path.join(self.data_dir, "my_item_hot.txt") nums_array = np.array([0] * self.item_num) r_stream = open(path, "r") for line_i in r_stream: my_str = line_i.strip().split("\t") item_id = int(my_str[0]) nums = int(my_str[-1]) item_ind = self.dict_item[item_id] nums_array[item_ind] = nums r_stream.close() a = np.argsort(-nums_array) # 降序排列 self.item_M = self.item_M[a, :] for x in xrange(0, self.item_num): self.dict_item[int(self.item_M[x, 0])] = x # 记录商品热度 if write: w_stream = open(os.path.join(self.data_dir, "my_item_hot.txt"), "w") nums_array = nums_array[a] for x in xrange(0, self.item_num): w_stream.writelines(str(int(self.item_M[x, 0])) + "\t" + str(nums_array[x]) + "\n") w_stream.close() # 对2000个词来 进行构造矩阵 def dapei(self): # 每一个词后面 各个商品的发生几率 self.my_matrix = np.zeros((self.num_word2, self.item_top_k)) for word_ind1 in xrange(0, self.num_word2): temp_word = self.r_word_M[word_ind1,] print word_ind1 # word_id1 = int(temp_word[0]) self.my_matrix[word_ind1,] = self.get_pro_1(word_ind1, False) # 返回某一个词对每个 商品的搭配意见 def get_pro_1(self, word_ind1, ab=True): if word_ind1 < self.num_word2 and ab: return self.my_matrix[word_ind1,] else: array0 = np.array([0.0] * (self.item_top_k)) temp_word_pro2 = np.log(self.word_M[0 : self.top_k_word + 1, 1]) temp_word_pro2[self.top_k_word] = sum(self.word_M[self.top_k_word :, 1]) temp_word_pro = np.log(self.word_word[word_ind1,]) for item_ind in xrange(0, self.item_top_k): word_str = self.item_word_array[item_ind] if word_str == "": # 没有任何词语 continue word_str = word_str.split(",") word_num2 = 0 for word_id2 in word_str: word_ind2 = self.dict_word.get(int(word_id2), -1) if word_ind2 == -1: continue word_ind2 = min(word_ind2, self.top_k_word) array0[item_ind] += temp_word_pro[word_ind2] - temp_word_pro2[word_ind2] word_num2 += 1 array0[item_ind] = array0[item_ind] / word_num2 return array0 # 得到某一个词的搭配商品 def get(self, item_id): yyy = np.array([0.0] * self.item_top_k) item_ind = self.dict_item[item_id] word_str = self.item_word_array[item_ind] if len(word_str) == 0: return yyy word_str_array = word_str.split(",") aaa = 1 for word_id in word_str_array: try: word_id2i = int(word_id) except: continue word_ind1 = self.r_dict_word.get(word_id2i, -1) if word_ind1 == -1: continue # 非统计对象 word_ind1 = min(word_ind1, self.r_word_num) yyy += self.get_pro_1(word_ind1) # word_word 记录的是 真实概率 aaa += 1 yyy = yyy / aaa b = max(yyy) yyy = np.exp(yyy - b) # 最大值化为一 return yyy def write_result(self): file_name1 = os.path.join(self.data_dir, "fm_submissions2_tag_w.txt") # word 意见 w_stream1 = open(file_name1, "w") iii = -1 t_b = time.time() for item_id in self.test_item: iii += 1 if iii % 100 == 0 or time.time() - t_b > 100: t_b = time.time() print iii, t_b pro_gailv = self.get(item_id) w_stream1.writelines(str(item_id) + "\t") for item_ind in xrange(0, self.item_top_k - 1): w_stream1.writelines(str(pro_gailv[item_ind]) + "\t") w_stream1.writelines(str(pro_gailv[self.item_top_k - 1]) + "\n") w_stream1.close()