def batch_em_cluster(read_directory, write_directory1, write_directory2): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) cluster_number = 8 init_mu = 0.1 init_sigma = 1.0 for i in range(file_number): vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt') data_dimension = vsm.shape[1] init_means = [] for j in range(cluster_number): init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu) cluster_model = cluster.EMClusterer(init_means, bias=0.1) cluster_tag = cluster_model.cluster(vsm, True, trace=False) cluster_tag_to_string = [str(x) for x in cluster_tag] center_data = cluster_model._means quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
def batch_em_cluster(read_directory, write_directory1, write_directory2): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) cluster_number = 8 init_mu = 0.1 init_sigma = 1.0 for i in range(file_number): vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt') data_dimension = vsm.shape[1] init_means = [] for j in range(cluster_number): init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu) cluster_model = cluster.EMClusterer(init_means, bias=0.1) cluster_tag = cluster_model.cluster(vsm, True, trace=False) cluster_tag_to_string = [str(x) for x in cluster_tag] center_data = cluster_model._means quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
def SP_CT_LDA(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') #视图1,根据词汇分布计算潜在主题之间的相似度 W1 = np.zeros((len(PHAI), len(PHAI))) for j in range(len(PHAI)): for k in range(j, len(PHAI)): W1[j, k] = 1.0 / (SKLD(PHAI[j], PHAI[k]) + 1.0) W1[k, j] = W1[j, k] #估计聚类数目 cluster_number = get_cluster_number(W1) print cluster_number cluster_tag = spectral_cluster2(W1, cluster_number) #聚类分析 center_topic = np.zeros((cluster_number, len(PHAI[0]))) each_cluster_number = np.zeros(cluster_number, int) weibo_topic_similarity = np.zeros((cluster_number, len(THETA))) THETA = THETA.transpose() for j in range(len(cluster_tag)): center_topic[cluster_tag[j]] += PHAI[j] each_cluster_number[cluster_tag[j]] += 1 weibo_topic_similarity[cluster_tag[j]] += THETA[j] # for j in range(cluster_number): center_topic[j] = center_topic[j] / each_cluster_number[j] #weibo_topic_similarity[j] = weibo_topic_similarity[j] / each_cluster_number[j] weibo_topic_similarity = weibo_topic_similarity.transpose() ecn_to_string = [str(x) for x in each_cluster_number] write_matrix_to_text(weibo_topic_similarity, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_topic, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(ecn_to_string, write_directory3 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def pattern_cluster(read_filename1, read_filename2, read_filename3, write_filename1, write_filename2): pattern_list = [] f = open(read_filename1, 'r') line = f.readline() while line: if len(line.split()) > 1: pattern_list.append(line.split()) line = f.readline() f.close() word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() #调用compute_similarity函数计算相似度矩阵并给出聚类数目 similarity_matrix, cluster_number = compute_similarity(pattern_list, read_filename3, word_weight_dict) write_matrix_to_text(similarity_matrix, write_filename1) quick_write_list_to_text([str(cluster_number)], write_filename2)
def pattern_cluster(read_filename1, read_filename2, read_filename3, write_filename1, write_filename2): pattern_list = [] f = open(read_filename1, "r") line = f.readline() while line: if len(line.split()) > 1: pattern_list.append(line.split()) line = f.readline() f.close() word_weight_dict = {} f = open(read_filename2, "r") line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() # 调用compute_similarity函数计算相似度矩阵并给出聚类数目 similarity_matrix, cluster_number = compute_similarity(pattern_list, read_filename3, word_weight_dict) write_matrix_to_text(similarity_matrix, write_filename1) quick_write_list_to_text([str(cluster_number)], write_filename2)
def CT_LDA(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3, write_filename): gamma = 0.1 run_time = [] file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): start = time.clock() THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') #视图1,根据词汇分布计算潜在主题之间的相似度 W1 = np.zeros((len(PHAI), len(PHAI))) for j in range(len(PHAI)): for k in range(j, len(PHAI)): W1[j, k] = 1.0 / (SKLD(PHAI[j], PHAI[k]) + 1.0) W1[k, j] = W1[j, k] #视图2,根据相关微博文本集合计算潜在主题之间的相似度 W2 = np.zeros((len(PHAI), len(PHAI))) related_weibo_list = [] for j in range(len(PHAI)): related_weibo_list.append([]) for j in range(len(THETA)): for k in range(len(THETA[0])): if THETA[j, k] >= gamma: related_weibo_list[k].append(j) for j in range(len(PHAI)): for k in range(j, len(PHAI)): numerator = len(set(related_weibo_list[j]) & set(related_weibo_list[k])) denominator = len(set(related_weibo_list[j]) | set(related_weibo_list[k])) if j == k: W2[j, k] = 1.0 W2[k, j] = 1.0 elif denominator == 0.0: W2[j, k] = 0.0 W2[k, j] = 0.0 else: W2[j, k] = np.true_divide(numerator, denominator) W2[k, j] = W2[j, k] #估计聚类数目 cluster_number = get_cluster_number(W1) max_iter = 3 print cluster_number cluster_tag = co_training_spectral_cluster(W1, W2, cluster_number, iter=max_iter) #聚类分析 center_topic = np.zeros((cluster_number, len(PHAI[0]))) each_cluster_number = np.zeros(cluster_number, int) weibo_topic_similarity = np.zeros((cluster_number, len(THETA))) THETA = THETA.transpose() for j in range(len(cluster_tag)): center_topic[cluster_tag[j]] += PHAI[j] each_cluster_number[cluster_tag[j]] += 1 weibo_topic_similarity[cluster_tag[j]] += THETA[j] # for j in range(cluster_number): center_topic[j] = center_topic[j] / each_cluster_number[j] #weibo_topic_similarity[j] = weibo_topic_similarity[j] / each_cluster_number[j] weibo_topic_similarity = weibo_topic_similarity.transpose() ecn_to_string = [str(x) for x in each_cluster_number] #time.sleep(5) run_time.append(str(time.clock() - start)) print "This time:", str(time.clock() - start) write_matrix_to_text(weibo_topic_similarity, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_topic, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(ecn_to_string, write_directory3 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1) quick_write_list_to_text(run_time, write_filename)
def stream_CT_LDA(read_directory1, read_directory2, read_directory3, write_directory1, write_directory2, write_directory3, write_filename): gamma = 0.1 s_lambda = 0.7 # 时间窗口 q = 4 ct_window = [] ct_num_window = [] ct_wordlist_window = [] run_time = [] file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') # 视图1,根据词汇分布计算潜在主题之间的相似度 W1 = np.zeros((len(PHAI), len(PHAI))) for j in range(len(PHAI)): for k in range(j, len(PHAI)): W1[j, k] = 1.0 / (SKLD(PHAI[j], PHAI[k]) + 1.0) W1[k, j] = W1[j, k] # 估计聚类数目 cluster_number = get_cluster_number(W1) print cluster_number # 本片数据的词汇列表 this_word_list = [] f1 = open(read_directory3 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.split()[0]) line = f1.readline() f1.close() start = time.clock() if i < q or np.mod(i, q) == 0: # 视图2,根据相关微博文本集合计算潜在主题之间的相似度 W2 = np.zeros((len(PHAI), len(PHAI))) related_weibo_list = [] for j in range(len(PHAI)): related_weibo_list.append([]) for j in range(len(THETA)): for k in range(len(THETA[0])): if THETA[j, k] >= gamma: related_weibo_list[k].append(j) for j in range(len(PHAI)): for k in range(j, len(PHAI)): numerator = len(set(related_weibo_list[j]) & set(related_weibo_list[k])) denominator = len(set(related_weibo_list[j]) | set(related_weibo_list[k])) if j == k: W2[j, k] = 1.0 W2[k, j] = 1.0 elif denominator == 0.0: W2[j, k] = 0.0 W2[k, j] = 0.0 else: W2[j, k] = np.true_divide(numerator, denominator) W2[k, j] = W2[j, k] max_iter = 20 cluster_tag = co_training_spectral_cluster(W1, W2, cluster_number, max_iter) # 聚类分析 center_topic = np.zeros((cluster_number, len(PHAI[0]))) each_cluster_number = np.zeros(cluster_number, int) weibo_topic_similarity = np.zeros((cluster_number, len(THETA))) THETA = THETA.transpose() for j in range(len(cluster_tag)): center_topic[cluster_tag[j]] += PHAI[j] each_cluster_number[cluster_tag[j]] += 1 weibo_topic_similarity[cluster_tag[j]] += THETA[j] # for j in range(cluster_number): center_topic[j] = center_topic[j] / each_cluster_number[j] weibo_topic_similarity = weibo_topic_similarity.transpose() else: # 回溯一个数据片 temp_ct = np.zeros((cluster_number, len(PHAI[0]))) if len(ct_window[-1]) >= cluster_number: idx = ct_num_window[-1].argsort() idx = idx[::-1] temp_ct = ct_window[-1][idx][0 : cluster_number, :] else: temp_ct[0 : len(ct_window[-1]), :] = ct_window[-1] # 合并向量空间 new_temp_ct, new_this_lt, new_word_list = merge_space(ct_wordlist_window[-1], this_word_list, temp_ct, PHAI) #计算当前潜在主题与前一片的中心主题之间的相似度 lt_ct_similarity = np.zeros((len(new_this_lt), len(new_temp_ct))); for j in range(len(new_this_lt)): for k in range(len(new_temp_ct)): lt_ct_similarity[j, k] = 1.0 / (SKLD(new_this_lt[j], new_temp_ct[k]) + 1.0) #print lt_ct_similarity cluster_tag = [] new_part_lt = [] #原空间(500维)下的本数据片的新出现的潜在主题 last_part_lt = [] #原空间下的与上一数据片中的中心主题比较相似的潜在主题,二维 for j in range(len(new_temp_ct)): last_part_lt.append([]) for j in range(len(new_this_lt)): if np.max(lt_ct_similarity[j]) < s_lambda: new_part_lt.append(PHAI[j]) cluster_tag.append(-1) #新类编号 else: max_index = np.argmax(lt_ct_similarity[j]) last_part_lt[max_index].append(PHAI[j]) cluster_tag.append(max_index) empty_count = 0 this_last_ct = [] this_last_ct_count = [] #for j in range(len(new_this_lt)): for j in range(len(last_part_lt)): if len(last_part_lt[j]) == 0: empty_count += 1 else: temp_this_ct = np.zeros(len(PHAI[0])) temp_this_ct_count = 0 for k in range(len(last_part_lt[j])): temp_this_ct += last_part_lt[j][k] temp_this_ct_count += 1 this_last_ct.append(temp_this_ct / temp_this_ct_count) this_last_ct_count.append(temp_this_ct_count) center_topic = np.zeros((cluster_number, len(PHAI[0]))) each_cluster_number = np.zeros(cluster_number, int) print "empty_number" , empty_count ''' 分情况讨论中心主题更新 ''' #new_part_it_number = cluster_number - (len(new_this_lt) - empty_count) new_part_it_number = cluster_number - (len(last_part_lt) - empty_count) print "new_part_it_number" , new_part_it_number if new_part_it_number == 0 and len(new_part_lt) == 0: #直接将上一片的主题作为本片的主题 #每片求均值 for j in range(len(this_last_ct)): center_topic[j] = this_last_ct[j] each_cluster_number[j] = this_last_ct_count[j] #此种情况一般不会发生,若发生,表明s_lamdba设置过小 elif new_part_it_number > 0 and len(new_part_lt) == 0: #直接将上一片的主题作为本片的主题 #每片求均值 for j in range(len(this_last_ct)): center_topic[j] = this_last_ct[j] each_cluster_number[j] = this_last_ct_count[j] center_topic = center_topic[0 : len(this_last_ct), :] each_cluster_number = each_cluster_number[0 : len(this_last_ct)] cluster_tag = cluster_tag[0 : len(this_last_ct)] elif new_part_it_number == 0 and len(new_part_lt) > 0: #替换一个中心主题 new_part_ct = np.zeros((1, len(PHAI[0]))) for j in range(len(new_part_lt)): new_part_ct += new_part_lt[j] new_part_ct = new_part_ct / len(new_part_lt) min_index = np.argmin(this_last_ct_count) #找出被删去的主题与哪一个最为相近,合并之 merge_si = np.zeros(len(this_last_ct), float) for j in range(len(this_last_ct)): if j == min_index: merge_si[j] = -1 else: merge_si[j] = 1.0 / (SKLD(this_last_ct[min_index], this_last_ct[j]) + 1.0) merge_des = np.argmax(merge_si) this_last_ct[min_index] = new_part_ct this_last_ct_count[min_index] = len(new_part_lt) this_last_ct[merge_des] = (this_last_ct[merge_des] + this_last_ct[min_index]) / 2.0 #聚类元素个数相加 this_last_ct_count[merge_des] = this_last_ct_count[merge_des] + this_last_ct_count[min_index] for j in range(len(this_last_ct)): center_topic[j] = this_last_ct[j] each_cluster_number[j] = this_last_ct_count[j] for j in range(len(cluster_tag)): #-1变为min_index #min_index变为merge_des if cluster_tag[j] == -1: cluster_tag[j] = min_index elif cluster_tag[j] == min_index: cluster_tag[j] = merge_des else: #更新前面部分 for j in range(len(this_last_ct)): center_topic[j] = this_last_ct[j] each_cluster_number[j] = this_last_ct_count[j] #新增1个主题 if new_part_it_number == 1: new_part_ct = np.zeros((1, len(PHAI[0]))) for j in range(len(new_part_lt)): new_part_ct += new_part_lt[j] new_part_ct = new_part_ct / len(new_part_lt) center_topic[-1] = new_part_ct each_cluster_number[-1] = len(new_part_lt) for j in range(len(cluster_tag)): if cluster_tag[j] == -1: cluster_tag[j] = cluster_number - 1 #这里可能会有异常 #elif len(new_part_lt) == 1: #新增若干个主题 else: #谱聚类 #print new_part_lt sp_label = spectral_cluster(new_part_lt, new_part_it_number) new_part_ct = np.zeros((new_part_it_number, len(PHAI[0]))) new_part_ct_number = np.zeros(new_part_it_number, int) for j in range(len(sp_label)): new_part_ct[sp_label[j]] += new_part_lt[j] new_part_ct_number[sp_label[j]] += 1 for j in range(new_part_it_number): new_part_ct[j] = new_part_ct[j] / new_part_ct_number[j] center_topic[len(this_last_ct) + j] = new_part_ct[j] each_cluster_number[len(this_last_ct) + j] = new_part_ct_number[j] new_count = 0 for j in range(len(cluster_tag)): if cluster_tag[j] == -1: cluster_tag[j] = cluster_number - new_part_it_number + sp_label[new_count] new_count += 1 #计算文档-主题相似度 weibo_topic_similarity = np.zeros((cluster_number, len(THETA))) THETA = THETA.transpose() for j in range(len(cluster_tag)): weibo_topic_similarity[cluster_tag[j]] += THETA[j] weibo_topic_similarity = weibo_topic_similarity.transpose() run_time.append(str(time.clock() - start)) print "This time:", str(time.clock() - start) # 公共部分 # 加入时间窗口 ecn_to_string = [str(x) for x in each_cluster_number] ct_window.append(center_topic) ct_num_window.append(each_cluster_number) ct_wordlist_window.append(this_word_list) #删除最历史数据 if len(ct_window) > q: ct_window.remove(ct_window[0]) ct_num_window.remove(ct_num_window[0]) ct_wordlist_window.remove(ct_wordlist_window[0]) write_matrix_to_text(weibo_topic_similarity, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_topic, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(ecn_to_string, write_directory3 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1) quick_write_list_to_text(run_time, write_filename)
def topic_life(read_directory1, read_directory2, read_directory3, write_directory1): gamma = 0.65 delta = 0.80 #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) q = 4 start_batch = 46 interval = 7 end_batch = start_batch + interval all_topic_batch, new_word_list, all_count = merge_all_center(read_directory1, read_directory2, start_batch, end_batch) evolution_matrix = np.zeros((all_count, all_count), int) previous_topics = [] previous_num = [] previous_intensity = [] start_index = 0 end_index = 0 for i in range(len(all_topic_batch)): this_topic_intensity = [] get_text_to_single_list(this_topic_intensity, read_directory3 + '/' + str(start_batch + i) + '.txt') this_topic_intensity = [int(x) for x in this_topic_intensity] print this_topic_intensity if i == 0: for j in range(len(all_topic_batch[i])): evolution_matrix[j, j] = 1 previous_topics.append(all_topic_batch[i][j]) previous_intensity.append(this_topic_intensity[j]) start_index = 0 end_index += len(all_topic_batch[i]) previous_num.append(len(all_topic_batch[i])) else: kl_matrix = np.zeros((len(all_topic_batch[i]), len(previous_topics))) for j in range(len(all_topic_batch[i])): for k in range(len(previous_topics)): kl_matrix[j, k] = 1.0 / (SKLD(all_topic_batch[i][j], previous_topics[k]) + 1.0) #判断出现 for j in range(len(kl_matrix)): #if np.max(kl_matrix[j]) < gamma: evolution_matrix[end_index + j, end_index + j] = 1 #判断消失 for j in range(len(kl_matrix[0])): if np.max(kl_matrix[:, j]) < gamma: evolution_matrix[start_index + j, start_index + j] = -1 #判断延续 for j in range(len(kl_matrix)): for k in range(len(kl_matrix[j])): if kl_matrix[j][k] >= delta: evolution_matrix[start_index + k, end_index + j] = 2 evolution_matrix[end_index + j, start_index + k] = 2 #判断合并 for j in range(len(kl_matrix)): latent_merge_index = [] si_value = [] for k in range(len(kl_matrix[j])): if kl_matrix[j][k] >= gamma and kl_matrix[j][k] < delta: latent_merge_index.append(k) si_value.append(kl_matrix[j][k]) if len(latent_merge_index) >= 2: sl = zip(latent_merge_index, si_value) sl = sorted(sl, key = itemgetter(1), reverse=True) latent_merge_index = [] m_count = 0 for each in sl: latent_merge_index.append(each[0]) m_count += 1 if m_count >= 3: break Z = np.zeros(len(all_topic_batch[i][0])) all_intensity = 0 for each in latent_merge_index: Z += previous_topics[each] * previous_intensity[each] all_intensity += previous_intensity[each] Z = Z / all_intensity related = 1.0 / (SKLD(all_topic_batch[i][j], Z) + 1.0) if related > delta: for each in latent_merge_index: evolution_matrix[start_index + each, end_index + j] = 3 evolution_matrix[end_index + j, start_index + each] = 3 #判断分裂 if len(kl_matrix) > 1: for j in range(len(kl_matrix[0])): latent_split_index = [] for k in range(len(kl_matrix)): if kl_matrix[k][j] >= gamma and kl_matrix[k][j] < delta: latent_split_index.append(k) if len(latent_split_index) >= 2: Z = np.zeros(len(all_topic_batch[i][0])) all_intensity = 0 for each in latent_split_index: Z += all_topic_batch[i][each] * this_topic_intensity[each] all_intensity += this_topic_intensity[each] Z = Z / all_intensity related = 1.0 / (SKLD(previous_topics[j], Z) + 1.0) if related > delta: for each in latent_split_index: evolution_matrix[start_index + j, end_index + each] = 4 evolution_matrix[end_index + each, start_index + j] = 4 for j in range(len(all_topic_batch[i])): previous_topics.append(all_topic_batch[i][j]) previous_intensity.append(this_topic_intensity[j]) previous_num.append(len(all_topic_batch[i])) if len(previous_num) > q: start_index += previous_num[0] for l in range(previous_num[0]): previous_topics.remove(previous_topics[0]) previous_intensity.remove(previous_intensity[0]) previous_num.remove(previous_num[0]) end_index += len(all_topic_batch[i]) write_matrix_to_text(evolution_matrix, write_directory1 + '/' + str(i + 1) + '.txt') print "Evolution %d Completed." % (i + 1)