def compute_em_weights(read_filename1, read_filename2, write_filename): ''' Linear fusion :param read_filename1: :param read_filename2: :param write_filename: ''' em_weights = [] coefficients_string = [] get_text_to_single_list(coefficients_string, read_filename2) coefficients = [float(x) for x in coefficients_string] f = open(read_filename1, 'r') line = f.readline() while line: each_line = line.split() em_weights.append( float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2]) line = f.readline() f.close() em_weights_to_string = [str(x) for x in em_weights] quick_write_list_to_text(em_weights_to_string, write_filename)
def compute_em_weights(read_filename1, read_filename2, write_filename): ''' Linear fusion :param read_filename1: :param read_filename2: :param write_filename: ''' em_weights = [] coefficients_string = [] get_text_to_single_list(coefficients_string, read_filename2) coefficients = [float(x) for x in coefficients_string] f = open(read_filename1, 'r') line = f.readline() while line: each_line = line.split() em_weights.append(float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2]) line = f.readline() f.close() em_weights_to_string = [str(x) for x in em_weights] quick_write_list_to_text(em_weights_to_string, write_filename)
def select_top_N_words(read_directory1, read_directory2, write_directory): N = 1000 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) each_word_tf = each_word_tf[1:] # 列表,内层2个 get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in key_words: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def get_final_center(read_filename1, read_filename2, write_filename): result = [] word_list = [] get_text_to_single_list(word_list, read_filename2) vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def kmeans_evaluate(read_filename1, read_filename2, write_directory): # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename1) cluster_tag = [] get_text_to_single_list(cluster_tag, read_filename2) real_tag = real_tag[0:len(cluster_tag)] #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5... reflect_tag = [['6', '8'], ['4'], ['5'], ['7'], ['3'], ['2'], ['6', '8'], ['1']] cluster_partion = [] for i in range(len(reflect_tag)): cluster_partion.append([]) for i in range(len(cluster_tag)): cluster_partion[int(cluster_tag[i]) - 1].append(str(i)) precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(reflect_tag)): real_cluster_partion = [] for j in range(len(real_tag)): if real_tag[j] in reflect_tag[i]: real_cluster_partion.append(str(j)) correct = len(set(cluster_partion[i]) & set(real_cluster_partion)) this_precision = np.true_divide(correct, len(set(cluster_partion[i]))) this_recall = np.true_divide(correct, len(set(real_cluster_partion))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key=itemgetter(1), reverse=True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def kmeans_evaluate(read_filename1, read_filename2, write_directory): # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename1) cluster_tag = [] get_text_to_single_list(cluster_tag, read_filename2) real_tag = real_tag[0 : len(cluster_tag)] #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5... reflect_tag = [['5'], ['7'], ['2'], ['3', '6', '8'], ['7'], ['2'], ['4'], ['1']] cluster_partion = [] for i in range(len(reflect_tag)): cluster_partion.append([]) for i in range(len(cluster_tag)): cluster_partion[int(cluster_tag[i]) - 1].append(str(i)) precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(reflect_tag)): real_cluster_partion = [] for j in range(len(real_tag)): if real_tag[j] in reflect_tag[i]: real_cluster_partion.append(str(j)) correct = len(set(cluster_partion[i]) & set(real_cluster_partion)) this_precision = np.true_divide(correct, len(set(cluster_partion[i]))) this_recall = np.true_divide(correct, len(set(real_cluster_partion))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def batch_count_tf(read_directory1, read_directory2, write_directory): ''' :param read_directory1: :param read_directory2: :param write_directory: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def classification_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) # 需要手动录入 class_tag = ['5', '6', '3', '8', '2', '4', '1', '7'] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(class_tag)): real_classification = [] for j in range(len(real_tag)): if real_tag[j] == class_tag[i]: real_classification.append(str(j)) correct = len(set(classification_result[i]) & set(real_classification)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_classification))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def classification_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) # 需要手动录入 class_tag = ['2', '3', '6', '1', '5', '7', '4'] class_tag2 = ['2', '3', '8', '1', '5', '7', '4'] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(class_tag)): real_classification = [] for j in range(len(real_tag)): # 检索6和8为一类 if real_tag[j] == class_tag[i] or real_tag[j] == class_tag2[i]: real_classification.append(str(j)) correct = len(set(classification_result[i]) & set(real_classification)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_classification))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def spct_prf(read_filename1, read_filename2, write_filename): cluster_tag = [] real_tag = [] get_text_to_single_list(cluster_tag, read_filename1) get_text_to_single_list(real_tag, read_filename2) cluster_tag = [int(x) for x in cluster_tag] real_tag = [int(x) for x in real_tag] reflect = [20, 21, 20] p, r, f = prf(cluster_tag, real_tag, reflect) print p print r print f quick_write_list_to_text([str(p), str(r), str(f)], write_filename)
def count_word_tf(read_filename1, read_filename2, write_filename): ''' 计算数据的所有词汇的词频 :param read_filename1: :param read_filename2: :param write_filename: ''' each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_filename1, 0) get_text_to_single_list(all_weibo_fenci, read_filename2) tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_filename)
def generate_high_quality_data(read_directory1, read_directory2, read_directory3, write_directory): ''' Linear fusion :param read_directory1: :param read_directory2: :param read_directory3: :param write_directory: ''' K = 3000 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): em_weights = [] coefficients_string = [] get_text_to_single_list(coefficients_string, read_directory1 + '/' + str(i + 1) + '.txt') coefficients = [float(x) for x in coefficients_string] f = open(read_directory2 + '/' + str(i + 1) + '.txt', 'r') line = f.readline() while line: each_line = line.split() this_em = 0.0 for j in range(len(coefficients)): this_em += float(each_line[j]) * coefficients[j] em_weights.append(this_em) line = f.readline() f.close() this_weibo = [] time_series = [] this_text = [] #get_text_to_single_list(this_weibo, read_directory3 + '/' + str(i + 1) + '.txt') f = open(read_directory3 + '/' + str(i + 1) + '.txt', 'rb') line = f.readline() while line: this_time = time.mktime(time.strptime(line.strip().split('\t')[2], '%Y/%m/%d %H:%M')) time_series.append(this_time) this_weibo.append(line.strip()) try: this_text.append(line.strip().split('\t')[6]) except: this_text.append(" ") line = f.readline() f.close() # 按EM值排序 ttte = zip(this_weibo, time_series, this_text, em_weights) ttte1 = sorted(ttte, key = itemgetter(3), reverse = True) this_weibo = [] time_series = [] this_text = [] em_weights = [] line_count = 0 for each in ttte1: if each[2] not in this_text and len(each[2]) >= 150: this_weibo.append(each[0]+'\t'+str(each[3])) time_series.append(each[1]) this_text.append(each[2]) line_count += 1 if line_count >= K: break # 再按时间升序排序 twts = zip(this_weibo, time_series) twts1 = sorted(twts, key = itemgetter(1)) this_weibo = [] time_series = [] this_text = [] for each in twts1: this_weibo.append(each[0]) quick_write_list_to_text(this_weibo, write_directory + '/' + str(i + 1) + '.txt')
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2): all_batch_index = [] f = open(read_filename) line = f.readline() while line: all_batch_index.append(line.split()) line = f.readline() f.close() for i in range(len(all_batch_index)): this_word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.strip()) line = f1.readline() f1.close() result = [] result_id_time = [] for j in range(len(all_batch_index[i])): word_list = [] f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb') line = f2.readline() while line: word_list.append(line.split()[0]) line = f2.readline() f2.close() vsm_nparray = get_text_to_nparray( read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int') id_time = [] get_text_to_single_list( id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt') for each2 in id_time: result_id_time.append(each2) for each in vsm_nparray: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in this_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in this_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')
def topic_life(read_directory1, read_directory2, read_directory3, write_directory1): gamma = 0.65 delta = 0.80 #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) q = 4 start_batch = 46 interval = 7 end_batch = start_batch + interval all_topic_batch, new_word_list, all_count = merge_all_center(read_directory1, read_directory2, start_batch, end_batch) evolution_matrix = np.zeros((all_count, all_count), int) previous_topics = [] previous_num = [] previous_intensity = [] start_index = 0 end_index = 0 for i in range(len(all_topic_batch)): this_topic_intensity = [] get_text_to_single_list(this_topic_intensity, read_directory3 + '/' + str(start_batch + i) + '.txt') this_topic_intensity = [int(x) for x in this_topic_intensity] print this_topic_intensity if i == 0: for j in range(len(all_topic_batch[i])): evolution_matrix[j, j] = 1 previous_topics.append(all_topic_batch[i][j]) previous_intensity.append(this_topic_intensity[j]) start_index = 0 end_index += len(all_topic_batch[i]) previous_num.append(len(all_topic_batch[i])) else: kl_matrix = np.zeros((len(all_topic_batch[i]), len(previous_topics))) for j in range(len(all_topic_batch[i])): for k in range(len(previous_topics)): kl_matrix[j, k] = 1.0 / (SKLD(all_topic_batch[i][j], previous_topics[k]) + 1.0) #判断出现 for j in range(len(kl_matrix)): #if np.max(kl_matrix[j]) < gamma: evolution_matrix[end_index + j, end_index + j] = 1 #判断消失 for j in range(len(kl_matrix[0])): if np.max(kl_matrix[:, j]) < gamma: evolution_matrix[start_index + j, start_index + j] = -1 #判断延续 for j in range(len(kl_matrix)): for k in range(len(kl_matrix[j])): if kl_matrix[j][k] >= delta: evolution_matrix[start_index + k, end_index + j] = 2 evolution_matrix[end_index + j, start_index + k] = 2 #判断合并 for j in range(len(kl_matrix)): latent_merge_index = [] si_value = [] for k in range(len(kl_matrix[j])): if kl_matrix[j][k] >= gamma and kl_matrix[j][k] < delta: latent_merge_index.append(k) si_value.append(kl_matrix[j][k]) if len(latent_merge_index) >= 2: sl = zip(latent_merge_index, si_value) sl = sorted(sl, key = itemgetter(1), reverse=True) latent_merge_index = [] m_count = 0 for each in sl: latent_merge_index.append(each[0]) m_count += 1 if m_count >= 3: break Z = np.zeros(len(all_topic_batch[i][0])) all_intensity = 0 for each in latent_merge_index: Z += previous_topics[each] * previous_intensity[each] all_intensity += previous_intensity[each] Z = Z / all_intensity related = 1.0 / (SKLD(all_topic_batch[i][j], Z) + 1.0) if related > delta: for each in latent_merge_index: evolution_matrix[start_index + each, end_index + j] = 3 evolution_matrix[end_index + j, start_index + each] = 3 #判断分裂 if len(kl_matrix) > 1: for j in range(len(kl_matrix[0])): latent_split_index = [] for k in range(len(kl_matrix)): if kl_matrix[k][j] >= gamma and kl_matrix[k][j] < delta: latent_split_index.append(k) if len(latent_split_index) >= 2: Z = np.zeros(len(all_topic_batch[i][0])) all_intensity = 0 for each in latent_split_index: Z += all_topic_batch[i][each] * this_topic_intensity[each] all_intensity += this_topic_intensity[each] Z = Z / all_intensity related = 1.0 / (SKLD(previous_topics[j], Z) + 1.0) if related > delta: for each in latent_split_index: evolution_matrix[start_index + j, end_index + each] = 4 evolution_matrix[end_index + each, start_index + j] = 4 for j in range(len(all_topic_batch[i])): previous_topics.append(all_topic_batch[i][j]) previous_intensity.append(this_topic_intensity[j]) previous_num.append(len(all_topic_batch[i])) if len(previous_num) > q: start_index += previous_num[0] for l in range(previous_num[0]): previous_topics.remove(previous_topics[0]) previous_intensity.remove(previous_intensity[0]) previous_num.remove(previous_num[0]) end_index += len(all_topic_batch[i]) write_matrix_to_text(evolution_matrix, write_directory1 + '/' + str(i + 1) + '.txt') print "Evolution %d Completed." % (i + 1)
def compute_similarity(pattern_list, read_filename, word_weight_dict): search_texts = [] get_text_to_single_list(search_texts, read_filename) query_result_list = [] for i in range(len(pattern_list)): query_result_list.append(query(pattern_list[i], search_texts, word_weight_dict)) similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)]) tag = [] for i in range(len(pattern_list)): tag.append(0) for j in range(i, len(pattern_list)): ''' 计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度 见TextQuery.py ''' numerator = len(set(query_result_list[i]) & set(query_result_list[j])) denominator = len(set(query_result_list[i]) | set(query_result_list[j])) similarity_matrix[i, j] = np.true_divide(numerator, denominator) similarity_matrix[j, i] = similarity_matrix[i, j] ''' 分部划分以确定聚类中心个数 ''' class_partion = [] for i in range(len(pattern_list)): if tag[i] == 0: temp_class_partion = [] for j in range(i, len(pattern_list)): if similarity_matrix[i, j] > 0.2: temp_class_partion.append(j) tag[j] = 1 class_partion.append(temp_class_partion) partion_length = [] for each in class_partion: partion_length.append(len(each)) # 按长度降序排序 cl = zip(class_partion, partion_length) cl = sorted(cl, key = itemgetter(1), reverse = True) class_partion = [] partion_length = [] for each in cl: class_partion.append(each[0]) partion_length.append(each[1]) length_sum = np.sum(partion_length) temp_sum = 0 cluster_number = 0 for i in range(len(partion_length)): temp_sum += partion_length[i] cluster_number += 1 #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目 if np.true_divide(temp_sum, length_sum) > 0.75: break class_partion_to_string = [] for i in range(cluster_number): class_partion_to_string.append(" ".join([str(x) for x in class_partion[i]])) print cluster_number query_result_list_string = [] for each in query_result_list: query_result_list_string.append(" ".join([str(x) for x in each])) # if possible #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt') return similarity_matrix, cluster_number
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2): all_batch_index = [] f = open(read_filename) line = f.readline() while line: all_batch_index.append(line.split()) line = f.readline() f.close() for i in range(len(all_batch_index)): this_word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.strip()) line = f1.readline() f1.close() result = [] result_id_time = [] for j in range(len(all_batch_index[i])): word_list = [] f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb') line = f2.readline() while line: word_list.append(line.split()[0]) line = f2.readline() f2.close() vsm_nparray = get_text_to_nparray(read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int') id_time = [] get_text_to_single_list(id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt') for each2 in id_time: result_id_time.append(each2) for each in vsm_nparray: tf_dict = {} for k in range(len(each)): if each[k] > 0.0001: tf_dict[word_list[k]] = each[k] tf_dict2 = {} for each1 in this_word_list: if each1 in tf_dict.keys(): tf_dict2[each1] = tf_dict[each1] else: tf_dict2[each1] = 0 this_line = [] for key in this_word_list: this_line.append(str(tf_dict2[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')
def select_top_N_words(read_directory1, read_directory2, write_directory): ''' 选取前N个词汇 :param read_directory1: 所有单词tf文件目录 :param read_directory2: 关键词文件目录 :param write_directory: 写入目录 ''' #选取的词汇数目 N = 2000 #目录下的文件个数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) #权值字典,按词性分配 score_dict = {"CC":0.0, "CD":0.0, "DT":0.2, "EX":0.0, "FW":0.3, "IN":0.0, "JJ":0.7, \ "JJR":0.75, "JJS":0.75, "LS":0.0, "MD":0.5, "NN":0.9, "NNS":0.9, "NNP":1.0, \ "NNPS":1.0, "PDT":0.0, "POS":0.0, "PRP":0.1, "PRP$":0.1, \ "RB":0.3, "RBR":0.35, "RBS":0.4, "RP":0.5, "SYM":0.0, "TO":0.0, "UH":0.0, \ "VB":0.7, "VBD":0.7, "VBG":0.7, "VBN":0.75, "VBP":0.7, "VBZ":0.7, \ "WDT":0.0, "WP":0.3, "WP$":0.3, "WRB":0.0, ":":0.0} for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) each_word_tf = each_word_tf[1:] # 列表,内层2个 get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): #word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split(',')[1] if each_word_tf[j][0] in key_words: select_word.append(each_word_tf[j][0]) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(each_word_tf[j][0]) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename): #展示5个词汇 #查询时选取3个词汇 select_number = 5 # 频繁项集聚类的结果标号,string类型,从1开始 class_tag = [] get_text_to_single_list(class_tag, read_filename1) # 聚簇数目 cluster_number = len(set(class_tag)) # 频繁项集,二维string类型列表 pattern_all = [] get_text_to_complex_list(pattern_all, read_filename2, 0) pattern_all = pattern_all[0: len(class_tag)] # 获得聚类结果的频繁项集划分,int型二维列表 class_partion = [] for i in range(cluster_number): class_partion.append([]) for i in range(len(class_tag)): for j in range(cluster_number): if class_tag[i] == str(j + 1): class_partion[j].append(i) # 获取全局词汇的权值 word_weight_dict = {} f = open(read_filename3, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() # 获取频繁项集中所有不同的词汇 all_word_list = [] for each in pattern_all: for word in set(each).difference(all_word_list): all_word_list.append(word) # 包含某个单词的频繁项集个数——针对所有单词 I_dict = {} for each in all_word_list: I_dict[each] = 0 for each1 in pattern_all: if each in each1: I_dict[each] += 1 # 包含某个单词的聚簇个数——针对所有单词 C_dict = {} for each in all_word_list: C_dict[each] = 0 for i in range(len(class_partion)): for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: C_dict[each] += 1 break cluster_word_list = [] for i in range(len(class_partion)): # 获取该聚簇下所有不同的单词 this_word_list = [] for j in range(len(class_partion[i])): for each in pattern_all[class_partion[i][j]]: if each not in this_word_list: this_word_list.append(each) # 计算每个单词在聚簇中的支持度 sup_dict = {} for each in this_word_list: sup_dict[each] = 0 for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: sup_dict[each] += 1 word_score_list = [] # 计算聚簇中的每个单词的权值,作为查询分类的依据 for each in this_word_list: global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0) word_score_list.append(word_score) # 按权值降序排序 tw = zip(this_word_list, word_score_list) tw = sorted(tw, key = itemgetter(1), reverse = True) this_word_list = [] word_score_list = [] count = 0 for each in tw: this_word_list.append(each[0]) count += 1 if count >= select_number: break cluster_word_list.append(" ".join(this_word_list)) quick_write_list_to_text(cluster_word_list, write_filename)
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename): # 展示5个词汇 # 查询时选取3个词汇 select_number = 5 # 频繁项集聚类的结果标号,string类型,从1开始 class_tag = [] get_text_to_single_list(class_tag, read_filename1) # 聚簇数目 cluster_number = len(set(class_tag)) # 频繁项集,二维string类型列表 pattern_all = [] get_text_to_complex_list(pattern_all, read_filename2, 0) pattern_all = pattern_all[0 : len(class_tag)] # 获得聚类结果的频繁项集划分,int型二维列表 class_partion = [] for i in range(cluster_number): class_partion.append([]) for i in range(len(class_tag)): for j in range(cluster_number): if class_tag[i] == str(j + 1): class_partion[j].append(i) # 获取全局词汇的权值 word_weight_dict = {} f = open(read_filename3, "r") line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() # 获取频繁项集中所有不同的词汇 all_word_list = [] for each in pattern_all: for word in set(each).difference(all_word_list): all_word_list.append(word) # 包含某个单词的频繁项集个数——针对所有单词 I_dict = {} for each in all_word_list: I_dict[each] = 0 for each1 in pattern_all: if each in each1: I_dict[each] += 1 # 包含某个单词的聚簇个数——针对所有单词 C_dict = {} for each in all_word_list: C_dict[each] = 0 for i in range(len(class_partion)): for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: C_dict[each] += 1 break cluster_word_list = [] for i in range(len(class_partion)): # 获取该聚簇下所有不同的单词 this_word_list = [] for j in range(len(class_partion[i])): for each in pattern_all[class_partion[i][j]]: if each not in this_word_list: this_word_list.append(each) # 计算每个单词在聚簇中的支持度 sup_dict = {} for each in this_word_list: sup_dict[each] = 0 for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: sup_dict[each] += 1 word_score_list = [] # 计算聚簇中的每个单词的权值,作为查询分类的依据 for each in this_word_list: global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0) word_score_list.append(word_score) # 按权值降序排序 tw = zip(this_word_list, word_score_list) tw = sorted(tw, key=itemgetter(1), reverse=True) this_word_list = [] word_score_list = [] count = 0 for each in tw: this_word_list.append(each[0]) count += 1 if count >= select_number: break cluster_word_list.append(" ".join(this_word_list)) quick_write_list_to_text(cluster_word_list, write_filename)
def select_top_N_words(read_directory1, read_directory2, read_filename, write_directory): ''' 选取前N个词作为高质量的特征词汇 :param read_directory1: :param read_directory2: :param read_filename: :param write_directory: ''' N = 500 #根据词性分配权值 score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.2, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0, "eng":0.1} file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] user_dict = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') f = open(read_filename, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: #用户词典中的词分配高权值 select_word.append(word_entity) word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words and word_tag != 'eng': #关键词也分配高权值 select_word.append(word_entity) try: word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: #其余词汇乘以0.6 select_word.append(word_entity) try: word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.50) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def select_top_N_words(read_directory1, read_directory2, read_filename3, write_directory): N = 500 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} user_dict = [] f = open(read_filename3, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: select_word.append(word_entity) word_score.append( np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words: select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.60) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key=itemgetter(1), reverse=True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def compute_similarity(pattern_list, read_filename, word_weight_dict): search_texts = [] get_text_to_single_list(search_texts, read_filename) query_result_list = [] for i in range(len(pattern_list)): query_result_list.append( query(pattern_list[i], search_texts, word_weight_dict)) similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)]) tag = [] for i in range(len(pattern_list)): tag.append(0) for j in range(i, len(pattern_list)): ''' 计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度 见TextQuery.py ''' numerator = len( set(query_result_list[i]) & set(query_result_list[j])) denominator = len( set(query_result_list[i]) | set(query_result_list[j])) similarity_matrix[i, j] = np.true_divide(numerator, denominator) similarity_matrix[j, i] = similarity_matrix[i, j] ''' 分部划分以确定聚类中心个数 ''' class_partion = [] for i in range(len(pattern_list)): if tag[i] == 0: temp_class_partion = [] for j in range(i, len(pattern_list)): if similarity_matrix[i, j] > 0.2: temp_class_partion.append(j) tag[j] = 1 class_partion.append(temp_class_partion) partion_length = [] for each in class_partion: partion_length.append(len(each)) # 按长度降序排序 cl = zip(class_partion, partion_length) cl = sorted(cl, key=itemgetter(1), reverse=True) class_partion = [] partion_length = [] for each in cl: class_partion.append(each[0]) partion_length.append(each[1]) length_sum = np.sum(partion_length) temp_sum = 0 cluster_number = 0 for i in range(len(partion_length)): temp_sum += partion_length[i] cluster_number += 1 #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目 if np.true_divide(temp_sum, length_sum) > 0.75: break class_partion_to_string = [] for i in range(cluster_number): class_partion_to_string.append(" ".join( [str(x) for x in class_partion[i]])) print cluster_number query_result_list_string = [] for each in query_result_list: query_result_list_string.append(" ".join([str(x) for x in each])) # if possible #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt') return similarity_matrix, cluster_number
def select_top_N_words(read_filename1, read_filename2, read_filename3, write_filename): ''' 选取前N个词作为高质量的特征词汇 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: ''' N = 3000 #根据词性分配权值 score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} each_word_tf = [] key_words = [] select_word = [] word_score = [] user_dict = [] get_text_to_complex_list(each_word_tf, read_filename1, 0) get_text_to_single_list(key_words, read_filename2) f = open(read_filename3, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: #用户词典中的词分配高权值 select_word.append(word_entity) word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words: #关键词也分配高权值 select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: #其余词汇乘以0.6 select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.60) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key=itemgetter(1), reverse=True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_filename)