def text_classify(read_filename1, read_filename2, read_filename3, write_filename): """ 查询分类 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: """ query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, "r") line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, "r") line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(query_pattern)): this_result = query(query_pattern[i], search_texts, word_weight_dict) result.append(" ".join([str(x) for x in this_result])) quick_write_list_to_text(result, write_filename)
def get_key_words(read_filename, write_filename1, write_filename2): ''' 使用结巴分词获取关键词 :param read_filename: :param write_filename1: :param write_filename2: ''' each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_filename, 0) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_filename1) quick_write_list_to_text(all_key_words, write_filename2)
def text_classify(read_filename1, read_filename2, read_filename3, write_filename): query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, 'r') line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(search_texts)): result.append([]) for i in range(len(query_pattern)): this_result = query2(query_pattern[i], search_texts, word_weight_dict) result[this_result].append(str(i)) result_to_string = [] for each in result: result_to_string.append(" ".join(each)) quick_write_list_to_text(result_to_string, write_filename)
def text_classify(read_filename1, read_filename2, read_filename3, write_filename): ''' 查询分类 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: ''' query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, 'r') line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(query_pattern)): this_result = query(query_pattern[i], search_texts, word_weight_dict) result.append(" ".join([str(x) for x in this_result])) quick_write_list_to_text(result, write_filename)
def get_key_words(read_directory, write_directory1, write_directory2): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 2) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt')
def get_key_words(read_directory, write_directory1, write_directory2): ''' :param read_directory: :param write_directory1: :param write_directory2: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 0) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def select_top_N_words(read_directory1, read_directory2, write_directory): N = 1000 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) each_word_tf = each_word_tf[1:] # 列表,内层2个 get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in key_words: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def online_lda(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) latent_topic_number = 50 for i in range(file_number): each_weibo_fenci = [] all_weibo_word = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: all_weibo_word.append([line.strip().split()[0]]) line = f.readline() f.close() dictionary = corpora.Dictionary(all_weibo_word) tf_corpus = [dictionary.doc2bow(text) for text in each_weibo_fenci] tf_corpus_to_string = [] for each in tf_corpus: ss = [str(x) for x in each] tf_corpus_to_string.append("+".join(ss)) lda = models.ldamodel.LdaModel(tf_corpus, num_topics=latent_topic_number) #获取文档-潜在主题分布矩阵 THETA = [] for j in range(len(tf_corpus)): this_line = np.zeros(latent_topic_number) for each1 in lda[tf_corpus[j]]: #each1是一个元组(topic_id, probability) this_line[each1[0]] = each1[1] THETA.append(" ".join([str(x) for x in this_line])) #获取潜在主题-词汇分布矩阵 PHAI = [] raw_topics = lda.show_topics(topics=latent_topic_number, formatted=False) for j in range(latent_topic_number): this_line = np.zeros(len(all_weibo_word)) for each2 in raw_topics[j]: #each1是一个元组(probability, (str)topic_id) this_line[int(each2[1])] = each2[0] PHAI.append(" ".join([str(x) for x in this_line])) quick_write_list_to_text(tf_corpus_to_string, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(THETA, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(PHAI, write_directory3 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key=itemgetter(1), reverse=True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory): ''' 微博文本的向量空间构造,值为TF :param read_filename1: :param read_filename2: :param write_filename: ''' file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 2) f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: all_weibo_fenci.append(line.strip().split()[0]) line = f.readline() f.close() result = [] for row in range(len(each_weibo_fenci)): tf_dict = {} # 词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0 this_line = [] for key in all_weibo_fenci: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') print "VSM Complete!!!"
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory): ''' 微博文本的向量空间构造,值为TF :param read_filename1: :param read_filename2: :param write_filename: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: all_weibo_fenci.append(line.strip().split()[0]) line = f.readline() f.close() result = [] for row in range(len(each_weibo_fenci)): tf_dict = {} # 词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0 this_line = [] for key in all_weibo_fenci: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') print "VSM Complete!!!"
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def batch_count_tf(read_directory1, read_directory2, write_directory): ''' :param read_directory1: :param read_directory2: :param write_directory: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def em_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5... reflect_tag = [['7'], ['1'], ['5'], ['4'], ['3', '2'], ['1'], ['1'], ['1', '2']] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(reflect_tag)): real_cluster_partion = [] for j in range(len(real_tag)): if real_tag[j] in reflect_tag[i]: real_cluster_partion.append(str(j)) correct = len( set(classification_result[i]) & set(real_cluster_partion)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_cluster_partion))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def classification_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) # 需要手动录入 class_tag = ['5', '6', '3', '8', '2', '4', '1', '7'] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(class_tag)): real_classification = [] for j in range(len(real_tag)): if real_tag[j] == class_tag[i]: real_classification.append(str(j)) correct = len(set(classification_result[i]) & set(real_classification)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_classification))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def classification_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) # 需要手动录入 class_tag = ['2', '3', '6', '1', '5', '7', '4'] class_tag2 = ['2', '3', '8', '1', '5', '7', '4'] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(class_tag)): real_classification = [] for j in range(len(real_tag)): # 检索6和8为一类 if real_tag[j] == class_tag[i] or real_tag[j] == class_tag2[i]: real_classification.append(str(j)) correct = len(set(classification_result[i]) & set(real_classification)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_classification))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def kmeans_evaluate(read_filename1, read_filename2, write_directory): # string类型二维列表 classification_result = [] get_text_to_complex_list(classification_result, read_filename1, 0) # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename2) #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5... reflect_tag = [['1'], ['4'], ['5'], ['7'], ['6', '8'], ['2'], ['3'], ['x']] precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(reflect_tag)): real_cluster_partion = [] for j in range(len(real_tag)): if real_tag[j] in reflect_tag[i]: real_cluster_partion.append(str(j)) correct = len(set(classification_result[i]) & set(real_cluster_partion)) this_precision = np.true_divide(correct, len(set(classification_result[i]))) this_recall = np.true_divide(correct, len(set(real_cluster_partion))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def sample_vsm(read_filename1, read_filename2, write_filename): weibo_content = [] all_word_list = [] select_number = 1000 get_text_to_complex_list(weibo_content, read_filename1, 0) f = open(read_filename2) line = f.readline() while line: all_word_list.append(line.strip().split()[0]) line = f.readline() f.close() all_word_list = all_word_list[0 : select_number] vsm = [] for row in range(len(weibo_content)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for j in range(len(weibo_content[row])): try: tf_dict[weibo_content[row][j].split('/')[0]] += 1 except KeyError: tf_dict[weibo_content[row][j].split('/')[0]] = 0 this_line = [] for key in all_word_list: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 vsm.append(" ".join(this_line)) quick_write_list_to_text(vsm, write_filename)
def sample_vsm(read_filename1, read_filename2, write_filename): weibo_content = [] all_word_list = [] select_number = 1000 get_text_to_complex_list(weibo_content, read_filename1, 0) f = open(read_filename2) line = f.readline() while line: all_word_list.append(line.strip().split()[0]) line = f.readline() f.close() all_word_list = all_word_list[0:select_number] vsm = [] for row in range(len(weibo_content)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for j in range(len(weibo_content[row])): try: tf_dict[weibo_content[row][j].split('/')[0]] += 1 except KeyError: tf_dict[weibo_content[row][j].split('/')[0]] = 0 this_line = [] for key in all_word_list: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 vsm.append(" ".join(this_line)) quick_write_list_to_text(vsm, write_filename)
def count_word_tf(read_filename1, read_filename2, write_filename): ''' 计算数据的所有词汇的词频 :param read_filename1: :param read_filename2: :param write_filename: ''' each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_filename1, 0) get_text_to_single_list(all_weibo_fenci, read_filename2) tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_filename)
def select_top_N_words(read_filename1, read_filename2, read_filename3, write_filename): ''' 选取前N个词作为高质量的特征词汇 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: ''' N = 3000 #根据词性分配权值 score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} each_word_tf = [] key_words = [] select_word = [] word_score = [] user_dict = [] get_text_to_complex_list(each_word_tf, read_filename1, 0) get_text_to_single_list(key_words, read_filename2) f = open(read_filename3, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: #用户词典中的词分配高权值 select_word.append(word_entity) word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words: #关键词也分配高权值 select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: #其余词汇乘以0.6 select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.60) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key=itemgetter(1), reverse=True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_filename)
def cquery(keyword_list, mode, time_interval, select, read_directory1, read_directory2, write_filename): ''' :param keyword_list: :param mode: :param time_interval: :param select: :param read_directory1: 数据总目录 :param read_directory2: 索引目录 :param write_filename: ''' if len(time_interval) != 2: print "Set Time Error!" return if (mode != "AND") and (mode != "OR"): print "Mode Error!" return start = time_interval[0] end = time_interval[1] #文件个数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory2)]) query_result = [] entropy_result = [] #当前目录下进行搜索 for i in range(file_number): #读取时间信息 f = open(read_directory1 + '/update_id_time/' + str(i + 1) + '.txt') time_lines = f.readlines() f.close() #当前片的最晚时间比查询设定的开始时间还早,则跳过该片 if float(time_lines[-1].strip().split()[-1]) < start: #print float(time_lines[-1].strip().split()[-1]) pass #当前片的最早时间比查询设定的结束时间晚,则结束 elif float(time_lines[-1].strip().split()[-1]) > end: break else: #压缩后的数据项对应原始数据的索引 f1 = open(read_directory2 + '/' + str(i + 1) + '.txt') data_index = f1.readlines() data_index = [int(x) for x in data_index] #print data_index f1.close() #数据的VSM表示的向量 each_weibo_vsm = [] get_text_to_complex_list(each_weibo_vsm, u'D:/Local/DataStreamMining/dataset/non_orthogonal/topics_data1/重构数据/' + str(i + 1) + '.txt', 0) #VSM所对应的词汇列表 word_list = [] f4 = open(read_directory1 + '/top_n_word/' + str(i + 1) + '.txt') word_lines = f4.readlines() f4.close() for each in word_lines: word_list.append(each.strip().split()[0]) #信息熵值列表 f5 = open(read_directory1 + '/entropy/' + str(i + 1) + '.txt') entropy_list = f5.readlines() f5.close() entropy_list = [float(x.strip()) for x in entropy_list] #每一个数据片中逐行遍历 for j in range(len(time_lines)): #当前遍历时的时间 now_t = float(time_lines[j].strip().split()[-1]) if (now_t >= start) and (now_t <= end) and (j in data_index): if mode == "OR": flag = 0 for each1 in keyword_list: for k in range(len(word_list)): if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001): this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list)) if this_message not in query_result: query_result.append(this_message) entropy_result.append(entropy_list[j]) flag = 1 break if flag == 1: break else: flag = 0 for each1 in keyword_list: for k in range(len(word_list)): if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001): flag += 1 break if flag == len(keyword_list): this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list)) if this_message not in query_result: query_result.append(this_message) entropy_result.append(entropy_list[j]) #按熵值降序排序 el = zip(entropy_result, query_result) el1 = sorted(el, key = itemgetter(0), reverse = True) #选择对应的行号索引 query_result2 = [] count_number = 1 for each in el1: query_result2.append(each[1]) count_number += 1 if count_number > select: break quick_write_list_to_text(query_result2, write_filename)
def select_top_N_words(read_directory1, read_directory2, read_filename, write_directory): ''' 选取前N个词作为高质量的特征词汇 :param read_directory1: :param read_directory2: :param read_filename: :param write_directory: ''' N = 500 #根据词性分配权值 score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.2, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0, "eng":0.1} file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] user_dict = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') f = open(read_filename, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: #用户词典中的词分配高权值 select_word.append(word_entity) word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words and word_tag != 'eng': #关键词也分配高权值 select_word.append(word_entity) try: word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: #其余词汇乘以0.6 select_word.append(word_entity) try: word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.50) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def select_top_N_words(read_directory1, read_directory2, write_directory): ''' 选取前N个词汇 :param read_directory1: 所有单词tf文件目录 :param read_directory2: 关键词文件目录 :param write_directory: 写入目录 ''' #选取的词汇数目 N = 2000 #目录下的文件个数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) #权值字典,按词性分配 score_dict = {"CC":0.0, "CD":0.0, "DT":0.2, "EX":0.0, "FW":0.3, "IN":0.0, "JJ":0.7, \ "JJR":0.75, "JJS":0.75, "LS":0.0, "MD":0.5, "NN":0.9, "NNS":0.9, "NNP":1.0, \ "NNPS":1.0, "PDT":0.0, "POS":0.0, "PRP":0.1, "PRP$":0.1, \ "RB":0.3, "RBR":0.35, "RBS":0.4, "RP":0.5, "SYM":0.0, "TO":0.0, "UH":0.0, \ "VB":0.7, "VBD":0.7, "VBG":0.7, "VBN":0.75, "VBP":0.7, "VBZ":0.7, \ "WDT":0.0, "WP":0.3, "WP$":0.3, "WRB":0.0, ":":0.0} for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) each_word_tf = each_word_tf[1:] # 列表,内层2个 get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): #word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split(',')[1] if each_word_tf[j][0] in key_words: select_word.append(each_word_tf[j][0]) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(each_word_tf[j][0]) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def select_top_N_words(read_directory1, read_directory2, read_filename3, write_directory): N = 500 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} user_dict = [] f = open(read_filename3, 'r') line = f.readline() while line: user_dict.append(line.split()[0]) line = f.readline() f.close() for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in user_dict: select_word.append(word_entity) word_score.append( np.log(float(each_word_tf[j][1])) * 1.0 * 1.0) elif word_entity in key_words: select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(word_entity) try: word_score.append( np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.60) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key=itemgetter(1), reverse=True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename): # 展示5个词汇 # 查询时选取3个词汇 select_number = 5 # 频繁项集聚类的结果标号,string类型,从1开始 class_tag = [] get_text_to_single_list(class_tag, read_filename1) # 聚簇数目 cluster_number = len(set(class_tag)) # 频繁项集,二维string类型列表 pattern_all = [] get_text_to_complex_list(pattern_all, read_filename2, 0) pattern_all = pattern_all[0 : len(class_tag)] # 获得聚类结果的频繁项集划分,int型二维列表 class_partion = [] for i in range(cluster_number): class_partion.append([]) for i in range(len(class_tag)): for j in range(cluster_number): if class_tag[i] == str(j + 1): class_partion[j].append(i) # 获取全局词汇的权值 word_weight_dict = {} f = open(read_filename3, "r") line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() # 获取频繁项集中所有不同的词汇 all_word_list = [] for each in pattern_all: for word in set(each).difference(all_word_list): all_word_list.append(word) # 包含某个单词的频繁项集个数——针对所有单词 I_dict = {} for each in all_word_list: I_dict[each] = 0 for each1 in pattern_all: if each in each1: I_dict[each] += 1 # 包含某个单词的聚簇个数——针对所有单词 C_dict = {} for each in all_word_list: C_dict[each] = 0 for i in range(len(class_partion)): for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: C_dict[each] += 1 break cluster_word_list = [] for i in range(len(class_partion)): # 获取该聚簇下所有不同的单词 this_word_list = [] for j in range(len(class_partion[i])): for each in pattern_all[class_partion[i][j]]: if each not in this_word_list: this_word_list.append(each) # 计算每个单词在聚簇中的支持度 sup_dict = {} for each in this_word_list: sup_dict[each] = 0 for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: sup_dict[each] += 1 word_score_list = [] # 计算聚簇中的每个单词的权值,作为查询分类的依据 for each in this_word_list: global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0) word_score_list.append(word_score) # 按权值降序排序 tw = zip(this_word_list, word_score_list) tw = sorted(tw, key=itemgetter(1), reverse=True) this_word_list = [] word_score_list = [] count = 0 for each in tw: this_word_list.append(each[0]) count += 1 if count >= select_number: break cluster_word_list.append(" ".join(this_word_list)) quick_write_list_to_text(cluster_word_list, write_filename)
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename): #展示5个词汇 #查询时选取3个词汇 select_number = 5 # 频繁项集聚类的结果标号,string类型,从1开始 class_tag = [] get_text_to_single_list(class_tag, read_filename1) # 聚簇数目 cluster_number = len(set(class_tag)) # 频繁项集,二维string类型列表 pattern_all = [] get_text_to_complex_list(pattern_all, read_filename2, 0) pattern_all = pattern_all[0: len(class_tag)] # 获得聚类结果的频繁项集划分,int型二维列表 class_partion = [] for i in range(cluster_number): class_partion.append([]) for i in range(len(class_tag)): for j in range(cluster_number): if class_tag[i] == str(j + 1): class_partion[j].append(i) # 获取全局词汇的权值 word_weight_dict = {} f = open(read_filename3, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() # 获取频繁项集中所有不同的词汇 all_word_list = [] for each in pattern_all: for word in set(each).difference(all_word_list): all_word_list.append(word) # 包含某个单词的频繁项集个数——针对所有单词 I_dict = {} for each in all_word_list: I_dict[each] = 0 for each1 in pattern_all: if each in each1: I_dict[each] += 1 # 包含某个单词的聚簇个数——针对所有单词 C_dict = {} for each in all_word_list: C_dict[each] = 0 for i in range(len(class_partion)): for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: C_dict[each] += 1 break cluster_word_list = [] for i in range(len(class_partion)): # 获取该聚簇下所有不同的单词 this_word_list = [] for j in range(len(class_partion[i])): for each in pattern_all[class_partion[i][j]]: if each not in this_word_list: this_word_list.append(each) # 计算每个单词在聚簇中的支持度 sup_dict = {} for each in this_word_list: sup_dict[each] = 0 for j in range(len(class_partion[i])): if each in pattern_all[class_partion[i][j]]: sup_dict[each] += 1 word_score_list = [] # 计算聚簇中的每个单词的权值,作为查询分类的依据 for each in this_word_list: global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0) word_score_list.append(word_score) # 按权值降序排序 tw = zip(this_word_list, word_score_list) tw = sorted(tw, key = itemgetter(1), reverse = True) this_word_list = [] word_score_list = [] count = 0 for each in tw: this_word_list.append(each[0]) count += 1 if count >= select_number: break cluster_word_list.append(" ".join(this_word_list)) quick_write_list_to_text(cluster_word_list, write_filename)
def oquery(keyword_list, mode, time_interval, select, read_directory, write_filename): ''' :param keyword_list: :param mode: :param time_interval: :param select: :param read_directory: 数据总目录 :param write_filename: ''' if len(time_interval) != 2: print "Set Time Error!" return if (mode != "AND") and (mode != "OR"): print "Mode Error!" return start = time_interval[0] end = time_interval[1] # 文件个数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory + '/update_vsm')]) query_result = [] entropy_result = [] # 当前目录下进行搜索 for i in range(file_number): # 读取时间信息 f = open(read_directory + '/update_id_time/' + str(i + 1) + '.txt') time_lines = f.readlines() f.close() # 当前片的最晚时间比查询设定的开始时间还早,则跳过该片 if float(time_lines[-1].strip().split()[-1]) < start: pass # 当前片的最早时间比查询设定的结束时间晚,则结束 elif float(time_lines[-1].strip().split()[-1]) > end: break else: # 数据的VSM表示的向量 each_weibo_vsm = [] get_text_to_complex_list(each_weibo_vsm, read_directory + '/update_vsm/' + str(i + 1) + '.txt', 0) # VSM所对应的词汇列表 word_list = [] f4 = open(read_directory + '/top_n_word/' + str(i + 1) + '.txt') word_lines = f4.readlines() f4.close() for each in word_lines: word_list.append(each.strip().split()[0]) # 信息熵值列表 f5 = open(read_directory + '/entropy/' + str(i + 1) + '.txt') entropy_list = f5.readlines() f5.close() entropy_list = [float(x.strip()) for x in entropy_list] # 每一个数据片中逐行遍历 for j in range(len(time_lines)): # 当前遍历时的时间 now_t = float(time_lines[j].strip().split()[-1]) if (now_t >= start) and (now_t <= end): if mode == "OR": flag = 0 for each1 in keyword_list: for k in range(len(word_list)): if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001): this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list)) if this_message not in query_result: query_result.append(this_message) entropy_result.append(entropy_list[j]) flag = 1 break if flag == 1: break else: flag = 0 for each1 in keyword_list: for k in range(len(word_list)): if (each1 in word_list[k]) and (float(each_weibo_vsm[j][k]) > 0.000001): flag += 1 break if flag == len(keyword_list): this_message = " ".join(vsm_map_word(each_weibo_vsm[j], word_list)) if this_message not in query_result: query_result.append(this_message) entropy_result.append(entropy_list[j]) # 按熵值降序排序 el = zip(entropy_result, query_result) el1 = sorted(el, key=itemgetter(0), reverse=True) # 选择对应的行号索引 query_result2 = [] count_number = 1 for each in el1: query_result2.append(each[1]) count_number += 1 if count_number > select: break quick_write_list_to_text(query_result2, write_filename)