def get_key_words(read_filename, write_filename1, write_filename2): ''' 使用结巴分词获取关键词 :param read_filename: :param write_filename1: :param write_filename2: ''' each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_filename, 0) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_filename1) quick_write_list_to_text(all_key_words, write_filename2)
def batch_em_cluster(read_directory, write_directory1, write_directory2): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) cluster_number = 8 init_mu = 0.1 init_sigma = 1.0 for i in range(file_number): vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt') data_dimension = vsm.shape[1] init_means = [] for j in range(cluster_number): init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu) cluster_model = cluster.EMClusterer(init_means, bias=0.1) cluster_tag = cluster_model.cluster(vsm, True, trace=False) cluster_tag_to_string = [str(x) for x in cluster_tag] center_data = cluster_model._means quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
def text_classify(read_filename1, read_filename2, read_filename3, write_filename): """ 查询分类 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: """ query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, "r") line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, "r") line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(query_pattern)): this_result = query(query_pattern[i], search_texts, word_weight_dict) result.append(" ".join([str(x) for x in this_result])) quick_write_list_to_text(result, write_filename)
def global_sort_by_time(update_item_index, read_directory, write_directory): print "Begin sorting." print "May take a long time, Please Wait..." read_file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) segment = 50000 total_length = len(update_item_index) segment_number = total_length / segment print "Total Segment %d ." % segment_number for i in range(segment_number): print "Segment %d ." % (i + 1) content_result = [] for k in range(segment): content_result.append(" ") for j in range(read_file_number): f1 = open(read_directory + "/" + str(j + 1) + ".txt", "rb") this_text_file = f1.readlines() f1.close() for l in range(segment): if update_item_index[segment * i + l][0] == str(j + 1): content_result[l] = this_text_file[int(update_item_index[segment * i + l][1]) - 1].strip() quick_write_list_to_text(content_result, write_directory + "/" + str(i + 1) + ".txt") print "Global Sort Complete!!!"
def vsm_update(read_directory1, read_directory2, write_directory1, write_directory2): ''' 除去全0的行 :param read_directory1: :param read_directory2: :param write_directory1: :param write_directory2: ''' file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): update_vsm = [] update_id_time = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') each_weibo_vsm = f1.readlines() f1.close() id_time = [] get_text_to_complex_list2(id_time, read_directory2 + '/' + str(i + 1) + '.txt', 0, 2) for j in range(len(each_weibo_vsm)): int_each_weibo_vsm = [int(x) for x in each_weibo_vsm[j].split()] #去掉全0行 if np.sum(int_each_weibo_vsm) > 0.1: update_vsm.append(each_weibo_vsm[j]) update_id_time.append(" ".join(id_time[j])) quick_write_list_to_text2(update_vsm, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(update_id_time, write_directory2 + '/' + str(i + 1) + '.txt') print "VSM Update Complete!!!"
def get_key_words(read_directory, write_directory1, write_directory2): ''' :param read_directory: :param write_directory1: :param write_directory2: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 0) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def compute_em_weights(read_filename1, read_filename2, write_filename): ''' Linear fusion :param read_filename1: :param read_filename2: :param write_filename: ''' em_weights = [] coefficients_string = [] get_text_to_single_list(coefficients_string, read_filename2) coefficients = [float(x) for x in coefficients_string] f = open(read_filename1, 'r') line = f.readline() while line: each_line = line.split() em_weights.append( float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2]) line = f.readline() f.close() em_weights_to_string = [str(x) for x in em_weights] quick_write_list_to_text(em_weights_to_string, write_filename)
def text_classify(read_filename1, read_filename2, read_filename3, write_filename): ''' 查询分类 :param read_filename1: :param read_filename2: :param read_filename3: :param write_filename: ''' query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, 'r') line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(query_pattern)): this_result = query(query_pattern[i], search_texts, word_weight_dict) result.append(" ".join([str(x) for x in this_result])) quick_write_list_to_text(result, write_filename)
def text_classify(read_filename1, read_filename2, read_filename3, write_filename): query_pattern = [] get_text_to_complex_list(query_pattern, read_filename1, 0) word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() search_texts = [] f1 = open(read_filename3, 'r') line = f1.readline() while line: search_texts.append(line.strip()) line = f1.readline() f1.close() result = [] for i in range(len(search_texts)): result.append([]) for i in range(len(query_pattern)): this_result = query2(query_pattern[i], search_texts, word_weight_dict) result[this_result].append(str(i)) result_to_string = [] for each in result: result_to_string.append(" ".join(each)) quick_write_list_to_text(result_to_string, write_filename)
def map_word_list(read_directory1, read_directory2, write_filename): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) result = [] for i in range(file_number): word_list = [] f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: word_list.append(line.strip()) line = f.readline() f.close() vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def batch_em_cluster(read_directory, write_directory1, write_directory2): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) cluster_number = 8 init_mu = 0.1 init_sigma = 1.0 for i in range(file_number): vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt') data_dimension = vsm.shape[1] init_means = [] for j in range(cluster_number): init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu) cluster_model = cluster.EMClusterer(init_means, bias=0.1) cluster_tag = cluster_model.cluster(vsm, True, trace=False) cluster_tag_to_string = [str(x) for x in cluster_tag] center_data = cluster_model._means quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
def hqd_word_segment(read_directory, write_directory1, write_directory2, write_directory3, write_directory4, write_directory5): ''' :param read_directory: :param write_directory1: :param write_directory2: :param write_directory3: :param write_directory4: :param write_directory5: ''' stopwords_list1 = get_stopwords1() file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): time_series = [] class_tag = [] content_with_tag = [] content_without_tag = [] all_weibo_word = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb') line = f.readline() while line: this_line = line.strip().split('\t') #this_id = this_line[0] this_time = time.mktime(time.strptime(this_line[2], '%Y/%m/%d %H:%M')) time_series.append(str(this_time)) class_tag.append(this_line[5]) try: this_text = this_line[6] except: this_text = " " wd_with_tag = word_segment(this_text, stopwords_list1) wd_without_tag = [x.split('/')[0] for x in wd_with_tag] # 此处的词汇带有词性标注 for word in set(wd_with_tag).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) content_with_tag.append(" ".join(wd_with_tag)) content_without_tag.append(" ".join(wd_without_tag)) line = f.readline() f.close() quick_write_list_to_text(time_series, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(content_with_tag, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(content_without_tag, write_directory3 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(class_tag, write_directory4 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_weibo_word, write_directory5 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def get_key_words(read_directory, write_directory1, write_directory2): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): each_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 2) key_words = [] all_key_words = [] for row in range(len(each_weibo_fenci)): word_entity = [] for each in each_weibo_fenci[row]: word_entity.append(each.split('/')[0]) tags = jieba.analyse.extract_tags(" ".join(word_entity), 3) key_words.append(" ".join(tags)) for word in " ".join(tags).split(): if word not in all_key_words: all_key_words.append(word) quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt')
def compute_em_weights(read_filename1, read_filename2, write_filename): ''' Linear fusion :param read_filename1: :param read_filename2: :param write_filename: ''' em_weights = [] coefficients_string = [] get_text_to_single_list(coefficients_string, read_filename2) coefficients = [float(x) for x in coefficients_string] f = open(read_filename1, 'r') line = f.readline() while line: each_line = line.split() em_weights.append(float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2]) line = f.readline() f.close() em_weights_to_string = [str(x) for x in em_weights] quick_write_list_to_text(em_weights_to_string, write_filename)
def topics_count(read_directory1, read_filename, write_directory): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows for i in range(file_number): id_series = [] time_series = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') line = f1.readline() while line: id_series.append(line.split('\x7f')[0]) #try: time_series.append(float(line.split('\x7f')[1])) #except: #time_series.append(41275.0) line = f1.readline() f1.close() all_tag = [] topic_dict = {} j = 1 k = 0 while j < weibo_row: weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548: if weibo_tag in all_tag: topic_dict[weibo_tag] += 1 else: all_tag.append(weibo_tag) topic_dict[weibo_tag] = 1 k += 1 j += 1 if k >= len(id_series): break result = [] for each in all_tag: result.append(each + ' ' + str(topic_dict[each])) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
def select_top_N_words(read_directory1, read_directory2, write_directory): N = 1000 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \ "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \ "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \ "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \ "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \ "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \ "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \ "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \ "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0} for i in range(file_number): each_word_tf = [] key_words = [] select_word = [] word_score = [] get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0) each_word_tf = each_word_tf[1:] # 列表,内层2个 get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt') for j in range(len(each_word_tf)): word_entity = each_word_tf[j][0].split('/')[0] word_tag = each_word_tf[j][0].split('/')[1] if word_entity in key_words: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0) except KeyError: word_score.append(float(0.0)) else: select_word.append(word_entity) try: word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80) except KeyError: word_score.append(float(0.0)) # 按权值降序排序 sw = zip(select_word, word_score) sw = sorted(sw, key = itemgetter(1), reverse = True) result_all = [] count_number = 1 for each in sw: result_all.append(each[0] + " " + str(each[1])) count_number += 1 if count_number > N: break quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def topics_count(read_directory1, read_filename, write_directory): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows for i in range(file_number): id_series = [] time_series = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') line = f1.readline() while line: id_series.append(line.split('\x7f')[0]) #try: time_series.append(float(line.split('\x7f')[1])) #except: #time_series.append(41275.0) line = f1.readline() f1.close() all_tag = [] topic_dict = {} j = 1 k = 0 while j < weibo_row: weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548: if weibo_tag in all_tag: topic_dict[weibo_tag] += 1 else: all_tag.append(weibo_tag) topic_dict[weibo_tag] = 1 k += 1 j += 1 if k >= len(id_series): break result = [] for each in all_tag: result.append(each + ' ' + str(topic_dict[each])) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
def generate_vsm_for_trans(read_filename): now_directory = os.getcwd() root_directory = os.path.dirname(now_directory) + '/' write_directory = root_directory + u'dataset' if (not (os.path.exists(write_directory))): os.mkdir(write_directory) write_filename = write_directory + u'/vsm.txt' pattern_list = [] all_word_list = [] f = open(read_filename, 'r') line = f.readline() while line: if len(line.split()) > 1: pattern_list.append(line.split()) for each in line.split(): if each not in all_word_list: all_word_list.append(each) line = f.readline() f.close() vsm = [] for i in range(len(pattern_list)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for each in pattern_list[i]: try: tf_dict[each] = 1 except KeyError: tf_dict[each] = 0 this_line = [] for key in all_word_list: this_line.append(tf_dict[key]) vsm.append(this_line) vsm_to_string = [] for each in vsm: vsm_to_string.append(" ".join([str(x) for x in each])) np_vsm = np.array([vsm]) quick_write_list_to_text(vsm_to_string, write_filename) return np_vsm
def get_new_wordlist(read_directory1, read_directory2, write_directory, write_filename): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) file_count = 1 this_start_time = 41538 new_word_list = [] all_batch_id = [] this_batch_id = [] for i in range(file_number): time_series = [] f = open(read_directory1 + "/" + str(i + 1) + '.txt') line = f.readline() while line: time_series.append(float(line.split()[1])) line = f.readline() f.close() if time_series[0] >= 41548: break elif (time_series[-1] < 41538): pass else: word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() if (time_series[-1] - this_start_time < 2): for word in set(word_list).difference(new_word_list): new_word_list.append(word) this_batch_id.append(str(i + 1)) else: quick_write_list_to_text( new_word_list, write_directory + '/' + str(file_count) + '.txt') all_batch_id.append(" ".join(this_batch_id)) new_word_list = [] this_start_time = this_start_time + 2 this_batch_id = [] file_count = file_count + 1 quick_write_list_to_text(all_batch_id, write_filename)
def generate_vsm_for_trans(read_filename): now_directory = os.getcwd() root_directory = os.path.dirname(now_directory) + '/' write_directory = root_directory + u'dataset' if (not(os.path.exists(write_directory))): os.mkdir(write_directory) write_filename = write_directory + u'/vsm.txt' pattern_list = [] all_word_list = [] f = open(read_filename, 'r') line = f.readline() while line: if len(line.split()) > 1: pattern_list.append(line.split()) for each in line.split(): if each not in all_word_list: all_word_list.append(each) line = f.readline() f.close() vsm = [] for i in range(len(pattern_list)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for each in pattern_list[i]: try: tf_dict[each] = 1 except KeyError: tf_dict[each] = 0 this_line = [] for key in all_word_list: this_line.append(tf_dict[key]) vsm.append(this_line) vsm_to_string = [] for each in vsm: vsm_to_string.append(" ".join([str(x) for x in each])) np_vsm = np.array([vsm]) quick_write_list_to_text(vsm_to_string, write_filename) return np_vsm
def get_final_center(read_filename1, read_filename2, write_filename): result = [] word_list = [] get_text_to_single_list(word_list, read_filename2) vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key=itemgetter(1), reverse=True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory): ''' 微博文本的向量空间构造,值为TF :param read_filename1: :param read_filename2: :param write_filename: ''' file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 2) f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: all_weibo_fenci.append(line.strip().split()[0]) line = f.readline() f.close() result = [] for row in range(len(each_weibo_fenci)): tf_dict = {} # 词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0 this_line = [] for key in all_weibo_fenci: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') print "VSM Complete!!!"
def hq_text_clustering(read_directory1, read_directory2, read_directory3, write_directory1, write_directory2): gamma = 0.01 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') # 本片数据的词汇列表 this_word_list = [] f1 = open(read_directory3 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: this_word_list.append(line.split()[0]) line = f1.readline() f1.close() if len(PHAI) >= 200: PHAI = np.array([PHAI]) cluster_tag = [] for j in range(len(THETA)): cluster_tag.append(str(np.argmax(THETA[j]))) real_topics = [] for j in range(len(PHAI)): this_topic = [] this_topic_weight = [] for k in range(len(PHAI[j])): if PHAI[j][k] > gamma: this_topic.append(this_word_list[k]) this_topic_weight.append(PHAI[j][k]) tt = zip(this_topic, this_topic_weight) tt = sorted(tt, key = itemgetter(1), reverse=True) this_topic = [] for each in tt: this_topic.append(each[0]) real_topics.append(" ".join(this_topic)) quick_write_list_to_text(cluster_tag, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(real_topics, write_directory2 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def compute_distance(read_directory1, read_directory2, read_directory3, write_filename, write_directory): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) center_d = [] for i in range(file_number): center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T kl1 = KL_distance(center[0], center[1]) kl2 = KL_distance(center[1], center[0]) center_d.append(str(np.max([kl1, kl2]))) cluster_data = [] f = open(read_directory2 + '/' + str(i + 1) + '.txt') cluster_da = f.readlines() f.close() for each in cluster_da: cluster_data.append([float(x) for x in each.split()]) #cluster_data = get_text_to_nparray(read_directory2 + '/' + str(i + 1) + '.txt', 'float') f = open(read_directory3 + '/' + str(i + 1) + '.txt') cluster_tag = f.readlines() f.close() final_distance = [] distance1 = 0.0 distance2 = 0.0 count1 = 0 count2 = 0 for j in range(len(cluster_tag)): if cluster_tag[j].strip() == '1': kl1 = KL_distance(center[0], cluster_data[j]) kl2 = KL_distance(cluster_data[j], center[0]) distance1 += np.max([kl1, kl2]) count1 += 1 if cluster_tag[j].strip() == '2': kl1 = KL_distance(center[1], cluster_data[j]) kl2 = KL_distance(cluster_data[j], center[1]) distance2 += np.max([kl1, kl2]) count2 += 1 final_distance.append(str(np.true_divide(distance1, count1))) final_distance.append(str(np.true_divide(distance2, count2))) quick_write_list_to_text(final_distance, write_directory + '/' + str(i + 1) + '.txt') quick_write_list_to_text(center_d, write_filename)
def SP_CT_LDA(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') #视图1,根据词汇分布计算潜在主题之间的相似度 W1 = np.zeros((len(PHAI), len(PHAI))) for j in range(len(PHAI)): for k in range(j, len(PHAI)): W1[j, k] = 1.0 / (SKLD(PHAI[j], PHAI[k]) + 1.0) W1[k, j] = W1[j, k] #估计聚类数目 cluster_number = get_cluster_number(W1) print cluster_number cluster_tag = spectral_cluster2(W1, cluster_number) #聚类分析 center_topic = np.zeros((cluster_number, len(PHAI[0]))) each_cluster_number = np.zeros(cluster_number, int) weibo_topic_similarity = np.zeros((cluster_number, len(THETA))) THETA = THETA.transpose() for j in range(len(cluster_tag)): center_topic[cluster_tag[j]] += PHAI[j] each_cluster_number[cluster_tag[j]] += 1 weibo_topic_similarity[cluster_tag[j]] += THETA[j] # for j in range(cluster_number): center_topic[j] = center_topic[j] / each_cluster_number[j] #weibo_topic_similarity[j] = weibo_topic_similarity[j] / each_cluster_number[j] weibo_topic_similarity = weibo_topic_similarity.transpose() ecn_to_string = [str(x) for x in each_cluster_number] write_matrix_to_text(weibo_topic_similarity, write_directory1 + '/' + str(i + 1) + '.txt') write_matrix_to_text(center_topic, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(ecn_to_string, write_directory3 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def get_new_wordlist(read_directory1, read_directory2, write_directory, write_filename): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) file_count = 1 this_start_time = 41538 new_word_list = [] all_batch_id = [] this_batch_id = [] for i in range(file_number): time_series = [] f = open(read_directory1 + "/" + str(i + 1) + '.txt') line = f.readline() while line: time_series.append(float(line.split()[1])) line = f.readline() f.close() if time_series[0] >= 41548: break; elif (time_series[-1] < 41538): pass; else: word_list = [] f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb') line = f1.readline() while line: word_list.append(line.split()[0]) line = f1.readline() f1.close() if (time_series[-1] - this_start_time < 2): for word in set(word_list).difference(new_word_list): new_word_list.append(word) this_batch_id.append(str(i + 1)) else: quick_write_list_to_text(new_word_list, write_directory + '/' + str(file_count) + '.txt') all_batch_id.append(" ".join(this_batch_id)) new_word_list = [] this_start_time = this_start_time + 2 this_batch_id = [] file_count = file_count + 1 quick_write_list_to_text(all_batch_id, write_filename)
def compute_distance(read_directory1, read_directory2, read_directory3, write_filename, write_directory): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) center_d = [] for i in range(file_number): center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt') center = center.T kl1 = KL_distance(center[0], center[1]) kl2 = KL_distance(center[1], center[0]) center_d.append(str(np.max([kl1, kl2]))) cluster_data = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt') cluster_data = cluster_data.T f = open(read_directory3 + '/' + str(i + 1) + '.txt') cluster_tag = f.readlines() f.close() final_distance = [] distance1 = 0.0 distance2 = 0.0 count1 = 0 count2 = 0 for j in range(len(cluster_tag)): if cluster_tag[j].strip() == '1': kl1 = KL_distance(center[0], cluster_data[j]) kl2 = KL_distance(cluster_data[j], center[0]) distance1 += np.max([kl1, kl2]) count1 += 1 if cluster_tag[j].strip() == '2': kl1 = KL_distance(center[1], cluster_data[j]) kl2 = KL_distance(cluster_data[j], center[1]) distance2 += np.max([kl1, kl2]) count2 += 1 final_distance.append(str(np.true_divide(distance1, count1))) final_distance.append(str(np.true_divide(distance2, count2))) quick_write_list_to_text(final_distance, write_directory + '/' + str(i + 1) + '.txt') quick_write_list_to_text(center_d, write_filename)
def count_word_tf(read_directory, write_directory): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): review_keywords = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb') line = f.readline() while line: for word in set(line.split()).difference(review_keywords): review_keywords.append(word) line = f.readline() f.close() quick_write_list_to_text(review_keywords, write_directory + '/' + str(i + 1) + '.txt')
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text( each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text( all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def get_word_list(read_directory, write_directory): file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): word_list = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'r') line = f.readline() while line: for each in line.split(): if each not in word_list: word_list.append(each) line = f.readline() f.close() quick_write_list_to_text(word_list, write_directory + '/' + str(i + 1) + '.txt')
def count_word_tf(read_directory1, read_directory2, write_directory): ''' 计算每片数据的所有词汇的词频 :param read_directory1: 文本文件目录 :param read_directory2: 所有词汇文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): #每条文本的分词结果 each_text_segment = [] #该数据片中的所有数据 all_text_word = [] get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_text_word: tf_dict[key] = 0 for row in range(len(each_text_segment)): for j in range(len(each_text_segment[row])): try: tf_dict[each_text_segment[row][j]] += 1 except KeyError: tf_dict[each_text_segment[row][j]] = 0 #词频列表 value_list = [] for key in all_text_word: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_text_word, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = ['-Word- -TF-'] for each in va: result_all.append(each[0] + " " + str(each[1])) #写入文件 quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory): ''' 微博文本的向量空间构造,值为TF :param read_filename1: :param read_filename2: :param write_filename: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: all_weibo_fenci.append(line.strip().split()[0]) line = f.readline() f.close() result = [] for row in range(len(each_weibo_fenci)): tf_dict = {} # 词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0 this_line = [] for key in all_weibo_fenci: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 result.append(" ".join(this_line)) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt') print "VSM Complete!!!"
def batch_count_tf(read_directory1, read_directory2, write_directory): ''' :param read_directory1: :param read_directory2: :param write_directory: ''' file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0) get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt') tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def data_segment(read_filename, write_directory): ''' 数据分片 :param read_filename: 读取文件 :param write_directory: 写入目录 ''' # 文件开始的编号 file_number = 1 print "Begin data segmentation!!!" print "May take a long time, Please Wait..." # 每条微博的分词 weibo_content_segment = [] # 每条文本的分词 weibo_id_segment = [] line_count = 0 fr = open(read_filename) line = fr.readline() while line: weibo_content_segment.append(line.strip()) weibo_id_segment.append(str(line_count)) line_count += 1 if line_count % 5000 == 0: # 写入文件 quick_write_list_to_text( weibo_content_segment, write_directory + u'/weibo_segment/' + str(file_number) + '.txt') quick_write_list_to_text( weibo_id_segment, write_directory + u'/weibo_id/' + str(file_number) + '.txt') file_number += 1 weibo_content_segment = [] weibo_id_segment = [] line = fr.readline() fr.close() print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def map_word_list(read_filename1, read_filename2, write_filename): word_list = [] f = open(read_filename2, 'rb') line = f.readline() while line: word_list.append(line.strip().split(',')[0]) line = f.readline() f.close() word_result = [] vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: word_result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(word_result, write_filename)
def count_word_tf(read_directory, write_directory): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): review_keywords = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb') line = f.readline() while line: for word in set(line.split()).difference(review_keywords): review_keywords.append(word) line = f.readline() f.close() quick_write_list_to_text(review_keywords, write_directory + '/' + str(i + 1) + '.txt')
def get_word_list(read_directory, write_directory): file_number = sum( [len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): word_list = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'r') line = f.readline() while line: for each in line.split(): if each not in word_list: word_list.append(each) line = f.readline() f.close() quick_write_list_to_text(word_list, write_directory + '/' + str(i + 1) + '.txt')
def sample_real_center(read_filename1, read_filename2, write_filename): result = [] word_list = [] f = open(read_filename2) line = f.readline() while line: word_list.append(line.strip().split()[0]) line = f.readline() f.close() word_list = word_list[0:1000] vsm = np.loadtxt(read_filename1) vsm = vsm.T for each in vsm: result.append(" ".join(reflect_vsm_to_wordlist(each, word_list))) quick_write_list_to_text(result, write_filename)
def spct_prf(read_filename1, read_filename2, write_filename): cluster_tag = [] real_tag = [] get_text_to_single_list(cluster_tag, read_filename1) get_text_to_single_list(real_tag, read_filename2) cluster_tag = [int(x) for x in cluster_tag] real_tag = [int(x) for x in real_tag] reflect = [20, 21, 20] p, r, f = prf(cluster_tag, real_tag, reflect) print p print r print f quick_write_list_to_text([str(p), str(r), str(f)], write_filename)
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break; print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def get_weibo_entropy(read_directory1, read_directory2, write_directory): ''' 计算文本信息熵 :param read_directory1: 词频向量文件目录 :param read_directory2: top n word文件目录 :param write_directory: 写入目录 ''' #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): word_weight = [] #获取词频向量 f0 = open(read_directory1 + '/' + str(i + 1) + '.txt') each_vsm = f0.readlines() f0.close() #获取词汇权值为一个列表 f = open(read_directory2 + '/' + str(i + 1) + '.txt') line = f.readline() while line: word_weight.append(float(line.split()[1])) line = f.readline() f.close() #得到的word_weight是一个array word_weight = np.log2(word_weight) entropy_all = [] for each in each_vsm: #计算熵值 each_line_vsm = np.array([float(x) for x in each.split()]) entropy_all.append(str(np.dot(word_weight, each_line_vsm))) #写入文件 quick_write_list_to_text(entropy_all, write_directory + '/' + str(i + 1) + '.txt') print "Compute Entropy Complete!!!"
def pre_text_classify(read_filename1, read_filename2, write_filename): vsm = np.loadtxt(read_filename1) vsm = vsm.T select_number = 3 word_list = [] word_weight = [] #word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_list.append(line.split()[0]) word_weight.append(line.split()[1]) line = f.readline() f.close() word_list = word_list[0:1000] word_weight = word_weight[0:1000] total_result = [] for i in range(len(vsm)): weight = [] for j in range(len(word_list)): weight.append(vsm[i, j]) ww = zip(word_list, weight) ww = sorted(ww, key = itemgetter(1), reverse = True) word_result = [] count_number = 1 for each in ww: word_result.append(each[0]) count_number += 1 if count_number > select_number: break total_result.append(" ".join(word_result)) quick_write_list_to_text(total_result, write_filename)
def data_segment(read_filename, write_directory): ''' 数据分片 :param read_filename: 读取文件 :param write_directory: 写入目录 ''' # 文件开始的编号 file_number = 1 print "Begin data segmentation!!!" print "May take a long time, Please Wait..." # 每条微博的分词 weibo_content_segment = [] # 每条文本的分词 weibo_id_segment = [] line_count = 0 fr = open(read_filename) line = fr.readline() while line: weibo_content_segment.append(line.strip()) weibo_id_segment.append(str(line_count)) line_count += 1 if line_count % 5000 == 0: # 写入文件 quick_write_list_to_text(weibo_content_segment, write_directory + u'/weibo_segment/' + str(file_number) + '.txt') quick_write_list_to_text(weibo_id_segment, write_directory + u'/weibo_id/' + str(file_number) + '.txt') file_number += 1 weibo_content_segment = [] weibo_id_segment = [] line = fr.readline() fr.close() print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def pre_text_classify(read_filename1, read_filename2, write_filename): vsm = np.loadtxt(read_filename1) vsm = vsm.T select_number = 3 word_list = [] word_weight = [] #word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_list.append(line.split()[0]) word_weight.append(line.split()[1]) line = f.readline() f.close() word_list = word_list[0:1000] word_weight = word_weight[0:1000] total_result = [] for i in range(len(vsm)): weight = [] for j in range(len(word_list)): weight.append(vsm[i, j]) ww = zip(word_list, weight) ww = sorted(ww, key=itemgetter(1), reverse=True) word_result = [] count_number = 1 for each in ww: word_result.append(each[0]) count_number += 1 if count_number > select_number: break total_result.append(" ".join(word_result)) quick_write_list_to_text(total_result, write_filename)
def sample_vsm(read_filename1, read_filename2, write_filename): weibo_content = [] all_word_list = [] select_number = 1000 get_text_to_complex_list(weibo_content, read_filename1, 0) f = open(read_filename2) line = f.readline() while line: all_word_list.append(line.strip().split()[0]) line = f.readline() f.close() all_word_list = all_word_list[0 : select_number] vsm = [] for row in range(len(weibo_content)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for j in range(len(weibo_content[row])): try: tf_dict[weibo_content[row][j].split('/')[0]] += 1 except KeyError: tf_dict[weibo_content[row][j].split('/')[0]] = 0 this_line = [] for key in all_word_list: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 vsm.append(" ".join(this_line)) quick_write_list_to_text(vsm, write_filename)
def sample_vsm(read_filename1, read_filename2, write_filename): weibo_content = [] all_word_list = [] select_number = 1000 get_text_to_complex_list(weibo_content, read_filename1, 0) f = open(read_filename2) line = f.readline() while line: all_word_list.append(line.strip().split()[0]) line = f.readline() f.close() all_word_list = all_word_list[0:select_number] vsm = [] for row in range(len(weibo_content)): tf_dict = {} # 词频TF字典 for key in all_word_list: tf_dict[key] = 0 for j in range(len(weibo_content[row])): try: tf_dict[weibo_content[row][j].split('/')[0]] += 1 except KeyError: tf_dict[weibo_content[row][j].split('/')[0]] = 0 this_line = [] for key in all_word_list: this_line.append(str(tf_dict[key])) #每一行合并为字符串,方便写入 vsm.append(" ".join(this_line)) quick_write_list_to_text(vsm, write_filename)
def count_word_tf(read_filename1, read_filename2, write_filename): ''' 计算数据的所有词汇的词频 :param read_filename1: :param read_filename2: :param write_filename: ''' each_weibo_fenci = [] all_weibo_fenci = [] get_text_to_complex_list(each_weibo_fenci, read_filename1, 0) get_text_to_single_list(all_weibo_fenci, read_filename2) tf_dict = {} #词频TF字典 for key in all_weibo_fenci: tf_dict[key] = 0 for row in range(len(each_weibo_fenci)): for j in range(len(each_weibo_fenci[row])): try: tf_dict[each_weibo_fenci[row][j]] += 1 except KeyError: tf_dict[each_weibo_fenci[row][j]] = 0 #词频列表 value_list = [] for key in all_weibo_fenci: value_list.append(tf_dict[key]) # 按词频降序排序 va = zip(all_weibo_fenci, value_list) va = sorted(va, key = itemgetter(1), reverse = True) result_all = [] for each in va: result_all.append(each[0] + " " + str(each[1])) quick_write_list_to_text(result_all, write_filename)
def vsm_update(read_directory1, read_directory2, write_directory1, write_directory2): ''' 除去全0的行 :param read_directory1: :param read_directory2: :param write_directory1: :param write_directory2: ''' file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) for i in range(file_number): update_vsm = [] update_id_time = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') each_weibo_vsm = f1.readlines() f1.close() id_time = [] get_text_to_complex_list2(id_time, read_directory2 + '/' + str(i + 1) + '.txt', 0, 2) for j in range(len(each_weibo_vsm)): int_each_weibo_vsm = [int(x) for x in each_weibo_vsm[j].split()] #去掉全0行 if np.sum(int_each_weibo_vsm) > 0.1: update_vsm.append(each_weibo_vsm[j]) update_id_time.append(" ".join(id_time[j])) quick_write_list_to_text2(update_vsm, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(update_id_time, write_directory2 + '/' + str(i + 1) + '.txt') print "VSM Update Complete!!!"
def global_sort_by_time(update_item_index, read_directory, write_directory): print "Begin sorting." print "May take a long time, Please Wait..." line_count = 1 file_count = 880 review_result = [] for i in range(len(update_item_index)): f1 = open(read_directory + "/" + update_item_index[i][0] + ".txt", "rb") each_review_text = f1.readlines() f1.close() #try: time_index = int(update_item_index[i][1]) review_result.append(each_review_text[time_index - 6].strip()) review_result.append(each_review_text[time_index - 5].strip()) review_result.append(each_review_text[time_index - 4].strip()) review_result.append(each_review_text[time_index - 3].strip()) review_result.append(each_review_text[time_index - 2].strip()) review_result.append(each_review_text[time_index - 1].strip()) review_result.append(each_review_text[time_index].strip()) review_result.append(each_review_text[time_index + 1].strip()) review_result.append("") #except IndexError: #review_result.append("\n") line_count += 1 if line_count > 5000: quick_write_list_to_text(review_result, write_directory + "/" + str(file_count) + ".txt") review_result = [] line_count = 1 file_count += 1 print "Sort Complete!!!"
def pattern_cluster(read_filename1, read_filename2, read_filename3, write_filename1, write_filename2): pattern_list = [] f = open(read_filename1, 'r') line = f.readline() while line: if len(line.split()) > 1: pattern_list.append(line.split()) line = f.readline() f.close() word_weight_dict = {} f = open(read_filename2, 'r') line = f.readline() while line: word_weight_dict[line.split()[0]] = float(line.split()[1]) line = f.readline() f.close() #调用compute_similarity函数计算相似度矩阵并给出聚类数目 similarity_matrix, cluster_number = compute_similarity(pattern_list, read_filename3, word_weight_dict) write_matrix_to_text(similarity_matrix, write_filename1) quick_write_list_to_text([str(cluster_number)], write_filename2)
def kmeans_evaluate(read_filename1, read_filename2, write_directory): # string类型 real_tag = [] get_text_to_single_list(real_tag, read_filename1) cluster_tag = [] get_text_to_single_list(cluster_tag, read_filename2) real_tag = real_tag[0:len(cluster_tag)] #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5... reflect_tag = [['6', '8'], ['4'], ['5'], ['7'], ['3'], ['2'], ['6', '8'], ['1']] cluster_partion = [] for i in range(len(reflect_tag)): cluster_partion.append([]) for i in range(len(cluster_tag)): cluster_partion[int(cluster_tag[i]) - 1].append(str(i)) precision_list = [] recall_list = [] fmeasure_list = [] for i in range(len(reflect_tag)): real_cluster_partion = [] for j in range(len(real_tag)): if real_tag[j] in reflect_tag[i]: real_cluster_partion.append(str(j)) correct = len(set(cluster_partion[i]) & set(real_cluster_partion)) this_precision = np.true_divide(correct, len(set(cluster_partion[i]))) this_recall = np.true_divide(correct, len(set(real_cluster_partion))) this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall)) print this_precision, this_recall, this_fmeasure precision_list.append(str(this_precision)) recall_list.append(str(this_recall)) fmeasure_list.append(str(this_fmeasure)) average_precision = np.average([float(x) for x in precision_list]) average_recall = np.average([float(x) for x in recall_list]) average_fmeasure = np.average([float(x) for x in fmeasure_list]) print 'Average:', average_precision, average_recall, average_fmeasure quick_write_list_to_text(precision_list, write_directory + u'/precision.txt') quick_write_list_to_text(recall_list, write_directory + u'/recall.txt') quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def global_segment(read_filename, write_directory): ''' 所有微博文本分词,并获取微博的各项信息 :param read_filename: :param write_directory: ''' stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() global_id = [] global_time = [] global_tag = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row f1 = open(write_directory + '/weibo_content.txt', 'w') f2 = open(write_directory + '/weibo_content2.txt', 'w') for j in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) global_id.append(weibo_id) global_time.append(str(weibo_time)) weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) f1.write(" ".join(fenci_result)) f1.write("\n") fenci_without_tag = [x.split('/')[0] for x in fenci_result] f2.write(" ".join(fenci_without_tag)) f2.write("\n") global_tag.append(weibo_tag) f1.close() f2.close() quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
def data_sample(read_directory, write_directory1, write_directory2): ''' :param read_directory: :param write_directory1: :param write_directory2: ''' file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory)]) sample_size = 250 sample_time = [] ratio = [] for i in range(file_number): vsm_matrix = get_text_to_nparray( read_directory + '/' + str(i + 1) + '.txt', 'int') vsm_matrix = vsm_matrix.T print 'Batch: %d' % (i + 1) start = time.clock() data_dimension = vsm_matrix.shape[0] Q = np.zeros((sample_size, data_dimension)) for k in range(Q.shape[0]): for j in range(Q.shape[1]): Q[k, j] = random.gauss( 1, np.sqrt(np.true_divide(1, np.sqrt(sample_size)))) sample_result = np.dot(Q, vsm_matrix) this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0 ratio.append(str(this_ratio)) interval = time.clock() - start print 'Time: %f' % interval sample_time.append(str(interval)) write_result = [] for each in sample_result: write_result.append(" ".join([str(x) for x in each])) quick_write_list_to_text(write_result, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(sample_time, write_directory2 + '/sample_time.txt') quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')