def topics_count(read_directory1, read_filename, write_directory): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows for i in range(file_number): id_series = [] time_series = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') line = f1.readline() while line: id_series.append(line.split('\x7f')[0]) #try: time_series.append(float(line.split('\x7f')[1])) #except: #time_series.append(41275.0) line = f1.readline() f1.close() all_tag = [] topic_dict = {} j = 1 k = 0 while j < weibo_row: weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548: if weibo_tag in all_tag: topic_dict[weibo_tag] += 1 else: all_tag.append(weibo_tag) topic_dict[weibo_tag] = 1 k += 1 j += 1 if k >= len(id_series): break result = [] for each in all_tag: result.append(each + ' ' + str(topic_dict[each])) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
def topics_count(read_directory1, read_filename, write_directory): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows for i in range(file_number): id_series = [] time_series = [] f1 = open(read_directory1 + '/' + str(i + 1) + '.txt') line = f1.readline() while line: id_series.append(line.split('\x7f')[0]) #try: time_series.append(float(line.split('\x7f')[1])) #except: #time_series.append(41275.0) line = f1.readline() f1.close() all_tag = [] topic_dict = {} j = 1 k = 0 while j < weibo_row: weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548: if weibo_tag in all_tag: topic_dict[weibo_tag] += 1 else: all_tag.append(weibo_tag) topic_dict[weibo_tag] = 1 k += 1 j += 1 if k >= len(id_series): break result = [] for each in all_tag: result.append(each + ' ' + str(topic_dict[each])) quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
def global_segment(read_filename, write_directory): ''' 所有微博文本分词,并获取微博的各项信息 :param read_filename: :param write_directory: ''' stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() global_id = [] global_time = [] global_tag = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row f1 = open(write_directory + '/weibo_content.txt', 'w') f2 = open(write_directory + '/weibo_content2.txt', 'w') for j in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) global_id.append(weibo_id) global_time.append(str(weibo_time)) weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) f1.write(" ".join(fenci_result)) f1.write("\n") fenci_without_tag = [x.split('/')[0] for x in fenci_result] f2.write(" ".join(fenci_without_tag)) f2.write("\n") global_tag.append(weibo_tag) f1.close() f2.close() quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text( each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text( all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break; print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def compute_purity(read_filename1, read_filename2, read_filename3, write_filename): #文件总数 file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename3) weibo_row = weibo_sheet.nrows id_series = [] time_series = [] f1 = open(read_filename2) line = f1.readline() while line: id_series.append(line.split()[0]) time_series.append(float(line.split()[1])) line = f1.readline() f1.close() cluster_tag = [] f2 = open(read_filename1) line = f2.readline() while line: cluster_tag.append(int(line.strip())) line = f2.readline() f2.close() id_series = id_series[0 : len(cluster_tag)] time_series = time_series[0 : len(cluster_tag)] i = 1 k = 0 # 第一个元素代表的聚类编号:1或2 tag1 = 0 tag2 = 0 # 第一个元素代表的原始标记编号:整数 tag3 = 0 correct = 0 while i < weibo_row: weibo_id = str(weibo_sheet.cell(i, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_tag = int(weibo_sheet.cell(i, 5).value) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: if k == 0: tag1 = cluster_tag[0] if tag1 == 1: tag2 = 2 else: tag2 = 1 tag3 = weibo_tag correct += 1 else: if (cluster_tag[k] == tag1 and weibo_tag == tag3) or (cluster_tag[k] == tag2 and weibo_tag != tag3): correct += 1 k += 1 if k >= len(cluster_tag): break i += 1 purity = str(np.true_divide(correct, len(cluster_tag))) quick_write_list_to_text([purity], write_filename)
def compute_purity(read_filename1, read_filename2, read_filename3, write_filename): #文件总数 file_number = np.sum( [len(files) for root, dirs, files in os.walk(read_directory1)]) weibo_sheet = open_sheet(read_filename3) weibo_row = weibo_sheet.nrows id_series = [] time_series = [] f1 = open(read_filename2) line = f1.readline() while line: id_series.append(line.split('\x7f')[0]) time_series.append(float(line.split('\x7f')[1])) line = f1.readline() f1.close() cluster_tag = [] f2 = open(read_filename1) line = f2.readline() while line: cluster_tag.append(int(line.strip())) line = f2.readline() f2.close() i = 1 k = 0 # 第一个元素代表的聚类编号:1或2 tag1 = 0 tag2 = 0 # 第一个元素代表的原始标记编号:整数 tag3 = 0 correct = 0 while i < weibo_row: weibo_id = str(weibo_sheet.cell(i, 0).value).split('.')[0] weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_tag = int(weibo_sheet.cell(i, 5).value) if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01: if k == 0: tag1 = cluster_tag[0] if tag1 == 1: tag2 = 2 else: tag2 = 1 tag3 = weibo_tag correct += 1 else: if (cluster_tag[k] == tag1 and weibo_tag == tag3) or (cluster_tag[k] == tag2 and weibo_tag != tag3): correct += 1 k += 1 if k >= len(cluster_tag): break i += 1 purity = str(np.true_divide(correct, len(cluster_tag))) quick_write_list_to_text([purity], write_filename)
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key = itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key=itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')