def global_segment(read_filename, write_directory): ''' 所有微博文本分词,并获取微博的各项信息 :param read_filename: :param write_directory: ''' stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() global_id = [] global_time = [] global_tag = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row f1 = open(write_directory + '/weibo_content.txt', 'w') f2 = open(write_directory + '/weibo_content2.txt', 'w') for j in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) global_id.append(weibo_id) global_time.append(str(weibo_time)) weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) f1.write(" ".join(fenci_result)) f1.write("\n") fenci_without_tag = [x.split('/')[0] for x in fenci_result] f2.write(" ".join(fenci_without_tag)) f2.write("\n") global_tag.append(weibo_tag) f1.close() f2.close() quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key = itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key=itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')