def hqd_word_segment(read_directory, write_directory1, write_directory2, write_directory3, write_directory4, write_directory5): ''' :param read_directory: :param write_directory1: :param write_directory2: :param write_directory3: :param write_directory4: :param write_directory5: ''' stopwords_list1 = get_stopwords1() file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) for i in range(file_number): time_series = [] class_tag = [] content_with_tag = [] content_without_tag = [] all_weibo_word = [] f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb') line = f.readline() while line: this_line = line.strip().split('\t') #this_id = this_line[0] this_time = time.mktime(time.strptime(this_line[2], '%Y/%m/%d %H:%M')) time_series.append(str(this_time)) class_tag.append(this_line[5]) try: this_text = this_line[6] except: this_text = " " wd_with_tag = word_segment(this_text, stopwords_list1) wd_without_tag = [x.split('/')[0] for x in wd_with_tag] # 此处的词汇带有词性标注 for word in set(wd_with_tag).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) content_with_tag.append(" ".join(wd_with_tag)) content_without_tag.append(" ".join(wd_without_tag)) line = f.readline() f.close() quick_write_list_to_text(time_series, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(content_with_tag, write_directory2 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(content_without_tag, write_directory3 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(class_tag, write_directory4 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(all_weibo_word, write_directory5 + '/' + str(i + 1) + '.txt') print "Segment %d Completed." % (i + 1)
def global_segment(read_filename, write_directory): ''' 所有微博文本分词,并获取微博的各项信息 :param read_filename: :param write_directory: ''' stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() global_id = [] global_time = [] global_tag = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row f1 = open(write_directory + '/weibo_content.txt', 'w') f2 = open(write_directory + '/weibo_content2.txt', 'w') for j in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) global_id.append(weibo_id) global_time.append(str(weibo_time)) weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) f1.write(" ".join(fenci_result)) f1.write("\n") fenci_without_tag = [x.split('/')[0] for x in fenci_result] f2.write(" ".join(fenci_without_tag)) f2.write("\n") global_tag.append(weibo_tag) f1.close() f2.close() quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text( each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text( all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def data_segment(read_filename, write_directory): weibo_sheet = open_sheet(read_filename) weibo_column = weibo_sheet.ncols weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row stopwords_list = get_stopwords() all_weibo_word = [] each_weibo_fenci = [] file_number = 1 piece = 3000 if weibo_row < piece: print "Exception:Data is too small!!!" else: for i in range(1, weibo_row): weibo_id = str(int(weibo_sheet.cell(i, 0).value)) weibo_time = weibo_sheet.cell(i, 2).value weibo_time = time_convert(weibo_time) weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value) fenci_result = word_segment(weibo_content, stopwords_list) each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result)) for word in set(fenci_result).difference(all_weibo_word): all_weibo_word.append(word) if i % piece == 0: quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt') quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt') file_number = file_number + 1 each_weibo_fenci = [] all_weibo_word = [] if weibo_row - i < piece: break; print "Data Segmentation Complete!!!" print "Total Segments: %d" % (file_number - 1)
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key = itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
def microblog_extract(read_filename, write_directory): ''' 选出前8000条微博作为高质量文本并分词,获取微博的各项信息 :param read_filename: :param write_directory: ''' #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) select_number = 8000 stopwords_list1 = get_stopwords1() stopwords_list2 = get_stopwords2() #for i in range(file_number): high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] all_weibo_word = [] weibo_sheet = open_sheet(read_filename) weibo_row = weibo_sheet.nrows print 'Number of the Weibo row: %d' % weibo_row count = 0 for j in range(1, weibo_row): weibo_content = str(weibo_sheet.cell(j, 6).value) fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2) if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo): count = count + 1 weibo_id = str(int(weibo_sheet.cell(j, 0).value)) weibo_time = weibo_sheet.cell(j, 2).value weibo_time = time_convert(weibo_time) weibo_tag = str(int(weibo_sheet.cell(j, 5).value)) high_quality_id.append(weibo_id) high_quality_time.append(weibo_time) high_quality_weibo.append(" ".join(fenci_result)) fenci_without_tag = [x.split('/')[0] for x in fenci_result] high_quality_weibo2.append(" ".join(fenci_without_tag)) high_quality_tag.append(weibo_tag) for word in set(fenci_result).difference(all_weibo_word): if word not in all_weibo_word: all_weibo_word.append(word) if count >= select_number: break print len(high_quality_weibo) # 按时间排序 itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag) itw1 = sorted(itw, key=itemgetter(1)) high_quality_weibo = [] high_quality_weibo2 = [] high_quality_id = [] high_quality_time = [] high_quality_tag = [] for each in itw1: high_quality_id.append(each[0]) high_quality_time.append(str(each[1])) high_quality_weibo.append(each[2]) high_quality_weibo2.append(each[3]) high_quality_tag.append(each[4]) quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt') quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt') quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt') quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt') quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt') quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')