Example #1
0
def hqd_word_segment(read_directory, write_directory1, write_directory2, write_directory3, write_directory4, write_directory5):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    :param write_directory3:
    :param write_directory4:
    :param write_directory5:
    '''

    stopwords_list1 = get_stopwords1()
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    for i in range(file_number):
        time_series = []
        class_tag = []
        
        content_with_tag = []
        content_without_tag = []
        
        all_weibo_word = []
        
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            this_line = line.strip().split('\t')
            
            #this_id = this_line[0]
            this_time = time.mktime(time.strptime(this_line[2], '%Y/%m/%d %H:%M'))
            time_series.append(str(this_time))
            
            class_tag.append(this_line[5])

            try:
                this_text = this_line[6]
            except:
                this_text = " "
            
            wd_with_tag = word_segment(this_text, stopwords_list1)
            wd_without_tag = [x.split('/')[0] for x in wd_with_tag]
            
            # 此处的词汇带有词性标注
            for word in set(wd_with_tag).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
            
            content_with_tag.append(" ".join(wd_with_tag))
            content_without_tag.append(" ".join(wd_without_tag))
            
            line = f.readline()
        f.close()
        
        quick_write_list_to_text(time_series, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_with_tag, write_directory2 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_without_tag, write_directory3 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(class_tag, write_directory4 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_weibo_word, write_directory5 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Example #2
0
def global_segment(read_filename, write_directory):
    '''
    所有微博文本分词,并获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
        
    global_id = []
    global_time = []
    global_tag = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    f1 = open(write_directory + '/weibo_content.txt', 'w')
    f2 = open(write_directory + '/weibo_content2.txt', 'w')

    for j in range(1, weibo_row):
       
        weibo_id = str(int(weibo_sheet.cell(j, 0).value))
        weibo_time = weibo_sheet.cell(j, 2).value
        weibo_time = time_convert(weibo_time)
            
        weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
        global_id.append(weibo_id)
        global_time.append(str(weibo_time))
        
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        f1.write(" ".join(fenci_result))
        f1.write("\n")
        
        fenci_without_tag = [x.split('/')[0] for x in fenci_result]
        f2.write(" ".join(fenci_without_tag))
        f2.write("\n")
        global_tag.append(weibo_tag)
    
    f1.close()
    f2.close()

    quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
Example #3
0
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''
    
    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
    
    #for i in range(file_number):
        
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    all_weibo_word = []
        
    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):
            
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo):
            count = count + 1
                
            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)
      
            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
                        
            if count >= select_number:
                break
    print len(high_quality_weibo)
    
    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key = itemgetter(1))
    
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
Example #4
0
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])

    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()

    #for i in range(file_number):

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    all_weibo_word = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):

        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1,
                                    stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result)
                                      not in high_quality_weibo):
            count = count + 1

            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))

            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)

            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)

            if count >= select_number:
                break
    print len(high_quality_weibo)

    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo,
              high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key=itemgetter(1))

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id,
                             write_directory + '/weibo_id.txt')
    quick_write_list_to_text(high_quality_time,
                             write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo,
                             write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2,
                             write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag,
                             write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word,
                             write_directory + '/weibo_word.txt')