Exemple #1
0
def hqd_word_segment(read_directory, write_directory1, write_directory2, write_directory3, write_directory4, write_directory5):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    :param write_directory3:
    :param write_directory4:
    :param write_directory5:
    '''

    stopwords_list1 = get_stopwords1()
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    for i in range(file_number):
        time_series = []
        class_tag = []
        
        content_with_tag = []
        content_without_tag = []
        
        all_weibo_word = []
        
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            this_line = line.strip().split('\t')
            
            #this_id = this_line[0]
            this_time = time.mktime(time.strptime(this_line[2], '%Y/%m/%d %H:%M'))
            time_series.append(str(this_time))
            
            class_tag.append(this_line[5])

            try:
                this_text = this_line[6]
            except:
                this_text = " "
            
            wd_with_tag = word_segment(this_text, stopwords_list1)
            wd_without_tag = [x.split('/')[0] for x in wd_with_tag]
            
            # 此处的词汇带有词性标注
            for word in set(wd_with_tag).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
            
            content_with_tag.append(" ".join(wd_with_tag))
            content_without_tag.append(" ".join(wd_without_tag))
            
            line = f.readline()
        f.close()
        
        quick_write_list_to_text(time_series, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_with_tag, write_directory2 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_without_tag, write_directory3 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(class_tag, write_directory4 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_weibo_word, write_directory5 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Exemple #2
0
def global_segment(read_filename, write_directory):
    '''
    所有微博文本分词,并获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
        
    global_id = []
    global_time = []
    global_tag = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    f1 = open(write_directory + '/weibo_content.txt', 'w')
    f2 = open(write_directory + '/weibo_content2.txt', 'w')

    for j in range(1, weibo_row):
       
        weibo_id = str(int(weibo_sheet.cell(j, 0).value))
        weibo_time = weibo_sheet.cell(j, 2).value
        weibo_time = time_convert(weibo_time)
            
        weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
        global_id.append(weibo_id)
        global_time.append(str(weibo_time))
        
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        f1.write(" ".join(fenci_result))
        f1.write("\n")
        
        fenci_without_tag = [x.split('/')[0] for x in fenci_result]
        f2.write(" ".join(fenci_without_tag))
        f2.write("\n")
        global_tag.append(weibo_tag)
    
    f1.close()
    f2.close()

    quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
Exemple #3
0
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)

    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    stopwords_list = get_stopwords()

    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1

    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))

            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) +
                                    " " + " ".join(fenci_result))

            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)

            if i % piece == 0:
                quick_write_list_to_text(
                    each_weibo_fenci, write_directory + u'/each_weibo_fenci/' +
                    str(file_number) + '.txt')
                quick_write_list_to_text(
                    all_weibo_word, write_directory + u'/all_weibo_word/' +
                    str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break

    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)
    
    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    stopwords_list = get_stopwords()
    
    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1
    
    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))
               
            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)
        
            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result))
            
            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)     
            
            if i % piece == 0:
                quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt')
                quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break;
        
    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)    
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''
    
    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
    
    #for i in range(file_number):
        
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    all_weibo_word = []
        
    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):
            
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo):
            count = count + 1
                
            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)
      
            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
                        
            if count >= select_number:
                break
    print len(high_quality_weibo)
    
    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key = itemgetter(1))
    
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])

    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()

    #for i in range(file_number):

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    all_weibo_word = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):

        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1,
                                    stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result)
                                      not in high_quality_weibo):
            count = count + 1

            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))

            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)

            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)

            if count >= select_number:
                break
    print len(high_quality_weibo)

    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo,
              high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key=itemgetter(1))

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id,
                             write_directory + '/weibo_id.txt')
    quick_write_list_to_text(high_quality_time,
                             write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo,
                             write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2,
                             write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag,
                             write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word,
                             write_directory + '/weibo_word.txt')