def topics_count(read_directory1, read_filename, write_directory):

    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows

    for i in range(file_number):
        id_series = []
        time_series = []

        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        line = f1.readline()
        while line:
            id_series.append(line.split('\x7f')[0])
            #try:
            time_series.append(float(line.split('\x7f')[1]))
            #except:
            #time_series.append(41275.0)
            line = f1.readline()
        f1.close()

        all_tag = []
        topic_dict = {}
        j = 1
        k = 0

        while j < weibo_row:
            weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0]
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))

            if weibo_id == id_series[k] and np.abs(weibo_time -
                                                   time_series[k]) < 0.01:
                #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548:
                if weibo_tag in all_tag:
                    topic_dict[weibo_tag] += 1
                else:
                    all_tag.append(weibo_tag)
                    topic_dict[weibo_tag] = 1

                k += 1

            j += 1
            if k >= len(id_series):
                break

        result = []
        for each in all_tag:
            result.append(each + ' ' + str(topic_dict[each]))

        quick_write_list_to_text(result,
                                 write_directory + '/' + str(i + 1) + '.txt')
def topics_count(read_directory1, read_filename, write_directory):
    
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    
    for i in range(file_number):    
        id_series = []
        time_series = []
        
        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        line = f1.readline()
        while line:
            id_series.append(line.split('\x7f')[0])
            #try:
            time_series.append(float(line.split('\x7f')[1]))
            #except:
                #time_series.append(41275.0)
            line = f1.readline()
        f1.close()
      
        all_tag = []
        topic_dict = {}
        j = 1
        k = 0

        while j < weibo_row:
            weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0]
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
            
            if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01:
            #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548:
                if weibo_tag in all_tag:
                    topic_dict[weibo_tag] += 1
                else:
                    all_tag.append(weibo_tag)
                    topic_dict[weibo_tag] = 1
                
                k += 1
            
            j += 1
            if k >= len(id_series):
                break
             
        
        result = []
        for each in all_tag:
            result.append(each + ' ' + str(topic_dict[each]))
    
        quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
Beispiel #3
0
def global_segment(read_filename, write_directory):
    '''
    所有微博文本分词,并获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
        
    global_id = []
    global_time = []
    global_tag = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    f1 = open(write_directory + '/weibo_content.txt', 'w')
    f2 = open(write_directory + '/weibo_content2.txt', 'w')

    for j in range(1, weibo_row):
       
        weibo_id = str(int(weibo_sheet.cell(j, 0).value))
        weibo_time = weibo_sheet.cell(j, 2).value
        weibo_time = time_convert(weibo_time)
            
        weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
        global_id.append(weibo_id)
        global_time.append(str(weibo_time))
        
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        f1.write(" ".join(fenci_result))
        f1.write("\n")
        
        fenci_without_tag = [x.split('/')[0] for x in fenci_result]
        f2.write(" ".join(fenci_without_tag))
        f2.write("\n")
        global_tag.append(weibo_tag)
    
    f1.close()
    f2.close()

    quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
Beispiel #4
0
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)

    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    stopwords_list = get_stopwords()

    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1

    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))

            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) +
                                    " " + " ".join(fenci_result))

            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)

            if i % piece == 0:
                quick_write_list_to_text(
                    each_weibo_fenci, write_directory + u'/each_weibo_fenci/' +
                    str(file_number) + '.txt')
                quick_write_list_to_text(
                    all_weibo_word, write_directory + u'/all_weibo_word/' +
                    str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break

    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)
    
    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    stopwords_list = get_stopwords()
    
    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1
    
    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))
               
            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)
        
            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result))
            
            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)     
            
            if i % piece == 0:
                quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt')
                quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break;
        
    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)    
def compute_purity(read_filename1, read_filename2, read_filename3, write_filename):
    
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    weibo_sheet = open_sheet(read_filename3)
    weibo_row = weibo_sheet.nrows
        
    id_series = []
    time_series = []
    f1 = open(read_filename2)
    line = f1.readline()
    while line:
        id_series.append(line.split()[0])
        time_series.append(float(line.split()[1]))
        line = f1.readline()
    f1.close()
    
    cluster_tag = []
    f2 = open(read_filename1)
    line = f2.readline()
    while line:
        cluster_tag.append(int(line.strip()))
        line = f2.readline()
    f2.close()
    
    id_series = id_series[0 : len(cluster_tag)]
    time_series = time_series[0 : len(cluster_tag)]
        
        
    i = 1
    k = 0
    # 第一个元素代表的聚类编号:1或2
    tag1 = 0
    tag2 = 0
            
    # 第一个元素代表的原始标记编号:整数
    tag3 = 0
        
    correct = 0
        
    while i < weibo_row:
        weibo_id = str(weibo_sheet.cell(i, 0).value).split('.')[0]
        weibo_time = weibo_sheet.cell(i, 2).value
        weibo_time = time_convert(weibo_time)
        weibo_tag = int(weibo_sheet.cell(i, 5).value)
      
        if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01:
            if k == 0:
                tag1 = cluster_tag[0]
                if tag1 == 1:
                    tag2 = 2
                else:
                    tag2 = 1
                
                tag3 = weibo_tag

                correct += 1
            else:
                if (cluster_tag[k] == tag1 and weibo_tag == tag3) or (cluster_tag[k] == tag2 and weibo_tag != tag3):
                    correct += 1
                        
            k += 1
            
        if k >= len(cluster_tag):
            break
            
        i += 1
        
    purity = str(np.true_divide(correct, len(cluster_tag)))
    
    quick_write_list_to_text([purity], write_filename)
def compute_purity(read_filename1, read_filename2, read_filename3,
                   write_filename):

    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    weibo_sheet = open_sheet(read_filename3)
    weibo_row = weibo_sheet.nrows

    id_series = []
    time_series = []
    f1 = open(read_filename2)
    line = f1.readline()
    while line:
        id_series.append(line.split('\x7f')[0])
        time_series.append(float(line.split('\x7f')[1]))
        line = f1.readline()
    f1.close()

    cluster_tag = []
    f2 = open(read_filename1)
    line = f2.readline()
    while line:
        cluster_tag.append(int(line.strip()))
        line = f2.readline()
    f2.close()

    i = 1
    k = 0
    # 第一个元素代表的聚类编号:1或2
    tag1 = 0
    tag2 = 0

    # 第一个元素代表的原始标记编号:整数
    tag3 = 0

    correct = 0

    while i < weibo_row:
        weibo_id = str(weibo_sheet.cell(i, 0).value).split('.')[0]
        weibo_time = weibo_sheet.cell(i, 2).value
        weibo_time = time_convert(weibo_time)
        weibo_tag = int(weibo_sheet.cell(i, 5).value)

        if weibo_id == id_series[k] and np.abs(weibo_time -
                                               time_series[k]) < 0.01:
            if k == 0:
                tag1 = cluster_tag[0]
                if tag1 == 1:
                    tag2 = 2
                else:
                    tag2 = 1

                tag3 = weibo_tag

                correct += 1
            else:
                if (cluster_tag[k] == tag1
                        and weibo_tag == tag3) or (cluster_tag[k] == tag2
                                                   and weibo_tag != tag3):
                    correct += 1

            k += 1

        if k >= len(cluster_tag):
            break

        i += 1

    purity = str(np.true_divide(correct, len(cluster_tag)))

    quick_write_list_to_text([purity], write_filename)
Beispiel #8
0
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''
    
    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
    
    #for i in range(file_number):
        
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    all_weibo_word = []
        
    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):
            
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result) not in high_quality_weibo):
            count = count + 1
                
            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)
      
            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
                        
            if count >= select_number:
                break
    print len(high_quality_weibo)
    
    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo, high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key = itemgetter(1))
    
    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []
    
    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(high_quality_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo, write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2, write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag, write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word, write_directory + '/weibo_word.txt')
Beispiel #9
0
def microblog_extract(read_filename, write_directory):
    '''
    选出前8000条微博作为高质量文本并分词,获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])

    select_number = 8000

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()

    #for i in range(file_number):

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    all_weibo_word = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    count = 0
    for j in range(1, weibo_row):

        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1,
                                    stopwords_list2)
        if len(fenci_result) > 5 and (" ".join(fenci_result)
                                      not in high_quality_weibo):
            count = count + 1

            weibo_id = str(int(weibo_sheet.cell(j, 0).value))
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))

            high_quality_id.append(weibo_id)
            high_quality_time.append(weibo_time)
            high_quality_weibo.append(" ".join(fenci_result))
            fenci_without_tag = [x.split('/')[0] for x in fenci_result]
            high_quality_weibo2.append(" ".join(fenci_without_tag))
            high_quality_tag.append(weibo_tag)

            for word in set(fenci_result).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)

            if count >= select_number:
                break
    print len(high_quality_weibo)

    # 按时间排序
    itw = zip(high_quality_id, high_quality_time, high_quality_weibo,
              high_quality_weibo2, high_quality_tag)
    itw1 = sorted(itw, key=itemgetter(1))

    high_quality_weibo = []
    high_quality_weibo2 = []
    high_quality_id = []
    high_quality_time = []
    high_quality_tag = []

    for each in itw1:
        high_quality_id.append(each[0])
        high_quality_time.append(str(each[1]))
        high_quality_weibo.append(each[2])
        high_quality_weibo2.append(each[3])
        high_quality_tag.append(each[4])

    quick_write_list_to_text(high_quality_id,
                             write_directory + '/weibo_id.txt')
    quick_write_list_to_text(high_quality_time,
                             write_directory + '/weibo_time.txt')
    quick_write_list_to_text(high_quality_weibo,
                             write_directory + '/weibo_content.txt')
    quick_write_list_to_text(high_quality_weibo2,
                             write_directory + '/weibo_content2.txt')
    quick_write_list_to_text(high_quality_tag,
                             write_directory + '/weibo_class_tag.txt')
    quick_write_list_to_text(all_weibo_word,
                             write_directory + '/weibo_word.txt')