Ejemplo n.º 1
0
def get_key_words(read_filename, write_filename1, write_filename2):
    '''
    使用结巴分词获取关键词
    :param read_filename:
    :param write_filename1:
    :param write_filename2:
    '''

    each_weibo_fenci = []
    get_text_to_complex_list(each_weibo_fenci, read_filename, 0)

    key_words = []
    all_key_words = []
    for row in range(len(each_weibo_fenci)):
        word_entity = []

        for each in each_weibo_fenci[row]:
            word_entity.append(each.split('/')[0])

        tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
        key_words.append(" ".join(tags))

        for word in " ".join(tags).split():
            if word not in all_key_words:
                all_key_words.append(word)

    quick_write_list_to_text(key_words, write_filename1)
    quick_write_list_to_text(all_key_words, write_filename2)
Ejemplo n.º 2
0
def batch_em_cluster(read_directory, write_directory1, write_directory2):

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    cluster_number = 8
    init_mu = 0.1
    init_sigma = 1.0

    for i in range(file_number):
        vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt')
        data_dimension = vsm.shape[1]

        init_means = []
        for j in range(cluster_number):
            init_means.append(init_sigma * np.random.randn(data_dimension) +
                              init_mu)

        cluster_model = cluster.EMClusterer(init_means, bias=0.1)

        cluster_tag = cluster_model.cluster(vsm, True, trace=False)

        cluster_tag_to_string = [str(x) for x in cluster_tag]
        center_data = cluster_model._means

        quick_write_list_to_text(cluster_tag_to_string,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        write_matrix_to_text(center_data,
                             write_directory2 + '/' + str(i + 1) + '.txt')
Ejemplo n.º 3
0
def text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    """
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    """

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, "r")
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, "r")
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 4
0
def global_sort_by_time(update_item_index, read_directory, write_directory):
    
    print "Begin sorting." 
    print "May take a long time, Please Wait..."
    
    read_file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    segment = 50000

    total_length = len(update_item_index)
    segment_number = total_length / segment
    
    print "Total Segment %d ." % segment_number
    
    for i in range(segment_number):
        
        print "Segment %d ." % (i + 1)
        
        content_result = []
        for k in range(segment):
            content_result.append(" ")
        
        for j in range(read_file_number):
            f1 = open(read_directory + "/" +  str(j + 1) + ".txt", "rb")
            this_text_file = f1.readlines()
            f1.close()
            
            for l in range(segment):
                if update_item_index[segment * i + l][0] == str(j + 1):
                    content_result[l] = this_text_file[int(update_item_index[segment * i + l][1]) - 1].strip()
            
        quick_write_list_to_text(content_result, write_directory + "/" + str(i + 1) + ".txt")

    print "Global Sort Complete!!!"
def vsm_update(read_directory1, read_directory2, write_directory1, write_directory2):
    '''
    除去全0的行
    :param read_directory1:
    :param read_directory2:
    :param write_directory1:
    :param write_directory2:
    '''
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        update_vsm = []
        update_id_time = [] 
        
        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        each_weibo_vsm = f1.readlines()
        f1.close()
        
        id_time = []
        
        get_text_to_complex_list2(id_time, read_directory2 + '/' + str(i + 1) + '.txt', 0, 2)
        
        for j in range(len(each_weibo_vsm)):
            int_each_weibo_vsm = [int(x) for x in each_weibo_vsm[j].split()]
            #去掉全0行
            if np.sum(int_each_weibo_vsm) > 0.1:
                update_vsm.append(each_weibo_vsm[j])
                update_id_time.append(" ".join(id_time[j]))
        
        quick_write_list_to_text2(update_vsm, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(update_id_time, write_directory2 + '/' + str(i + 1) + '.txt')
    
    print "VSM Update Complete!!!"
Ejemplo n.º 6
0
def get_key_words(read_directory, write_directory1, write_directory2):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    '''

    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        each_weibo_fenci = []        
        get_text_to_complex_list(each_weibo_fenci, read_directory + '/' + str(i + 1) + '.txt', 0)
        
        key_words = []
        all_key_words =  []
        for row in range(len(each_weibo_fenci)):
            word_entity = []

            for each in each_weibo_fenci[row]:
                word_entity.append(each.split('/')[0])

            tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
            key_words.append(" ".join(tags))
            
            for word in " ".join(tags).split():
                if word not in all_key_words:
                    all_key_words.append(word)
        
        quick_write_list_to_text(key_words, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_key_words, write_directory2 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 7
0
def compute_em_weights(read_filename1, read_filename2, write_filename):
    '''
    Linear fusion
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    em_weights = []

    coefficients_string = []
    get_text_to_single_list(coefficients_string, read_filename2)
    coefficients = [float(x) for x in coefficients_string]

    f = open(read_filename1, 'r')
    line = f.readline()
    while line:
        each_line = line.split()
        em_weights.append(
            float(each_line[0]) * coefficients[0] +
            float(each_line[1]) * coefficients[1] +
            float(each_line[2]) * coefficients[2])

        line = f.readline()
    f.close()

    em_weights_to_string = [str(x) for x in em_weights]
    quick_write_list_to_text(em_weights_to_string, write_filename)
Ejemplo n.º 8
0
def text_classify(read_filename1, read_filename2, read_filename3,
                  write_filename):
    '''
    查询分类
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    '''

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(query_pattern)):
        this_result = query(query_pattern[i], search_texts, word_weight_dict)
        result.append(" ".join([str(x) for x in this_result]))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 9
0
def text_classify(read_filename1, read_filename2, read_filename3,
                  write_filename):

    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)

    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()
    f1.close()

    result = []
    for i in range(len(search_texts)):
        result.append([])

    for i in range(len(query_pattern)):
        this_result = query2(query_pattern[i], search_texts, word_weight_dict)
        result[this_result].append(str(i))

    result_to_string = []
    for each in result:
        result_to_string.append(" ".join(each))

    quick_write_list_to_text(result_to_string, write_filename)
Ejemplo n.º 10
0
def map_word_list(read_directory1, read_directory2, write_filename):

    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    result = []

    for i in range(file_number):

        word_list = []
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            word_list.append(line.strip())
            line = f.readline()

        f.close()

        vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        vsm = vsm.T
        for each in vsm:
            result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 11
0
def get_key_words(read_filename, write_filename1, write_filename2):
    '''
    使用结巴分词获取关键词
    :param read_filename:
    :param write_filename1:
    :param write_filename2:
    '''
    
    each_weibo_fenci = []        
    get_text_to_complex_list(each_weibo_fenci, read_filename, 0)
        
    key_words = []
    all_key_words =  []
    for row in range(len(each_weibo_fenci)):
        word_entity = []

        for each in each_weibo_fenci[row]:
            word_entity.append(each.split('/')[0])

        tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
        key_words.append(" ".join(tags))
            
        for word in " ".join(tags).split():
            if word not in all_key_words:
                all_key_words.append(word)
        
    quick_write_list_to_text(key_words, write_filename1)
    quick_write_list_to_text(all_key_words, write_filename2)
Ejemplo n.º 12
0
def batch_em_cluster(read_directory, write_directory1, write_directory2):
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    cluster_number = 8
    init_mu = 0.1
    init_sigma = 1.0
    
    for i in range(file_number):
        vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt')
        data_dimension = vsm.shape[1]
        
        init_means = []
        for j in range(cluster_number):
            init_means.append(init_sigma * np.random.randn(data_dimension) + init_mu)
        
        cluster_model = cluster.EMClusterer(init_means, bias=0.1)
        
        cluster_tag = cluster_model.cluster(vsm, True, trace=False)
        
        cluster_tag_to_string = [str(x) for x in cluster_tag]
        center_data = cluster_model._means
        
        quick_write_list_to_text(cluster_tag_to_string, write_directory1 + '/' + str(i + 1) + '.txt')
        write_matrix_to_text(center_data, write_directory2 + '/' + str(i + 1) + '.txt')
Ejemplo n.º 13
0
def hqd_word_segment(read_directory, write_directory1, write_directory2, write_directory3, write_directory4, write_directory5):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    :param write_directory3:
    :param write_directory4:
    :param write_directory5:
    '''

    stopwords_list1 = get_stopwords1()
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    for i in range(file_number):
        time_series = []
        class_tag = []
        
        content_with_tag = []
        content_without_tag = []
        
        all_weibo_word = []
        
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            this_line = line.strip().split('\t')
            
            #this_id = this_line[0]
            this_time = time.mktime(time.strptime(this_line[2], '%Y/%m/%d %H:%M'))
            time_series.append(str(this_time))
            
            class_tag.append(this_line[5])

            try:
                this_text = this_line[6]
            except:
                this_text = " "
            
            wd_with_tag = word_segment(this_text, stopwords_list1)
            wd_without_tag = [x.split('/')[0] for x in wd_with_tag]
            
            # 此处的词汇带有词性标注
            for word in set(wd_with_tag).difference(all_weibo_word):
                if word not in all_weibo_word:
                    all_weibo_word.append(word)
            
            content_with_tag.append(" ".join(wd_with_tag))
            content_without_tag.append(" ".join(wd_without_tag))
            
            line = f.readline()
        f.close()
        
        quick_write_list_to_text(time_series, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_with_tag, write_directory2 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(content_without_tag, write_directory3 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(class_tag, write_directory4 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_weibo_word, write_directory5 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 14
0
def text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    
    query_pattern = []
    get_text_to_complex_list(query_pattern, read_filename1, 0)
    
    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()
    
    search_texts = []
    f1 = open(read_filename3, 'r')
    line = f1.readline()
    while line:
        search_texts.append(line.strip())
        line = f1.readline()  
    f1.close()
    
    result = []
    for i in range(len(search_texts)):
        result.append([])
        
    for i in range(len(query_pattern)):
        this_result = query2(query_pattern[i], search_texts, word_weight_dict)
        result[this_result].append(str(i))
    
    result_to_string = []
    for each in result:
        result_to_string.append(" ".join(each))
    
    quick_write_list_to_text(result_to_string, write_filename)
def get_key_words(read_directory, write_directory1, write_directory2):
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    for i in range(file_number):
        each_weibo_fenci = []
        get_text_to_complex_list(each_weibo_fenci,
                                 read_directory + '/' + str(i + 1) + '.txt', 2)

        key_words = []
        all_key_words = []
        for row in range(len(each_weibo_fenci)):
            word_entity = []

            for each in each_weibo_fenci[row]:
                word_entity.append(each.split('/')[0])

            tags = jieba.analyse.extract_tags(" ".join(word_entity), 3)
            key_words.append(" ".join(tags))

            for word in " ".join(tags).split():
                if word not in all_key_words:
                    all_key_words.append(word)

        quick_write_list_to_text(key_words,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(all_key_words,
                                 write_directory2 + '/' + str(i + 1) + '.txt')
Ejemplo n.º 16
0
def compute_em_weights(read_filename1, read_filename2, write_filename):
    '''
    Linear fusion
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    em_weights = []
    
    coefficients_string = [] 
    get_text_to_single_list(coefficients_string, read_filename2)   
    coefficients = [float(x) for x in coefficients_string]
        
    f = open(read_filename1, 'r')
    line = f.readline()
    while line:
        each_line = line.split()
        em_weights.append(float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2])
            
        line = f.readline()
    f.close()
    
    em_weights_to_string = [str(x) for x in em_weights]
    quick_write_list_to_text(em_weights_to_string, write_filename)
def topics_count(read_directory1, read_filename, write_directory):

    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows

    for i in range(file_number):
        id_series = []
        time_series = []

        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        line = f1.readline()
        while line:
            id_series.append(line.split('\x7f')[0])
            #try:
            time_series.append(float(line.split('\x7f')[1]))
            #except:
            #time_series.append(41275.0)
            line = f1.readline()
        f1.close()

        all_tag = []
        topic_dict = {}
        j = 1
        k = 0

        while j < weibo_row:
            weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0]
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))

            if weibo_id == id_series[k] and np.abs(weibo_time -
                                                   time_series[k]) < 0.01:
                #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548:
                if weibo_tag in all_tag:
                    topic_dict[weibo_tag] += 1
                else:
                    all_tag.append(weibo_tag)
                    topic_dict[weibo_tag] = 1

                k += 1

            j += 1
            if k >= len(id_series):
                break

        result = []
        for each in all_tag:
            result.append(each + ' ' + str(topic_dict[each]))

        quick_write_list_to_text(result,
                                 write_directory + '/' + str(i + 1) + '.txt')
def select_top_N_words(read_directory1, read_directory2, write_directory):
    N = 1000
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        each_word_tf = each_word_tf[1:]  # 列表,内层2个
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
        
        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in key_words:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))  
            else:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break
        
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
def topics_count(read_directory1, read_filename, write_directory):
    
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    
    for i in range(file_number):    
        id_series = []
        time_series = []
        
        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        line = f1.readline()
        while line:
            id_series.append(line.split('\x7f')[0])
            #try:
            time_series.append(float(line.split('\x7f')[1]))
            #except:
                #time_series.append(41275.0)
            line = f1.readline()
        f1.close()
      
        all_tag = []
        topic_dict = {}
        j = 1
        k = 0

        while j < weibo_row:
            weibo_id = str(weibo_sheet.cell(j, 0).value).split('.')[0]
            weibo_time = weibo_sheet.cell(j, 2).value
            weibo_time = time_convert(weibo_time)
            weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
            
            if weibo_id == id_series[k] and np.abs(weibo_time - time_series[k]) < 0.01:
            #if weibo_id == id_series[k] and weibo_time >= 41538 and weibo_time < 41548:
                if weibo_tag in all_tag:
                    topic_dict[weibo_tag] += 1
                else:
                    all_tag.append(weibo_tag)
                    topic_dict[weibo_tag] = 1
                
                k += 1
            
            j += 1
            if k >= len(id_series):
                break
             
        
        result = []
        for each in all_tag:
            result.append(each + ' ' + str(topic_dict[each]))
    
        quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 20
0
def generate_vsm_for_trans(read_filename):
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'

    write_directory = root_directory + u'dataset'

    if (not (os.path.exists(write_directory))):
        os.mkdir(write_directory)

    write_filename = write_directory + u'/vsm.txt'

    pattern_list = []
    all_word_list = []

    f = open(read_filename, 'r')
    line = f.readline()
    while line:
        if len(line.split()) > 1:
            pattern_list.append(line.split())
            for each in line.split():
                if each not in all_word_list:
                    all_word_list.append(each)
        line = f.readline()
    f.close()

    vsm = []

    for i in range(len(pattern_list)):
        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0

        for each in pattern_list[i]:
            try:
                tf_dict[each] = 1
            except KeyError:
                tf_dict[each] = 0

        this_line = []
        for key in all_word_list:
            this_line.append(tf_dict[key])

        vsm.append(this_line)

    vsm_to_string = []
    for each in vsm:
        vsm_to_string.append(" ".join([str(x) for x in each]))

    np_vsm = np.array([vsm])

    quick_write_list_to_text(vsm_to_string, write_filename)
    return np_vsm
Ejemplo n.º 21
0
def get_new_wordlist(read_directory1, read_directory2, write_directory,
                     write_filename):
    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])
    file_count = 1
    this_start_time = 41538
    new_word_list = []
    all_batch_id = []
    this_batch_id = []

    for i in range(file_number):
        time_series = []
        f = open(read_directory1 + "/" + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            time_series.append(float(line.split()[1]))
            line = f.readline()

        f.close()

        if time_series[0] >= 41548:
            break
        elif (time_series[-1] < 41538):
            pass
        else:
            word_list = []
            f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
            line = f1.readline()
            while line:
                word_list.append(line.split()[0])
                line = f1.readline()
            f1.close()

            if (time_series[-1] - this_start_time < 2):
                for word in set(word_list).difference(new_word_list):
                    new_word_list.append(word)

                this_batch_id.append(str(i + 1))

            else:
                quick_write_list_to_text(
                    new_word_list,
                    write_directory + '/' + str(file_count) + '.txt')
                all_batch_id.append(" ".join(this_batch_id))

                new_word_list = []
                this_start_time = this_start_time + 2
                this_batch_id = []
                file_count = file_count + 1

    quick_write_list_to_text(all_batch_id, write_filename)
Ejemplo n.º 22
0
def generate_vsm_for_trans(read_filename):
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    
    write_directory = root_directory + u'dataset'
    
    if (not(os.path.exists(write_directory))):
        os.mkdir(write_directory)
    
    write_filename = write_directory + u'/vsm.txt'
    
    pattern_list = []
    all_word_list = []
    
    f = open(read_filename, 'r')
    line = f.readline()
    while line:
        if len(line.split()) > 1:
            pattern_list.append(line.split())
            for each in line.split():
                if each not in all_word_list:
                    all_word_list.append(each)
        line = f.readline()
    f.close()
    
    vsm = []
    
    for i in range(len(pattern_list)):
        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0
            
        for each in pattern_list[i]:
            try:
                tf_dict[each] = 1
            except KeyError:
                tf_dict[each] = 0
            
        this_line = []
        for key in all_word_list:
            this_line.append(tf_dict[key])
        
        vsm.append(this_line)
    
    vsm_to_string = []
    for each in vsm:
        vsm_to_string.append(" ".join([str(x) for x in each]))
    
    np_vsm = np.array([vsm])
    
    quick_write_list_to_text(vsm_to_string, write_filename)
    return np_vsm
Ejemplo n.º 23
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

    word_list = []
    get_text_to_single_list(word_list, read_filename2)

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 24
0
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''

    #文件总数
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = []
        #该数据片中的所有数据
        all_text_word = []

        get_text_to_complex_list(each_text_segment,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 0)
        get_text_to_single_list(all_text_word,
                                read_directory2 + '/' + str(i + 1) + '.txt')

        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0

        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0

        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])

        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key=itemgetter(1), reverse=True)

        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))

        #写入文件
        quick_write_list_to_text(result_all,
                                 write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 25
0
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2,
                                  write_directory):
    '''
    微博文本的向量空间构造,值为TF
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        each_weibo_fenci = []
        all_weibo_fenci = []

        get_text_to_complex_list(each_weibo_fenci,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 2)
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            all_weibo_fenci.append(line.strip().split()[0])
            line = f.readline()
        f.close()

        result = []

        for row in range(len(each_weibo_fenci)):

            tf_dict = {}  # 词频TF字典
            for key in all_weibo_fenci:
                tf_dict[key] = 0

            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0

            this_line = []
            for key in all_weibo_fenci:
                this_line.append(str(tf_dict[key]))

            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))

        quick_write_list_to_text(result,
                                 write_directory + '/' + str(i + 1) + '.txt')

    print "VSM Complete!!!"
Ejemplo n.º 26
0
def hq_text_clustering(read_directory1, read_directory2, read_directory3, write_directory1, write_directory2):
    
    gamma = 0.01
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        
        THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt')
        
        # 本片数据的词汇列表
        this_word_list = []
        f1 = open(read_directory3 + '/' + str(i + 1) + '.txt', 'rb')
        line = f1.readline()
        while line:
            this_word_list.append(line.split()[0])
            line = f1.readline()
        
        f1.close()
        
        if len(PHAI) >= 200:
            PHAI = np.array([PHAI])
        
        cluster_tag = []
        
        for j in range(len(THETA)):
            cluster_tag.append(str(np.argmax(THETA[j])))
        
        real_topics = []
        for j in range(len(PHAI)):
            this_topic = []
            this_topic_weight = []

            for k in range(len(PHAI[j])):
                if PHAI[j][k] > gamma:
                    this_topic.append(this_word_list[k])
                    this_topic_weight.append(PHAI[j][k])
            
            tt = zip(this_topic, this_topic_weight)
            tt = sorted(tt, key = itemgetter(1), reverse=True)
            this_topic = []
            for each in tt:
                this_topic.append(each[0])
            
            real_topics.append(" ".join(this_topic))

        quick_write_list_to_text(cluster_tag, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(real_topics, write_directory2 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 27
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

        
    word_list = []
    get_text_to_single_list(word_list, read_filename2)
        
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(result, write_filename)
def compute_distance(read_directory1, read_directory2, read_directory3, write_filename, write_directory):
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    center_d = []
    
    for i in range(file_number):
        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T
        
        kl1 = KL_distance(center[0], center[1])
        kl2 = KL_distance(center[1], center[0])
        
        center_d.append(str(np.max([kl1, kl2])))
        
        cluster_data = []
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        cluster_da = f.readlines()
        f.close()
        for each in cluster_da:
            cluster_data.append([float(x) for x in each.split()])
        #cluster_data = get_text_to_nparray(read_directory2 + '/' + str(i + 1) + '.txt', 'float')
        
        f = open(read_directory3 + '/' + str(i + 1) + '.txt')
        cluster_tag = f.readlines()
        f.close()
        
        final_distance = []
        distance1 = 0.0
        distance2 = 0.0
        count1 = 0
        count2 = 0
        for j in range(len(cluster_tag)):
            if cluster_tag[j].strip() == '1':
                kl1 = KL_distance(center[0], cluster_data[j])
                kl2 = KL_distance(cluster_data[j], center[0])
                distance1 += np.max([kl1, kl2])
                count1 += 1
            if cluster_tag[j].strip() == '2':
                kl1 = KL_distance(center[1], cluster_data[j])
                kl2 = KL_distance(cluster_data[j], center[1])
                distance2 += np.max([kl1, kl2])
                count2 += 1
                
        final_distance.append(str(np.true_divide(distance1, count1)))
        final_distance.append(str(np.true_divide(distance2, count2)))
        quick_write_list_to_text(final_distance, write_directory + '/' + str(i + 1) + '.txt')
    
    quick_write_list_to_text(center_d, write_filename)
Ejemplo n.º 29
0
def SP_CT_LDA(read_directory1, read_directory2, write_directory1, write_directory2, write_directory3):
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        
        THETA = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        PHAI = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt')
        
        #视图1,根据词汇分布计算潜在主题之间的相似度
        W1 = np.zeros((len(PHAI), len(PHAI)))
        for j in range(len(PHAI)):
            for k in range(j, len(PHAI)):
                W1[j, k] = 1.0 / (SKLD(PHAI[j], PHAI[k]) + 1.0)
                W1[k, j] = W1[j, k]

        #估计聚类数目
        cluster_number = get_cluster_number(W1)
        
        print cluster_number
        cluster_tag = spectral_cluster2(W1, cluster_number)
        
        #聚类分析
        center_topic = np.zeros((cluster_number, len(PHAI[0])))
        each_cluster_number = np.zeros(cluster_number, int)
        
        weibo_topic_similarity = np.zeros((cluster_number, len(THETA)))
        THETA = THETA.transpose()
        
        for j in range(len(cluster_tag)):
            center_topic[cluster_tag[j]] += PHAI[j]
            each_cluster_number[cluster_tag[j]] += 1
            
            weibo_topic_similarity[cluster_tag[j]] += THETA[j]
        
        #
        for j in range(cluster_number):
            center_topic[j] = center_topic[j] / each_cluster_number[j]
            #weibo_topic_similarity[j] = weibo_topic_similarity[j] / each_cluster_number[j]
        
        weibo_topic_similarity = weibo_topic_similarity.transpose()
        
        ecn_to_string = [str(x) for x in each_cluster_number]
        
        write_matrix_to_text(weibo_topic_similarity, write_directory1 + '/' + str(i + 1) + '.txt')
        write_matrix_to_text(center_topic, write_directory2 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(ecn_to_string, write_directory3 + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
def get_new_wordlist(read_directory1, read_directory2, write_directory, write_filename):
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    file_count = 1
    this_start_time = 41538
    new_word_list = []
    all_batch_id = []
    this_batch_id = []
    
    for i in range(file_number):
        time_series = []
        f = open(read_directory1 + "/" + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            time_series.append(float(line.split()[1]))
            line = f.readline()
        
        f.close()
        
        if time_series[0] >= 41548:
            break;
        elif (time_series[-1] < 41538):
            pass;
        else:
            word_list = []
            f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
            line = f1.readline()
            while line:
                word_list.append(line.split()[0])
                line = f1.readline()
            f1.close()
            
            if (time_series[-1] - this_start_time < 2):
                for word in set(word_list).difference(new_word_list):
                    new_word_list.append(word)
                
                this_batch_id.append(str(i + 1))
                
            else:
                quick_write_list_to_text(new_word_list, write_directory + '/' + str(file_count) + '.txt')
                all_batch_id.append(" ".join(this_batch_id))
                
                new_word_list = []
                this_start_time = this_start_time + 2
                this_batch_id = []
                file_count = file_count + 1
    
    quick_write_list_to_text(all_batch_id, write_filename)
Ejemplo n.º 31
0
def compute_distance(read_directory1, read_directory2, read_directory3,
                     write_filename, write_directory):
    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    center_d = []

    for i in range(file_number):
        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T

        kl1 = KL_distance(center[0], center[1])
        kl2 = KL_distance(center[1], center[0])

        center_d.append(str(np.max([kl1, kl2])))

        cluster_data = np.loadtxt(read_directory2 + '/' + str(i + 1) + '.txt')
        cluster_data = cluster_data.T

        f = open(read_directory3 + '/' + str(i + 1) + '.txt')
        cluster_tag = f.readlines()
        f.close()

        final_distance = []
        distance1 = 0.0
        distance2 = 0.0
        count1 = 0
        count2 = 0
        for j in range(len(cluster_tag)):
            if cluster_tag[j].strip() == '1':
                kl1 = KL_distance(center[0], cluster_data[j])
                kl2 = KL_distance(cluster_data[j], center[0])
                distance1 += np.max([kl1, kl2])
                count1 += 1
            if cluster_tag[j].strip() == '2':
                kl1 = KL_distance(center[1], cluster_data[j])
                kl2 = KL_distance(cluster_data[j], center[1])
                distance2 += np.max([kl1, kl2])
                count2 += 1

        final_distance.append(str(np.true_divide(distance1, count1)))
        final_distance.append(str(np.true_divide(distance2, count2)))
        quick_write_list_to_text(final_distance,
                                 write_directory + '/' + str(i + 1) + '.txt')

    quick_write_list_to_text(center_d, write_filename)
def count_word_tf(read_directory, write_directory):
       
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        review_keywords = []
        
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            for word in set(line.split()).difference(review_keywords):
                review_keywords.append(word)
                             
            line = f.readline()
        f.close()
        
        quick_write_list_to_text(review_keywords, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 33
0
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)

    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row

    stopwords_list = get_stopwords()

    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1

    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))

            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)

            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) +
                                    " " + " ".join(fenci_result))

            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)

            if i % piece == 0:
                quick_write_list_to_text(
                    each_weibo_fenci, write_directory + u'/each_weibo_fenci/' +
                    str(file_number) + '.txt')
                quick_write_list_to_text(
                    all_weibo_word, write_directory + u'/all_weibo_word/' +
                    str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break

    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)
Ejemplo n.º 34
0
def get_word_list(read_directory, write_directory):
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        word_list = []
        
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'r')
        line = f.readline()
        while line:
            for each in line.split():
                if each not in word_list:
                    word_list.append(each)
            line = f.readline()
        f.close()
        
        quick_write_list_to_text(word_list, write_directory + '/' + str(i + 1) + '.txt')
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''
    
    #文件总数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = [] 
        #该数据片中的所有数据
        all_text_word = []
        
        get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0
            
        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        #写入文件
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 36
0
def top_N_words_tfidf_vsm_process(read_directory1, read_directory2, write_directory):
    '''
    微博文本的向量空间构造,值为TF
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_weibo_fenci = [] 
        all_weibo_fenci = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            all_weibo_fenci.append(line.strip().split()[0])
            line = f.readline()  
        f.close()
        
        result = []
        
        for row in range(len(each_weibo_fenci)):
            
            tf_dict = {}  # 词频TF字典
            for key in all_weibo_fenci:
                tf_dict[key] = 0
            
            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j].split('/')[0]] = 0
            
            this_line = []
            for key in all_weibo_fenci:
                this_line.append(str(tf_dict[key]))
            
            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))
        
        quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
    
    print "VSM Complete!!!"
Ejemplo n.º 37
0
def batch_count_tf(read_directory1, read_directory2, write_directory):
    '''
    
    :param read_directory1:
    :param read_directory2:
    :param write_directory:
    '''

    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_weibo_fenci = [] 
        all_weibo_fenci = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_weibo_fenci:
            tf_dict[key] = 0
            
        for row in range(len(each_weibo_fenci)):
            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_weibo_fenci:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_weibo_fenci, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = []
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 38
0
def data_segment(read_filename, write_directory):
    '''
    数据分片
    :param read_filename: 读取文件
    :param write_directory: 写入目录
    '''

    # 文件开始的编号
    file_number = 1

    print "Begin data segmentation!!!"
    print "May take a long time, Please Wait..."

    # 每条微博的分词
    weibo_content_segment = []

    # 每条文本的分词
    weibo_id_segment = []

    line_count = 0

    fr = open(read_filename)
    line = fr.readline()
    while line:
        weibo_content_segment.append(line.strip())
        weibo_id_segment.append(str(line_count))
        line_count += 1

        if line_count % 5000 == 0:
            # 写入文件
            quick_write_list_to_text(
                weibo_content_segment, write_directory + u'/weibo_segment/' +
                str(file_number) + '.txt')
            quick_write_list_to_text(
                weibo_id_segment,
                write_directory + u'/weibo_id/' + str(file_number) + '.txt')
            file_number += 1
            weibo_content_segment = []
            weibo_id_segment = []

        line = fr.readline()
    fr.close()

    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)
def map_word_list(read_filename1, read_filename2, write_filename):
    
    word_list = []
    f = open(read_filename2, 'rb')
    line = f.readline()
    while line:
        word_list.append(line.strip().split(',')[0])
        line = f.readline()
    
    f.close()
    
    word_result = []
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        word_result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(word_result, write_filename)
def count_word_tf(read_directory, write_directory):

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    for i in range(file_number):
        review_keywords = []

        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            for word in set(line.split()).difference(review_keywords):
                review_keywords.append(word)

            line = f.readline()
        f.close()

        quick_write_list_to_text(review_keywords,
                                 write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 41
0
def get_word_list(read_directory, write_directory):

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    for i in range(file_number):
        word_list = []

        f = open(read_directory + '/' + str(i + 1) + '.txt', 'r')
        line = f.readline()
        while line:
            for each in line.split():
                if each not in word_list:
                    word_list.append(each)
            line = f.readline()
        f.close()

        quick_write_list_to_text(word_list,
                                 write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 42
0
def sample_real_center(read_filename1, read_filename2, write_filename):

    result = []

    word_list = []
    f = open(read_filename2)
    line = f.readline()
    while line:
        word_list.append(line.strip().split()[0])
        line = f.readline()
    f.close()

    word_list = word_list[0:1000]

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 43
0
def spct_prf(read_filename1, read_filename2, write_filename):
    
    cluster_tag = []
    real_tag = []
    
    get_text_to_single_list(cluster_tag, read_filename1)
    get_text_to_single_list(real_tag, read_filename2)

    cluster_tag = [int(x) for x in cluster_tag]
    real_tag = [int(x) for x in real_tag]
    
    reflect = [20, 21, 20]
    
    p, r, f = prf(cluster_tag, real_tag, reflect)
    print p
    print r
    print f
    

    quick_write_list_to_text([str(p), str(r), str(f)], write_filename)
def data_segment(read_filename, write_directory):
    weibo_sheet = open_sheet(read_filename)
    
    weibo_column = weibo_sheet.ncols
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    stopwords_list = get_stopwords()
    
    all_weibo_word = []
    each_weibo_fenci = []
    file_number = 1
    
    piece = 3000
    if weibo_row < piece:
        print "Exception:Data is too small!!!"
    else:
        for i in range(1, weibo_row):
            weibo_id = str(int(weibo_sheet.cell(i, 0).value))
               
            weibo_time = weibo_sheet.cell(i, 2).value
            weibo_time = time_convert(weibo_time)
        
            weibo_content = str(weibo_sheet.cell(i, weibo_column - 1).value)
            fenci_result = word_segment(weibo_content, stopwords_list)
            each_weibo_fenci.append(weibo_id.strip() + " " + str(weibo_time) + " " + " ".join(fenci_result))
            
            for word in set(fenci_result).difference(all_weibo_word):
                all_weibo_word.append(word)     
            
            if i % piece == 0:
                quick_write_list_to_text(each_weibo_fenci, write_directory + u'/each_weibo_fenci/' + str(file_number) + '.txt')
                quick_write_list_to_text(all_weibo_word, write_directory + u'/all_weibo_word/' + str(file_number) + '.txt')
                file_number = file_number + 1
                each_weibo_fenci = []
                all_weibo_word = []
                if weibo_row - i < piece:
                    break;
        
    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)    
Ejemplo n.º 45
0
def get_weibo_entropy(read_directory1, read_directory2, write_directory):
    '''
    计算文本信息熵
    :param read_directory1: 词频向量文件目录
    :param read_directory2: top n word文件目录
    :param write_directory: 写入目录
    '''

    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        word_weight = []
        
        #获取词频向量
        f0 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        each_vsm = f0.readlines()
        f0.close()
        
        #获取词汇权值为一个列表
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            word_weight.append(float(line.split()[1]))
            line = f.readline()  
        f.close()
        
        #得到的word_weight是一个array
        word_weight = np.log2(word_weight)
        
        entropy_all = []
        
        for each in each_vsm:
            #计算熵值
            each_line_vsm = np.array([float(x) for x in each.split()])
            entropy_all.append(str(np.dot(word_weight, each_line_vsm)))
        
        #写入文件
        quick_write_list_to_text(entropy_all, write_directory + '/' + str(i + 1) + '.txt')
    
    print "Compute Entropy Complete!!!"
def pre_text_classify(read_filename1, read_filename2, write_filename):
    
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    
    select_number = 3
    
    word_list = []
    word_weight = []
    #word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_list.append(line.split()[0])
        word_weight.append(line.split()[1])

        line = f.readline()
    f.close()
    
    word_list = word_list[0:1000]
    word_weight = word_weight[0:1000]
    
    total_result = []
    for i in range(len(vsm)):
        weight = []
        for j in range(len(word_list)):
            weight.append(vsm[i, j])
        
        ww = zip(word_list, weight)
        ww = sorted(ww, key = itemgetter(1), reverse = True)
        
        word_result = []
        count_number = 1
        for each in ww:
            word_result.append(each[0])
            count_number += 1
            if count_number > select_number:
                break
        total_result.append(" ".join(word_result))
    
    quick_write_list_to_text(total_result, write_filename)
Ejemplo n.º 47
0
def data_segment(read_filename, write_directory):
    '''
    数据分片
    :param read_filename: 读取文件
    :param write_directory: 写入目录
    '''
    
    # 文件开始的编号
    file_number = 1
    
    print "Begin data segmentation!!!" 
    print "May take a long time, Please Wait..."
        
    # 每条微博的分词
    weibo_content_segment = []
    
    # 每条文本的分词
    weibo_id_segment = []

    line_count = 0
    
    fr = open(read_filename)
    line = fr.readline()
    while line:
        weibo_content_segment.append(line.strip())
        weibo_id_segment.append(str(line_count))
        line_count += 1
        
        if line_count % 5000 == 0:
            # 写入文件
            quick_write_list_to_text(weibo_content_segment, write_directory + u'/weibo_segment/' + str(file_number) + '.txt')
            quick_write_list_to_text(weibo_id_segment, write_directory + u'/weibo_id/' + str(file_number) + '.txt')
            file_number += 1
            weibo_content_segment = []
            weibo_id_segment = []
            
        line = fr.readline()     
    fr.close()

    print "Data Segmentation Complete!!!"
    print "Total Segments: %d" % (file_number - 1)    
Ejemplo n.º 48
0
def pre_text_classify(read_filename1, read_filename2, write_filename):

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T

    select_number = 3

    word_list = []
    word_weight = []
    #word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_list.append(line.split()[0])
        word_weight.append(line.split()[1])

        line = f.readline()
    f.close()

    word_list = word_list[0:1000]
    word_weight = word_weight[0:1000]

    total_result = []
    for i in range(len(vsm)):
        weight = []
        for j in range(len(word_list)):
            weight.append(vsm[i, j])

        ww = zip(word_list, weight)
        ww = sorted(ww, key=itemgetter(1), reverse=True)

        word_result = []
        count_number = 1
        for each in ww:
            word_result.append(each[0])
            count_number += 1
            if count_number > select_number:
                break
        total_result.append(" ".join(word_result))

    quick_write_list_to_text(total_result, write_filename)
Ejemplo n.º 49
0
def sample_vsm(read_filename1, read_filename2, write_filename):
    
    weibo_content = []
    all_word_list = []
    
    select_number = 1000
    
    get_text_to_complex_list(weibo_content, read_filename1, 0)
    
    f = open(read_filename2)
    line = f.readline()
    while line:
        all_word_list.append(line.strip().split()[0])
        line = f.readline()  
    f.close()
    
    all_word_list = all_word_list[0 : select_number]
    
    vsm = []
        
    for row in range(len(weibo_content)):
            
        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0
            
        for j in range(len(weibo_content[row])):
            try:
                tf_dict[weibo_content[row][j].split('/')[0]] += 1
            except KeyError:
                tf_dict[weibo_content[row][j].split('/')[0]] = 0
            
        this_line = []
        for key in all_word_list:
            this_line.append(str(tf_dict[key]))
            
        #每一行合并为字符串,方便写入
        vsm.append(" ".join(this_line))
        
    quick_write_list_to_text(vsm, write_filename)
Ejemplo n.º 50
0
def sample_vsm(read_filename1, read_filename2, write_filename):

    weibo_content = []
    all_word_list = []

    select_number = 1000

    get_text_to_complex_list(weibo_content, read_filename1, 0)

    f = open(read_filename2)
    line = f.readline()
    while line:
        all_word_list.append(line.strip().split()[0])
        line = f.readline()
    f.close()

    all_word_list = all_word_list[0:select_number]

    vsm = []

    for row in range(len(weibo_content)):

        tf_dict = {}  # 词频TF字典
        for key in all_word_list:
            tf_dict[key] = 0

        for j in range(len(weibo_content[row])):
            try:
                tf_dict[weibo_content[row][j].split('/')[0]] += 1
            except KeyError:
                tf_dict[weibo_content[row][j].split('/')[0]] = 0

        this_line = []
        for key in all_word_list:
            this_line.append(str(tf_dict[key]))

        #每一行合并为字符串,方便写入
        vsm.append(" ".join(this_line))

    quick_write_list_to_text(vsm, write_filename)
Ejemplo n.º 51
0
def count_word_tf(read_filename1, read_filename2, write_filename):
    '''
    计算数据的所有词汇的词频
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''
    
    each_weibo_fenci = [] 
    all_weibo_fenci = []
        
    get_text_to_complex_list(each_weibo_fenci, read_filename1, 0)
    get_text_to_single_list(all_weibo_fenci, read_filename2)
        
    tf_dict = {}  #词频TF字典
    for key in all_weibo_fenci:
        tf_dict[key] = 0
            
    for row in range(len(each_weibo_fenci)):
        for j in range(len(each_weibo_fenci[row])):
            try:
                tf_dict[each_weibo_fenci[row][j]] += 1
            except KeyError:
                tf_dict[each_weibo_fenci[row][j]] = 0
        
    #词频列表
    value_list = []
    for key in all_weibo_fenci:
        value_list.append(tf_dict[key])
        
    # 按词频降序排序
    va = zip(all_weibo_fenci, value_list)
    va = sorted(va, key = itemgetter(1), reverse = True)    
        
    result_all = []
    for each in va:
        result_all.append(each[0] + " " + str(each[1]))
       
    quick_write_list_to_text(result_all, write_filename)
Ejemplo n.º 52
0
def vsm_update(read_directory1, read_directory2, write_directory1,
               write_directory2):
    '''
    除去全0的行
    :param read_directory1:
    :param read_directory2:
    :param write_directory1:
    :param write_directory2:
    '''
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        update_vsm = []
        update_id_time = []

        f1 = open(read_directory1 + '/' + str(i + 1) + '.txt')
        each_weibo_vsm = f1.readlines()
        f1.close()

        id_time = []

        get_text_to_complex_list2(id_time,
                                  read_directory2 + '/' + str(i + 1) + '.txt',
                                  0, 2)

        for j in range(len(each_weibo_vsm)):
            int_each_weibo_vsm = [int(x) for x in each_weibo_vsm[j].split()]
            #去掉全0行
            if np.sum(int_each_weibo_vsm) > 0.1:
                update_vsm.append(each_weibo_vsm[j])
                update_id_time.append(" ".join(id_time[j]))

        quick_write_list_to_text2(update_vsm,
                                  write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(update_id_time,
                                 write_directory2 + '/' + str(i + 1) + '.txt')

    print "VSM Update Complete!!!"
Ejemplo n.º 53
0
def global_sort_by_time(update_item_index, read_directory, write_directory):
    
    print "Begin sorting." 
    print "May take a long time, Please Wait..."
    line_count = 1
    file_count = 880
    review_result = []
    
    for i in range(len(update_item_index)):
        
        f1 = open(read_directory + "/" + update_item_index[i][0] + ".txt", "rb")
        each_review_text = f1.readlines()
        f1.close()
        
        #try:
        time_index = int(update_item_index[i][1])
        review_result.append(each_review_text[time_index - 6].strip())
        review_result.append(each_review_text[time_index - 5].strip())
        review_result.append(each_review_text[time_index - 4].strip())
        review_result.append(each_review_text[time_index - 3].strip())
        review_result.append(each_review_text[time_index - 2].strip())
        review_result.append(each_review_text[time_index - 1].strip())
        review_result.append(each_review_text[time_index].strip())
        review_result.append(each_review_text[time_index + 1].strip())
        review_result.append("")
        
        #except IndexError:
            #review_result.append("\n")
               
        line_count += 1
        
        if line_count > 5000:
            quick_write_list_to_text(review_result, write_directory + "/" + str(file_count) + ".txt")
            
            review_result = []
            line_count = 1
            file_count += 1
            
    print "Sort Complete!!!"
Ejemplo n.º 54
0
def pattern_cluster(read_filename1, read_filename2, read_filename3, write_filename1, write_filename2):
    pattern_list = []
    f = open(read_filename1, 'r')
    line = f.readline()
    while line:
        if len(line.split()) > 1:
            pattern_list.append(line.split())
        line = f.readline()
    f.close()
    
    word_weight_dict = {}
    f = open(read_filename2, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()
    
    #调用compute_similarity函数计算相似度矩阵并给出聚类数目
    similarity_matrix, cluster_number = compute_similarity(pattern_list, read_filename3, word_weight_dict)
    
    write_matrix_to_text(similarity_matrix, write_filename1)
    quick_write_list_to_text([str(cluster_number)], write_filename2)
Ejemplo n.º 55
0
def kmeans_evaluate(read_filename1, read_filename2, write_directory):

    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename1)

    cluster_tag = []
    get_text_to_single_list(cluster_tag, read_filename2)

    real_tag = real_tag[0:len(cluster_tag)]

    #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5...
    reflect_tag = [['6', '8'], ['4'], ['5'], ['7'], ['3'], ['2'], ['6', '8'],
                   ['1']]

    cluster_partion = []
    for i in range(len(reflect_tag)):
        cluster_partion.append([])

    for i in range(len(cluster_tag)):
        cluster_partion[int(cluster_tag[i]) - 1].append(str(i))

    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(reflect_tag)):
        real_cluster_partion = []
        for j in range(len(real_tag)):
            if real_tag[j] in reflect_tag[i]:
                real_cluster_partion.append(str(j))

        correct = len(set(cluster_partion[i]) & set(real_cluster_partion))
        this_precision = np.true_divide(correct, len(set(cluster_partion[i])))
        this_recall = np.true_divide(correct, len(set(real_cluster_partion)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall,
                                       (this_precision + this_recall))

        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))

    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list,
                             write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Ejemplo n.º 56
0
def global_segment(read_filename, write_directory):
    '''
    所有微博文本分词,并获取微博的各项信息
    :param read_filename:
    :param write_directory:
    '''

    stopwords_list1 = get_stopwords1()
    stopwords_list2 = get_stopwords2()
        
    global_id = []
    global_time = []
    global_tag = []

    weibo_sheet = open_sheet(read_filename)
    weibo_row = weibo_sheet.nrows
    print 'Number of the Weibo row: %d' % weibo_row
    
    f1 = open(write_directory + '/weibo_content.txt', 'w')
    f2 = open(write_directory + '/weibo_content2.txt', 'w')

    for j in range(1, weibo_row):
       
        weibo_id = str(int(weibo_sheet.cell(j, 0).value))
        weibo_time = weibo_sheet.cell(j, 2).value
        weibo_time = time_convert(weibo_time)
            
        weibo_tag = str(int(weibo_sheet.cell(j, 5).value))
                
        global_id.append(weibo_id)
        global_time.append(str(weibo_time))
        
        weibo_content = str(weibo_sheet.cell(j, 6).value)
        fenci_result = word_segment(weibo_content, stopwords_list1, stopwords_list2)
        f1.write(" ".join(fenci_result))
        f1.write("\n")
        
        fenci_without_tag = [x.split('/')[0] for x in fenci_result]
        f2.write(" ".join(fenci_without_tag))
        f2.write("\n")
        global_tag.append(weibo_tag)
    
    f1.close()
    f2.close()

    quick_write_list_to_text(global_id, write_directory + '/weibo_id.txt')  
    quick_write_list_to_text(global_time, write_directory + '/weibo_time.txt')
    quick_write_list_to_text(global_tag, write_directory + '/weibo_class_tag.txt')
Ejemplo n.º 57
0
def data_sample(read_directory, write_directory1, write_directory2):
    '''
    
    :param read_directory:
    :param write_directory1:
    :param write_directory2:
    '''
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])
    sample_size = 250
    sample_time = []
    ratio = []

    for i in range(file_number):
        vsm_matrix = get_text_to_nparray(
            read_directory + '/' + str(i + 1) + '.txt', 'int')
        vsm_matrix = vsm_matrix.T

        print 'Batch: %d' % (i + 1)
        start = time.clock()

        data_dimension = vsm_matrix.shape[0]

        Q = np.zeros((sample_size, data_dimension))
        for k in range(Q.shape[0]):
            for j in range(Q.shape[1]):
                Q[k, j] = random.gauss(
                    1, np.sqrt(np.true_divide(1, np.sqrt(sample_size))))

        sample_result = np.dot(Q, vsm_matrix)

        this_ratio = np.true_divide(sample_size, data_dimension) * 8.0 / 4.0
        ratio.append(str(this_ratio))

        interval = time.clock() - start
        print 'Time: %f' % interval
        sample_time.append(str(interval))

        write_result = []
        for each in sample_result:
            write_result.append(" ".join([str(x) for x in each]))
        quick_write_list_to_text(write_result,
                                 write_directory1 + '/' + str(i + 1) + '.txt')

    quick_write_list_to_text(sample_time,
                             write_directory2 + '/sample_time.txt')
    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')