Ejemplo n.º 1
0
def compute_em_weights(read_filename1, read_filename2, write_filename):
    '''
    Linear fusion
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    em_weights = []

    coefficients_string = []
    get_text_to_single_list(coefficients_string, read_filename2)
    coefficients = [float(x) for x in coefficients_string]

    f = open(read_filename1, 'r')
    line = f.readline()
    while line:
        each_line = line.split()
        em_weights.append(
            float(each_line[0]) * coefficients[0] +
            float(each_line[1]) * coefficients[1] +
            float(each_line[2]) * coefficients[2])

        line = f.readline()
    f.close()

    em_weights_to_string = [str(x) for x in em_weights]
    quick_write_list_to_text(em_weights_to_string, write_filename)
Ejemplo n.º 2
0
def compute_em_weights(read_filename1, read_filename2, write_filename):
    '''
    Linear fusion
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''

    em_weights = []
    
    coefficients_string = [] 
    get_text_to_single_list(coefficients_string, read_filename2)   
    coefficients = [float(x) for x in coefficients_string]
        
    f = open(read_filename1, 'r')
    line = f.readline()
    while line:
        each_line = line.split()
        em_weights.append(float(each_line[0]) * coefficients[0] + float(each_line[1]) * coefficients[1] + float(each_line[2]) * coefficients[2])
            
        line = f.readline()
    f.close()
    
    em_weights_to_string = [str(x) for x in em_weights]
    quick_write_list_to_text(em_weights_to_string, write_filename)
def select_top_N_words(read_directory1, read_directory2, write_directory):
    N = 1000
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        each_word_tf = each_word_tf[1:]  # 列表,内层2个
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
        
        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in key_words:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))  
            else:
                select_word.append(word_entity)
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break
        
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 4
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

    word_list = []
    get_text_to_single_list(word_list, read_filename2)

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 5
0
def kmeans_evaluate(read_filename1, read_filename2, write_directory):

    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename1)

    cluster_tag = []
    get_text_to_single_list(cluster_tag, read_filename2)

    real_tag = real_tag[0:len(cluster_tag)]

    #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5...
    reflect_tag = [['6', '8'], ['4'], ['5'], ['7'], ['3'], ['2'], ['6', '8'],
                   ['1']]

    cluster_partion = []
    for i in range(len(reflect_tag)):
        cluster_partion.append([])

    for i in range(len(cluster_tag)):
        cluster_partion[int(cluster_tag[i]) - 1].append(str(i))

    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(reflect_tag)):
        real_cluster_partion = []
        for j in range(len(real_tag)):
            if real_tag[j] in reflect_tag[i]:
                real_cluster_partion.append(str(j))

        correct = len(set(cluster_partion[i]) & set(real_cluster_partion))
        this_precision = np.true_divide(correct, len(set(cluster_partion[i])))
        this_recall = np.true_divide(correct, len(set(real_cluster_partion)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall,
                                       (this_precision + this_recall))

        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))

    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list,
                             write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Ejemplo n.º 6
0
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''

    #文件总数
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = []
        #该数据片中的所有数据
        all_text_word = []

        get_text_to_complex_list(each_text_segment,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 0)
        get_text_to_single_list(all_text_word,
                                read_directory2 + '/' + str(i + 1) + '.txt')

        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0

        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0

        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])

        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key=itemgetter(1), reverse=True)

        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))

        #写入文件
        quick_write_list_to_text(result_all,
                                 write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 7
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

        
    word_list = []
    get_text_to_single_list(word_list, read_filename2)
        
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(result, write_filename)
Ejemplo n.º 8
0
def kmeans_evaluate(read_filename1, read_filename2, write_directory):
    
    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename1)
    
    cluster_tag = []
    get_text_to_single_list(cluster_tag, read_filename2)
    
    real_tag = real_tag[0 : len(cluster_tag)]
    
    #列表索引+1为聚类编号,等号右边为真实标注的编号 即1对应5...
    reflect_tag = [['5'], ['7'], ['2'], ['3', '6', '8'], ['7'], ['2'], ['4'], ['1']]
    
    cluster_partion = []
    for i in range(len(reflect_tag)):
        cluster_partion.append([])
    
    for i in range(len(cluster_tag)):
        cluster_partion[int(cluster_tag[i]) - 1].append(str(i))
    
    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(reflect_tag)):
        real_cluster_partion = []
        for j in range(len(real_tag)):
            if real_tag[j] in reflect_tag[i]:
                real_cluster_partion.append(str(j))
        
        correct = len(set(cluster_partion[i]) & set(real_cluster_partion))
        this_precision = np.true_divide(correct, len(set(cluster_partion[i])))
        this_recall = np.true_divide(correct, len(set(real_cluster_partion)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall))
        
        print this_precision, this_recall, this_fmeasure
        
        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))
    
    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list, write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
def count_word_tf(read_directory1, read_directory2, write_directory):
    '''
    计算每片数据的所有词汇的词频
    :param read_directory1: 文本文件目录
    :param read_directory2: 所有词汇文件目录
    :param write_directory: 写入目录
    '''
    
    #文件总数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        #每条文本的分词结果
        each_text_segment = [] 
        #该数据片中的所有数据
        all_text_word = []
        
        get_text_to_complex_list(each_text_segment, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_text_word, read_directory2 + '/'+ str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_text_word:
            tf_dict[key] = 0
            
        for row in range(len(each_text_segment)):
            for j in range(len(each_text_segment[row])):
                try:
                    tf_dict[each_text_segment[row][j]] += 1
                except KeyError:
                    tf_dict[each_text_segment[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_text_word:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_text_word, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = ['-Word- -TF-']
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        #写入文件
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 10
0
def batch_count_tf(read_directory1, read_directory2, write_directory):
    '''
    
    :param read_directory1:
    :param read_directory2:
    :param write_directory:
    '''

    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_weibo_fenci = [] 
        all_weibo_fenci = []
        
        get_text_to_complex_list(each_weibo_fenci, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        get_text_to_single_list(all_weibo_fenci, read_directory2 + '/' + str(i + 1) + '.txt')
        
        tf_dict = {}  #词频TF字典
        for key in all_weibo_fenci:
            tf_dict[key] = 0
            
        for row in range(len(each_weibo_fenci)):
            for j in range(len(each_weibo_fenci[row])):
                try:
                    tf_dict[each_weibo_fenci[row][j]] += 1
                except KeyError:
                    tf_dict[each_weibo_fenci[row][j]] = 0
        
        #词频列表
        value_list = []
        for key in all_weibo_fenci:
            value_list.append(tf_dict[key])
        
        # 按词频降序排序
        va = zip(all_weibo_fenci, value_list)
        va = sorted(va, key = itemgetter(1), reverse = True)    
        
        result_all = []
        for each in va:
            result_all.append(each[0] + " " + str(each[1]))
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 11
0
def classification_evaluate(read_filename1, read_filename2, write_directory):

    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)

    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)

    # 需要手动录入
    class_tag = ['5', '6', '3', '8', '2', '4', '1', '7']

    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(class_tag)):
        real_classification = []
        for j in range(len(real_tag)):
            if real_tag[j] == class_tag[i]:
                real_classification.append(str(j))

        correct = len(set(classification_result[i]) & set(real_classification))
        this_precision = np.true_divide(correct,
                                        len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_classification)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall,
                                       (this_precision + this_recall))

        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))

    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list,
                             write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Ejemplo n.º 12
0
def classification_evaluate(read_filename1, read_filename2, write_directory):
    
    # string类型二维列表
    classification_result = []
    get_text_to_complex_list(classification_result, read_filename1, 0)
    
    # string类型
    real_tag = []
    get_text_to_single_list(real_tag, read_filename2)
    
    # 需要手动录入
    class_tag = ['2', '3', '6', '1', '5', '7', '4']
    class_tag2 = ['2', '3', '8', '1', '5', '7', '4']
    
    precision_list = []
    recall_list = []
    fmeasure_list = []
    for i in range(len(class_tag)):
        real_classification = []
        for j in range(len(real_tag)):
            # 检索6和8为一类
            if real_tag[j] == class_tag[i] or real_tag[j] == class_tag2[i]:
                real_classification.append(str(j))
        
        correct = len(set(classification_result[i]) & set(real_classification))
        this_precision = np.true_divide(correct, len(set(classification_result[i])))
        this_recall = np.true_divide(correct, len(set(real_classification)))
        this_fmeasure = np.true_divide(2.0 * this_precision * this_recall, (this_precision + this_recall))
        
        print this_precision, this_recall, this_fmeasure

        precision_list.append(str(this_precision))
        recall_list.append(str(this_recall))
        fmeasure_list.append(str(this_fmeasure))
    
    average_precision = np.average([float(x) for x in precision_list])
    average_recall = np.average([float(x) for x in recall_list])
    average_fmeasure = np.average([float(x) for x in fmeasure_list])
    print 'Average:', average_precision, average_recall, average_fmeasure
    quick_write_list_to_text(precision_list, write_directory + u'/precision.txt')
    quick_write_list_to_text(recall_list, write_directory + u'/recall.txt')
    quick_write_list_to_text(fmeasure_list, write_directory + u'/fmeasure.txt')
Ejemplo n.º 13
0
def spct_prf(read_filename1, read_filename2, write_filename):
    
    cluster_tag = []
    real_tag = []
    
    get_text_to_single_list(cluster_tag, read_filename1)
    get_text_to_single_list(real_tag, read_filename2)

    cluster_tag = [int(x) for x in cluster_tag]
    real_tag = [int(x) for x in real_tag]
    
    reflect = [20, 21, 20]
    
    p, r, f = prf(cluster_tag, real_tag, reflect)
    print p
    print r
    print f
    

    quick_write_list_to_text([str(p), str(r), str(f)], write_filename)
Ejemplo n.º 14
0
def count_word_tf(read_filename1, read_filename2, write_filename):
    '''
    计算数据的所有词汇的词频
    :param read_filename1:
    :param read_filename2:
    :param write_filename:
    '''
    
    each_weibo_fenci = [] 
    all_weibo_fenci = []
        
    get_text_to_complex_list(each_weibo_fenci, read_filename1, 0)
    get_text_to_single_list(all_weibo_fenci, read_filename2)
        
    tf_dict = {}  #词频TF字典
    for key in all_weibo_fenci:
        tf_dict[key] = 0
            
    for row in range(len(each_weibo_fenci)):
        for j in range(len(each_weibo_fenci[row])):
            try:
                tf_dict[each_weibo_fenci[row][j]] += 1
            except KeyError:
                tf_dict[each_weibo_fenci[row][j]] = 0
        
    #词频列表
    value_list = []
    for key in all_weibo_fenci:
        value_list.append(tf_dict[key])
        
    # 按词频降序排序
    va = zip(all_weibo_fenci, value_list)
    va = sorted(va, key = itemgetter(1), reverse = True)    
        
    result_all = []
    for each in va:
        result_all.append(each[0] + " " + str(each[1]))
       
    quick_write_list_to_text(result_all, write_filename)
Ejemplo n.º 15
0
def generate_high_quality_data(read_directory1, read_directory2, read_directory3, write_directory):
    '''
    Linear fusion
    :param read_directory1:
    :param read_directory2:
    :param read_directory3:
    :param write_directory:
    '''
    K = 3000

    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    for i in range(file_number):
        em_weights = []
    
        coefficients_string = [] 
        get_text_to_single_list(coefficients_string, read_directory1 + '/' + str(i + 1) + '.txt')   
        coefficients = [float(x) for x in coefficients_string]
        
        f = open(read_directory2 + '/' + str(i + 1) + '.txt', 'r')
        line = f.readline()
        while line:
            each_line = line.split()
            this_em = 0.0
            for j in range(len(coefficients)):
                this_em += float(each_line[j]) * coefficients[j]
            
            em_weights.append(this_em)
            
            line = f.readline()
        f.close()
        
        this_weibo = []
        time_series = []
        this_text = []
        #get_text_to_single_list(this_weibo, read_directory3 + '/' + str(i + 1) + '.txt')
        
        f = open(read_directory3 + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            this_time = time.mktime(time.strptime(line.strip().split('\t')[2], '%Y/%m/%d %H:%M'))
            time_series.append(this_time)
            this_weibo.append(line.strip())
            try:
                this_text.append(line.strip().split('\t')[6])
            except:
                this_text.append(" ")
            
            line = f.readline()
        f.close()
        
        # 按EM值排序
        ttte = zip(this_weibo, time_series, this_text, em_weights)
        ttte1 = sorted(ttte, key = itemgetter(3), reverse = True)
        
        this_weibo = []
        time_series = []
        this_text = []
        em_weights = []
        
        line_count = 0
        for each in ttte1:
            if each[2] not in this_text and len(each[2]) >= 150:
                this_weibo.append(each[0]+'\t'+str(each[3]))
                time_series.append(each[1])
                this_text.append(each[2])
                line_count += 1
                
                if line_count >= K:
                    break
        
        # 再按时间升序排序
        twts = zip(this_weibo, time_series)
        twts1 = sorted(twts, key = itemgetter(1))
        
        this_weibo = []
        time_series = []
        this_text = []
        
        for each in twts1:
            this_weibo.append(each[0])

        quick_write_list_to_text(this_weibo, write_directory + '/' + str(i + 1) + '.txt')
def merge_batch(read_directory1, read_directory2, read_directory3,
                read_directory4, read_filename, write_directory1,
                write_directory2):

    all_batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        all_batch_index.append(line.split())
        line = f.readline()

    f.close()

    for i in range(len(all_batch_index)):
        this_word_list = []
        f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
        line = f1.readline()
        while line:
            this_word_list.append(line.strip())
            line = f1.readline()

        f1.close()

        result = []
        result_id_time = []

        for j in range(len(all_batch_index[i])):

            word_list = []
            f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt',
                      'rb')
            line = f2.readline()
            while line:
                word_list.append(line.split()[0])
                line = f2.readline()

            f2.close()

            vsm_nparray = get_text_to_nparray(
                read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int')

            id_time = []
            get_text_to_single_list(
                id_time,
                read_directory4 + '/' + all_batch_index[i][j] + '.txt')

            for each2 in id_time:
                result_id_time.append(each2)

            for each in vsm_nparray:
                tf_dict = {}
                for k in range(len(each)):
                    if each[k] > 0.0001:
                        tf_dict[word_list[k]] = each[k]

                tf_dict2 = {}
                for each1 in this_word_list:
                    if each1 in tf_dict.keys():
                        tf_dict2[each1] = tf_dict[each1]
                    else:
                        tf_dict2[each1] = 0

                this_line = []
                for key in this_word_list:
                    this_line.append(str(tf_dict2[key]))

                #每一行合并为字符串,方便写入
                result.append(" ".join(this_line))

        quick_write_list_to_text(result,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(result_id_time,
                                 write_directory2 + '/' + str(i + 1) + '.txt')
Ejemplo n.º 17
0
def topic_life(read_directory1, read_directory2, read_directory3, write_directory1):
    
    gamma = 0.65
    delta = 0.80
    
    #file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    q = 4
    start_batch = 46
    interval = 7
    end_batch = start_batch + interval
    
    all_topic_batch, new_word_list, all_count = merge_all_center(read_directory1, read_directory2, start_batch, end_batch)
    
    evolution_matrix = np.zeros((all_count, all_count), int)
    
    previous_topics = []
    previous_num = []
    previous_intensity = []
    
    start_index = 0
    end_index = 0
    
    for i in range(len(all_topic_batch)):
        this_topic_intensity = []
        get_text_to_single_list(this_topic_intensity, read_directory3 + '/' + str(start_batch + i) + '.txt')
        this_topic_intensity = [int(x) for x in this_topic_intensity]
        print this_topic_intensity
        
        if i == 0:
            for j in range(len(all_topic_batch[i])):
                evolution_matrix[j, j] = 1
                previous_topics.append(all_topic_batch[i][j])
                previous_intensity.append(this_topic_intensity[j])
            
            start_index = 0
            end_index += len(all_topic_batch[i])
            
            previous_num.append(len(all_topic_batch[i]))

        else:
            kl_matrix = np.zeros((len(all_topic_batch[i]), len(previous_topics)))
            
            for j in range(len(all_topic_batch[i])):
                for k in range(len(previous_topics)):
                    kl_matrix[j, k] = 1.0 / (SKLD(all_topic_batch[i][j], previous_topics[k]) + 1.0)
            
            #判断出现
            for j in range(len(kl_matrix)):
                #if np.max(kl_matrix[j]) < gamma:
                evolution_matrix[end_index + j, end_index + j] = 1
            
            #判断消失
            for j in range(len(kl_matrix[0])):
                if np.max(kl_matrix[:, j]) < gamma:
                    evolution_matrix[start_index + j, start_index + j] = -1
            
            #判断延续
            for j in range(len(kl_matrix)):
                for k in range(len(kl_matrix[j])):
                    if kl_matrix[j][k] >= delta:
                        evolution_matrix[start_index + k, end_index + j] = 2
                        evolution_matrix[end_index + j, start_index + k] = 2
            
            #判断合并
            for j in range(len(kl_matrix)):
                latent_merge_index = []
                si_value = []
                for k in range(len(kl_matrix[j])):
                    if kl_matrix[j][k] >= gamma and kl_matrix[j][k] < delta:
                        latent_merge_index.append(k)
                        si_value.append(kl_matrix[j][k])
                
                
                
                if len(latent_merge_index) >= 2:
                    sl = zip(latent_merge_index, si_value)
                    sl = sorted(sl, key = itemgetter(1), reverse=True)
                    latent_merge_index = []
                
                    m_count = 0
                    for each in sl:
                        latent_merge_index.append(each[0])
                        m_count += 1
                    
                        if m_count >= 3:
                            break
                    
                    Z = np.zeros(len(all_topic_batch[i][0]))
                    all_intensity = 0
                    for each in latent_merge_index:
                        Z += previous_topics[each] * previous_intensity[each]
                        all_intensity += previous_intensity[each]
                    
                    Z = Z / all_intensity
                    related = 1.0 / (SKLD(all_topic_batch[i][j], Z) + 1.0)
                    
                    if related > delta:
                        for each in latent_merge_index:
                            evolution_matrix[start_index + each, end_index + j] = 3
                            evolution_matrix[end_index + j, start_index + each] = 3
            #判断分裂
            if len(kl_matrix) > 1: 
                for j in range(len(kl_matrix[0])):
                    latent_split_index = []
                    for k in range(len(kl_matrix)):
                        if kl_matrix[k][j] >= gamma and kl_matrix[k][j] < delta:
                            latent_split_index.append(k)
                
                    if len(latent_split_index) >= 2:
                        Z = np.zeros(len(all_topic_batch[i][0]))
                        all_intensity = 0
                        for each in latent_split_index:
                            Z += all_topic_batch[i][each] * this_topic_intensity[each]
                            all_intensity += this_topic_intensity[each]
                    
                        Z = Z / all_intensity
                        related = 1.0 / (SKLD(previous_topics[j], Z) + 1.0)
                    
                        if related > delta:
                            for each in latent_split_index:
                                evolution_matrix[start_index + j, end_index + each] = 4
                                evolution_matrix[end_index + each, start_index + j] = 4     
            
            for j in range(len(all_topic_batch[i])):
                previous_topics.append(all_topic_batch[i][j])
                previous_intensity.append(this_topic_intensity[j])
            
            previous_num.append(len(all_topic_batch[i]))
            
            if len(previous_num) > q:
                start_index += previous_num[0]
                for l in range(previous_num[0]):
                    previous_topics.remove(previous_topics[0])
                    previous_intensity.remove(previous_intensity[0])
                
                previous_num.remove(previous_num[0])
                
            
            end_index += len(all_topic_batch[i])
        
        write_matrix_to_text(evolution_matrix, write_directory1 + '/' + str(i + 1) + '.txt')        
        print "Evolution %d Completed." % (i + 1)
Ejemplo n.º 18
0
def compute_similarity(pattern_list, read_filename, word_weight_dict):

    search_texts = []
    get_text_to_single_list(search_texts, read_filename)
    
    query_result_list = []
    for i in range(len(pattern_list)):
        query_result_list.append(query(pattern_list[i], search_texts, word_weight_dict))
    
    similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)])
    tag = []
    for i in range(len(pattern_list)):
        tag.append(0)
        for j in range(i, len(pattern_list)):
            '''
            计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度
            见TextQuery.py
            '''
            numerator = len(set(query_result_list[i]) & set(query_result_list[j]))
            denominator = len(set(query_result_list[i]) | set(query_result_list[j]))
            
            similarity_matrix[i, j] = np.true_divide(numerator, denominator)
            similarity_matrix[j, i] = similarity_matrix[i, j]
    
    '''
    分部划分以确定聚类中心个数
    '''       
    class_partion = []        
    for i in range(len(pattern_list)):
        if tag[i] == 0:
            temp_class_partion = []
            for j in range(i, len(pattern_list)):
                if similarity_matrix[i, j] > 0.2:
                    temp_class_partion.append(j)
                    tag[j] = 1
            class_partion.append(temp_class_partion)
    
    partion_length = []
    for each in class_partion:
        partion_length.append(len(each))
    
    # 按长度降序排序
    cl = zip(class_partion, partion_length)
    cl = sorted(cl, key = itemgetter(1), reverse = True)
    
    class_partion = []
    partion_length = []
    
    for each in cl:
        class_partion.append(each[0])
        partion_length.append(each[1])
    
    length_sum = np.sum(partion_length)
    temp_sum = 0
    cluster_number = 0
    for i in range(len(partion_length)):
        temp_sum += partion_length[i]
        cluster_number += 1
        
        #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目
        if np.true_divide(temp_sum, length_sum) > 0.75:
            break

    class_partion_to_string = []
    for i in range(cluster_number):
        class_partion_to_string.append(" ".join([str(x) for x in class_partion[i]]))
        
    print cluster_number
    
    query_result_list_string = []
    for each in query_result_list:
        query_result_list_string.append(" ".join([str(x) for x in each]))       
    
    # if possible  
    #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt')
    
    return similarity_matrix, cluster_number
def merge_batch(read_directory1, read_directory2, read_directory3, read_directory4, read_filename, write_directory1, write_directory2):
    
    all_batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        all_batch_index.append(line.split())
        line = f.readline()
        
    f.close()
    
    for i in range(len(all_batch_index)):
        this_word_list = []
        f1 = open(read_directory2 + '/' + str(i + 1) + '.txt', 'rb')
        line = f1.readline()
        while line:
            this_word_list.append(line.strip())
            line = f1.readline()
        
        f1.close()
        
        result = []
        result_id_time = []
        
        for j in range(len(all_batch_index[i])):
            
            word_list = []
            f2 = open(read_directory3 + '/' + all_batch_index[i][j] + '.txt', 'rb')
            line = f2.readline()
            while line:
                word_list.append(line.split()[0])
                line = f2.readline()
        
            f2.close()
            
            
            vsm_nparray = get_text_to_nparray(read_directory1 + '/' + all_batch_index[i][j] + '.txt', 'int')
            
            id_time = []
            get_text_to_single_list(id_time, read_directory4 + '/' + all_batch_index[i][j] + '.txt')
            
            for each2 in id_time:
                result_id_time.append(each2)
            
            for each in vsm_nparray:
                tf_dict = {}
                for k in range(len(each)):
                    if each[k] > 0.0001:
                        tf_dict[word_list[k]] = each[k]
                
                tf_dict2 = {}
                for each1 in this_word_list:
                    if each1 in tf_dict.keys():
                        tf_dict2[each1] = tf_dict[each1]
                    else:
                        tf_dict2[each1] = 0
            
                this_line = []
                for key in this_word_list:
                    this_line.append(str(tf_dict2[key]))
            
                #每一行合并为字符串,方便写入
                result.append(" ".join(this_line))
        
        quick_write_list_to_text(result, write_directory1 + '/' + str(i + 1) + '.txt')
        quick_write_list_to_text(result_id_time, write_directory2 + '/' + str(i + 1) + '.txt')
def select_top_N_words(read_directory1, read_directory2, write_directory):
    '''
    选取前N个词汇
    :param read_directory1: 所有单词tf文件目录
    :param read_directory2: 关键词文件目录
    :param write_directory: 写入目录
    '''
    #选取的词汇数目
    N = 2000
    
    #目录下的文件个数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    #权值字典,按词性分配
    score_dict = {"CC":0.0, "CD":0.0, "DT":0.2, "EX":0.0, "FW":0.3, "IN":0.0, "JJ":0.7, \
                  "JJR":0.75, "JJS":0.75, "LS":0.0, "MD":0.5, "NN":0.9, "NNS":0.9, "NNP":1.0, \
                  "NNPS":1.0, "PDT":0.0, "POS":0.0, "PRP":0.1, "PRP$":0.1, \
                  "RB":0.3, "RBR":0.35, "RBS":0.4, "RP":0.5, "SYM":0.0, "TO":0.0, "UH":0.0, \
                  "VB":0.7, "VBD":0.7, "VBG":0.7, "VBN":0.75, "VBP":0.7, "VBZ":0.7, \
                  "WDT":0.0, "WP":0.3, "WP$":0.3, "WRB":0.0, ":":0.0}
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        each_word_tf = each_word_tf[1:]  # 列表,内层2个
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
        
        for j in range(len(each_word_tf)):
            #word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split(',')[1]
            if each_word_tf[j][0] in key_words:
                select_word.append(each_word_tf[j][0])
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))  
            else:
                select_word.append(each_word_tf[j][0])
                try:
                    word_score.append(float(each_word_tf[j][1]) * score_dict[word_tag] * 0.80)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break

        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 21
0
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename):
    
    #展示5个词汇
    #查询时选取3个词汇
    select_number = 5
    
    # 频繁项集聚类的结果标号,string类型,从1开始
    class_tag = []
    get_text_to_single_list(class_tag, read_filename1)

    # 聚簇数目
    cluster_number = len(set(class_tag))
    
    # 频繁项集,二维string类型列表
    pattern_all = []
    get_text_to_complex_list(pattern_all, read_filename2, 0)
    pattern_all = pattern_all[0: len(class_tag)]
    
    # 获得聚类结果的频繁项集划分,int型二维列表
    class_partion = []
    for i in range(cluster_number):
        class_partion.append([])
        
    for i in range(len(class_tag)):
        for j in range(cluster_number):
            if class_tag[i] == str(j + 1):
                class_partion[j].append(i)
    
    # 获取全局词汇的权值
    word_weight_dict = {}
    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()
    
    # 获取频繁项集中所有不同的词汇
    all_word_list = []
    for each in pattern_all:
        for word in set(each).difference(all_word_list):
            all_word_list.append(word)
    
    # 包含某个单词的频繁项集个数——针对所有单词
    I_dict = {}
    for each in all_word_list:
        I_dict[each] = 0
        for each1 in pattern_all:
            if each in each1:
                I_dict[each] += 1
    
    # 包含某个单词的聚簇个数——针对所有单词
    C_dict = {}
    for each in all_word_list:
        C_dict[each] = 0
        for i in range(len(class_partion)):
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    C_dict[each] += 1
                    break
    
    cluster_word_list = []   
    for i in range(len(class_partion)):
        # 获取该聚簇下所有不同的单词
        this_word_list = []
        for j in range(len(class_partion[i])):
            for each in pattern_all[class_partion[i][j]]:
                if each not in this_word_list:
                    this_word_list.append(each)
        
        # 计算每个单词在聚簇中的支持度
        sup_dict = {}
        
        for each in this_word_list:
            sup_dict[each] = 0
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    sup_dict[each] += 1
        
        word_score_list = []
        # 计算聚簇中的每个单词的权值,作为查询分类的依据
        for each in this_word_list:
            global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each])) 
            word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0)
            word_score_list.append(word_score)
        
        # 按权值降序排序
        tw = zip(this_word_list, word_score_list)
        tw = sorted(tw, key = itemgetter(1), reverse = True)
        
        this_word_list = []
        word_score_list = []
        
        count = 0
        for each in tw:
            this_word_list.append(each[0])
            count += 1
            if count >= select_number:
                break
        
        cluster_word_list.append(" ".join(this_word_list))
    
    quick_write_list_to_text(cluster_word_list, write_filename)
Ejemplo n.º 22
0
def pre_text_classify(read_filename1, read_filename2, read_filename3, write_filename):

    # 展示5个词汇
    # 查询时选取3个词汇
    select_number = 5

    # 频繁项集聚类的结果标号,string类型,从1开始
    class_tag = []
    get_text_to_single_list(class_tag, read_filename1)

    # 聚簇数目
    cluster_number = len(set(class_tag))

    # 频繁项集,二维string类型列表
    pattern_all = []
    get_text_to_complex_list(pattern_all, read_filename2, 0)
    pattern_all = pattern_all[0 : len(class_tag)]

    # 获得聚类结果的频繁项集划分,int型二维列表
    class_partion = []
    for i in range(cluster_number):
        class_partion.append([])

    for i in range(len(class_tag)):
        for j in range(cluster_number):
            if class_tag[i] == str(j + 1):
                class_partion[j].append(i)

    # 获取全局词汇的权值
    word_weight_dict = {}
    f = open(read_filename3, "r")
    line = f.readline()
    while line:
        word_weight_dict[line.split()[0]] = float(line.split()[1])
        line = f.readline()
    f.close()

    # 获取频繁项集中所有不同的词汇
    all_word_list = []
    for each in pattern_all:
        for word in set(each).difference(all_word_list):
            all_word_list.append(word)

    # 包含某个单词的频繁项集个数——针对所有单词
    I_dict = {}
    for each in all_word_list:
        I_dict[each] = 0
        for each1 in pattern_all:
            if each in each1:
                I_dict[each] += 1

    # 包含某个单词的聚簇个数——针对所有单词
    C_dict = {}
    for each in all_word_list:
        C_dict[each] = 0
        for i in range(len(class_partion)):
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    C_dict[each] += 1
                    break

    cluster_word_list = []
    for i in range(len(class_partion)):
        # 获取该聚簇下所有不同的单词
        this_word_list = []
        for j in range(len(class_partion[i])):
            for each in pattern_all[class_partion[i][j]]:
                if each not in this_word_list:
                    this_word_list.append(each)

        # 计算每个单词在聚簇中的支持度
        sup_dict = {}

        for each in this_word_list:
            sup_dict[each] = 0
            for j in range(len(class_partion[i])):
                if each in pattern_all[class_partion[i][j]]:
                    sup_dict[each] += 1

        word_score_list = []
        # 计算聚簇中的每个单词的权值,作为查询分类的依据
        for each in this_word_list:
            global_weight = np.true_divide(len(pattern_all) * cluster_number, (I_dict[each] * C_dict[each]))
            word_score = word_weight_dict[each] * sup_dict[each] * np.log(global_weight + 1.0)
            word_score_list.append(word_score)

        # 按权值降序排序
        tw = zip(this_word_list, word_score_list)
        tw = sorted(tw, key=itemgetter(1), reverse=True)

        this_word_list = []
        word_score_list = []

        count = 0
        for each in tw:
            this_word_list.append(each[0])
            count += 1
            if count >= select_number:
                break

        cluster_word_list.append(" ".join(this_word_list))

    quick_write_list_to_text(cluster_word_list, write_filename)
Ejemplo n.º 23
0
def select_top_N_words(read_directory1, read_directory2, read_filename, write_directory):
    '''
    选取前N个词作为高质量的特征词汇
    :param read_directory1:
    :param read_directory2:
    :param read_filename:
    :param write_directory:
    '''
    N = 500
    
    #根据词性分配权值
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.2, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0, "eng":0.1}
    
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    for i in range(file_number):
        each_word_tf = [] 
        key_words = []
        
        select_word = []
        word_score = []
    
        user_dict = []
        
        get_text_to_complex_list(each_word_tf, read_directory1 + '/' + str(i + 1) + '.txt', 0)
        
        get_text_to_single_list(key_words, read_directory2 + '/' + str(i + 1) + '.txt')
    
        f = open(read_filename, 'r')
        line = f.readline()
        while line:
            user_dict.append(line.split()[0])
            line = f.readline()
        f.close()
        
        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in user_dict:
                #用户词典中的词分配高权值
                select_word.append(word_entity)
                word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
            elif word_entity in key_words and word_tag != 'eng':
                #关键词也分配高权值
                select_word.append(word_entity)
                try:
                    word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))
                    
            else:
                #其余词汇乘以0.6
                select_word.append(word_entity)
                try:
                    word_score.append(np.log(float(each_word_tf[j][1])) * score_dict[word_tag] * 0.50)
                except KeyError:
                    word_score.append(float(0.0))
        
        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key = itemgetter(1), reverse = True)    
        
        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break
        
        quick_write_list_to_text(result_all, write_directory + '/' + str(i + 1) + '.txt')
        
        print "Segment %d Completed." % (i + 1)
Ejemplo n.º 24
0
def select_top_N_words(read_directory1, read_directory2, read_filename3,
                       write_directory):
    N = 500
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.0, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}
    user_dict = []

    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        user_dict.append(line.split()[0])
        line = f.readline()
    f.close()

    for i in range(file_number):
        each_word_tf = []
        key_words = []

        select_word = []
        word_score = []

        get_text_to_complex_list(each_word_tf,
                                 read_directory1 + '/' + str(i + 1) + '.txt',
                                 0)

        get_text_to_single_list(key_words,
                                read_directory2 + '/' + str(i + 1) + '.txt')

        for j in range(len(each_word_tf)):
            word_entity = each_word_tf[j][0].split('/')[0]
            word_tag = each_word_tf[j][0].split('/')[1]
            if word_entity in user_dict:
                select_word.append(word_entity)
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
            elif word_entity in key_words:
                select_word.append(word_entity)
                try:
                    word_score.append(
                        np.log(float(each_word_tf[j][1])) *
                        score_dict[word_tag] * 1.0)
                except KeyError:
                    word_score.append(float(0.0))
            else:
                select_word.append(word_entity)
                try:
                    word_score.append(
                        np.log(float(each_word_tf[j][1])) *
                        score_dict[word_tag] * 0.60)
                except KeyError:
                    word_score.append(float(0.0))

        # 按权值降序排序
        sw = zip(select_word, word_score)
        sw = sorted(sw, key=itemgetter(1), reverse=True)

        result_all = []
        count_number = 1
        for each in sw:
            result_all.append(each[0] + " " + str(each[1]))
            count_number += 1
            if count_number > N:
                break

        quick_write_list_to_text(result_all,
                                 write_directory + '/' + str(i + 1) + '.txt')
Ejemplo n.º 25
0
def compute_similarity(pattern_list, read_filename, word_weight_dict):

    search_texts = []
    get_text_to_single_list(search_texts, read_filename)

    query_result_list = []
    for i in range(len(pattern_list)):
        query_result_list.append(
            query(pattern_list[i], search_texts, word_weight_dict))

    similarity_matrix = np.zeros([len(pattern_list), len(pattern_list)])
    tag = []
    for i in range(len(pattern_list)):
        tag.append(0)
        for j in range(i, len(pattern_list)):
            '''
            计算每一个频繁项集查询匹配到的文本集合,用查询文本集合之间的Jacard相似度衡量频繁项集之间的相似度
            见TextQuery.py
            '''
            numerator = len(
                set(query_result_list[i]) & set(query_result_list[j]))
            denominator = len(
                set(query_result_list[i]) | set(query_result_list[j]))

            similarity_matrix[i, j] = np.true_divide(numerator, denominator)
            similarity_matrix[j, i] = similarity_matrix[i, j]
    '''
    分部划分以确定聚类中心个数
    '''
    class_partion = []
    for i in range(len(pattern_list)):
        if tag[i] == 0:
            temp_class_partion = []
            for j in range(i, len(pattern_list)):
                if similarity_matrix[i, j] > 0.2:
                    temp_class_partion.append(j)
                    tag[j] = 1
            class_partion.append(temp_class_partion)

    partion_length = []
    for each in class_partion:
        partion_length.append(len(each))

    # 按长度降序排序
    cl = zip(class_partion, partion_length)
    cl = sorted(cl, key=itemgetter(1), reverse=True)

    class_partion = []
    partion_length = []

    for each in cl:
        class_partion.append(each[0])
        partion_length.append(each[1])

    length_sum = np.sum(partion_length)
    temp_sum = 0
    cluster_number = 0
    for i in range(len(partion_length)):
        temp_sum += partion_length[i]
        cluster_number += 1

        #选取所有频繁项集数量的75%,一刀切,前面的部分的划分数就是聚类数目
        if np.true_divide(temp_sum, length_sum) > 0.75:
            break

    class_partion_to_string = []
    for i in range(cluster_number):
        class_partion_to_string.append(" ".join(
            [str(x) for x in class_partion[i]]))

    print cluster_number

    query_result_list_string = []
    for each in query_result_list:
        query_result_list_string.append(" ".join([str(x) for x in each]))

    # if possible
    #quick_write_list_to_text(class_partion_to_string, 'D:/partion2.txt')

    return similarity_matrix, cluster_number
Ejemplo n.º 26
0
def select_top_N_words(read_filename1, read_filename2, read_filename3,
                       write_filename):
    '''
    选取前N个词作为高质量的特征词汇
    :param read_filename1:
    :param read_filename2:
    :param read_filename3:
    :param write_filename:
    '''
    N = 3000

    #根据词性分配权值
    score_dict = {"nr":1.0, "nr1":0.5, "nr2":0.75, "nrt":1.0, "nrf":1.0, "ns":1.0, "nsf":1.0, "nt":1.0, \
                   "nz":1.0, "nl":0.5, "ng":0.5, "n":0.9, "t":0.5, "tg":0.5, "s":0.3, "f":0.3, "j":0.5, \
                   "v":0.7, "vd":0.6, "vn":0.9, "vshi":0.0, "vyou":0.0, "vf":0.3, "vx":0.3, "vi":0.7, \
                   "vl":0.3, "vg":0.5, "a":0.6, "ad":0.3, "an":0.9, "ag":0.5, "al":0.3, "b":0.3, "bl":0.2, \
                    "z":0.9, "zg":0.3, "r":0.3, "rr":0.3, "rz":0.3, "rzt":0.3, "rzs":0.3, "rzv":0.3, "ry":0.2, \
                    "ryt":0.2, "rys":0.2, "ryv":0.2, "rg":0.2, "m":0.6, "mq":0.5, "q":0.6, "qv":0.7, "qt":0.7, \
                    "d":0.4, "p":0.0, "pba":0.0, "pbei":0.0, "c":0.0, "cc":0.0, "u":0.0, "ug":0.0, "e":0.0, \
                    "y":0.0, "o":0.0, "h":0.0, "k":0.0, "x":0.1, "xx":0.0, "xu":0.9, "w":0.0, "l":0.6, "i":0.6, \
                    "g":0.0, "vq":0.0, "nrfg":0.75, "dg":0.0, "mg":0.2, "yg":0.0}

    each_word_tf = []
    key_words = []

    select_word = []
    word_score = []

    user_dict = []

    get_text_to_complex_list(each_word_tf, read_filename1, 0)

    get_text_to_single_list(key_words, read_filename2)

    f = open(read_filename3, 'r')
    line = f.readline()
    while line:
        user_dict.append(line.split()[0])
        line = f.readline()
    f.close()

    for j in range(len(each_word_tf)):
        word_entity = each_word_tf[j][0].split('/')[0]
        word_tag = each_word_tf[j][0].split('/')[1]
        if word_entity in user_dict:
            #用户词典中的词分配高权值
            select_word.append(word_entity)
            word_score.append(np.log(float(each_word_tf[j][1])) * 1.0 * 1.0)
        elif word_entity in key_words:
            #关键词也分配高权值
            select_word.append(word_entity)
            try:
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * score_dict[word_tag] *
                    1.0)
            except KeyError:
                word_score.append(float(0.0))
        else:
            #其余词汇乘以0.6
            select_word.append(word_entity)
            try:
                word_score.append(
                    np.log(float(each_word_tf[j][1])) * score_dict[word_tag] *
                    0.60)
            except KeyError:
                word_score.append(float(0.0))

    # 按权值降序排序
    sw = zip(select_word, word_score)
    sw = sorted(sw, key=itemgetter(1), reverse=True)

    result_all = []
    count_number = 1
    for each in sw:
        result_all.append(each[0] + " " + str(each[1]))
        count_number += 1
        if count_number > N:
            break

    quick_write_list_to_text(result_all, write_filename)