Example #1
0
def generate_sort_index(read_directory, write_directory):
    '''
    排序索引的产生
    :param read_directory:
    :param write_directory:
    '''
    time_series = []
    item_index = []
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        line_count = 1
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            if line_count % 9 == 6:
                time_series.append(float(line.strip()[13:]))
                item_index.append([str(i + 1), str(line_count)])
            line = f.readline()
            line_count += 1
        f.close()
    
    #按时间升序排序
    tsi = zip(time_series, item_index)
    tsi1 = sorted(tsi, key = itemgetter(0))
            
    #选择对应的行号索引
    update_item_index = []
    for each in tsi1:
        update_item_index.append(each[1])
    
    write_list_to_text_by_row(update_item_index, write_directory + u'/update_item_index.txt')
    
    return update_item_index
Example #2
0
def generate_sort_index(read_directory, write_directory):
    '''
    排序索引的产生
    :param read_directory:
    :param write_directory:
    '''
    time_series = []
    item_index = []
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    for i in range(file_number):
        line_count = 1
        f = open(read_directory + '/' + str(i + 1) + '.txt', 'rb')
        line = f.readline()
        while line:
            #this_time = line.strip().split('\t')[2]
            this_time = time.mktime(time.strptime(line.strip().split('\t')[2], '%Y/%m/%d %H:%M'))
            time_series.append(this_time)
            item_index.append([str(i + 1), str(line_count)])  #索引下标均从1开始
            
            line = f.readline()
            line_count += 1
        f.close()
    
    #按时间升序排序
    tsi = zip(time_series, item_index)
    tsi1 = sorted(tsi, key = itemgetter(0))
            
    #选择对应的行号索引
    update_item_index = []
    for each in tsi1:
        update_item_index.append(each[1])
    
    write_list_to_text_by_row(update_item_index, write_directory + u'/update_item_index.txt')
    
    return update_item_index
def spur(read_directory, write_directory1, write_directory2):
    '''
    SPUR压缩
    Summarization via Pattern Utility and Ranking
    Summarize a batch of transactions with low compression ratio and high quality.
    
    Xintian Yang, Amol Ghoting, Yiye Ruan, A Framework for Summarizing and Analyzing Twitter Feeds, KDD'12,August 12–16, 2012, Beijing, China.
    
    :param read_directory: VSM文件目录
    :param write_directory1: 压缩结果文件目录
    :param write_directory2: 压缩比例文件目录
    '''

    #频繁项集挖掘的支持度
    minimun_support = 60

    #误报率
    f = 0.1

    #压缩比率
    ratio = []

    #压缩时间
    compress_time = []

    #文件总数
    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    #循环遍历所有VSM文件
    for i in range(file_number):
        print 'Batch: %d' % (i + 1)
        start = time.clock()
        '''
        #挖掘频繁项集,并得到相应的频繁项集的支持度
        #频繁项集挖掘采用FP-Growth算法
        #参考 https://github.com/enaeseth/python-fp-growth
        '''
        o_trans, trans_size = generate_transactions(read_directory + '/' +
                                                    str(i + 1) + '.txt')

        #压缩预算,压缩的上限是原始事务中的总项数items的0.6
        #M = 0.7 * trans_size
        M = 85000
        #频繁项及其对应的长度、支持度的列表
        frequent_patterns = []  #二维int列表
        length_all = []
        support_all = []

        #find_frequent_itemsets返回的结果类型为"generator"
        #The type of the return of the "find_frequent_itemsets" is "generator"
        for each, support in find_frequent_itemsets(o_trans,
                                                    minimun_support,
                                                    include_support=True):
            each.sort()
            frequent_patterns.append(each)
            length_all.append(len(each))
            support_all.append(support)

        print len(frequent_patterns)
        #频繁项按照长度由高到低排序
        fl = zip(frequent_patterns, length_all, support_all)
        fl1 = sorted(fl, key=itemgetter(1), reverse=True)
        '''
        #为便于表示原始事务,每一个频繁项pattern用一个字符串来表示,作为其id
        #每一个pattern的表示格式"p*",*为数字,从0开始
        '''
        #pattern的id与pattern所代表的事务中的项关系的字典
        #类型:'id':int[item]
        id_pattern_dict = {}
        #pattern的id与pattern的长度关系的字典
        #类型:'id':int
        pattern_length_dict = {}
        #pattern的id与pattern的支持度关系的字典
        #类型:'id':int
        pattern_support_dict = {}

        id1 = 0
        for each in fl1:
            id_pattern_dict['p' + str(id1)] = each[0]
            pattern_length_dict['p' + str(id1)] = each[1]
            pattern_support_dict['p' + str(id1)] = each[2]
            id1 += 1

        #pattern的id与含有该pattern的事务关系的字典
        #事务的编号的排列方式以原始事务顺序为依据,为int型
        #类型:'id':int[trans]
        pattern_trans_dict = {}
        for each in id_pattern_dict.keys():
            value_list = []
            for j in range(len(o_trans)):
                if set(id_pattern_dict[each]).issubset(o_trans[j]):  #后面无需集合化
                    value_list.append(j)

            pattern_trans_dict[each] = value_list

        #获取每个频繁项的子频繁项,不包含本身
        #类型:'id':str[id]
        sub_pattern_dict = {}
        #获取每个频繁项的父频繁项,不包含本身
        #类型:'id':str[id]
        super_pattern_dict = {}
        #获取每个与每个频繁项相交的但不属于以上2种情况的频繁项
        #类型:'id':str[id]
        overlap_pattern_dict = {}
        for each in id_pattern_dict.keys():
            value_list1 = []
            value_list2 = []
            value_list3 = []
            for each1 in id_pattern_dict.keys():
                if each != each1:
                    intersection = set(id_pattern_dict[each1]) & set(
                        id_pattern_dict[each])
                    if intersection == set():
                        pass
                    elif set(id_pattern_dict[each1]) == intersection:
                        value_list1.append(each1)
                    elif set(id_pattern_dict[each]) == intersection:
                        value_list2.append(each1)
                    else:
                        value_list3.append(each1)
                else:
                    pass

            sub_pattern_dict[each] = value_list1
            super_pattern_dict[each] = value_list2
            overlap_pattern_dict[each] = value_list3
        '''
        初始化utility值
        返回pattern的id与该pattern的utility值关系的字典
        返回pattern的id与包含该pattern的事务列表关系的字典
        '''
        pattern_utility, pattern_coverage_set = utility_f(
            id_pattern_dict, pattern_trans_dict, pattern_support_dict,
            sub_pattern_dict, f)

        #获取utility值最大的pattern
        max_index = np.argmax(pattern_utility.values())
        Q_top = pattern_utility.keys()[max_index]

        #pattern_utility的复制
        Q_utility = pattern_utility.copy()
        '''
        将原始事务用当前pattern表示,根据utility进行
        同时不断更新utility值
        '''
        #current_size = trans_size
        current_size = 0
        iter_count = 0

        while current_size < M:
            #当前选择的pattern
            this_pattern = Q_top

            if Q_utility[this_pattern] >= 0.0:
                '''
                  #用当前频繁项this_pattern表示原始事务
                  #this_pattern是该pattern的键,是一个字符串
                  '''
                replace_trans_with_pattern(o_trans, this_pattern,
                                           id_pattern_dict[this_pattern],
                                           pattern_coverage_set[this_pattern])
                #此时,o_trans已经改变
                #注意,之后o_trans中既包含int型,又包含string型
                '''
                  当前pattern表示完后,更新其余pattern的utility值
                  '''
                for each1 in super_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each1]) & set(
                        pattern_coverage_set[this_pattern])
                    pattern_utility[each1] = pattern_utility[each1] - len(
                        id_pattern_dict[this_pattern]) * len(covered_set)
                    if each1 in Q_utility.keys():
                        Q_utility[each1] = pattern_utility[each1]

                for each2 in sub_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each2]) & set(
                        pattern_coverage_set[this_pattern])
                    pattern_utility[each2] = pattern_utility[each2] - (
                        len(id_pattern_dict[each2]) - 1) * len(covered_set)

                    if each2 in Q_utility.keys():
                        Q_utility[each2] = pattern_utility[each2]

                    pattern_coverage_set[each2] = [
                        x for x in pattern_coverage_set[each2]
                        if x not in covered_set
                    ]
                    if (len(pattern_coverage_set[each2])
                            == 0) and (each2 in Q_utility.keys()):
                        del Q_utility[each2]

                for each3 in overlap_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each3]) & set(
                        pattern_coverage_set[this_pattern])
                    pattern_utility[each3] = pattern_utility[each3] - len(
                        covered_set) * len(
                            set(id_pattern_dict[each3])
                            & set(id_pattern_dict[this_pattern]))
                    if each3 in Q_utility.keys():
                        Q_utility[each3] = pattern_utility[each3]

                #if len(pattern_coverage_set[this_pattern]) == 0:
                #flag += 1
                #else:
                #flag = 0

                current_size = current_size + len(
                    pattern_coverage_set[this_pattern])
                iter_count += 1
                if iter_count >= 50000:
                    break

                #if flag == 3:
                #break
                #current_size = np.sum([len(x) for x in o_trans])
                #print current_size

                #当前pattern已表示过 删除之
                del Q_utility[this_pattern]

                #重新按照utility值降序排序 选取utility值最大的一项
                if Q_utility != {}:
                    max_index = np.argmax(Q_utility.values())
                    Q_top = Q_utility.keys()[max_index]
                else:
                    break

            else:
                break

        #final_size = np.sum([len(x) for x in o_trans])

        final_size = current_size
        print 'Final size: ', final_size
        this_ratio = np.true_divide(final_size, trans_size)
        print 'Ratio: ', this_ratio

        ratio.append(str(this_ratio))

        interval = time.clock() - start
        print 'Time: %f' % interval
        compress_time.append(str(interval))

        write_list_to_text_by_row(o_trans,
                                  write_directory1 + '/' + str(i + 1) + '.txt')

    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
    quick_write_list_to_text(compress_time,
                             write_directory2 + '/compress_time.txt')
def spur(read_directory, write_directory1, write_directory2):
    '''
    SPUR压缩
    Summarization via Pattern Utility and Ranking
    Summarize a batch of transactions with low compression ratio and high quality.
    
    Xintian Yang, Amol Ghoting, Yiye Ruan, A Framework for Summarizing and Analyzing Twitter Feeds, KDD'12,August 12–16, 2012, Beijing, China.
    
    :param read_directory: VSM文件目录
    :param write_directory1: 压缩结果文件目录
    :param write_directory2: 压缩比例文件目录
    '''

    #频繁项集挖掘的支持度
    minimun_support = 60
    
    #误报率
    f = 0.1
    
    #压缩比率
    ratio = []
    
    #压缩时间
    compress_time = []
    
    #文件总数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    #循环遍历所有VSM文件
    for i in range(file_number):
        print 'Batch: %d' % (i + 1)
        start = time.clock()
        
        '''
        #挖掘频繁项集,并得到相应的频繁项集的支持度
        #频繁项集挖掘采用FP-Growth算法
        #参考 https://github.com/enaeseth/python-fp-growth
        '''
        o_trans, trans_size = generate_transactions(read_directory + '/' + str(i + 1) + '.txt')
        
        #压缩预算,压缩的上限是原始事务中的总项数items的0.6
        #M = 0.7 * trans_size
        M = 85000
        #频繁项及其对应的长度、支持度的列表
        frequent_patterns = []  #二维int列表
        length_all = []
        support_all = []
        
        #find_frequent_itemsets返回的结果类型为"generator"
        #The type of the return of the "find_frequent_itemsets" is "generator"
        for each, support in find_frequent_itemsets(o_trans, minimun_support, include_support=True):
            each.sort()
            frequent_patterns.append(each)
            length_all.append(len(each))
            support_all.append(support)
        
        print len(frequent_patterns)           
        #频繁项按照长度由高到低排序
        fl = zip(frequent_patterns, length_all, support_all)
        fl1 = sorted(fl, key = itemgetter(1), reverse = True)
        
        '''
        #为便于表示原始事务,每一个频繁项pattern用一个字符串来表示,作为其id
        #每一个pattern的表示格式"p*",*为数字,从0开始
        '''
        #pattern的id与pattern所代表的事务中的项关系的字典
        #类型:'id':int[item]
        id_pattern_dict = {}
        #pattern的id与pattern的长度关系的字典
        #类型:'id':int
        pattern_length_dict = {}
        #pattern的id与pattern的支持度关系的字典
        #类型:'id':int
        pattern_support_dict = {}
        
        id1 = 0
        for each in fl1:
            id_pattern_dict['p' + str(id1)] = each[0]
            pattern_length_dict['p' + str(id1)] = each[1]
            pattern_support_dict['p' + str(id1)] = each[2]         
            id1 += 1

        #pattern的id与含有该pattern的事务关系的字典
        #事务的编号的排列方式以原始事务顺序为依据,为int型
        #类型:'id':int[trans]
        pattern_trans_dict = {}
        for each in id_pattern_dict.keys():
            value_list = []
            for j in range(len(o_trans)):
                if set(id_pattern_dict[each]).issubset(o_trans[j]):  #后面无需集合化
                    value_list.append(j)
            
            pattern_trans_dict[each] = value_list
        
        #获取每个频繁项的子频繁项,不包含本身
        #类型:'id':str[id]
        sub_pattern_dict = {}
        #获取每个频繁项的父频繁项,不包含本身
        #类型:'id':str[id]
        super_pattern_dict = {}
        #获取每个与每个频繁项相交的但不属于以上2种情况的频繁项
        #类型:'id':str[id]
        overlap_pattern_dict = {}
        for each in id_pattern_dict.keys():
            value_list1 = []
            value_list2 = []
            value_list3 = []
            for each1 in id_pattern_dict.keys():
                if each != each1:
                    intersection = set(id_pattern_dict[each1]) & set(id_pattern_dict[each])
                    if intersection == set():
                        pass
                    elif set(id_pattern_dict[each1]) == intersection:
                        value_list1.append(each1)
                    elif set(id_pattern_dict[each]) == intersection:
                        value_list2.append(each1)
                    else:
                        value_list3.append(each1)
                else:
                    pass
            
            sub_pattern_dict[each] = value_list1
            super_pattern_dict[each] = value_list2
            overlap_pattern_dict[each] = value_list3
        
        '''
        初始化utility值
        返回pattern的id与该pattern的utility值关系的字典
        返回pattern的id与包含该pattern的事务列表关系的字典
        '''
        pattern_utility , pattern_coverage_set = utility_f(id_pattern_dict, pattern_trans_dict, pattern_support_dict, sub_pattern_dict, f)
        
        #获取utility值最大的pattern
        max_index = np.argmax(pattern_utility.values())
        Q_top = pattern_utility.keys()[max_index]
        
        #pattern_utility的复制
        Q_utility = pattern_utility.copy()
        
        '''
        将原始事务用当前pattern表示,根据utility进行
        同时不断更新utility值
        '''
        #current_size = trans_size
        current_size = 0
        iter_count = 0
        
        while current_size < M:
            #当前选择的pattern
            this_pattern = Q_top
            
            if Q_utility[this_pattern] >= 0.0:
                
                '''
                  #用当前频繁项this_pattern表示原始事务
                  #this_pattern是该pattern的键,是一个字符串
                  '''
                replace_trans_with_pattern(o_trans, this_pattern, id_pattern_dict[this_pattern], pattern_coverage_set[this_pattern])
                #此时,o_trans已经改变
                #注意,之后o_trans中既包含int型,又包含string型
                
                '''
                  当前pattern表示完后,更新其余pattern的utility值
                  '''
                for each1 in super_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each1]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each1] = pattern_utility[each1] - len(id_pattern_dict[this_pattern]) * len(covered_set)
                    if each1 in Q_utility.keys():
                        Q_utility[each1] = pattern_utility[each1]
                
                for each2 in sub_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each2]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each2] = pattern_utility[each2] - (len(id_pattern_dict[each2]) - 1) * len(covered_set)
                    
                    if each2 in Q_utility.keys():
                        Q_utility[each2] = pattern_utility[each2]
                    
                    pattern_coverage_set[each2] = [x for x in pattern_coverage_set[each2] if x not in covered_set]
                    if (len(pattern_coverage_set[each2]) == 0) and (each2 in Q_utility.keys()):
                        del Q_utility[each2]
                
                for each3 in overlap_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each3]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each3] = pattern_utility[each3] - len(covered_set) * len(set(id_pattern_dict[each3]) & set(id_pattern_dict[this_pattern]))
                    if each3 in Q_utility.keys():
                        Q_utility[each3] = pattern_utility[each3]
                
                #if len(pattern_coverage_set[this_pattern]) == 0:
                    #flag += 1
                #else:
                    #flag = 0
                    
                current_size = current_size + len(pattern_coverage_set[this_pattern])
                iter_count += 1
                if iter_count >= 50000:
                    break
                
                #if flag == 3:
                    #break
                #current_size = np.sum([len(x) for x in o_trans])
                #print current_size
                
                #当前pattern已表示过 删除之
                del Q_utility[this_pattern]
                
                #重新按照utility值降序排序 选取utility值最大的一项
                if Q_utility != {}:
                    max_index = np.argmax(Q_utility.values())
                    Q_top = Q_utility.keys()[max_index]
                else:
                    break
                            
            else:
                break  
        
        #final_size = np.sum([len(x) for x in o_trans])

        final_size = current_size
        print 'Final size: ', final_size
        this_ratio = np.true_divide(final_size, trans_size)
        print 'Ratio: ', this_ratio
        
        ratio.append(str(this_ratio))
        
        interval = time.clock() - start
        print 'Time: %f' % interval
        compress_time.append(str(interval))
        
        write_list_to_text_by_row(o_trans, write_directory1 + '/' + str(i + 1) + '.txt') 
    
    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
    quick_write_list_to_text(compress_time, write_directory2 + '/compress_time.txt')