Example #1
0
def EstimateDistribution( filename, epsilon, noise ):
    '''
    @summary: 拉普拉斯噪音扰动序列长度计数

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param noise: use Laplace Mechanism or not

    @return: noisySeqLengthRatioList
    @rtype: list

    '''
    dplog.info( "Get l_opt from file (%s)..." % (filename) )

    file = open(filename)
    total = 0
    dic = {}
    sequences = []

    for line in file:
        if line.startswith('#') or line.startswith('//') or line.startswith('%'):
            continue
        
        line = line.strip().split(' ')
        total += 1
        sequences.append( len(line) )

        if not dic.has_key( len(line) ):
            dic[ len(line) ] = 1
        else:
            dic[ len(line) ] += 1

    sortlist = sorted( dic.iteritems(), key=lambda k:k[0] )
    NoisySeqLengthList = []

    if str(noise) == 'True':
        # dplog.debug( '=== before Laplace Mechanism ===' )
        # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) )

        # 避免出现负数
        NoisySeqLengthList = map(lambda x: (x[0],max(0,0,x[1] + laplace(1/float(epsilon)))), sortlist )
        
        dplog.debug( '=== Laplace Mechanism ===' )
        dplog.debug( 'epsilon: (%f)' % (epsilon) )
        dplog.debug( 'scale parameter: (%f)' % (1/float(epsilon)) )
        # dplog.debug( '=== after Laplace Mechanism ===')
        # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) )

        NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)), NoisySeqLengthList )
        # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) )

        return NoisySeqLengthRatioList

    elif str(noise) == 'False':
        return sequences

    file.close()
Example #2
0
def ExponentialMechanism(sequences, fraction, epsilon):
    '''
    @brief 指数机制获取最优序列长度

    @param sortlist: 序列数据集记录长度真实分布
    @param fraction: 经验值, 默认0.85
    @param epsilon: 隐私预算

    @return: 最优序列长度l_opt
    @rtype: int

    '''
    target = len(sequences) * fraction

    sequences.sort()

    dplog.info("sort sequences length: %d" % (len(sequences)))
    dplog.info("sequences head median tail : %d %d %d" %
               (sequences[0], sequences[int(target) - 1], sequences[-1]))

    previous = 0
    counter = 0
    l_opt = 0.0
    tally = 0.0

    fraction = fraction - 0.05
    counter = int(len(sequences) * fraction - 1)
    previous = sequences[counter]
    sequences = sequences[counter + 1:]

    # a custom aggregator that reservoir samples from the sorted list
    for value in sequences:
        counter += 1

        # python 没有三元运算符
        # sample = (random() > tally / (tally + (value-previous))*math.exp(-epsilon*abs(target-counter))) ? (value-previous)*random() + previous : sample
        # dplog.info( "exponential tally : %f %f" % (tally, (value-previous)*math.exp(-epsilon*abs(target-counter))) )
        # dplog.info( "value : %d\tprevious : %d\ttarget: %d\tcounter : %d" % (value, previous, target, counter) )
        if math.fabs(tally + (value - previous) *
                     math.exp(-epsilon * abs(target - counter)) -
                     0.0) <= 1E-1000:
            continue
        else:
            l_opt = ((value - previous) * random() + previous) if (
                random() > tally /
                (tally + (value - previous) *
                 math.exp(-epsilon * abs(target - counter)))) else l_opt
        tally = tally + (value - previous) * math.exp(
            -epsilon * abs(target - counter))
        previous = value

    return l_opt
def ExponentialMechanism( sequences, fraction, epsilon ):
    '''
    @brief 指数机制获取最优序列长度

    @param sortlist: 序列数据集记录长度真实分布
    @param fraction: 经验值, 默认0.85
    @param epsilon: 隐私预算

    @return: 最优序列长度l_opt
    @rtype: int

    '''
    target = len( sequences ) * fraction

    sequences.sort()
    
    dplog.info( "sort sequences length: %d" % (len(sequences)) )
    dplog.info( "sequences head median tail : %d %d %d" % (sequences[0], sequences[int(target)-1], sequences[-1]) )

    previous = 0 
    counter = 0
    l_opt = 0.0
    tally = 0.0
    
    fraction = fraction - 0.05
    counter =  int( len(sequences)*fraction - 1 )
    previous = sequences[counter]
    sequences = sequences[counter+1:]

    # a custom aggregator that reservoir samples from the sorted list
    for value in sequences:
        counter += 1
        
        # python 没有三元运算符
        # sample = (random() > tally / (tally + (value-previous))*math.exp(-epsilon*abs(target-counter))) ? (value-previous)*random() + previous : sample
        # dplog.info( "exponential tally : %f %f" % (tally, (value-previous)*math.exp(-epsilon*abs(target-counter))) )
        # dplog.info( "value : %d\tprevious : %d\ttarget: %d\tcounter : %d" % (value, previous, target, counter) )
        if math.fabs( tally + (value-previous)*math.exp(-epsilon*abs(target-counter)) -0.0 ) <= 1E-1000:
            continue
        else:    
            l_opt = ((value-previous)*random() + previous) if (random() > tally / (tally + (value-previous)*math.exp(-epsilon*abs(target-counter)))) else l_opt
        tally = tally + (value-previous)*math.exp(-epsilon*abs(target-counter)) 
        previous = value
    
    return l_opt
Example #4
0
def GetOptSeqLength( filename, epsilon, mechanism="Exponential" ):
    '''
    @summary: 获得最优序列长度

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param mechanism: which noise mechanism
        - Laplace Mechanism
        - Exponential Mechanism ( default )
    
    @return: l_opt
    @rtype: int

    '''
    dplog.info( " === Phase 1.1: GetOptSequenceLength Begin ===" )
    
    if mechanism == "Laplace":
        seqLengthList = EstimateDistribution( filename, epsilon, "True" )
        total = 0.0
        # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE
        l_opt = seqLengthList[-1][0] 

        for item in seqLengthList:
            total += item[1]

            if total >= RATIO_VALUE:
                l_opt = item[0]
                break
    
    elif mechanism == "Exponential":
        seqLengthList = EstimateDistribution( filename, 0.0, "False" )    
        l_opt = ExponentialMechanism( seqLengthList, RATIO_VALUE, epsilon )

    dplog.debug( "Empirical ratio value is : (%s)" % (str(RATIO_VALUE)) )
    dplog.info( "l_opt = %d" % (math.ceil(l_opt)) )
    dplog.info( " === Phase 1.1: GetOptSequenceLength End ===" )

    return math.ceil(l_opt)
Example #5
0
def GetOptSeqLength(filename, epsilon, mechanism="Exponential"):
    '''
    @summary: 获得最优序列长度

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param mechanism: which noise mechanism
        - Laplace Mechanism
        - Exponential Mechanism ( default )
    
    @return: l_opt
    @rtype: int

    '''
    dplog.info(" === Phase 1.1: GetOptSequenceLength Begin ===")

    if mechanism == "Laplace":
        seqLengthList = EstimateDistribution(filename, epsilon, "True")
        total = 0.0
        # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE
        l_opt = seqLengthList[-1][0]

        for item in seqLengthList:
            total += item[1]

            if total >= RATIO_VALUE:
                l_opt = item[0]
                break

    elif mechanism == "Exponential":
        seqLengthList = EstimateDistribution(filename, 0.0, "False")
        l_opt = ExponentialMechanism(seqLengthList, RATIO_VALUE, epsilon)

    dplog.debug("Empirical ratio value is : (%s)" % (str(RATIO_VALUE)))
    dplog.info("l_opt = %d" % (math.ceil(l_opt)))
    dplog.info(" === Phase 1.1: GetOptSequenceLength End ===")

    return math.ceil(l_opt)
Example #6
0
    def parse_sequences(self, filename):
        '''
        @summary: truncate sequences
            - 最优序列长度 l_opt 截断原始序列数据集
            - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数

        @param filename: sequential database

        @return: 原始序列数据集对应的n_max-gram和计数
        @rtype: Counter

        @note: assume that locations are numbered from 1 .. max

        '''
        dplog.info( " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" % ( filename, self.max_len) )
        
        file = open(filename)

        self.all_record_num = 0
        lines = []

        # First check the alphabet
        for line in file:
            if line.startswith('#') or line.startswith('//') or line.startswith('%'):
                continue
            
            # self.lines.append(line.strip().split()[:self.max_len])
            lines.append( line.strip().split()[:self.max_len] )
            
            ''' 
            max_item: the max-value item in one max_len sequence
            
            需要有前提: 项集编号从1开始,且连续到max
            >>> map(int, '234')
            [2, 3, 4]
            
            '''
            max_item = max(map(int, lines[-1]))

            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            self.all_record_num += 1

        # be the end point
        self.TERM = self.alphabet_size
        dplog.debug( "Alphabet size : %s" % (str(self.alphabet_size)) )
        dplog.debug( "Termination code : %s" % (str(self.TERM)) )
        dplog.debug( "Number of sequences : (%s)" % (self.all_record_num) )

        pbar = MyProgressBar('Parsing', self.all_record_num + 1)
        
        # 下面这几行涉及 NGram 操作
        for (record, line) in enumerate(lines):
            ''' 
            >>> strToSeq('234', dec=1)
            u'\x01\x02\x03'
            
            >>> line = '123'
            >>> self.TERM = 3
            u'\x00\x01\x02\x03'

            '''
            # 序列记录添加结束符 self.TERM
            seq = strToSeq(line, dec=1) + unichr(self.TERM)
            for i in range(1, self.N_max+1):
                '''
                (N=i) defines N-gram
                
                Example:
                >>> G = ngram.NGram(N=3)
                >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05'])
                >>> print list(a)
                [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']]
                    
                '''
                G = ngram.NGram(N=i)
                
                # Counter every gram from 1...N_max(包含结束符)
                self.update(G.ngrams(seq))

            pbar.update(record + 1)
        pbar.finish()

        file.close()
Example #7
0
def main():
    '''
    @summary: main entry

    '''
    
    init()
    
    logstr = "+"*8 + "    Start Diff-FSPM Algorithm    " + "+"*8
    dplog.info("")
    dplog.info("+" * len(logstr))
    dplog.info(logstr)
    dplog.info("+" * len(logstr))
    dplog.info("")

    dplog.debug("original sequence database : (%s)"%(conf.dataset))
    dplog.debug("differential privacy budget : (%s)"%(conf.epsilon))
    dplog.debug("minmum support value : (%s)"%(conf.min_sup))
    
    Diff_FSPM()

    logstr = "+"*8 + "     End Diff-FSPM Algorithm     " + "+"*8
    dplog.info("")
    dplog.info("+" * len(logstr))
    dplog.success(logstr)
    dplog.info("+" * len(logstr))
    dplog.info("")
Example #8
0
def Diff_FSPM():
    '''
    Diff-FSPM 算法分为如下3个步骤:
        - 原始序列数据集局部转换
            - 获取最优序列长度 l_opt            ok
            - 截断原始序列数据集                ok
        - 层次遍历构建绕动闭前缀序列树
            - min_sup 约束剪枝
            - 闭等价关系 剪枝
            - 预测计数值 PK. 噪音计数值
        - 描述上是挖掘FSP树, 实际直接输出结果集
            
    @summary: Diff-FSPM algorithm
    '''
    dplog.info( " === Phase 1: Decomposing input sequence dataset to n-grams (%d<=n<=%d) Begin ===" % (1, conf.l_opt) )
    conf.l_opt = GetOptSeqLength(conf.dataset, conf.epsilon, mechanism="Exponential")
    ngram_set = NGramSet( int(conf.l_opt), N_max=int(conf.n_max) )
    ngram_set.load_dataset( conf.dataset, conf.dataset_ngrams % (conf.l_opt) )
    dplog.info( " === Phase 1: Decomposing input sequence dataset to n-grams (%d<=n<=%d) End ===" % (1, conf.l_opt) )

    dplog.info( " === Phase 2: Sanitizing n-grams to build noisy frequent sequential patterns Tree Begin ===" )
    ngram_set = Sanitizer.ngram( ngram_set, conf.n_max, conf.epsilon, conf.l_opt, conf.min_sup)
    ngram_set.dump( conf.dataset_noisy % (conf.l_opt, conf.epsilon))
    dplog.info( " === Phase 2: Sanitizing n-grams to build noisy frequent sequential patterns Tree End ===" )
    
    dplog.info( " === Phase 3: Synthetic frequent sequential patterns from santized n-grams Begin ===" )
    factory = Reconstruction( ngram_set, conf.min_sup )
    factory.extend()
    factory.ngramset.dump( conf.dataset_result % (conf.l_opt, conf.epsilon))
    dplog.info( " === Phase 3: Synthetic frequent sequential patterns from santized n-grams End ===" )
Example #9
0
    def parse_sequences(self, filename):
        '''
        @summary: truncate sequences
            - 最优序列长度 l_opt 截断原始序列数据集
            - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数

        @param filename: sequential database

        @return: 原始序列数据集对应的n_max-gram和计数
        @rtype: Counter

        @note: assume that locations are numbered from 1 .. max

        '''
        dplog.info(
            " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" %
            (filename, self.max_len))

        file = open(filename)

        self.all_record_num = 0
        lines = []

        # First check the alphabet
        for line in file:
            if line.startswith('#') or line.startswith(
                    '//') or line.startswith('%'):
                continue

            # self.lines.append(line.strip().split()[:self.max_len])
            lines.append(line.strip().split()[:self.max_len])
            ''' 
            max_item: the max-value item in one max_len sequence
            
            需要有前提: 项集编号从1开始,且连续到max
            >>> map(int, '234')
            [2, 3, 4]
            
            '''
            max_item = max(map(int, lines[-1]))

            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            self.all_record_num += 1

        # be the end point
        self.TERM = self.alphabet_size
        dplog.debug("Alphabet size : %s" % (str(self.alphabet_size)))
        dplog.debug("Termination code : %s" % (str(self.TERM)))
        dplog.debug("Number of sequences : (%s)" % (self.all_record_num))

        pbar = MyProgressBar('Parsing', self.all_record_num + 1)

        # 下面这几行涉及 NGram 操作
        for (record, line) in enumerate(lines):
            ''' 
            >>> strToSeq('234', dec=1)
            u'\x01\x02\x03'
            
            >>> line = '123'
            >>> self.TERM = 3
            u'\x00\x01\x02\x03'

            '''
            # 序列记录添加结束符 self.TERM
            seq = strToSeq(line, dec=1) + unichr(self.TERM)
            for i in range(1, self.N_max + 1):
                '''
                (N=i) defines N-gram
                
                Example:
                >>> G = ngram.NGram(N=3)
                >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05'])
                >>> print list(a)
                [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']]
                    
                '''
                G = ngram.NGram(N=i)

                # Counter every gram from 1...N_max(包含结束符)
                self.update(G.ngrams(seq))

            pbar.update(record + 1)
        pbar.finish()

        file.close()
Example #10
0
def EstimateDistribution(filename, epsilon, noise):
    '''
    @summary: 拉普拉斯噪音扰动序列长度计数

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param noise: use Laplace Mechanism or not

    @return: noisySeqLengthRatioList
    @rtype: list

    '''
    dplog.info("Get l_opt from file (%s)..." % (filename))

    file = open(filename)
    total = 0
    dic = {}
    sequences = []

    for line in file:
        if line.startswith('#') or line.startswith('//') or line.startswith(
                '%'):
            continue

        line = line.strip().split(' ')
        total += 1
        sequences.append(len(line))

        if not dic.has_key(len(line)):
            dic[len(line)] = 1
        else:
            dic[len(line)] += 1

    sortlist = sorted(dic.iteritems(), key=lambda k: k[0])
    NoisySeqLengthList = []

    if str(noise) == 'True':
        # dplog.debug( '=== before Laplace Mechanism ===' )
        # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) )

        # 避免出现负数
        NoisySeqLengthList = map(
            lambda x: (x[0], max(0, 0, x[1] + laplace(1 / float(epsilon)))),
            sortlist)

        dplog.debug('=== Laplace Mechanism ===')
        dplog.debug('epsilon: (%f)' % (epsilon))
        dplog.debug('scale parameter: (%f)' % (1 / float(epsilon)))
        # dplog.debug( '=== after Laplace Mechanism ===')
        # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) )

        NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)),
                                      NoisySeqLengthList)
        # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) )

        return NoisySeqLengthRatioList

    elif str(noise) == 'False':
        return sequences

    file.close()