def EstimateDistribution( filename, epsilon, noise ): ''' @summary: 拉普拉斯噪音扰动序列长度计数 @param filename: Sequence DB @param epsilon: privacy budget @param noise: use Laplace Mechanism or not @return: noisySeqLengthRatioList @rtype: list ''' dplog.info( "Get l_opt from file (%s)..." % (filename) ) file = open(filename) total = 0 dic = {} sequences = [] for line in file: if line.startswith('#') or line.startswith('//') or line.startswith('%'): continue line = line.strip().split(' ') total += 1 sequences.append( len(line) ) if not dic.has_key( len(line) ): dic[ len(line) ] = 1 else: dic[ len(line) ] += 1 sortlist = sorted( dic.iteritems(), key=lambda k:k[0] ) NoisySeqLengthList = [] if str(noise) == 'True': # dplog.debug( '=== before Laplace Mechanism ===' ) # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) ) # 避免出现负数 NoisySeqLengthList = map(lambda x: (x[0],max(0,0,x[1] + laplace(1/float(epsilon)))), sortlist ) dplog.debug( '=== Laplace Mechanism ===' ) dplog.debug( 'epsilon: (%f)' % (epsilon) ) dplog.debug( 'scale parameter: (%f)' % (1/float(epsilon)) ) # dplog.debug( '=== after Laplace Mechanism ===') # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) ) NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)), NoisySeqLengthList ) # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) ) return NoisySeqLengthRatioList elif str(noise) == 'False': return sequences file.close()
def ExponentialMechanism(sequences, fraction, epsilon): ''' @brief 指数机制获取最优序列长度 @param sortlist: 序列数据集记录长度真实分布 @param fraction: 经验值, 默认0.85 @param epsilon: 隐私预算 @return: 最优序列长度l_opt @rtype: int ''' target = len(sequences) * fraction sequences.sort() dplog.info("sort sequences length: %d" % (len(sequences))) dplog.info("sequences head median tail : %d %d %d" % (sequences[0], sequences[int(target) - 1], sequences[-1])) previous = 0 counter = 0 l_opt = 0.0 tally = 0.0 fraction = fraction - 0.05 counter = int(len(sequences) * fraction - 1) previous = sequences[counter] sequences = sequences[counter + 1:] # a custom aggregator that reservoir samples from the sorted list for value in sequences: counter += 1 # python 没有三元运算符 # sample = (random() > tally / (tally + (value-previous))*math.exp(-epsilon*abs(target-counter))) ? (value-previous)*random() + previous : sample # dplog.info( "exponential tally : %f %f" % (tally, (value-previous)*math.exp(-epsilon*abs(target-counter))) ) # dplog.info( "value : %d\tprevious : %d\ttarget: %d\tcounter : %d" % (value, previous, target, counter) ) if math.fabs(tally + (value - previous) * math.exp(-epsilon * abs(target - counter)) - 0.0) <= 1E-1000: continue else: l_opt = ((value - previous) * random() + previous) if ( random() > tally / (tally + (value - previous) * math.exp(-epsilon * abs(target - counter)))) else l_opt tally = tally + (value - previous) * math.exp( -epsilon * abs(target - counter)) previous = value return l_opt
def ExponentialMechanism( sequences, fraction, epsilon ): ''' @brief 指数机制获取最优序列长度 @param sortlist: 序列数据集记录长度真实分布 @param fraction: 经验值, 默认0.85 @param epsilon: 隐私预算 @return: 最优序列长度l_opt @rtype: int ''' target = len( sequences ) * fraction sequences.sort() dplog.info( "sort sequences length: %d" % (len(sequences)) ) dplog.info( "sequences head median tail : %d %d %d" % (sequences[0], sequences[int(target)-1], sequences[-1]) ) previous = 0 counter = 0 l_opt = 0.0 tally = 0.0 fraction = fraction - 0.05 counter = int( len(sequences)*fraction - 1 ) previous = sequences[counter] sequences = sequences[counter+1:] # a custom aggregator that reservoir samples from the sorted list for value in sequences: counter += 1 # python 没有三元运算符 # sample = (random() > tally / (tally + (value-previous))*math.exp(-epsilon*abs(target-counter))) ? (value-previous)*random() + previous : sample # dplog.info( "exponential tally : %f %f" % (tally, (value-previous)*math.exp(-epsilon*abs(target-counter))) ) # dplog.info( "value : %d\tprevious : %d\ttarget: %d\tcounter : %d" % (value, previous, target, counter) ) if math.fabs( tally + (value-previous)*math.exp(-epsilon*abs(target-counter)) -0.0 ) <= 1E-1000: continue else: l_opt = ((value-previous)*random() + previous) if (random() > tally / (tally + (value-previous)*math.exp(-epsilon*abs(target-counter)))) else l_opt tally = tally + (value-previous)*math.exp(-epsilon*abs(target-counter)) previous = value return l_opt
def GetOptSeqLength( filename, epsilon, mechanism="Exponential" ): ''' @summary: 获得最优序列长度 @param filename: Sequence DB @param epsilon: privacy budget @param mechanism: which noise mechanism - Laplace Mechanism - Exponential Mechanism ( default ) @return: l_opt @rtype: int ''' dplog.info( " === Phase 1.1: GetOptSequenceLength Begin ===" ) if mechanism == "Laplace": seqLengthList = EstimateDistribution( filename, epsilon, "True" ) total = 0.0 # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE l_opt = seqLengthList[-1][0] for item in seqLengthList: total += item[1] if total >= RATIO_VALUE: l_opt = item[0] break elif mechanism == "Exponential": seqLengthList = EstimateDistribution( filename, 0.0, "False" ) l_opt = ExponentialMechanism( seqLengthList, RATIO_VALUE, epsilon ) dplog.debug( "Empirical ratio value is : (%s)" % (str(RATIO_VALUE)) ) dplog.info( "l_opt = %d" % (math.ceil(l_opt)) ) dplog.info( " === Phase 1.1: GetOptSequenceLength End ===" ) return math.ceil(l_opt)
def GetOptSeqLength(filename, epsilon, mechanism="Exponential"): ''' @summary: 获得最优序列长度 @param filename: Sequence DB @param epsilon: privacy budget @param mechanism: which noise mechanism - Laplace Mechanism - Exponential Mechanism ( default ) @return: l_opt @rtype: int ''' dplog.info(" === Phase 1.1: GetOptSequenceLength Begin ===") if mechanism == "Laplace": seqLengthList = EstimateDistribution(filename, epsilon, "True") total = 0.0 # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE l_opt = seqLengthList[-1][0] for item in seqLengthList: total += item[1] if total >= RATIO_VALUE: l_opt = item[0] break elif mechanism == "Exponential": seqLengthList = EstimateDistribution(filename, 0.0, "False") l_opt = ExponentialMechanism(seqLengthList, RATIO_VALUE, epsilon) dplog.debug("Empirical ratio value is : (%s)" % (str(RATIO_VALUE))) dplog.info("l_opt = %d" % (math.ceil(l_opt))) dplog.info(" === Phase 1.1: GetOptSequenceLength End ===") return math.ceil(l_opt)
def parse_sequences(self, filename): ''' @summary: truncate sequences - 最优序列长度 l_opt 截断原始序列数据集 - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数 @param filename: sequential database @return: 原始序列数据集对应的n_max-gram和计数 @rtype: Counter @note: assume that locations are numbered from 1 .. max ''' dplog.info( " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" % ( filename, self.max_len) ) file = open(filename) self.all_record_num = 0 lines = [] # First check the alphabet for line in file: if line.startswith('#') or line.startswith('//') or line.startswith('%'): continue # self.lines.append(line.strip().split()[:self.max_len]) lines.append( line.strip().split()[:self.max_len] ) ''' max_item: the max-value item in one max_len sequence 需要有前提: 项集编号从1开始,且连续到max >>> map(int, '234') [2, 3, 4] ''' max_item = max(map(int, lines[-1])) if self.alphabet_size < max_item: self.alphabet_size = max_item self.all_record_num += 1 # be the end point self.TERM = self.alphabet_size dplog.debug( "Alphabet size : %s" % (str(self.alphabet_size)) ) dplog.debug( "Termination code : %s" % (str(self.TERM)) ) dplog.debug( "Number of sequences : (%s)" % (self.all_record_num) ) pbar = MyProgressBar('Parsing', self.all_record_num + 1) # 下面这几行涉及 NGram 操作 for (record, line) in enumerate(lines): ''' >>> strToSeq('234', dec=1) u'\x01\x02\x03' >>> line = '123' >>> self.TERM = 3 u'\x00\x01\x02\x03' ''' # 序列记录添加结束符 self.TERM seq = strToSeq(line, dec=1) + unichr(self.TERM) for i in range(1, self.N_max+1): ''' (N=i) defines N-gram Example: >>> G = ngram.NGram(N=3) >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05']) >>> print list(a) [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']] ''' G = ngram.NGram(N=i) # Counter every gram from 1...N_max(包含结束符) self.update(G.ngrams(seq)) pbar.update(record + 1) pbar.finish() file.close()
def main(): ''' @summary: main entry ''' init() logstr = "+"*8 + " Start Diff-FSPM Algorithm " + "+"*8 dplog.info("") dplog.info("+" * len(logstr)) dplog.info(logstr) dplog.info("+" * len(logstr)) dplog.info("") dplog.debug("original sequence database : (%s)"%(conf.dataset)) dplog.debug("differential privacy budget : (%s)"%(conf.epsilon)) dplog.debug("minmum support value : (%s)"%(conf.min_sup)) Diff_FSPM() logstr = "+"*8 + " End Diff-FSPM Algorithm " + "+"*8 dplog.info("") dplog.info("+" * len(logstr)) dplog.success(logstr) dplog.info("+" * len(logstr)) dplog.info("")
def Diff_FSPM(): ''' Diff-FSPM 算法分为如下3个步骤: - 原始序列数据集局部转换 - 获取最优序列长度 l_opt ok - 截断原始序列数据集 ok - 层次遍历构建绕动闭前缀序列树 - min_sup 约束剪枝 - 闭等价关系 剪枝 - 预测计数值 PK. 噪音计数值 - 描述上是挖掘FSP树, 实际直接输出结果集 @summary: Diff-FSPM algorithm ''' dplog.info( " === Phase 1: Decomposing input sequence dataset to n-grams (%d<=n<=%d) Begin ===" % (1, conf.l_opt) ) conf.l_opt = GetOptSeqLength(conf.dataset, conf.epsilon, mechanism="Exponential") ngram_set = NGramSet( int(conf.l_opt), N_max=int(conf.n_max) ) ngram_set.load_dataset( conf.dataset, conf.dataset_ngrams % (conf.l_opt) ) dplog.info( " === Phase 1: Decomposing input sequence dataset to n-grams (%d<=n<=%d) End ===" % (1, conf.l_opt) ) dplog.info( " === Phase 2: Sanitizing n-grams to build noisy frequent sequential patterns Tree Begin ===" ) ngram_set = Sanitizer.ngram( ngram_set, conf.n_max, conf.epsilon, conf.l_opt, conf.min_sup) ngram_set.dump( conf.dataset_noisy % (conf.l_opt, conf.epsilon)) dplog.info( " === Phase 2: Sanitizing n-grams to build noisy frequent sequential patterns Tree End ===" ) dplog.info( " === Phase 3: Synthetic frequent sequential patterns from santized n-grams Begin ===" ) factory = Reconstruction( ngram_set, conf.min_sup ) factory.extend() factory.ngramset.dump( conf.dataset_result % (conf.l_opt, conf.epsilon)) dplog.info( " === Phase 3: Synthetic frequent sequential patterns from santized n-grams End ===" )
def parse_sequences(self, filename): ''' @summary: truncate sequences - 最优序列长度 l_opt 截断原始序列数据集 - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数 @param filename: sequential database @return: 原始序列数据集对应的n_max-gram和计数 @rtype: Counter @note: assume that locations are numbered from 1 .. max ''' dplog.info( " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" % (filename, self.max_len)) file = open(filename) self.all_record_num = 0 lines = [] # First check the alphabet for line in file: if line.startswith('#') or line.startswith( '//') or line.startswith('%'): continue # self.lines.append(line.strip().split()[:self.max_len]) lines.append(line.strip().split()[:self.max_len]) ''' max_item: the max-value item in one max_len sequence 需要有前提: 项集编号从1开始,且连续到max >>> map(int, '234') [2, 3, 4] ''' max_item = max(map(int, lines[-1])) if self.alphabet_size < max_item: self.alphabet_size = max_item self.all_record_num += 1 # be the end point self.TERM = self.alphabet_size dplog.debug("Alphabet size : %s" % (str(self.alphabet_size))) dplog.debug("Termination code : %s" % (str(self.TERM))) dplog.debug("Number of sequences : (%s)" % (self.all_record_num)) pbar = MyProgressBar('Parsing', self.all_record_num + 1) # 下面这几行涉及 NGram 操作 for (record, line) in enumerate(lines): ''' >>> strToSeq('234', dec=1) u'\x01\x02\x03' >>> line = '123' >>> self.TERM = 3 u'\x00\x01\x02\x03' ''' # 序列记录添加结束符 self.TERM seq = strToSeq(line, dec=1) + unichr(self.TERM) for i in range(1, self.N_max + 1): ''' (N=i) defines N-gram Example: >>> G = ngram.NGram(N=3) >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05']) >>> print list(a) [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']] ''' G = ngram.NGram(N=i) # Counter every gram from 1...N_max(包含结束符) self.update(G.ngrams(seq)) pbar.update(record + 1) pbar.finish() file.close()
def EstimateDistribution(filename, epsilon, noise): ''' @summary: 拉普拉斯噪音扰动序列长度计数 @param filename: Sequence DB @param epsilon: privacy budget @param noise: use Laplace Mechanism or not @return: noisySeqLengthRatioList @rtype: list ''' dplog.info("Get l_opt from file (%s)..." % (filename)) file = open(filename) total = 0 dic = {} sequences = [] for line in file: if line.startswith('#') or line.startswith('//') or line.startswith( '%'): continue line = line.strip().split(' ') total += 1 sequences.append(len(line)) if not dic.has_key(len(line)): dic[len(line)] = 1 else: dic[len(line)] += 1 sortlist = sorted(dic.iteritems(), key=lambda k: k[0]) NoisySeqLengthList = [] if str(noise) == 'True': # dplog.debug( '=== before Laplace Mechanism ===' ) # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) ) # 避免出现负数 NoisySeqLengthList = map( lambda x: (x[0], max(0, 0, x[1] + laplace(1 / float(epsilon)))), sortlist) dplog.debug('=== Laplace Mechanism ===') dplog.debug('epsilon: (%f)' % (epsilon)) dplog.debug('scale parameter: (%f)' % (1 / float(epsilon))) # dplog.debug( '=== after Laplace Mechanism ===') # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) ) NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)), NoisySeqLengthList) # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) ) return NoisySeqLengthRatioList elif str(noise) == 'False': return sequences file.close()