Ejemplo n.º 1
0
    def dump(self, filename):

        dplog.debug("Creating ngram file (%s, N=%d)..." %
                    (filename, self.N_max))

        file = open(filename, 'w')

        #file.write( 'ngrams : ' + str(len(self)) + '\n' )  # len(self)表示所有ngrams总数
        file.write(str(len(self)) + '\n')
        #file.write( 'alphabet_size : ' + str(self.alphabet_size) + '\n' )

        pbar = MyProgressBar('Dumping', len(self))

        i = 0
        for gram in sorted(self.keys(), key=cmp_to_key(compare_grams)):
            # NOTE: ord(x)+1 should be in order to remain compatible with the input format
            file.write(
                "%s : %f\n" %
                (" ".join(map(lambda x: str(ord(x) + 1), gram)), self[gram]))
            # file.write( "%s : %f\n" % ((seqToStr(gram, inc=1), self[gram])) )
            i += 1
            pbar.update(i)

        pbar.finish()
        '''
        pbar = MyProgressBar('Dumping', len(self.lines))
        i = 0
        for line in self.lines:
            file.write( "%s\n" % (" ".join(line)) )
            i += 1
            pbar.update(i)
        pbar.finish()
        '''

        file.close()
Ejemplo n.º 2
0
    def dump(self, filename):
        
        dplog.debug( "Creating ngram file (%s, N=%d)..." % (filename, self.N_max) )

        file = open(filename, 'w')
        
        #file.write( 'ngrams : ' + str(len(self)) + '\n' )  # len(self)表示所有ngrams总数
        file.write( str(len(self)) + '\n' )
        #file.write( 'alphabet_size : ' + str(self.alphabet_size) + '\n' )
        
        pbar = MyProgressBar('Dumping', len(self))

        i = 0
        for gram in sorted(self.keys(), key=cmp_to_key(compare_grams)):
            # NOTE: ord(x)+1 should be in order to remain compatible with the input format
            file.write("%s : %f\n" % (" ".join(map(lambda x: str(ord(x)+1), gram)),self[gram]))
            # file.write( "%s : %f\n" % ((seqToStr(gram, inc=1), self[gram])) )
            i += 1
            pbar.update(i)

        pbar.finish()

        '''
        pbar = MyProgressBar('Dumping', len(self.lines))
        i = 0
        for line in self.lines:
            file.write( "%s\n" % (" ".join(line)) )
            i += 1
            pbar.update(i)
        pbar.finish()
        '''

        file.close()
Ejemplo n.º 3
0
    def load_dataset(self, in_file, dump_file):
        '''
        @summary: 截断原始序列数据集

        @param in_file: 原始序列数据集文件
        @param dump_file: 截断归约表示文件

        '''
        if not os.path.isfile(dump_file):
            dplog.debug( "File (%s) does not exist!"%(dump_file) )
            dplog.debug( "Creating File (%s)" % (dump_file) )
            
            self.parse_sequences(in_file)
            self.dump(dump_file)
        
        else:
            self.load_dump(dump_file)
Ejemplo n.º 4
0
    def load_dataset(self, in_file, dump_file):
        '''
        @summary: 截断原始序列数据集

        @param in_file: 原始序列数据集文件
        @param dump_file: 截断归约表示文件

        '''
        if not os.path.isfile(dump_file):
            dplog.debug("File (%s) does not exist!" % (dump_file))
            dplog.debug("Creating File (%s)" % (dump_file))

            self.parse_sequences(in_file)
            self.dump(dump_file)

        else:
            self.load_dump(dump_file)
Ejemplo n.º 5
0
def main():
    '''
    @summary: main entry

    '''
    
    init()
    
    logstr = "+"*8 + "    Start Diff-FSPM Algorithm    " + "+"*8
    dplog.info("")
    dplog.info("+" * len(logstr))
    dplog.info(logstr)
    dplog.info("+" * len(logstr))
    dplog.info("")

    dplog.debug("original sequence database : (%s)"%(conf.dataset))
    dplog.debug("differential privacy budget : (%s)"%(conf.epsilon))
    dplog.debug("minmum support value : (%s)"%(conf.min_sup))
    
    Diff_FSPM()

    logstr = "+"*8 + "     End Diff-FSPM Algorithm     " + "+"*8
    dplog.info("")
    dplog.info("+" * len(logstr))
    dplog.success(logstr)
    dplog.info("+" * len(logstr))
    dplog.info("")
Ejemplo n.º 6
0
def GetOptSeqLength( filename, epsilon, mechanism="Exponential" ):
    '''
    @summary: 获得最优序列长度

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param mechanism: which noise mechanism
        - Laplace Mechanism
        - Exponential Mechanism ( default )
    
    @return: l_opt
    @rtype: int

    '''
    dplog.info( " === Phase 1.1: GetOptSequenceLength Begin ===" )
    
    if mechanism == "Laplace":
        seqLengthList = EstimateDistribution( filename, epsilon, "True" )
        total = 0.0
        # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE
        l_opt = seqLengthList[-1][0] 

        for item in seqLengthList:
            total += item[1]

            if total >= RATIO_VALUE:
                l_opt = item[0]
                break
    
    elif mechanism == "Exponential":
        seqLengthList = EstimateDistribution( filename, 0.0, "False" )    
        l_opt = ExponentialMechanism( seqLengthList, RATIO_VALUE, epsilon )

    dplog.debug( "Empirical ratio value is : (%s)" % (str(RATIO_VALUE)) )
    dplog.info( "l_opt = %d" % (math.ceil(l_opt)) )
    dplog.info( " === Phase 1.1: GetOptSequenceLength End ===" )

    return math.ceil(l_opt)
Ejemplo n.º 7
0
def GetOptSeqLength(filename, epsilon, mechanism="Exponential"):
    '''
    @summary: 获得最优序列长度

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param mechanism: which noise mechanism
        - Laplace Mechanism
        - Exponential Mechanism ( default )
    
    @return: l_opt
    @rtype: int

    '''
    dplog.info(" === Phase 1.1: GetOptSequenceLength Begin ===")

    if mechanism == "Laplace":
        seqLengthList = EstimateDistribution(filename, epsilon, "True")
        total = 0.0
        # 默认l_opt取最大序列长度, 避免扰动比率和<RATIO_VALUE
        l_opt = seqLengthList[-1][0]

        for item in seqLengthList:
            total += item[1]

            if total >= RATIO_VALUE:
                l_opt = item[0]
                break

    elif mechanism == "Exponential":
        seqLengthList = EstimateDistribution(filename, 0.0, "False")
        l_opt = ExponentialMechanism(seqLengthList, RATIO_VALUE, epsilon)

    dplog.debug("Empirical ratio value is : (%s)" % (str(RATIO_VALUE)))
    dplog.info("l_opt = %d" % (math.ceil(l_opt)))
    dplog.info(" === Phase 1.1: GetOptSequenceLength End ===")

    return math.ceil(l_opt)
Ejemplo n.º 8
0
def EstimateDistribution( filename, epsilon, noise ):
    '''
    @summary: 拉普拉斯噪音扰动序列长度计数

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param noise: use Laplace Mechanism or not

    @return: noisySeqLengthRatioList
    @rtype: list

    '''
    dplog.info( "Get l_opt from file (%s)..." % (filename) )

    file = open(filename)
    total = 0
    dic = {}
    sequences = []

    for line in file:
        if line.startswith('#') or line.startswith('//') or line.startswith('%'):
            continue
        
        line = line.strip().split(' ')
        total += 1
        sequences.append( len(line) )

        if not dic.has_key( len(line) ):
            dic[ len(line) ] = 1
        else:
            dic[ len(line) ] += 1

    sortlist = sorted( dic.iteritems(), key=lambda k:k[0] )
    NoisySeqLengthList = []

    if str(noise) == 'True':
        # dplog.debug( '=== before Laplace Mechanism ===' )
        # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) )

        # 避免出现负数
        NoisySeqLengthList = map(lambda x: (x[0],max(0,0,x[1] + laplace(1/float(epsilon)))), sortlist )
        
        dplog.debug( '=== Laplace Mechanism ===' )
        dplog.debug( 'epsilon: (%f)' % (epsilon) )
        dplog.debug( 'scale parameter: (%f)' % (1/float(epsilon)) )
        # dplog.debug( '=== after Laplace Mechanism ===')
        # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) )

        NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)), NoisySeqLengthList )
        # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) )

        return NoisySeqLengthRatioList

    elif str(noise) == 'False':
        return sequences

    file.close()
Ejemplo n.º 9
0
    def load_dump(self, filename):
        '''
        @summary: 写结果到目标文件 

        @param filename: write-file name

        '''
        file = open(filename)

        ngram_num = int(file.readline().strip())  # 已存在目标文件第一行为ngrams总数
        #self.alphabet_size = int(file.readline().strip())

        dplog.debug("ngrams total : %d" % (ngram_num))
        dplog.debug("Loading ngrams file (%s, l_opt=%d)..." %
                    (filename, self.N_max))

        pbar = MyProgressBar('Loading dump', ngram_num)

        for (line_num, line) in enumerate(file):

            # 区别 str.partition & str.split
            # line = 'lu:123:qin'
            # 1. line.partition(':')    >>> ('lu', ':', '123:qin')
            # 2. line.split(':')        >>> ['lu', '123', 'qin']
            parts = line.strip().partition(':')
            tokens = parts[0].strip().split()
            self[strToSeq(tokens, dec=1)] = float(parts[2].strip())

            max_item = max(map(int, tokens)) - 1
            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            pbar.update(line_num + 1)

        pbar.finish()

        self.TERM = self.alphabet_size

        dplog.debug("Alphabet size : %d" % (self.alphabet_size))
Ejemplo n.º 10
0
    def load_dump(self, filename):
        '''
        @summary: 写结果到目标文件 

        @param filename: write-file name

        '''
        file = open(filename)

        ngram_num = int(file.readline().strip())    # 已存在目标文件第一行为ngrams总数
        #self.alphabet_size = int(file.readline().strip())

        dplog.debug( "ngrams total : %d" % (ngram_num) )
        dplog.debug( "Loading ngrams file (%s, l_opt=%d)..." % (filename, self.N_max) )

        pbar = MyProgressBar('Loading dump', ngram_num)

        for (line_num, line) in enumerate(file):

            # 区别 str.partition & str.split
            # line = 'lu:123:qin'
            # 1. line.partition(':')    >>> ('lu', ':', '123:qin')
            # 2. line.split(':')        >>> ['lu', '123', 'qin']
            parts = line.strip().partition(':')
            tokens = parts[0].strip().split()
            self[strToSeq(tokens, dec=1)] = float(parts[2].strip()) 
            
            max_item = max(map(int, tokens)) - 1
            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            pbar.update(line_num + 1)

        pbar.finish()
        
        self.TERM = self.alphabet_size 

        dplog.debug( "Alphabet size : %d" % (self.alphabet_size) )
Ejemplo n.º 11
0
    def parse_sequences(self, filename):
        '''
        @summary: truncate sequences
            - 最优序列长度 l_opt 截断原始序列数据集
            - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数

        @param filename: sequential database

        @return: 原始序列数据集对应的n_max-gram和计数
        @rtype: Counter

        @note: assume that locations are numbered from 1 .. max

        '''
        dplog.info( " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" % ( filename, self.max_len) )
        
        file = open(filename)

        self.all_record_num = 0
        lines = []

        # First check the alphabet
        for line in file:
            if line.startswith('#') or line.startswith('//') or line.startswith('%'):
                continue
            
            # self.lines.append(line.strip().split()[:self.max_len])
            lines.append( line.strip().split()[:self.max_len] )
            
            ''' 
            max_item: the max-value item in one max_len sequence
            
            需要有前提: 项集编号从1开始,且连续到max
            >>> map(int, '234')
            [2, 3, 4]
            
            '''
            max_item = max(map(int, lines[-1]))

            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            self.all_record_num += 1

        # be the end point
        self.TERM = self.alphabet_size
        dplog.debug( "Alphabet size : %s" % (str(self.alphabet_size)) )
        dplog.debug( "Termination code : %s" % (str(self.TERM)) )
        dplog.debug( "Number of sequences : (%s)" % (self.all_record_num) )

        pbar = MyProgressBar('Parsing', self.all_record_num + 1)
        
        # 下面这几行涉及 NGram 操作
        for (record, line) in enumerate(lines):
            ''' 
            >>> strToSeq('234', dec=1)
            u'\x01\x02\x03'
            
            >>> line = '123'
            >>> self.TERM = 3
            u'\x00\x01\x02\x03'

            '''
            # 序列记录添加结束符 self.TERM
            seq = strToSeq(line, dec=1) + unichr(self.TERM)
            for i in range(1, self.N_max+1):
                '''
                (N=i) defines N-gram
                
                Example:
                >>> G = ngram.NGram(N=3)
                >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05'])
                >>> print list(a)
                [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']]
                    
                '''
                G = ngram.NGram(N=i)
                
                # Counter every gram from 1...N_max(包含结束符)
                self.update(G.ngrams(seq))

            pbar.update(record + 1)
        pbar.finish()

        file.close()
Ejemplo n.º 12
0
    def parse_sequences(self, filename):
        '''
        @summary: truncate sequences
            - 最优序列长度 l_opt 截断原始序列数据集
            - Counter.update & ngram.NGram 保存所有n_max-gram序列模式和支持度计数

        @param filename: sequential database

        @return: 原始序列数据集对应的n_max-gram和计数
        @rtype: Counter

        @note: assume that locations are numbered from 1 .. max

        '''
        dplog.info(
            " === Phase 1.2: Truncating Sequence file (%s, l_opt=%d) ===" %
            (filename, self.max_len))

        file = open(filename)

        self.all_record_num = 0
        lines = []

        # First check the alphabet
        for line in file:
            if line.startswith('#') or line.startswith(
                    '//') or line.startswith('%'):
                continue

            # self.lines.append(line.strip().split()[:self.max_len])
            lines.append(line.strip().split()[:self.max_len])
            ''' 
            max_item: the max-value item in one max_len sequence
            
            需要有前提: 项集编号从1开始,且连续到max
            >>> map(int, '234')
            [2, 3, 4]
            
            '''
            max_item = max(map(int, lines[-1]))

            if self.alphabet_size < max_item:
                self.alphabet_size = max_item

            self.all_record_num += 1

        # be the end point
        self.TERM = self.alphabet_size
        dplog.debug("Alphabet size : %s" % (str(self.alphabet_size)))
        dplog.debug("Termination code : %s" % (str(self.TERM)))
        dplog.debug("Number of sequences : (%s)" % (self.all_record_num))

        pbar = MyProgressBar('Parsing', self.all_record_num + 1)

        # 下面这几行涉及 NGram 操作
        for (record, line) in enumerate(lines):
            ''' 
            >>> strToSeq('234', dec=1)
            u'\x01\x02\x03'
            
            >>> line = '123'
            >>> self.TERM = 3
            u'\x00\x01\x02\x03'

            '''
            # 序列记录添加结束符 self.TERM
            seq = strToSeq(line, dec=1) + unichr(self.TERM)
            for i in range(1, self.N_max + 1):
                '''
                (N=i) defines N-gram
                
                Example:
                >>> G = ngram.NGram(N=3)
                >>> a = G.ngrams([u'\x01', u'\x02', u'\x03', u'\x04', u'\x05'])
                >>> print list(a)
                [[u'\x01', u'\x02', u'\x03'], [u'\x02', u'\x03', u'\x04'], [u'\x03', u'\x04', u'\x05']]
                    
                '''
                G = ngram.NGram(N=i)

                # Counter every gram from 1...N_max(包含结束符)
                self.update(G.ngrams(seq))

            pbar.update(record + 1)
        pbar.finish()

        file.close()
Ejemplo n.º 13
0
def EstimateDistribution(filename, epsilon, noise):
    '''
    @summary: 拉普拉斯噪音扰动序列长度计数

    @param filename: Sequence DB
    @param epsilon: privacy budget
    @param noise: use Laplace Mechanism or not

    @return: noisySeqLengthRatioList
    @rtype: list

    '''
    dplog.info("Get l_opt from file (%s)..." % (filename))

    file = open(filename)
    total = 0
    dic = {}
    sequences = []

    for line in file:
        if line.startswith('#') or line.startswith('//') or line.startswith(
                '%'):
            continue

        line = line.strip().split(' ')
        total += 1
        sequences.append(len(line))

        if not dic.has_key(len(line)):
            dic[len(line)] = 1
        else:
            dic[len(line)] += 1

    sortlist = sorted(dic.iteritems(), key=lambda k: k[0])
    NoisySeqLengthList = []

    if str(noise) == 'True':
        # dplog.debug( '=== before Laplace Mechanism ===' )
        # dplog.debug( 'sortlist:\n %s' % (str(sortlist)) )

        # 避免出现负数
        NoisySeqLengthList = map(
            lambda x: (x[0], max(0, 0, x[1] + laplace(1 / float(epsilon)))),
            sortlist)

        dplog.debug('=== Laplace Mechanism ===')
        dplog.debug('epsilon: (%f)' % (epsilon))
        dplog.debug('scale parameter: (%f)' % (1 / float(epsilon)))
        # dplog.debug( '=== after Laplace Mechanism ===')
        # dplog.debug( 'Noisy-sortlist :\n%s' % (str(NoisySeqLengthList)) )

        NoisySeqLengthRatioList = map(lambda x: (x[0], x[1] / float(total)),
                                      NoisySeqLengthList)
        # dplog.debug( 'Noisy-sortlist-Ratio :\n%s' % (str(NoisySeqLengthRatioList)) )

        return NoisySeqLengthRatioList

    elif str(noise) == 'False':
        return sequences

    file.close()