Beispiel #1
0
 def __init__(self, inPutPath, outPutPath, isPaired, readNum,
              bucketIndexLen, lossless, verbose):
     self.mutiDict = {}  ### kmers for buckets
     self.sequenceTable = []  #store sequence for output
     self.kmerLen = bucketIndexLen
     self.indexLen = bucketIndexLen
     self.bucketDict = {}  #nested_dict(2, int)
     self.encodeBucketPath = {}
     self.newNodeNum = 0
     self.simpleNodeNum = 0
     self.tipNodeNum = 0
     self.bifurNodeNum = 0
     self.deleteBifurRatio = 0.2
     self.inPutPath = inPutPath
     self.outPutPath = outPutPath
     self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.recdna = {"A": "T", "C": "G", "G": "C", "T": "A"}
     self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.firstSeq = BitStream()
     self.numFlag = BitStream()
     self.freq3 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(3))
     self.freq4 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.freqs = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.bitoutL = arithmeticcoding.BitInputStream(
         open(self.inPutPath + ".bifurL", "rb"))
     self.bitoutR = arithmeticcoding.BitInputStream(
         open(self.inPutPath + ".bifurR", "rb"))
     self.decodeSeqPathL = self.openFileLeft()
     self.decodeSeqPathR = self.openFileRight()
     self.outFileName = outPutPath + ".dna"
     self.paired = isPaired
     self.seqLen = 0  #length for current read
     self.outPairFileName = [outPutPath + "_1.dna", outPutPath + "_2.dna"]
     self.outFile = None
     self.outPairFile = None
     self.readNum = readNum
     self.seqence = ""  ##encode seq
     self.bucketIndex = []  #bucket index
     self.bucketCov = []  # reads number in bucket
     self.readIndexPos = []  #index positions in each read
     self.readLen = []
     self.readrc = sream()  # read in forward or backward
     self.readN = {
         "flag": sream(),
         "pos": [],
         "l": []
     }  # N in read indicate, number, position and length
     self.numFlag = sream()  #new nodes indicate
     self.lossless = lossless
     self.verbose = verbose
     self.openOutFile()  #prepare output file
 def compress(quantized, output_file):
     """
     Function to load d
     
     Input:
     filename : Input hdf5 file consisting of training dataset
     
     Output:
     dataframe of paths to images dataset
     """
     data = pickle.dumps(quantized)
     with open(output_file, "wb") as file:
         bitout = arithmeticcoding.BitOutputStream(file)
         initfreqs = arithmeticcoding.FlatFrequencyTable(257)
         freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
         enc = arithmeticcoding.ArithmeticEncoder(32, bitout)
         i = 0
         while i < len(data):
             # Read and encode one byte
             symbol = data[i]
             i += 1
             enc.write(freqs, symbol)
             freqs.increment(symbol)
         enc.write(freqs, 256)  # EOF
         enc.finish()  # Flush remaining code bits
Beispiel #3
0
def decomparess(inputfile, outfile, model):
    bitin = arithmeticcoding.BitInputStream(open(inputfile, "rb"))
    initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    dec = arithmeticcoding.ArithmeticDecoder(bitin)
    prev_chars = []
    i = 0

    with open(outfile, "w") as out:
        
        while(True):
            guesses = dec.read(freqs)
            if guesses == MAGIC_EOF:
                break

            print('guesses',guesses)
            freqs.increment(guesses)
            for _ in range(guesses):
                 char = predict(prev_chars, model, indices_char)
                 out.write(char)

            print("i",i)
            literal = dec.read(freqs)
            print('lit',chr(literal))
            out.write(chr(literal))
            freqs.increment(literal)
            prev_chars.append(chr(literal))
            if len(prev_chars) > maxlen:
                 prev_chars.pop(0)
            i = i + 1

        bitin.close()
Beispiel #4
0
def get_frequencies(filepath):
    freqs = arithmeticcoding.SimpleFrequencyTable([0] * 257)
    with open(filepath, "rb") as input:
        while True:
            b = input.read(1)
            if len(b) == 0:
                break
            freqs.increment(b[0])
    return freqs
Beispiel #5
0
	def get_frequencies(self, inp, frequencies, num_symbols):
		freqs = arithmeticcoding.SimpleFrequencyTable(frequencies)
		#self.f = [0 for i in range(num_symbols+1)]
		#for i in range(len(inp)):
		#	b = inp[i]
		#	freqs.increment(b)
		#	self.f[b] += 1
		#self.f[num_symbols] += 1
		#self.f = frequencies
		return freqs
Beispiel #6
0
    def compressTree(
        self, node, overall_freqs, N
    ):  #n is the number of nodes in the hidden layer and pw is the list of all the normalized probability; use cummulative frequencies, then,
        #won't have to normalize
        enc = arithmeticcoding.ArithmeticEncoder()
        q = deque([node])
        #self.j = 0
        while len(q) != 0:
            temp = q.popleft()
            if temp.v > 1:
                tempValue = temp.v
                i = 0
                for child in temp.childNodes:
                    if child != None:

                        if tempValue > 0:
                            q.append(child)
                            binomial_frequencies = ec(
                            ).binomial_encoder_frequencies(
                                overall_freqs[i:], tempValue
                            )  # binomial encoder can convert to frequencies. convert to binary independently and check compression ratio for confirming correct amount of compression
                            freqs = arithmeticcoding.SimpleFrequencyTable(
                                binomial_frequencies)
                            enc.write(freqs, child.v)
                            tempValue = tempValue - child.v
                            #a = a + '1011'
                            i += 1
                            #print('Compressing Tree...',self.j)
                            #self.j += 1
                        #print (i)
            elif temp.v == 1:
                for child in temp.childNodes:
                    if child != None:
                        if child.v == 1:
                            symbol = child.c
                            q.append(child)
                            freqs = arithmeticcoding.SimpleFrequencyTable(
                                overall_freqs)
                            enc.write(freqs, symbol)

        compressed_tree = enc.finish()

        return compressed_tree
Beispiel #7
0
def read_frequencies(bitin):
	def read_int(n):
		result = 0
		for _ in range(n):
			result = (result << 1) | bitin.read_no_eof()  # Big endian
		return result
	
	freqs = [read_int(32) for _ in range(256)]
	freqs.append(1)  # EOF symbol
	return arithmeticcoding.SimpleFrequencyTable(freqs)
Beispiel #8
0
def decompress(bitin, out):
    initfreqs = arithmeticcoding.FlatFrequencyTable(257)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    dec = arithmeticcoding.ArithmeticDecoder(32, bitin)
    while True:
        # Decode and write one byte
        symbol = dec.read(freqs)
        if symbol == 256:  # EOF symbol
            break
        out.write(bytes((symbol, )))
        freqs.increment(symbol)
Beispiel #9
0
def generate_freqs(pro, first_step=False, resolution=1e9):
    freqs = arithmeticcoding.SimpleFrequencyTable([0] * (1 + len(characters)))
    for i in range(len(characters)):
        # freqs.set(i, static_freqs[characters[i]])
        freqs.set(i, 1)
    if first_step is False:
        for i in range(pro.shape[0]):
            if (pro[i] * resolution).astype(np.int64) > 1:
                freqs.set(i, (pro[i] * resolution).astype(np.int64))
    freqs.set(len(characters), 1)  # \n
    # freqs.set(41, 1)  # EOF
    return freqs
def compress(inp, bitout):
    initfreqs = arithmeticcoding.FlatFrequencyTable(257)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    enc = arithmeticcoding.ArithmeticEncoder(32, bitout)
    while True:
        # Read and encode one byte
        symbol = inp.read(1)
        if len(symbol) == 0:
            break
        enc.write(freqs, symbol[0])
        freqs.increment(symbol[0])
    enc.write(freqs, 256)  # EOF
    enc.finish()  # Flush remaining code bits
Beispiel #11
0
def comparess(file1, model, indices_char):
    #this is painfully slow
    #if at all possible it should be revised so that it can mostly be run on the gpu
    #by painfully slow i mean on the order of .02 seconds per character guess.
    #ie ~16 minutes for a 50k character file.

    f1 = open(file1, 'r').read()
    data_size = len(f1)
    i = 0
    #output = [0, f1[0]]

    bitout = arithmeticcoding.BitOutputStream(open(file1 + '.comp', "wb"))
    initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    enc = arithmeticcoding.ArithmeticEncoder(bitout)
    guesses_right = 0
    gss = ''

    while i < data_size:
        current = ord(f1[i])
        if i < maxlen:
            enc.write(freqs,
                      0)  # Always 'guessing' zero correctly before maxlen
            freqs.increment(0)
            enc.write(freqs, current)
            freqs.increment(current)
        else:
            guess = predict(f1[(i - maxlen):i], model, indices_char)
            if (f1[i] == guess and guesses_right < 255):
                guesses_right += 1
                print("Guessed", f1[i], "correctly")
            else:
                enc.write(freqs, guesses_right)
                print("Wrong guess. Outputing", guesses_right,
                      "correct guesses")
                freqs.increment(guesses_right)
                print(i, "Outputing char", current)
                enc.write(freqs, current)
                freqs.increment(current)
                guesses_right = 0

        if (i % 100 == 0): print("i:", i)
        i += 1

    if guesses_right > 0:
        enc.write(freqs, guesses_right)
    enc.write(freqs, MAGIC_EOF)
    print("out eof sanity check")
    enc.finish()
    bitout.close()
    return None
 def decompress(input_file):
     decode = bytearray()
     with open(input_file, "rb") as inp:
         bitin = arithmeticcoding.BitInputStream(inp)
         initfreqs = arithmeticcoding.FlatFrequencyTable(257)
         freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
         dec = arithmeticcoding.ArithmeticDecoder(32, bitin)
         while True:
             # Decode and write one byte
             symbol = dec.read(freqs)
             if symbol == 256:  # EOF symbol
                 break
             decode.extend(bytes((symbol, )))
             freqs.increment(symbol)
     return pickle.loads(decode)
Beispiel #13
0
 def __init__(self, path, ispaired, kmerLen, verbose, sequenceTable):
     self.mutiDict = {}  ### kmers for buckets
     self.sequenceTable = sequenceTable
     self.kmerLen = kmerLen
     self.indexLen = kmerLen
     self.paired = ispaired
     self.seqLen = 0
     #self.bucketDict = defaultdict(lambda : defaultdict(dict))
     self.bucketDict = {}  #nested_dict(2, int)
     self.encodeBucketPath = {}
     self.newNodeNum = 0
     self.simpleNodeNum = 0
     self.tipNodeNum = 0
     self.bifurNodeNum = 0
     self.deleteBifurRatio = 0.2
     self.outPutPath = path
     self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.firstSeq = BitStream()
     self.numFlag = BitStream()
     self.freq3 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(3))
     self.freq4 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.freqs = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.bitoutL = arithmeticcoding.BitOutputStream(
         open(self.outPutPath + ".bifurL", "wb"))
     self.bitoutR = arithmeticcoding.BitOutputStream(
         open(self.outPutPath + ".bifurR", "wb"))
     self.encodeSeqPathL = self.openFileLeft()
     self.encodeSeqPathR = self.openFileRight()
     self.verbose = verbose
     self.removeOutputFile()
    def decompress_next(self, new_freq_table_256):
        if isinstance(new_freq_table_256, (list, set)):
            new_table_copy = list(new_freq_table_256)
            new_table_copy.extend([int(1)])
            self.freqsTable = arithmeticcoding.SimpleFrequencyTable(
                new_table_copy)

        #self.decoder = arithmeticcoding.ArithmeticDecoder(32, self.bitin)

        symbol = self.decoder.read(self.freqsTable)

        if symbol < 256:
            self.out.write(bytes((symbol, )))

        return symbol
Beispiel #15
0
def compress(snp, numsymbol):
	initfreqs = arithmeticcoding.FlatFrequencyTable(numsymbol)
	freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
	enc = arithmeticcoding.ArithmeticEncoder(32)

	snp  = np.squeeze(snp)
	rows,cols,channel = snp.shape
	
	for c in range(channel):      
		for i in range(rows):
			for j in range(cols):
		# Read and encode one byte
				symbol = snp[i,j,c]	 
				enc.write(freqs, symbol)
				freqs.increment(symbol)
	enc.write(freqs, numsymbol-1)  # EOF
	enc.finish()  # Flush remaining code bits
	return enc.bit_nums
def read_frequencies(bitin):
    freqs = []
    for i in range(256):
        freqs.append(read_int(bitin, 32))
    freqs.append(1)  # EOF symbol
    return arithmeticcoding.SimpleFrequencyTable(freqs)
Beispiel #17
0
		def __init__(self, symbols, hassubctx):
			self.frequencies = arithmeticcoding.SimpleFrequencyTable([0] * symbols)
			self.subcontexts = ([None] * symbols) if hassubctx else None
Beispiel #18
0
    def inferenceNN(
        self, x, M, N, overall_freqs, L, activationFunction
    ):  #N is the number of hidden nodes, the weights are of dimension MxN
        y = [0 for i in range(N)]
        enc = arithmeticcoding.ArithmeticEncoder()
        dec = arithmeticcoding.ArithmeticDecoder(L)
        q = deque([N])
        #q_node = deque([node])
        self.w = 0
        tot_queue_length = floor(2 * log2(N + 1) + 1)
        max_queue_length = floor(2 * log2(N + 1) + 1)
        current_queue_length = floor(2 * log2(N + 1) + 1)
        j = 0
        level = 0
        flag = 0
        flagp = 0
        k = len(overall_freqs)
        print('M:', M, 'N:', N)
        while len(q) != 0 and level < M:
            currentNodeValue = q.popleft()
            current_queue_length -= floor(2 * log2(currentNodeValue + 1) + 1)

            if flagp == 0:
                print('current_queue_length', current_queue_length)
                flagp = 1
            #currentnode = q_node.popleft()
            if currentNodeValue > 1:
                c = 0  #colour initialized with 0
                while c <= k - 1 and currentNodeValue > 0:  #kth colour need not be encoded
                    binomial_frequencies = ec().binomial_encoder_frequencies(
                        overall_freqs[c:], currentNodeValue)
                    freqs = arithmeticcoding.SimpleFrequencyTable(
                        binomial_frequencies)
                    childNodeValue = dec.read(freqs)
                    #if childNodeValue != currentnode.childNodes[c].v:
                    #	print('Not Matching!', childNodeValue, currentnode.childNodes[c].v)
                    #else:
                    #	print('No problems here')
                    enc.write(freqs, childNodeValue)
                    currentNodeValue -= childNodeValue
                    q.append(childNodeValue)
                    current_queue_length += floor(2 *
                                                  log2(childNodeValue + 1) + 1)
                    max_queue_length = max(max_queue_length,
                                           current_queue_length)
                    tot_queue_length += current_queue_length
                    self.w += 1
                    #q_node.append(currentnode.childNodes[c])
                    #print('childNodeValue',childNodeValue)
                    if childNodeValue > 0:
                        flag = 1
                    for i in range(childNodeValue):

                        #	print('level:',level,'x[level]',x[level])
                        #	print('Calculating Y....', level,':',self.w)
                        y[j + i] += uc().index_to_weight(c) * x[level]
                        #print(x[level], c)
                        #y[j+i] += c*x[level]
                    c = c + 1
                    j = (j + childNodeValue) % N
                    if j == 0 and flag:
                        level = level + 1
                        #print('level:',level)
                        flag = 0
            elif currentNodeValue == 1:
                freqs = arithmeticcoding.SimpleFrequencyTable(overall_freqs)
                c = dec.read(freqs)
                enc.write(freqs, c)
                q.append(1)
                current_queue_length += 3
                max_queue_length = max(max_queue_length, current_queue_length)
                tot_queue_length += current_queue_length

                self.w += 1
                y[j + i] += uc().index_to_weight(c) * x[level]
                j = (j + 1) % N
                if j == 0:
                    level += 1

        avg_queue_length = tot_queue_length / self.w

        L1 = enc.finish()  #return L1 if needed
        y = np.array(y)
        if activationFunction == 'ReLU':
            y = uc().ReLU(y)
        elif activationFunction == 'sigmoid':
            y = uc().sigmoid(y)
        elif activationFunction == None:
            y = y
        return y, avg_queue_length, max_queue_length
    def read_frequencies(self, frequencies):

        return arithmeticcoding.SimpleFrequencyTable(frequencies)
    def compress_next(self, new_freq_table_256, symbol_number):
        if isinstance(new_freq_table_256, (list, set)):
            new_table_copy = list(new_freq_table_256)
            new_table_copy.extend([int(1)])
            self.freqsTable = arithmeticcoding.SimpleFrequencyTable(
                new_table_copy)

        #self.encoder = arithmeticcoding.ArithmeticEncoder(32, self.bitout)

        self.encoder.write(self.freqsTable, symbol_number)
        ## set new frequency for the symbol
        #self.freqsTable.set(symbol_number, freq_pred)


## Returns a frequency table based on the bytes in the given file.
## Also contains an extra entry for symbol 256, whose frequency is set to 0.
# def get_frequencies(self, filepath):
#    freqs = arithmeticcoding.SimpleFrequencyTable([0] * 257)
#    with open(filepath, "rb") as input:
#        while True:
#            b = input.read(1)
#            if len(b) == 0:
#                break
#            freqs.increment(b[0])
#    return freqs

# def write_frequencies(self, bitout, freqs):
#    for i in range(256):
#        write_int(bitout, 32, freqs.get(i))

# def compress(self, freqs, inp, bitout):
#    enc = arithmeticcoding.ArithmeticEncoder(32, bitout)
#    while True:
#        symbol = inp.read(1)
#        if len(symbol) == 0:
#            break
#        enc.write(freqs, symbol[0])
#    enc.write(freqs, 256)  # EOF
#    enc.finish()  # Flush remaining code bits

## Writes an unsigned integer of the given bit width to the given stream.
#def write_int(bitout, numbits, value):
#    for i in reversed(range(numbits)):
#        bitout.write((value >> i) & 1)  # Big endian
#
#
## Command line main application function.
#def main(args):
#    # Handle command line arguments
#    if len(args) != 2:
#        sys.exit("Usage: python arithmeticcompress.py InputFile OutputFile")
#    inputfile, outputfile = args
#
#    # Read input file once to compute symbol frequencies
#    freqs = get_frequencies(inputfile)
#    freqs.increment(256)  # EOF symbol gets a frequency of 1
#
#    # Read input file again, compress with arithmetic coding, and write output file
#    with open(inputfile, "rb") as inp, \
#            contextlib.closing(arithmeticcoding.BitOutputStream(open(outputfile, "wb"))) as bitout:
#        write_frequencies(bitout, freqs)
#        compress(freqs, inp, bitout)
#
#
## Main launcher
#if __name__ == "__main__":
#    main(sys.argv[1:])