Example #1
0
 def __init__(self, inPutPath, outPutPath, isPaired, readNum,
              bucketIndexLen, lossless, verbose):
     self.mutiDict = {}  ### kmers for buckets
     self.sequenceTable = []  #store sequence for output
     self.kmerLen = bucketIndexLen
     self.indexLen = bucketIndexLen
     self.bucketDict = {}  #nested_dict(2, int)
     self.encodeBucketPath = {}
     self.newNodeNum = 0
     self.simpleNodeNum = 0
     self.tipNodeNum = 0
     self.bifurNodeNum = 0
     self.deleteBifurRatio = 0.2
     self.inPutPath = inPutPath
     self.outPutPath = outPutPath
     self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.recdna = {"A": "T", "C": "G", "G": "C", "T": "A"}
     self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.firstSeq = BitStream()
     self.numFlag = BitStream()
     self.freq3 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(3))
     self.freq4 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.freqs = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.bitoutL = arithmeticcoding.BitInputStream(
         open(self.inPutPath + ".bifurL", "rb"))
     self.bitoutR = arithmeticcoding.BitInputStream(
         open(self.inPutPath + ".bifurR", "rb"))
     self.decodeSeqPathL = self.openFileLeft()
     self.decodeSeqPathR = self.openFileRight()
     self.outFileName = outPutPath + ".dna"
     self.paired = isPaired
     self.seqLen = 0  #length for current read
     self.outPairFileName = [outPutPath + "_1.dna", outPutPath + "_2.dna"]
     self.outFile = None
     self.outPairFile = None
     self.readNum = readNum
     self.seqence = ""  ##encode seq
     self.bucketIndex = []  #bucket index
     self.bucketCov = []  # reads number in bucket
     self.readIndexPos = []  #index positions in each read
     self.readLen = []
     self.readrc = sream()  # read in forward or backward
     self.readN = {
         "flag": sream(),
         "pos": [],
         "l": []
     }  # N in read indicate, number, position and length
     self.numFlag = sream()  #new nodes indicate
     self.lossless = lossless
     self.verbose = verbose
     self.openOutFile()  #prepare output file
 def compress(quantized, output_file):
     """
     Function to load d
     
     Input:
     filename : Input hdf5 file consisting of training dataset
     
     Output:
     dataframe of paths to images dataset
     """
     data = pickle.dumps(quantized)
     with open(output_file, "wb") as file:
         bitout = arithmeticcoding.BitOutputStream(file)
         initfreqs = arithmeticcoding.FlatFrequencyTable(257)
         freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
         enc = arithmeticcoding.ArithmeticEncoder(32, bitout)
         i = 0
         while i < len(data):
             # Read and encode one byte
             symbol = data[i]
             i += 1
             enc.write(freqs, symbol)
             freqs.increment(symbol)
         enc.write(freqs, 256)  # EOF
         enc.finish()  # Flush remaining code bits
Example #3
0
def decomparess(inputfile, outfile, model):
    bitin = arithmeticcoding.BitInputStream(open(inputfile, "rb"))
    initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    dec = arithmeticcoding.ArithmeticDecoder(bitin)
    prev_chars = []
    i = 0

    with open(outfile, "w") as out:
        
        while(True):
            guesses = dec.read(freqs)
            if guesses == MAGIC_EOF:
                break

            print('guesses',guesses)
            freqs.increment(guesses)
            for _ in range(guesses):
                 char = predict(prev_chars, model, indices_char)
                 out.write(char)

            print("i",i)
            literal = dec.read(freqs)
            print('lit',chr(literal))
            out.write(chr(literal))
            freqs.increment(literal)
            prev_chars.append(chr(literal))
            if len(prev_chars) > maxlen:
                 prev_chars.pop(0)
            i = i + 1

        bitin.close()
 def start(self, dictionary_size=256):
     self.dictionary_size = dictionary_size
     self.bitout = arithmeticcoding.BitOutputStream(
         open(self.outputfile, "wb"))
     #self.freqsTable = arithmeticcoding.SimpleFrequencyTable([float(i % 8 + 1) for i in range(self.dictionary_size + 1)])
     self.freqsTable = arithmeticcoding.FlatFrequencyTable(
         self.dictionary_size + 1)
     self.encoder = arithmeticcoding.ArithmeticEncoder(32, self.bitout)
 def start(self, dictionary_size=256):
     self.dictionary_size = dictionary_size
     self.inp = open(self.inputfile, "rb")
     self.out = open(self.outputfile, "wb")
     self.bitin = arithmeticcoding.BitInputStream(self.inp)
     #self.freqsTable = arithmeticcoding.SimpleFrequencyTable([float(i % 8 + 1) for i in range(self.dictionary_size + 1)])
     self.freqsTable = arithmeticcoding.FlatFrequencyTable(
         self.dictionary_size + 1)
     self.decoder = arithmeticcoding.ArithmeticDecoder(32, self.bitin)
Example #6
0
def decompress(bitin, out):
    initfreqs = arithmeticcoding.FlatFrequencyTable(257)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    dec = arithmeticcoding.ArithmeticDecoder(32, bitin)
    while True:
        # Decode and write one byte
        symbol = dec.read(freqs)
        if symbol == 256:  # EOF symbol
            break
        out.write(bytes((symbol, )))
        freqs.increment(symbol)
Example #7
0
	def __init__(self, order, symbollimit, escapesymbol):
		if order < -1 or symbollimit <= 0 or not (0 <= escapesymbol < symbollimit):
			raise ValueError()
		self.model_order = order
		self.symbol_limit = symbollimit
		self.escape_symbol = escapesymbol
		
		if order >= 0:
			self.root_context = PpmModel.Context(symbollimit, order >= 1)
			self.root_context.frequencies.increment(escapesymbol)
		else:
			self.root_context = None
		self.order_minus1_freqs = arithmeticcoding.FlatFrequencyTable(symbollimit)
def compress(inp, bitout):
    initfreqs = arithmeticcoding.FlatFrequencyTable(257)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    enc = arithmeticcoding.ArithmeticEncoder(32, bitout)
    while True:
        # Read and encode one byte
        symbol = inp.read(1)
        if len(symbol) == 0:
            break
        enc.write(freqs, symbol[0])
        freqs.increment(symbol[0])
    enc.write(freqs, 256)  # EOF
    enc.finish()  # Flush remaining code bits
Example #9
0
def comparess(file1, model, indices_char):
    #this is painfully slow
    #if at all possible it should be revised so that it can mostly be run on the gpu
    #by painfully slow i mean on the order of .02 seconds per character guess.
    #ie ~16 minutes for a 50k character file.

    f1 = open(file1, 'r').read()
    data_size = len(f1)
    i = 0
    #output = [0, f1[0]]

    bitout = arithmeticcoding.BitOutputStream(open(file1 + '.comp', "wb"))
    initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE)
    freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
    enc = arithmeticcoding.ArithmeticEncoder(bitout)
    guesses_right = 0
    gss = ''

    while i < data_size:
        current = ord(f1[i])
        if i < maxlen:
            enc.write(freqs,
                      0)  # Always 'guessing' zero correctly before maxlen
            freqs.increment(0)
            enc.write(freqs, current)
            freqs.increment(current)
        else:
            guess = predict(f1[(i - maxlen):i], model, indices_char)
            if (f1[i] == guess and guesses_right < 255):
                guesses_right += 1
                print("Guessed", f1[i], "correctly")
            else:
                enc.write(freqs, guesses_right)
                print("Wrong guess. Outputing", guesses_right,
                      "correct guesses")
                freqs.increment(guesses_right)
                print(i, "Outputing char", current)
                enc.write(freqs, current)
                freqs.increment(current)
                guesses_right = 0

        if (i % 100 == 0): print("i:", i)
        i += 1

    if guesses_right > 0:
        enc.write(freqs, guesses_right)
    enc.write(freqs, MAGIC_EOF)
    print("out eof sanity check")
    enc.finish()
    bitout.close()
    return None
 def decompress(input_file):
     decode = bytearray()
     with open(input_file, "rb") as inp:
         bitin = arithmeticcoding.BitInputStream(inp)
         initfreqs = arithmeticcoding.FlatFrequencyTable(257)
         freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
         dec = arithmeticcoding.ArithmeticDecoder(32, bitin)
         while True:
             # Decode and write one byte
             symbol = dec.read(freqs)
             if symbol == 256:  # EOF symbol
                 break
             decode.extend(bytes((symbol, )))
             freqs.increment(symbol)
     return pickle.loads(decode)
Example #11
0
 def __init__(self, path, ispaired, kmerLen, verbose, sequenceTable):
     self.mutiDict = {}  ### kmers for buckets
     self.sequenceTable = sequenceTable
     self.kmerLen = kmerLen
     self.indexLen = kmerLen
     self.paired = ispaired
     self.seqLen = 0
     #self.bucketDict = defaultdict(lambda : defaultdict(dict))
     self.bucketDict = {}  #nested_dict(2, int)
     self.encodeBucketPath = {}
     self.newNodeNum = 0
     self.simpleNodeNum = 0
     self.tipNodeNum = 0
     self.bifurNodeNum = 0
     self.deleteBifurRatio = 0.2
     self.outPutPath = path
     self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'}
     self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"}
     self.firstSeq = BitStream()
     self.numFlag = BitStream()
     self.freq3 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(3))
     self.freq4 = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.freqs = arithmeticcoding.SimpleFrequencyTable(
         arithmeticcoding.FlatFrequencyTable(4))
     self.bitoutL = arithmeticcoding.BitOutputStream(
         open(self.outPutPath + ".bifurL", "wb"))
     self.bitoutR = arithmeticcoding.BitOutputStream(
         open(self.outPutPath + ".bifurR", "wb"))
     self.encodeSeqPathL = self.openFileLeft()
     self.encodeSeqPathR = self.openFileRight()
     self.verbose = verbose
     self.removeOutputFile()
Example #12
0
def compress(snp, numsymbol):
	initfreqs = arithmeticcoding.FlatFrequencyTable(numsymbol)
	freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs)
	enc = arithmeticcoding.ArithmeticEncoder(32)

	snp  = np.squeeze(snp)
	rows,cols,channel = snp.shape
	
	for c in range(channel):      
		for i in range(rows):
			for j in range(cols):
		# Read and encode one byte
				symbol = snp[i,j,c]	 
				enc.write(freqs, symbol)
				freqs.increment(symbol)
	enc.write(freqs, numsymbol-1)  # EOF
	enc.finish()  # Flush remaining code bits
	return enc.bit_nums
Example #13
0
    def __init__(self, order, symbollimit, escapesymbol):
        # order must be at least -1, symbol limit must be at least 0, and the escape symbol must be a positive value
        # and smaller than symbol limit
        if order < -1 or symbollimit <= 0 or not (0 <= escapesymbol <
                                                  symbollimit):
            raise ValueError()
        self.model_order = order  # order of the model
        self.symbol_limit = symbollimit  # symbol limit
        self.escape_symbol = escapesymbol  # escape symbol

        # building frequency table
        if order >= 0:
            self.root_context = PpmModel.Context(symbollimit, order >= 1)
            self.root_context.frequencies.increment(escapesymbol)
        else:
            self.root_context = None
        self.order_minus1_freqs = arithmeticcoding.FlatFrequencyTable(
            symbollimit)