def encode(self, outfile): # calculate frequencies for char in self.text: self.frequencies[char] += 1 # construct huffman tree for k, v in self.frequencies.items(): self.pq.put(Leaf(k, v)) for _ in range(self.pq.qsize() - 1): min1 = self.pq.get() min2 = self.pq.get() new_node = Branch(min1, min2) self.pq.put(new_node) tree = self.pq.get() # put encodings into a dict for each char, and flip for decodings self._get_encodings(tree) self.decodings = {str(v.bin): k for k, v in self.encodings.items()} # generate output bit array bits_out = BitArray(bin="") for char in self.text: bits_out.append(self.encodings[char]) with open(outfile, 'wb') as f: bits_out.tofile(f) return bits_out
def print_binary_map(taint_dict, binary_map_and_size): """ The function creates and saves the binary map of bytes that are tainted from the source. The bit[x] means that byte[x] from input file is tainted. :param taint_dict: Taint dictionary :param binary_map_and_size: Binary file and size - name:size :return: None """ assert taint_dict if not binary_map_and_size: return [file_name, size] = binary_map_and_size.split(':') if int(size) < 0: logger.warinig("The size value is negative") return map = BitArray(int(size)) map.set(0) for key in taint_dict: for value in range(taint_dict[key]): map.set(1, key + value) with open(file_name, "wb") as binary_map: map.tofile(binary_map)
def main(): file = open(sys.argv[1], "rb") msg = ConstBitStream(file) s_in = BitArray() keys = {Bits(''): 0} s_out = BitArray() count = 1 n_bits = 0 while True: try: s_in.append(msg.read(1)) except ReadError: break #Se a palavra nao tiver no dicionario if Bits(s_in) not in keys: # x = yb y = s_in[:-1] b = s_in[-1:] pos = keys[Bits(y)] #log base 2 |keys| n_bits = ceil(log2(len(keys))) if n_bits != 0: prefix = Bits(uint=int(pos), length=n_bits) else: prefix = Bits('') s_out.append(Bits('0b' + str(prefix.bin) + str(b.bin))) keys[Bits(s_in)] = count count += 1 s_in.clear() #Add padding: 00000 10101 #Numero de zeros é o tamanho dos bits extra para depois no descompressor saber if s_in[:1].bin == Bits(1): z = Bits('0b' + '0' * len(s_in)) else: z = Bits('0b' + '1' * len(s_in)) s_in.reverse() s_out.reverse() s_out.append(s_in) s_out.append(z) s_out.reverse() with open(sys.argv[2], 'wb') as f_out: s_out.tofile(f_out) file.close()
def output(tree: list, file_name: str): """Takes the tree values and uses them to create the output.bin file""" file2 = open(file_name[:-3] + "bin", "wb") codes = "" for item in tree: #stores char length, char length if len(item['character'].encode('utf8')) > 1: file2.write(bytes("?".encode('utf8'))) else: file2.write(bytes(item['character'].encode('utf8'))) length = len(item['binary code']) length = BitArray(bin=(format(length, '08b'))) length.tofile(file2) #stores all the codes in one long line codes += item['binary code'] #2 bytes of all 1's follwed by a new line to represent the end of the section clear = BitArray(bin='11111111') for i in range(2): clear.tofile(file2) file2.write(bytes("\n".encode('utf8'))) #writes all the characters codes to the file in one long string codes = BitArray(bin=codes) codes.tofile(file2) #writes a newline character to begin the next section file2.write(bytes("\n".encode('utf8'))) #writes the actual data in the huffman format file1 = open(file_name, "r", encoding='utf-8') value = "" for line in file1: for char in line: pos = binary_search(tree, char, 0, len(tree) - 1, "character") value += tree[pos]['binary code'] #only writes in lots of 8bits to reduces the amount of wasted bits if len(value) % 8 == 0: value = BitArray(bin=value) value.tofile(file2) value = "" value = BitArray(bin=value) value.tofile(file2) file1.close() file2.close()
def compression(): """ This function is formatted to be used when compressing the text file. It makes use of the returned values in the following functions to get a correctly compressed file: --> count_frequencies function --> creation_of_huffmantree function --> encoded_text function --> padding_text function All of the above """ path = config_json["filepath_text"] # the splitext method is used to split the path name into a pair root and ext. # ext being the extension(of the filepath) and root being everything except extension part filename, file_extension = os.path.splitext(path) # command to create filepath for the binary file output_path = filename + ".bin" # opens and reads the file # uses encoding parameter to open file and read it in any possible language with open(path, "r", encoding="utf8") as file, open(output_path, "wb") as output: # uses read method that returns the specified number of bytes from the file # default is -1 (the whole file) text = file.read() # uses the rstrip method removes any trailing characters at the end of a string text = text.rstrip() # dictionary used to store data values in key d = {} frequency = count_frequencies(text) g = creation_of_huffmantree(frequency) # loops through each frequency element from dictionary for el in g: d[el[0]] = el[1] # uses maketrans method to create a mapping table table = text.maketrans(d) # uses translate method to return a string where some specified characters are replaced # characters are replaced with characters described above using mapping table encoded_text = text.translate(table) padded_encoded_text = padding_text(encoded_text) # saves bytes to a binary file b = BitArray(bin=padded_encoded_text) b.tofile(output) # returns compressed file return output_path
def cipher(file_in, file_out, key, init_vector): """ @brief Mode PCBC chiffrement: chiffre le fichier avec la fonction de chiffrement, la clé et le vecteur initial passé en paramètre dans un autre fichier @:param file_in Le fichier d'entrée @:param file_out Le fichier de sortie @:param key La clé en bytes @:param init_vector Le vecteur initial """ init_vector = BitArray(init_vector) print("vector init : ", init_vector) init_vector.tofile(open("vector.txt", "wb+")) with open(file_in, 'rb') as f: message = BitArray(bytes=f.read()) mod = len(message) % 128 if mod != 0: message.append(128 - mod) print("message_cipher :", message) sortie = open(file_out, 'wb+') temp = int(len(message) / 128) last_bytes = init_vector if temp != 1: x = 0 sort = BitArray() while x != temp: print("x:", x) chunk = message[128 * x:(128 * (x + 1))] print("chunk:", chunk) chunk_xor = chunk ^ last_bytes print("encrypt:", chunk_xor) last_bytes = cam.encryption(chunk_xor, key) print("function encryption, cipher:", last_bytes) sort.append(last_bytes) last_bytes = chunk ^ last_bytes x = x + 1 sort.tofile(sortie) else: print("There is no enough blocks to apply PCBC mode.") sortie.close() f.close()
def encode(): # Call functions to create tree, then uses on data to create and save a binary encoded version print('encoding...') # Read input data file # load file and convert to string file = open(txtType, "r") fileString = file.read() file.close() data = fileString # Loops through characters to calculate frequencies frequencies = defaultdict(int) for character in data: frequencies[character] += 1 # Uses frequencies to create canonical tree for encoding encoded = createTree(frequencies) encoded = canonical(encoded) keyForDecode = [] # Adds character-length pairs to front of binary string to be used for canonical decoding for x in encoded: keyForDecode.append(bin(len(x[1]))[2:].zfill(8)) keyForDecode.append(bin(ord(x[0]))[2:].zfill(8)) binary = '' for i in keyForDecode: binary += i # Adds '}' to mark the end of the key binary += bin(ord('}'))[2:].zfill(8) # Adds encoded string binary += createCode(encoded, data) dif = len(binary) % 8 if dif == 0: dif = 8 zeroes = 8 - dif # Adds number of zeroes to the front which will need to be removed from the end (because encoded in 8-bit chunks) binary = bin(ord(str(zeroes)))[2:].zfill(8) + binary output = BitArray(bin=binary) # Write output to file .hc file2 = open(hcType, "wb") output.tofile(file2) file2.close() print('encoded time: ', clock())
def decipher(file_in, file_out, key, init_vector): """ @brief Mode PCBC déchiffrement: déchiffre le fichier avec la fonction de chiffrement, la clé et le vecteur initial passé en paramètre dans un autre fichier @:param file_in Le fichier d'entrée @:param file_out Le fichier de sortie @:param key Camellia Key Object @:param init_vector Le vecteur initial """ vector = open(init_vector, 'rb') init_vector = BitArray(vector) print("vector d'initialisation :", init_vector) if len(init_vector) != 128: raise ValueError("init_vector must be 128 bits.") with open(file_in, 'rb') as f: message = BitArray(bytes=f.read()) last_chunk = init_vector sortie = open(file_out, 'wb+') temp = int(len(message) / 128) if temp != 1: x = 0 sort = BitArray() while x != temp: print("x:", x) chunk = message[128 * x:(128 * (x + 1))] print("chunk:", chunk) chunk_deciph = cam.decryption(chunk, key, True) print("decript:", chunk_deciph) chunk_deciph ^= last_chunk print("function decryption, decipher:", chunk_deciph) sort.append(chunk_deciph) print("sortie :", sort) last_chunk = chunk ^ chunk_deciph x += 1 sort.tofile(sortie) else: print("There is no enough blocks to apply PCBC mode.") sortie.close() f.close()
def createCompressText(self, filename, root) -> None: """ This function opens a text and uses the codes created to encode the tree it then write the bytes to a bin file :return: VOID """ code = self.getCodes() temp = [] with open("books/" + filename + ".txt", "r", encoding="utf-8") as text: converted_file = text.read() for char in converted_file: temp.append(self.codes[char]) string = "".join(temp) binary_string = BitArray(bin=string) with open("output/" + filename + ".bin", "wb") as newFile: pickle.dump(root, newFile) binary_string.tofile(newFile) newFile.close()
def ECB(function, file_in, file_out, key): """ @brief Mode ECB: chiffre/déchiffre le fichier avec la fonction de chiffrement et la clé passée en paramètre dans un autre fichier @:param file_in L'adresse du fichier d'entrée @:param file_out l'adresse du fichier de sortie @:param chunk_size La taille du bloc en bytes @:param key La clé en bytes """ with open(file_in, 'rb') as f: message = BitArray(bytes=f.read()) mod = len(message) % 128 if mod != 0: message.append(128 - mod) sortie = open(file_out, 'wb+') temp = int(len(message) / 128) if temp != 1: x = 0 sort = BitArray() while x != temp: print("x:", x) chunk = message[128 * x:(128 * (x + 1))] print("chunk:", chunk) if function == cam.decryption: cipher = function(chunk, key, True) print("function decryption, decipher:", cipher) else: cipher = function(chunk, key) print("function encryption, cipher:", cipher) sort.append(cipher) print("sortie :", sort) x = x + 1 sort.tofile(sortie) else: print("There is no enough blocks to apply ECB mode.") sortie.close() f.close()
from PIL import Image from bitstring import BitArray import sys im = Image.open(sys.argv[1]) pix = im.load() w = im.size[0] h = im.size[1] s = BitArray(bytearray([w, h])) bits = [] for j in range(h): for i in range(w): if len(bits) == 8: s += bits bits = [] if pix[i, j] == 1: bits.insert(0, 0) else: bits.insert(0, 1) # pad row to 8 bits dif = 8 - len(bits) for x in range(0, dif - 1): bits.insert(0, 0) f = open('output.rgf', 'wb') s.tofile(f) print s.hex
# index of minumum packet size in File Proprties header i_min_data_pkt_size = index[0] + 736 print "[*] Original Minimum Data Packet Size: %s" % fb[ i_min_data_pkt_size:i_min_data_pkt_size + 32].hex print "[*] Original Maximum Data Packet Size: %s" % fb[i_min_data_pkt_size + 32:i_min_data_pkt_size + 64].hex # Accroding to ASF standarad the minimum data size and the maximum data size should be equal print "[*] Changing Miniumum and Maximum Data packet size to 0" # changing the data packets in bit array fb[i_min_data_pkt_size:i_min_data_pkt_size + 8] = 0x00 fb[i_min_data_pkt_size + 8:i_min_data_pkt_size + 16] = 0x00 fb[i_min_data_pkt_size + 16:i_min_data_pkt_size + 24] = 0x00 fb[i_min_data_pkt_size + 24:i_min_data_pkt_size + 32] = 0x00 fb[i_min_data_pkt_size + 32:i_min_data_pkt_size + 40] = 0x00 fb[i_min_data_pkt_size + 40:i_min_data_pkt_size + 48] = 0x00 fb[i_min_data_pkt_size + 48:i_min_data_pkt_size + 56] = 0x00 fb[i_min_data_pkt_size + 56:i_min_data_pkt_size + 64] = 0x00 print "[*] POC File Created poc.asf" of = open('poc.asf', 'w+b') fb.tofile(of) of.close() f.close()
# -*- coding: utf-8 -*- # # Python 2.7.1 # from bitstring import BitArray fname = 'image.jpg' with open(fname, 'r+b') as fh: byte_map = [ord(b) for b in fh.read(4)] byte_list = [byte_map[0], byte_map[1], byte_map[2], byte_map[3]] print 'retrieved', len(byte_list), 'from file', fname offset = 0 for ascii_val in byte_list: bin_val = BitArray(hex(ascii_val)) print bin_val.bin BitArray.reverse(bin_val) print bin_val.bin fh.seek(offset) bin_val.tofile(fh) print 'writing offset', offset, 'of file', fname offset += 1 fh.close()
for i in range (0,len(mycharset)): if countset[i] != 0: # suppress the zero appearance charecters print(mycharset[i] , " has " , '{0:04d}'.format(countset[i]) , " times appeared. "+\ "Probability = " , '{:.10f}'.format(probabilityset[i]) + " Huffman: " + mycodebook[str(mycharset[i])]) # just a print out operation onesandzeros = "" # initiate bit array for i in range (0, len(allofthefile)): onesandzeros = onesandzeros + mycodebook[str(allofthefile[i])] # create ones and zeros array binary_file = open('compressed_foo.bin', 'wb') # open the binary compressed file for writing i = 0 while (i < len(onesandzeros)): b = BitArray(bin=onesandzeros[i:i+8]) # divide array with 8 many bits and make them into a byte b.tofile(binary_file) # write the calculated byte to file i = i+8 binary_file.close() binary_file = open('compressed_foo.bin', "rb") # open the binary compressed file for reading allofthebinaryfile = binary_file.read() # read all of the bytes in the compressed file binary_file.close() newonesandzeros = "" # initiate new bit sequance to decompression of the file for i in range (0, len(allofthebinaryfile)): newonesandzeros = newonesandzeros + str(bin(allofthebinaryfile[i])[2:].zfill(8)) # tranform bytes into bit array mynewfile = "" # initiate character array
def SaveBinStrData(binStr, fileName): binData = BitArray(bin = binStr) try: binData.tofile(open(fileName, 'wb')) except: GlobalMsg.warn('unable to open file [' + fileName + ']')
print 'Analyzing file' charWeights = countChars(sourceFile) print '\n' #Create symbol table based on file analasys symbolTable = PrefixTree(charWeights).getEncodeTable() #Initialize output bits with count of chars outputBits = BitArray(int=len(charWeights), length=8) #Add char weights to binary output for char in charWeights.keys(): outputBits.append(BitArray(int=ord(char), length=32)) outputBits.append(BitArray(int=charWeights[char], length=32)) #Add code words print 'Encoding file' line = 0 sourceFile = open(sys.argv[1], 'r') for s_line in sourceFile: line += 1 update_progress(line, lineCount) for s_char in s_line: outputBits.append(BitArray('0b{}'.format(symbolTable[s_char]))) print '\n' #Write compressed binary to file outFile = open('{}.hc'.format(sys.argv[1]), 'wb') outputBits.tofile(outFile) print 'Done.'
def main(): file = open(sys.argv[1], "rb") msg = ConstBitStream(file) s_in = BitArray() aux = BitArray() keys = [''] s_out = '' s_in.append(msg.read(1)) aux = s_in.copy() aux.invert(0) count = 0 #Encontrar o tamanho do padding while (s_in.bin) != aux.bin: try: count += 1 s_in.clear() s_in.append(msg.read(1)) except ReadError: break padding = BitArray() padding.append(s_in) s_in = BitArray() #Com o tamanho encontrar o padding correspondente padding.append(msg.read(count - 1)) while True: n_bits = ceil(log2(len(keys))) try: s_in.append(msg.read(n_bits + 1)) except ReadError: break y = s_in[:-1] b = s_in[-1:] if Bits(y) == Bits(''): pos = 0 else: pos = y.uint s_out = s_out + (str(keys[pos]) + b.bin) keys.append(str(keys[pos]) + str(Bits(b).bin)) s_in = BitArray() output = BitArray('0b' + s_out) output.append(padding) with open(sys.argv[2], 'wb') as f_out: output.tofile(f_out) file.close()
def writeB(scene, b_name): #Sanitization checks #if no image size, error if scene.n_films == 0: print "Error: Scene needs a film." return #if no camera, error: if scene.n_cameras == 0: print "Error: Scene needs a camera." return #if no bounding box, error if scene.n_boundboxes == 0: print "Error: Scene needs a bounding box." return #Create bitstring s = BitArray() #Film film = scene.films[0] t = BitArray() t = bitstring.pack("3*int:32", 0, film['width'], film['height']) print t.unpack("3*int:32") s = s + t #Camera camera = scene.cameras[0] t = BitArray() t = bitstring.pack("int:32, 12*float:32", 1, camera['point'][0], camera['point'][1], camera['point'][2], camera['fieldOfView'], camera['toPoint'][0], camera['toPoint'][1], camera['toPoint'][2], camera['up'][0], camera['up'][1], camera['up'][2], camera['lensRadius'], camera['focalDepth']) print t.unpack("int:32, 12*float:32") s = s + t #Lights for i in range(scene.n_lights): light = scene.lights[i] t = BitArray() t = bitstring.pack("2*int:32, 6*float:32", 2, light['type'], light['point'][0], light['point'][1], light['point'][2], light['color'][0], light['color'][1], light['color'][2]) print t.unpack("2*int:32, 6*float:32") s = s + t #Materials for i in range(scene.n_materials): mat = scene.materials[i] t = BitArray() t = bitstring.pack("int:32, 3*float:32, 2*int:32, 4*float:32", 3, mat['color'][0], mat['color'][1], mat['color'][2], mat['type'], mat['metal'], mat['specular'], mat['lambert'], mat['ambient'], mat['exponent']) print t.unpack("int:32, 3*float:32, 2*int:32, 4*float:32") s = s + t #Spheres for i in range(scene.n_spheres): sphere = scene.spheres[i] t = BitArray() t = bitstring.pack("int:32, 4*float:32, int:32", 4, sphere['point'][0], sphere['point'][1], sphere['point'][2], sphere['radius'], sphere['materialIndex']) print t.unpack("int:32, 4*float:32, int:32") s = s + t #Triangles for i in range(scene.n_triangles): tri = scene.triangles[i] t = BitArray() t = bitstring.pack("int:32, 9*float:32, int:32", 5, tri['point1'][0], tri['point1'][1], tri['point1'][2], tri['point2'][0], tri['point2'][1], tri['point2'][2], tri['point3'][0], tri['point3'][1], tri['point3'][2], tri['materialIndex']) print t.unpack("int:32, 9*float:32, int:32") s = s + t #Bounding Box box = scene.boundboxes[0] t = BitArray() t = bitstring.pack("int:32, 6*float:32", 6, box['min'][0], box['min'][1], box['min'][2], box['max'][0], box['max'][1], box['max'][2]) print t.unpack("int:32, 6*float:32") s = s + t #Send end code t = BitArray() t = bitstring.pack("int:32", 7) s = s + t #Write to file with open(b_name, "wb") as f: s.tofile(f)
#Create the container for the hashbools hashTable = BitArray(1000000) modValue = 1000000 #read each line, split to remove extra chars. for line in file.readlines(): data = line.split() for line in data: #hash with the md5 hashfunction. hash = hashlib.md5(line.lower().encode('utf-8')).digest() #extract parts of the hash and convert to integer inside array index span h1 = int(hash[0:3].encode("hex"), 16) % modValue h2 = int(hash[4:7].encode("hex"), 16) % modValue h3 = int(hash[8:11].encode("hex"), 16) % modValue h4 = int(hash[12:15].encode("hex"), 16) % modValue #set the bits in the array. use the set function for speed. hashTable.set(True, h1) hashTable.set(True, h2) hashTable.set(True, h3) hashTable.set(True, h4) file.close() #open file as binary file and write to it. with open('data', 'wb') as outfile: hashTable.tofile(outfile) #print execution time. ~20 seconds on my system end = time.time() print(end - start)
class Ps2Iso: def __init__(self, filename): self._set_logger() self.log.info(f"Loading {filename}, this may take a while...") #self.data = Bits(filename=filename) self.data = BitArray(filename=filename) self.pvd = PVD(self.data) self.block_size = self.pvd.logical_block_size if self.pvd.system_identifier != "PLAYSTATION": self.log.warning( (f"system_identifier: '{self.pvd.system_identifier}', " "should be 'PLAYSTATION'")) self.log.warning(f"{filename} may not be a PS2 ISO file") if self.block_size != 2048: self.log.warning((f"logical_block_size: {self.block_size}, " "should be 2048")) self.log.warning(f"{filename} may not be a PS2 ISO file") self.path_tables = PathTables(self.data, self.pvd) self.tree = self.path_tables.get_path_tree() def get_object(self, path): paths = path.split("/") if paths[0] == "": paths.pop(0) mark = self.tree for p in paths: mark = mark.get_child(p) return mark def get_blocks_allocated(self, path): obj = self.get_object(path) lba_list = self.get_lba_list() obj_idx = next(idx for idx, i in enumerate(lba_list) if i[1] == path) lba = lba_list[obj_idx][0] next_lba = lba_list[obj_idx + 1][0] return next_lba - lba def get_lba(self, path): return self.get_object(path).lba def replace_files(self, replacements, allow_move=False): paths = [path for path, _ in replacements] bins = [b for _, b in replacements] sizes = [len(b) // 8 for b in bins] blocks_required = [ceil(len(b) / 8 / self.block_size) for b in bins] curr_lba = [self.get_lba(p) for p in paths] curr_blocks_allocated = [self.get_blocks_allocated(p) for p in paths] items = [{ "path": p, "bin": b, "size": s, "blocks_required": br, "curr_lba": cl, "curr_blocks_allocated": cb } for p, b, s, br, cl, cb in zip(paths, bins, sizes, blocks_required, curr_lba, curr_blocks_allocated)] overflows = [] for i in items: if i["blocks_required"] > i["curr_blocks_allocated"]: overflows.append(i) for o in overflows: self.log.warning((f"{o['path']} (size: {o['size']} " f"requires {o['blocks_required']} blocks, " f"{o['curr_blocks_allocated']} available")) if overflows and not allow_move: raise ValueError("allow_move must be true to increase file sizes") for i in items: lba = i["curr_lba"] num_blocks = i["curr_blocks_allocated"] self.clear_blocks(lba, num_blocks) if not allow_move: for i in items: i["new_lba"] = i["curr_lba"] b = i["bin"] offset = i["curr_lba"] * self.block_size * 8 self.data.overwrite(b, offset) else: raise NotImplementedError("Moving files is not supported yet") for i in items: self.update_toc(i["path"], i["new_lba"], i["size"]) def update_toc(self, path, lba, size): self.get_object(path).update_toc(lba, size) def write(self, filename): with open(filename, "wb") as f: self.data.tofile(f) def clear_blocks(self, start_block, num_blocks): start_addr = start_block * self.block_size * 8 end_addr = start_addr + num_blocks * self.block_size * 8 self.data.set(0, range(start_addr, end_addr)) def get_lba_list(self): root = self.tree lba_list = self._get_lba_list(root) lba_list = list(set(lba_list)) return sorted(lba_list, key=lambda x: x[0]) def _get_lba_list(self, item, lba_list=None): if lba_list is None: lba_list = [] lba = item.lba path = item.path lba_list.append((lba, path)) if isinstance(item, TreeFolder): for c in item.children: self._get_lba_list(c, lba_list=lba_list) return lba_list def _get_blocks(self, lba, blocks=None, size=None): if blocks and size is None: size = blocks * self.block_size if size is None: raise ValueError("blocks/size must be set") def _set_logger(self): self.log = logging.getLogger("Ps2Iso") handler = logging.StreamHandler() formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) self.log.addHandler(handler) self.log.setLevel(logging.INFO)
modValue = 1000000 #read each line, split to remove extra chars. for line in file.readlines(): data = line.split() for line in data: #hash with the md5 hashfunction. hash = hashlib.md5(line.lower().encode('utf-8')).digest() #extract parts of the hash and convert to integer inside array index span h1 = int(hash[0:3].encode("hex"),16)%modValue h2 = int(hash[4:7].encode("hex"),16)%modValue h3 = int(hash[8:11].encode("hex"),16)%modValue h4 = int(hash[12:15].encode("hex"),16)%modValue #set the bits in the array. use the set function for speed. hashTable.set(True,h1) hashTable.set(True,h2) hashTable.set(True,h3) hashTable.set(True,h4) file.close() #open file as binary file and write to it. with open('data', 'wb') as outfile: hashTable.tofile(outfile) #print execution time. ~20 seconds on my system end = time.time() print(end-start)
class TinyCompressor: """This class holds data and methods for a data compressor using LEC Algorithm""" table = {} decode_table = {} eof = 0 __first_data = [] __previous_data = [] __data = [] __data_ns = [] __decimal_places = [] __strings_table = {} __compressed_data = BitArray() __compressed_data_string = "" __data_length = 0 data_frequencies = {} __d_values = [] codec = 0 def __init__(self, decimal_places): self.__decimal_places = decimal_places[:] def set_strings_table(self, new_strings_table): self.__strings_table = new_strings_table def get_n(self, d_value): if d_value == 0: return 0 return int(floor(log(abs(d_value),2))) + 1 def get_a(self,d_value,n_value): if d_value == 0: return "" if d_value < 0: return BitArray(int=d_value-1, length=20).bin[(-1)*n_value:] if d_value > 0: return BitArray(int=d_value, length=20).bin[(-1)*n_value:] def generate_data_list(self,inputfilename): first = True self.__first_data = [] self.__previous_data = [] self.__data_ns = [] self.__data = [] self.data_frequencies = {} with open(inputfilename) as inputfile: for line in inputfile: linedata = line.split(",") self.__data_length = len(linedata) if (len(linedata) != len(self.__decimal_places)): print "Length of decimal places different than length of data" return #Should return an exception if first: for i in range(len(linedata)): self.__first_data.append(float(linedata[i])) self.__previous_data = self.__first_data[:] first = False else: for i in range(len(linedata)): value = (int(float(linedata[i]) * 10**self.__decimal_places[i]) - int(float(self.__previous_data[i]) * 10**self.__decimal_places[i])) """if (i == 2): print "Value =", value""" self.__data.append(value) self.__data_ns.append(self.get_n(value)) self.__previous_data = linedata[:] print "Data len =", len(self.__data) print "adding range MAX_DATA_N" self.__data_ns += range(MAX_DATA_N) def generate_table(self,inputfilename): self.generate_data_list(inputfilename) self.codec = HuffmanCodec.from_data(self.__data_ns) self.table = self.codec.get_code_table() self.__strings_table = {} for symbol in self.table.keys(): if not type(symbol) is int: self.eof = symbol bitsize, value = self.table[symbol] self.__strings_table[symbol] = bin(value)[2:].rjust(bitsize, '0') def encode_data(self, inputfilename, outputfilename): self.generate_data_list(inputfilename) self.__compressed_data_string = "" try: for i in range(len(self.__data)): self.__compressed_data_string += \ self.__strings_table[self.__data_ns[i]] + \ self.get_a(self.__data[i], self.__data_ns[i]) except KeyError: print "Not possible to encode data[{}] = {}".format(i, self.__data[i]) return #Add EOF self.__compressed_data_string += self.__strings_table[self.eof] self.__compressed_data = BitArray(bin=self.__compressed_data_string) #print "Compressed data to file:", self.__compressed_data.bin def to_file(self): with open(outputfilename, 'wb') as outputfile: self.__compressed_data.tofile(outputfile) def build_values(self,inputfilename): print "Building values from", inputfilename compressed_bitarray = 0 with open(inputfilename, 'rb') as compressedfile: compressed_bitarray = BitArray(compressedfile) #print "Compressed data from file:", compressed_bitarray.bin for k in self.__strings_table.keys(): if (type(k) is int): self.decode_table[self.__strings_table[k]] = k possible_codes = set(self.decode_table.keys()) #print "Decode table =", self.decode_table self.__d_values = [] time_to_stop = False iteration = 0 start_s = 0 end_s = 1 start_a = end_s end_a = 3 n = 0 s = 0 a = 0 while( not time_to_stop): if compressed_bitarray[start_s:end_s].bin in possible_codes: s = compressed_bitarray[start_s:end_s] n = self.decode_table[s.bin] start_a = end_s end_a = start_a + n # +1 ? if n == 0: #a = 0 self.__d_values.append(0) else: a = compressed_bitarray[start_a:end_a] if a[0]: self.__d_values.append((OFFSET_ZERO+ a).int) else: self.__d_values.append((OFFSET_ONE+ a).int +1) start_s = end_a else: end_s += 1 if end_s >= len(compressed_bitarray.bin): time_to_stop = True def decode_data(self,first_values, inputfilename, outputfilename): self.build_values(inputfilename) self.__values = [] accumulator = first_values[:] print "len __d_values =", len(self.__d_values) """print "Data encoded =", self.__data print "Data decoded =", self.__d_values print "First values =", first_values""" """for i in range(len(self.__d_values)/len(accumulator)): self.__values.append(accumulator[:]) for j in range(i, i*len(accumulator)+ len(accumulator)): print "(i,j) =",j-i*len(accumulator),j accumulator[j-i*len(accumulator)] += self.__d_values[j]""" self.__values.append(accumulator[:]) for i in range(len(self.__d_values)): """if (i == 2): print "Value =", self.__d_values[i]""" """if((i%len(accumulator) == 1)): print self.__d_values[i]""" if self.__decimal_places[i%len(accumulator)] == 0: accumulator[i%len(accumulator)] += self.__d_values[i] else: accumulator[i%len(accumulator)] += float(self.__d_values[i]) \ / 10**self.__decimal_places[i%len(accumulator)] if ((i%len(accumulator)) == (len(accumulator)-1)): self.__values.append(accumulator[:]) with open(outputfilename, 'wb') as outputfile: for value in self.__values: line = ",".join( [("{:."+str(self.__decimal_places[i]) +"f}").format(float(value[i])) for i in range(len(value))]) outputfile.write(line + '\n')
index = fb.find('0xa1dcab8c47a9cf118ee400c00c205365',bytealigned=True) print "[*] found file properties GUID" print "[*] File properties GUID: %s" % fb[index[0]:(index[0]+128)] # index of minumum packet size in File Proprties header i_min_data_pkt_size = index[0] + 736 print "[*] Original Minimum Data Packet Size: %s" % fb[i_min_data_pkt_size:i_min_data_pkt_size+32].hex print "[*] Original Maximum Data Packet Size: %s" % fb[i_min_data_pkt_size+32:i_min_data_pkt_size+64].hex # Accroding to ASF standarad the minimum data size and the maximum data size should be equal print "[*] Changing Miniumum and Maximum Data packet size to 0" # changing the data packets in bit array fb[i_min_data_pkt_size:i_min_data_pkt_size+8] = 0x00 fb[i_min_data_pkt_size+8:i_min_data_pkt_size+16] = 0x00 fb[i_min_data_pkt_size+16:i_min_data_pkt_size+24] = 0x00 fb[i_min_data_pkt_size+24:i_min_data_pkt_size+32] = 0x00 fb[i_min_data_pkt_size+32:i_min_data_pkt_size+40] = 0x00 fb[i_min_data_pkt_size+40:i_min_data_pkt_size+48] = 0x00 fb[i_min_data_pkt_size+48:i_min_data_pkt_size+56] = 0x00 fb[i_min_data_pkt_size+56:i_min_data_pkt_size+64] = 0x00 print "[*] POC File Created poc.asf" of = open('poc.asf','w+b') fb.tofile(of) of.close() f.close()
def save_best(data: BitArray): file = open(file=current_best, mode='wb') data.tofile(file) file.close()
def compress(originalfile, treefile='treefile.json', compressedfile='compressedfile.bin'): """ The compress function compresses the originalfile, and stores the tree as "treefile" and the compressed file as "compressedfile" """ #creates a blank dictionary thecharacters = {} tree = {} #opnes the textfile with utf8 encoding as f with open(originalfile, encoding='utf8') as f: #loops until the file has ended while True: #reads the c = f.read(1) #if c is blank if not c: #stop the timer as the file is finished and exit the loop break #if it appears in the dictionary if c in thecharacters: #add 1 to the counter of frequency thecharacters[c] += 1 #if c doesnt appear in the dictionary if c not in thecharacters: #add the item to the dictionary thecharacters[c] = 1 f.close() for key in thecharacters: tree[key[0]] = '' """ create the tree smallest value gets assigned as 0 and 1 old dictionary altered to reflect the combined values new dictionary created with the single elements and binary didgts for each character assign the new bits to the end of the dictionary value repeat until length of old dictionary is 1 """ while len(thecharacters) > 1: smallest2 = sorted(thecharacters.items(), key=itemgetter(1))[:2] del thecharacters[smallest2[0][0]] del thecharacters[smallest2[1][0]] thecharacters[smallest2[0][0] + '㋡' + smallest2[1][0]] = int( smallest2[0][1]) + int(smallest2[1][1]) for i in smallest2[0][0].split('㋡'): tree[i] += '0' for i in smallest2[1][0].split('㋡'): tree[i] += '1' """ When tree is created, need to flip the binary values back to front to give the actual values """ for i in tree: tree[i] = tree[i][::-1] #write json file with the huffman tree json.dump(tree, open(treefile, 'w')) """ read the file compare the character to the dictionary find the binary write it to file """ binstring = '' file = open(compressedfile, "wb") with open(originalfile, encoding='utf8') as f: #loops until the file has ended while True: c = f.read(1) #if character appears in the dictionary if c in tree: #writes the value to the file binstring += tree[c] if not c: a = BitArray(bin=binstring) a.tofile(file) file.close() f.close() break original = int(os.stat(originalfile).st_size) final = int(os.stat(compressedfile).st_size) treesize = int(os.stat(treefile).st_size) print(originalfile) print("Original file size: ", os.stat(originalfile).st_size, "bits") print("Compresd file size: ", os.stat(compressedfile).st_size, "bits") print("The tree file size: ", os.stat(treefile).st_size, "bits") print("The totl file size: ", final + treesize, "bits") print((((final + treesize) - original) / original) * 100, "% change")
for i in Lis: file.write(i[0]) file.write(i[1]) file.write(" ") file.close() file = open(r"C:\Users\akash\OneDrive\Desktop\Images\compressed.txt", "wb") #Location of the compressed file #dec=file.read() temp_string = filecon check_st = "" for i in Lis: if (i[0] == "0"): temp_string = temp_string.replace(i[0], "^$^") check_st = i[1] for i in Lis: if (i[0] == "1"): temp_string = temp_string.replace(i[0], i[1]) temp_string = temp_string.replace("^$^", check_st) print(temp_string) for i in Lis: if (i[0] != "0" and i[0] != "1"): temp_string = temp_string.replace(i[0], i[1]) print(temp_string) temp_string = "0b" + temp_string #Addition of "0b" is required by BitString to convert it print("length", len(temp_string)) b = BitArray(temp_string) b.tofile(file) file.close() length_tempst = len(temp_string) - 2 #This variable will be passed to the Huffmantextdecom.py file
if fill_perc == 50: registry = BitArray(os.urandom(size // 8)) filled = registry.count(1) if filled > fill_count: registry.invert() filled = size - filled else: filled = 0 registry = BitArray(length=size) for _ in range(index_bits): registry.set(1, random.choices(range(0, size), k=(inv_count - filled))) filled = registry.count(1) if inv_count - filled < 10: break while filled < inv_count: pos = random.randrange(size) if not registry[pos]: registry.set(1, pos) filled += 1 if fill_perc > 50: registry.invert() filename = f"data/{index_bits}bits_{fill_perc}pc_random.gz" with gzip.open(filename, "wb") as fp: registry.tofile(fp) print(filename)
def calc(): bStartTolerance = breakbetween + (breakbetween * bTolerance) sStartTolerance = breakbetween - (breakbetween * sTolerance) bBigBreakTolerance = longBreak + (longBreak * bTolerance) sBigBreakTolerance = longBreak - (longBreak * bTolerance) bSmallBreakTolerance = (longBreak * factor) + (longBreak * factor * bTolerance) sSmallBreakTolerance = (longBreak * factor) - (longBreak * factor * sTolerance) #bSyncTolerance = 0.01+(0.1*bTolerance) #sSyncTolerance = 0.01-(0.1*sTolerance) global codedata global result write = [] index = 0 partialError = 0 dataIndex = 0 dif = [] while 1: mutex.acquire() while True: if len(data) > len(codedata): codedata.append([]) else: break while True: if len(data) > len(write): write.append(False) else: break # start new index circualation if dataIndex == len(data) - 1: dataIndex = -1 # if dataIndex < len(data) - 1: dataIndex += 1 if len(data) == 0: mutex.release() time.sleep(threadBreak) continue if len(data[dataIndex]) != 0: while len(data[dataIndex]) > 2: # enouth to compare? d1 = datetime.strptime(data[dataIndex][0], "%H:%M:%S.%f") d2 = datetime.strptime(data[dataIndex][1], "%H:%M:%S.%f") d1 = d2 - d1 # calculate the time between paket dif.append(float(d1.total_seconds())) data[dataIndex].pop(0) for f1 in dif: if sStartTolerance < f1 < bStartTolerance: # searching the file start/end index = 0 print(str(f1) + " \t=> Start of File") if codedata[dataIndex] != []: hashFromServer = codedata[dataIndex][ -8:] # get the Hash fom the end of the data del codedata[dataIndex][-8:] # remove hash from data hashFromServer = int( ''.join(str(e) for e in hashFromServer), 2) print("Hash from serer: " + str(hashFromServer)) dataString = ''.join( str(e) for e in codedata[dataIndex]) # data from List to String hashFromClient = hash8( codedata[dataIndex], table)[0] # generating 8 bit Perason Hash print("Hash from client: " + str(hashFromClient)) if hashFromServer != hashFromClient: print( "Mistake in data transfer... Hashes are not the same!" ) print("") print("Data: " + dataString) print("Data Length: " + str(len(dataString))) print("") b = BitArray(bin=dataString ) # making bitArray without Char encoding if hashFromClient == hashFromServer: # successfully transfered f = open('./' + filename, 'wb') # open file b.tofile(f) # write to file f.flush() f.close() return codedata[dataIndex] = [] if write[ dataIndex] == False: # false at the beginning as long the file hasnt started write[dataIndex] = True else: if write[dataIndex] == True: if sBigBreakTolerance < f1 < bBigBreakTolerance: # time range for a 1 codedata[dataIndex].append("1") print( str(index) + "\t" + str(f1) + " \t=> 1 " ) # print result and distance to the range borders else: if sSmallBreakTolerance < f1 < bSmallBreakTolerance: # time range for 0 codedata[dataIndex].append("0") print(str(index) + "\t" + str(f1) + " \t=> 0") else: partialError += 1 print( str(index) + "\t" + str(f1) + " \t=> undefind: will be ignored") index += 1 dif = [] mutex.release() time.sleep(threadBreak)
def calc(): bStartTolerance = breakbetween + (breakbetween * bTolerance) sStartTolerance = breakbetween - (breakbetween * sTolerance) bBigBreakTolerance = longBreak + (longBreak * bTolerance) sBigBreakTolerance = longBreak - (longBreak * bTolerance) bSmallBreakTolerance = (longBreak * factor) + (longBreak * factor * bTolerance) sSmallBreakTolerance = (longBreak * factor) - (longBreak * factor * sTolerance) #bSyncTolerance = 0.01+(0.1*bTolerance) #sSyncTolerance = 0.01-(0.1*sTolerance) global codedata global result write = False index = 0 totalError = 0 partialError = 0 totalData = 0 count = 0 correctTransfert = 0 startTime = [] firstCorrectReceveTime = [] passFirstTime = 0 while 1: mutex.acquire() dif = list() while len(data) > 2: # enouth to compare? d1 = datetime.strptime(data[0], "%H:%M:%S.%f") d2 = datetime.strptime(data[1], "%H:%M:%S.%f") d1 = d2 - d1 # calculate the time between paket dif.append(float(d1.total_seconds())) data.pop(0) for f1 in dif: if sStartTolerance < f1 < bStartTolerance: # searching the file start/end index = 0 print(str(f1) + " \t=> Start of File") if codedata != []: hashFromServer = codedata[ -8:] # get the Hash fom the end of the data del codedata[-8:] # remove hash from data hashFromServer = int( ''.join(str(e) for e in hashFromServer), 2) print("Hash from serer: " + str(hashFromServer)) dataString = ''.join( str(e) for e in codedata) # data from List to String hashFromClient = hash8( codedata, table)[0] # generating 8 bit Perason Hash print("Hash from client: " + str(hashFromClient)) if hashFromServer != hashFromClient: print( "Mistake in data transfer... Hashes are not the same!" ) print("") print("Data: " + dataString) print("Data Length: " + str(len(dataString))) print("") totalData += len(dataString) totalError += partialError partialError = 0 print("Fehlerrate gesamt: " + str((totalError / totalData) * 100) + "%") count += 1 b = BitArray(bin=dataString ) # making bitArray without Char encoding if hashFromClient == hashFromServer: f = open('./' + filename, 'wb') # open file b.tofile(f) # write to file f.flush() f.close() if correctTransfert == 0: firstCorrectReceveTime = time.time() correctTransfert += 1 print("Korrekt Übertragen: " + str(correctTransfert)) if startTime != [] and firstCorrectReceveTime != []: print("Zeit bis zum ersten koreketn Paket" + (times + " " + str(firstCorrectReceveTime - startTime))) return if count == 20: print("Korrekt Übertragen: " + str(correctTransfert)) if startTime != [] and firstCorrectReceveTime != []: print("Zeit bis zum ersten koreketn Paket" + str(startTime - firstCorrectReceveTime)) #return codedata = [] if write == False: # false at the beginning as long the file hasnt started startTime = time.time() write = True else: if write == True: if sBigBreakTolerance < f1 < bBigBreakTolerance: # time range for a 1 if index <= 10: print( str(index) + "\t" + str(f1) + " \t estimate sync") else: codedata.append("1") print( str(index) + "\t" + str(f1) + " \t=> 1 " ) # print result and distance to the range borders else: if sSmallBreakTolerance < f1 < bSmallBreakTolerance: # time range for 0 if index <= 10: print( str(index) + "\t" + str(f1) + " \t=> estimate sync") else: codedata.append("0") print(str(index) + "\t" + str(f1) + " \t=> 0") else: if index < 16: print( str(index) + "\t" + str(f1) + " \t=> sync") else: partialError += 1 print( str(index) + "\t" + str(f1) + " \t=> undefind: will be ignored") index += 1 mutex.release() time.sleep(threadBreak)
from bitstring import BitArray from tkinter.filedialog import askopenfilename fn = askopenfilename(filetypes=(("chicken File", "*.ch"), ("All Files", "*.*")), title="Select a chicken file") f = open(fn, 'r') in_string = f.read() in_string = ((in_string.replace('chicken', '1')).replace(' ', '')).replace('\n', '0') bit_array = BitArray(bin=in_string) #print(bit_array.bin) out_file = open(fn[:-2] + 'cbit', 'wb') bit_array.tofile(out_file) out_file.close()
def compress(file): print("Compressing...") print("") # Opens and reads the file, with UTF-8 encoding with codecs.open(file, 'r', encoding='utf8') as f: text = f.read() # Iterates through the characters in the file, adding each unique character to letter_frequency and letters arrays # A count is kept for the number of times each character is used in the file and is added to the letter_frequency array (alongside the character) letters = [] letter_frequency = [] for letter in text: if letter not in letter_frequency: frequency = text.count(letter) letter_frequency.append(frequency) letter_frequency.append(letter) letters.append(letter) # Creates the initial nodes for the Huffman Tree nodes = [] while len(letter_frequency) > 0: nodes.append(letter_frequency[0:2]) letter_frequency = letter_frequency[2:] nodes.sort() tree = [] tree.append(nodes) # Iterates through characters, allocating each one a 1 or a zero, based on whether the character is in the present in the node before # Also creates new nodes if there is more than one character associated to the node before while len(nodes) > 1: x = 0 new_node = [] nodes.sort() nodes[x].append("0") nodes[x + 1].append("1") first_node = (nodes[x][0] + nodes[x + 1][0]) second_node = (nodes[x][1] + nodes[x + 1][1]) new_node.append(first_node) new_node.append(second_node) new_nodes = [] new_nodes.append(new_node) new_nodes = new_nodes + nodes[2:] nodes = new_nodes tree.append(nodes) tree.sort(reverse=True) # Removes all duplicate items in the Huffman Tree unique_nodes = [] for level in tree: for node in level: if node not in unique_nodes: unique_nodes.append(node) else: level.remove(node) # Builds the unique binary code for each character based on its path in the Huffman Tree if len(letters) == 1: letter_code = [letters[0], "0"] letter_binary.append(letter_code * len(text)) else: for letter in letters: lettercode = "" for node in unique_nodes: if len(node) > 2 and letter in node[1]: lettercode = lettercode + node[2] letter_code = [letter, lettercode] letter_binary.append(letter_code) # Creates new array, containing only the character and binary code for each character in the Huffman Tree tree_levels = [] tree_level = [] for letter in letter_binary: tree_level.append(letter[0]) tree_level.append(letter[1]) tree_levels.append(tree_level) tree_level = [] # Sorts and prints the Huffman Tree using the Sort() function print("Huffman Tree of File:") print(Sort(tree_levels)) print("") # Creates bitstring of the text in the file, using binary codes of each character binary_string = "" for character in text: for item in letter_binary: if character in item: binary_string = binary_string + item[1] # Prints the binary representation of the file print("Compressed File:") print(binary_string) print("") # Writes the bitstring and the Huffman Tree to a bin file (compressed file) a = BitArray(bin=binary_string) with open('output_file.bin', 'wb') as f: a.tofile(f) pk.dump(tree, f) # Calculates the size of the original file, the compressed file and the size reduction uncompressed_file_size = Path(file).stat().st_size compressed_file_size_bytes = Path('output_file.bin').stat().st_size compressed_file_size = len(binary_string) size.append(compressed_file_size) print("Original file size: ", uncompressed_file_size, " bytes") print("Compressed file size: ", compressed_file_size_bytes, " bytes") print("This is a reduction of ", (1 - (compressed_file_size_bytes / uncompressed_file_size)) * 100, "%")
def compression(target_file): """This is the compression section""" start_dict = float(time.process_time()) # measure time from here # frequency dictionary letter_freq = {} # collect all characters in a text file opened_file = target_file filename = opened_file with open(opened_file, encoding="utf_8_sig") as f: for line in f: for letter in line: # print("'" + letter + "' found!") # test code try: letter_freq[letter] += 1 # dict take every characters except: KeyError # ignore other characters not in the dictionary letter_freq.update({letter: 1}) # add new character to dict # print("letter_freq", letter_freq) # Class for creating node objects in a tree class Node(object): """This is the class that creates an object that holds 2 daughter objects""" # constructor def __init__(self, left_node=None, right_node=None): """ In a binary tree, each node can only have up to 2 daughters. The variable 'left' and 'right' stores the daughter nodes when initialised. The node stored in the daughters can either be a character (leaf) or another node object. """ self.left_node = left_node # left-hand nodes self.right_node = right_node # right-hand node # getter method for left and right item def daughters(self): """This is a method that returns the contents of both nodes stored from their parent node""" return self.left_node, self.right_node # a ascending sorted array list of nodes (can be either a node object or a character node) in tuple # sort all tuple item by their second element (at position 1) in the array # convert evey item in the letter frequency dictionary 'letter_freq' into a tuple consist of a character and number list_nodes_objects = sorted(letter_freq.items(), key=lambda x: x[1], reverse=False) # print(list_nodes_objects) # test code # this while loop iterates while len(list_nodes_objects) > 1: """ This while loop iterates the list of node tuple until one item is left. The goal of this loop is to generate 1 node, which contains all the combined nodes """ # take the 2 least weight nodes (character_1, freq_1) = list_nodes_objects[0] # tuple (character, freq) (character_2, freq_2) = list_nodes_objects[1] list_nodes_objects = list_nodes_objects[ 2:] # list of nodes updated after 2 nodes are taken out # print("list_nodes_objects ", list_nodes_objects) # test code new_node = Node( character_1, character_2 ) # new node, contains the 2 combined smallest node in an object list_nodes_objects.append( (new_node, freq_1 + freq_2) ) # put the combined node back to the array of node with new weight # (combined node, sum of weight) # sort the array in order after a new node is append list_nodes_objects = sorted(list_nodes_objects, key=lambda x: x[1], reverse=False) # print("sorted list_nodes_objects ", list_nodes_objects) # test code # Huffman dictionary generator def huffman_dictionary(node_object, binary=''): """ This recursive function returns the huffman code mapping (hash table) for each characters in dictionary. The parameter 'node' can either be a node object or a character. e.g. (node, weight) The parameter 'bitString' stores the binary information of a character in every recursion. """ # if a leaf (a string, not an object) is reached: if type(node_object) is str: return { node_object: binary } # insert a new mapping to the dictionary # create a tuple, extract items (left_item, right_item) from a nodes' daughter (left_item, right_item) = node_object.daughters() encoding_dict = {} # dictionary contains all the encoding # print("1 ", encoding_dict) # test code # call itself recursively until all dictionary is added, depth first search algorithm encoding_dict.update(huffman_dictionary(left_item, binary + '0')) encoding_dict.update(huffman_dictionary(right_item, binary + '1')) # print("2 ", encoding_dict) # test code return encoding_dict # output the dictionary # trigger the function 'huffman_dictionary' to generate the huffman dictionary # by passing the [first object] in the [first tuple] # in the [list of tuple node object] created in the loop. huffman_encoding = huffman_dictionary(list_nodes_objects[0][0]) # print(huffman_encoding) # test code end_dict = float( time.process_time() ) # end measure time in this line to check processing time for this section print( str(end_dict - start_dict) + " second taken to create a unique huffman hash table!") # add letter frequency to the compressed file # convert a dictionary object into string string_dict = json.dumps(letter_freq) # print("string dict: ", string_dict) # test code # convert dictionary to ascii binary byte_array = string_dict.encode() binary_int = int.from_bytes(byte_array, "big") binary_string = bin(binary_int) dict_binary = binary_string[2:] # print("dict bin: ", dict_binary) # test code # add binary identifier (a catchphrase) for the dictionary section # this separate the hash table with the huffman code byte_array1 = "catchphrase".encode() binary_int1 = int.from_bytes(byte_array1, "big") binary_catchphrase = bin(binary_int1)[2:] # print("catch bin: ", binary_catchphrase) # test code # print("catch bin len: ",len(binary_catchphrase)) # test code: 87 # ------ huffman_binary = '' # insert binary encoding to a new compression file opened_file = open(opened_file, encoding="utf_8_sig") with opened_file as f: for line in f: for letter in line: # print(huffman_encoding[letter]) # test code huffman_binary += huffman_encoding[letter] # print(letter, huffman_encoding[letter]) # test code # print("huff: ",huffman_binary) # test code # compress file with hash table into binary encodingString = "" # the encoded content # insert decoding hash table and the catchphrase encodingString += dict_binary + binary_catchphrase + huffman_binary # encodingString += dict_binary + binary_catchphrase + huffman_binary # test code # print("binary: ", encodingString) # test code # extract original file name file_name = filename[:-4] # assuming the file to be compressed is .txt # print(file_name) # test code # write 'encodingString' into a binary file compressed_file = BitArray(bin=encodingString) open_file = file_name + '.bin' with open(open_file, 'wb') as f: compressed_file.tofile(f) print("Compression completed!") end_compress = float( time.process_time() ) # end measure time in this line to check processing time for this section print(str(end_compress - end_dict) + " second taken to compress file!") main()