def classify_list_of_videos(videos): LAST_EDGE_LAYER = FLAGS.layer_index NUM_BINS = FLAGS.num_bins DELTA_VALUE = FLAGS.delta_value codec_path = 'huffman_encoding_config/' + 'layer' + str( LAST_EDGE_LAYER) + '/' + 'num_bins_' + str(NUM_BINS) delta_hist = analysis.load_huff_dictionary(codec_path + '/delta_hist') delta_codec = HuffmanCodec.from_frequencies(delta_hist) frame_one_hist = analysis.load_huff_dictionary(codec_path + '/frame_one_hist') frame_one_codec = HuffmanCodec.from_frequencies(frame_one_hist) codecs = delta_codec, frame_one_codec for i in range(len(videos)): print('Classifying: ', videos[i]) MSE = classify_video(videos[i], codecs, write=False) send(RESET) results_path = "Results" + '/layer' + str(LAST_EDGE_LAYER) + '/num_bins_' + str(NUM_BINS) + '/delta_value' \ + str(DELTA_VALUE) avg_error = sum(MSE) / len(MSE) result = 'file: ' + videos[i] + ', AVG_MSE: ' + str(avg_error) + '\n' if not (os.path.isdir(results_path)): try: os.makedirs(results_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(results_path + '/MSE.txt', 'a') as myfile: myfile.write(result)
def main(): logging.basicConfig(level=logging.INFO) # Shakespeare Complete Work from Project Gutenberg url = 'http://www.gutenberg.org/files/100/100-0.txt' path = download(url, 'shakespeare.txt') with path.open('r', encoding='utf-8-sig') as f: raw = f.read() _log.info('Building codec from raw data') frequencies = Counter(raw) _log.info(f'Frequencies {len(frequencies)}: {frequencies}') codec = HuffmanCodec.from_frequencies(frequencies) codec.save(CODECS / "shakespeare-raw.pickle", metadata={"frequencies": frequencies}) _log.info('Doing white space clean up') clean = raw clean = re.sub(r'\s*\n+\s*', '\n', clean) clean = re.sub(r' +', ' ', clean) frequencies = Counter(clean) _log.info(f'Frequencies {len(frequencies)}: {frequencies}') codec = HuffmanCodec.from_frequencies(frequencies) codec.save(CODECS / "shakespeare.pickle", metadata={"frequencies": frequencies}) _log.info('Only handling lower case') lower = clean.lower() frequencies = Counter(lower) _log.info(f'Frequencies {len(frequencies)}: {frequencies}') codec = HuffmanCodec.from_frequencies(frequencies) codec.save(CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies})
def main(): logging.basicConfig(level=logging.INFO) # JSON data sets from https://www.data.gov/ urls = [ "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.json", "https://data.cdc.gov/api/views/bi63-dtpu/rows.json", "https://data.cdc.gov/api/views/cjae-szjv/rows.json", "https://data.cityofnewyork.us/api/views/25th-nujf/rows.json", "https://data.ct.gov/api/views/kbxi-4ia7/rows.json", "https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.json", "https://www.chapelhillopendata.org/api/v2/catalog/datasets/bicycle-crash-data-chapel-hill-region/exports/json", "https://data.cdc.gov/api/views/6vp6-wxuq/rows.json", "https://www.sba.gov/sites/default/files/data.json", "https://data.cdc.gov/api/views/e6fc-ccez/rows.json", "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.json", "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.json", "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.json", "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.json", "https://data.cdc.gov/api/views/6rkc-nb2q/rows.json", "https://data.sfgov.org/api/views/j4sj-j2nf/rows.json", "https://data.kingcounty.gov/api/views/gmen-63jm/rows.json", "https://data.mo.gov/api/views/vpge-tj3s/rows.json", ] _log.info('Building frequency tables') frequencies_raw = Counter() frequencies_compact = Counter() for url in urls: path = download( url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json') with path.open('r') as f: raw = f.read() # Only take first N bytes. # Large files probably have a lot of structural repetition, which skews the frequencies frequencies_raw.update(raw[:100000]) # Parse and re-encode to compact JSON compact = json.dumps(json.loads(raw), separators=(',', ':')) frequencies_compact.update(compact[:100000]) # TODO add more metadata _log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}') codec = HuffmanCodec.from_frequencies(frequencies_raw) codec.save(CODECS / "json.pickle", metadata={"frequencies": frequencies_raw}) _log.info( f'Frequencies compact {len(frequencies_compact)}: {frequencies_compact}' ) codec = HuffmanCodec.from_frequencies(frequencies_compact) codec.save(CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact})
def test_eof_cut_off(): # Using frequency table that should give this encoding # A -> 0 # B -> 11 # C -> 101 # EOF -> 100 codec = HuffmanCodec.from_frequencies({ 'A': 5, 'B': 4, 'C': 2, }) cases = { # Straightforward cases '': 0, 'A': 1, 'AB': 1, 'ABB': 1, 'CCC': 2, # Cases where EOF cut-off saves one output byte 'ACC': 1, 'CC': 1, 'CCCCC': 2, } for data, expected_length in cases.items(): encoded = codec.encode(data) assert len(encoded) == expected_length assert data == codec.decode(encoded)
def huffman_encode_block(zigzagged_block): frequencies = collections.Counter(zigzagged_block) huffman_codec = HuffmanCodec.from_frequencies(frequencies) temp = [] for k, v in frequencies.items(): temp.extend([k, v]) return huffman_codec.encode(zigzagged_block), dict(frequencies)
def test_print_code_table(): codec = HuffmanCodec.from_frequencies({'a': 2, 'b': 4, 'c': 8}) out = StringIO() codec.print_code_table(out=out) dump = out.getvalue() assert re.search(r"1\s+1\s+.*'c'", dump) assert re.search(r"2\s+01\s+.*'b'", dump) assert re.search(r"3\s+001\s+.*'a'", dump) assert re.search(r"3\s+000\s+.*_EOF", dump)
def test_trailing_zero_handling(): """ Just two symbols ('a' and 'b'): without end-of-file handling, each would only take 1 bit (e.g. a=0 and b=1) so 'abba' would be 4 bits '0110', trailed with zeros to fill a byte: '0110000', which is indiscernible from result of input 'abbaaaaa'. With proper end-of-file handling, trailing bits are ignored properly. """ codec = HuffmanCodec.from_frequencies({'a': 1, 'b': 1}) decoded = codec.decode(codec.encode('abba')) assert decoded == 'abba'
def test_custom_eof_in_frequencies(): codec = HuffmanCodec.from_frequencies({ 'A': 5, 'B': 3, 'C': 2, 'Z': 8 }, eof="Z") encoded = codec.encode("ABCACBZABAB") assert codec.decode(encoded) == "ABCACB"
def encode(data, fps=10): assert len(data.shape) == 4 num_frames = data.shape[0] flattened = torch.flatten(data).sign() print(flattened.shape) flattened[flattened == 0] = 1 data = flattened.int().tolist() freqs = {} freqs[1] = torch.sum(flattened == 1).item() freqs[-1] = torch.sum(flattened == -1).item() assert freqs[1] + freqs[-1] == len(data) codec = HuffmanCodec.from_frequencies(freqs) encoded = codec.encode(data) added_bitrate = (len(encoded) + (len(freqs) * (32 * 2 * 8))) / (num_frames * (2 ** 20)) * fps return freqs, encoded, added_bitrate
def main(): logging.basicConfig(level=logging.INFO) # XML data sets from https://www.data.gov/ urls = [ "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.xml", "https://data.cdc.gov/api/views/bi63-dtpu/rows.xml", "https://data.cdc.gov/api/views/cjae-szjv/rows.xml", "https://data.cityofnewyork.us/api/views/25th-nujf/rows.xml", "https://data.ct.gov/api/views/kbxi-4ia7/rows.xml", "https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.xml", "https://data.cdc.gov/api/views/6vp6-wxuq/rows.xml", "https://www.sba.gov/sites/default/files/data.xml", "https://data.cdc.gov/api/views/e6fc-ccez/rows.xml", "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.xml", "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.xml", "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Roads_MileMarkers.xml", "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.xml", "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.xml", "https://data.cdc.gov/api/views/6rkc-nb2q/rows.xml", "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Airports.xml", "https://data.sfgov.org/api/views/j4sj-j2nf/rows.xml", "https://data.kingcounty.gov/api/views/gmen-63jm/rows.xml", "https://data.mo.gov/api/views/vpge-tj3s/rows.xml", ] _log.info('Building frequency tables') frequencies = Counter() for url in urls: path = download( url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.xml') with path.open('r') as f: # Only take first N bytes. # Large files probably have a lot of structural repetition, which skews the frequencies raw = f.read(100000) frequencies.update(raw) # TODO add more metadata _log.info(f'Frequencies raw {len(frequencies)}: {frequencies}') codec = HuffmanCodec.from_frequencies(frequencies) codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})
def decode(data): # decode binary data with special header for decompress algorithm real_size = int.from_bytes(data[:POS_NUM_OF_CHARS], byteorder='little') num_of_chars = int.from_bytes(data[POS_NUM_OF_CHARS:POS_NUM_OF_CHARS + 2], byteorder='big') # int(data[POS_NUM_OF_CHARS]) dummy_list = [] print("num_of_chars ", num_of_chars) for i in range(num_of_chars): c_ch = data[POS_ALPHABET + i * 4] c_ch_freq = int.from_bytes(data[POS_ALPHABET + 1 + i * 4:POS_ALPHABET + 1 + i * 4 + 3], byteorder='big') dummy_list.append((c_ch, c_ch_freq)) alphabet_frequencies = dict(dummy_list) print("alphabet_frequencies", alphabet_frequencies) codec = HuffmanCodec.from_frequencies(alphabet_frequencies) codec.print_code_table() return codec.decode(data[POS_ALPHABET + num_of_chars * 4:])
def main(): # Create a TCP/IP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Bind the socket to the port server_address = ('localhost', 10000) print(sys.stderr, 'starting up on %s port %s' % server_address) sock.bind(server_address) # Listen for incoming connections sock.listen(1) connect = True failCount = 0 passCount = 0 INCEPT = torchvision.models.inception_v3(pretrained=True) PREVIOUS_ARRAY = None USE_DELTA = True LAST_EDGE_LAYER = FLAGS.layer_index NUM_BINS = FLAGS.num_bins DELTA_VALUE = FLAGS.delta_value save_results = FLAGS.save_results analyse_fc_results = FLAGS.compare_fc print('Number of bins: ', NUM_BINS) print('DELTA VALUE: ', DELTA_VALUE) print('Last edge index: ', LAST_EDGE_LAYER) print('Saving Results: ', save_results) if (LAST_EDGE_LAYER == 7): RESHAPE_ARRAY_DIMENSIONS = [192, 35, 35] elif (LAST_EDGE_LAYER == 11): RESHAPE_ARRAY_DIMENSIONS = [768, 17, 17] elif (LAST_EDGE_LAYER == 6): RESHAPE_ARRAY_DIMENSIONS = [192, 71, 71] else: RESHAPE_ARRAY_DIMENSIONS = None print("Reshape dimensions not defined for layer being partitioned") codec_path = 'huffman_encoding_config/' + 'layer' + str(LAST_EDGE_LAYER) + '/' + 'num_bins_' + str(NUM_BINS) delta_hist = load_huff_dictionary(codec_path + '/delta_hist') delta_codec = HuffmanCodec.from_frequencies(delta_hist) frame_one_hist = load_huff_dictionary(codec_path + '/frame_one_hist') frame_one_codec = HuffmanCodec.from_frequencies(frame_one_hist) sizes = [] videos = [ "videos/n01443537/goldfish_1.mp4", "videos/n01443537/goldfish_2.mp4", "videos/n01443537/goldfish_3.mp4", "videos/n01882714/koala_1.mp4", "videos/n01882714/koala_2.mp4", "videos/n01882714/koala_3.mp4", "videos/n02085620/dog_1.mp4", "videos/n02099601/golden_retriever_1.mp4", "videos/n02099712/golden_retriever_1.mp4", "videos/n02110958/pug_1.mp4", "videos/n02110958/pug_3.mp4", "videos/n02110958/pug_4.mp4", "videos/n02206856/bee_1.mp4", "videos/n02391049/zebra_1.mp4", "videos/n02391049/zebra_2.mp4", "videos/n02391049/zebra_3.mp4", "videos/n02510455/panda_1.mp4", "videos/n02510455/panda_2.mp4", "videos/n02510455/panda_3.mp4", "videos/n02510455/panda_4.mp4", "videos/n02510455/panda_5.mp4", "videos/n02676566/guitar_1.mp4", "videos/n02676566/guitar_2.mp4", "videos/n02676566/guitar_3.mp4", "videos/n02676566/guitar_4.mp4", "videos/n02676566/guitar_6.mp4", "videos/n02787622/banjo_1.mp4", "videos/n02787622/banjo_2.mp4", "videos/n02787622/banjo_3.mp4", "videos/n02787622/banjo_5.mp4", "videos/n03452741/piano_1.mp4", "videos/n03452741/piano_2.mp4", "videos/n03495258/harp_1.mp4", "videos/n03495258/harp_2.mp4", "videos/n03495258/harp_3.mp4", "videos/n03584254/ipod_1.mp4", "videos/n03584254/ipod_2.mp4", "videos/n03967562/plough_1.mp4", "videos/n04536866/violin_3.mp4", "videos/n04536866/violin_4.mp4", "videos/n06596364/comic_1.mp4", "videos/n01910747/jelly_fish_1.mp4", "videos/n01910747/jelly_fish_2.mp4", "videos/n02134084/polar_bear_1.mp4", "videos/n02134084/polar_bear_3.mp4", "videos/n02342885/hamster_1.mp4", "videos/n02342885/hamster_2.mp4", "videos/n02342885/hamster_4.mp4", "videos/n02342885/hamster_5.mp4", "videos/n02364673/guinea_pig_1.mp4", "videos/n02364673/guinea_pig_2.mp4" ] # for analysing fc output if analyse_fc_results is True: test_videos = [ "videos/n01882714/koala_1.mp4", "videos/n02510455/panda_1.mp4", "videos/n02676566/guitar_2.mp4", "videos/n02133161/bear_1.mp4", "videos/n02110958/pug_3.mp4" ] videos = test_videos vid_num = 0 frame_number = 0 cats = json.load(open('config/categories.json')) class_id = videos[vid_num].split('/')[1] for j in range(len(cats)): if cats[j]['id'] == class_id: index = cats[j]['index'] while True: # Wait for a connection received = '' arr = bytearray() byte_size = 0 print(sys.stderr, 'waiting for a connection') connection, client_address = sock.accept() try: print(sys.stderr, 'connection from', client_address) # Receive the data in small chunks and retransmit it while True: data = connection.recv(1024) byte_size = byte_size + len(data) arr.extend(data) # print(sys.stderr, 'received "%s"' % data) if data: # print(sys.stderr, 'sending data back to the client') connection.sendall(data) else: print(sys.stderr, 'no more data from', client_address) connect = False break finally: # Clean up the connection connection.close() print('Size of data received: ', byte_size) # code for receiving reset to frame one if byte_size == 1: print('Received reset') avg_byte_size = sum(sizes) / len(sizes) passRate = (passCount / (failCount + passCount)) * 100 print('percentage of passed: ', passRate) result = 'file: ' + videos[vid_num] + ', %Passed: ' + str(passRate) + ', avg_byte_size: ' \ + str(avg_byte_size) + ', layer: ' + str(LAST_EDGE_LAYER) + ', num_bins_used: ' + str( NUM_BINS) + \ ', Delta Value: ' + str(DELTA_VALUE) + '\n' results_path = "Results" + '/layer' + str(LAST_EDGE_LAYER) + '/num_bins_' + str( NUM_BINS) + '/delta_value' \ + str(DELTA_VALUE) if save_results is True: if not os.path.isdir(results_path): try: os.makedirs(results_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(results_path + "/results.txt", "a") as myfile: myfile.write(result) # Resetting variables PREVIOUS_ARRAY = None sizes = [] passCount = 0 failCount = 0 vid_num += 1 frame_number = 0 cats = json.load(open('config/categories.json')) class_id = videos[vid_num].split('/')[1] for j in range(len(cats)): if cats[j]['id'] == class_id: index = cats[j]['index'] elif PREVIOUS_ARRAY is not None and USE_DELTA is True: decoded = delta_codec.decode(arr) arr = np.reshape(decoded, RESHAPE_ARRAY_DIMENSIONS) decoded_arr = decode(arr, NUM_BINS, max_num=8, min_num=-8) delta_decoded_arr = decode_delta(PREVIOUS_ARRAY, decoded_arr) PREVIOUS_ARRAY = delta_decoded_arr fc_out = server_run(torch.Tensor(delta_decoded_arr), LAST_EDGE_LAYER, INCEPT) result = classify_server_run(fc_out, class_label=index) else: decoded = frame_one_codec.decode(arr) arr = np.reshape(decoded, RESHAPE_ARRAY_DIMENSIONS) decoded_arr = decode(arr, NUM_BINS, max_num=8, min_num=-8) PREVIOUS_ARRAY = decoded_arr fc_out = server_run(torch.Tensor(decoded_arr), LAST_EDGE_LAYER, INCEPT) result = classify_server_run(fc_out, class_label=index) # 0 for false 1 for true, str so can be written to file and easily calculate total top_five_is_the_same = '0' top_one_is_the_same = '0' if analyse_fc_results is True: video_file = videos[vid_num].split('/')[2] video = video_file.split('.')[0] saved_fc_dir = 'Results/fc_results/' + class_id path_to_saved_fc = saved_fc_dir + '/' + video + '_' + str(frame_number) + '.npy' print('looking for: ', path_to_saved_fc) if os.path.isdir(saved_fc_dir): print('path exists') unencoded_fc = np.load(path_to_saved_fc) unencoded_top_five = unencoded_fc.argsort()[0][-1:-6:-1] # Get top 5 classifications. encoded_top_five = fc_out.data.numpy().argsort()[0][-1:-6:-1] unencoded_top = unencoded_fc.argsort()[0][-1] encoded_top = fc_out.data.numpy().argsort()[0][-1] if (np.array_equal(unencoded_top_five, encoded_top_five)): top_five_is_the_same = '1' if (np.array_equal(unencoded_top, encoded_top)): top_one_is_the_same = '1' path_to_results = 'Results/fc_results/comparison_results/layer_' + str(LAST_EDGE_LAYER) + \ '/num_bins_' + str(NUM_BINS) + \ '/delta_value_' + str(DELTA_VALUE) fc_analysis_result = videos[vid_num] + ' ,same top five predictions ,' + top_five_is_the_same + \ ' ,same top one prediction ,' + top_one_is_the_same + '\n' if not (os.path.isdir(path_to_results)): os.makedirs(path_to_results) with open(path_to_results + '/fc_results.txt', 'a') as myfile: myfile.write(fc_analysis_result) frame_number += 1 if result: passCount += 1 else: failCount += 1 sizes.append(byte_size) print('Total checked: ', passCount + failCount) print('Number of correct classifications: ', passCount) print('Number Failed: ', failCount)
def decode(encoded, freqs, shape): codec = HuffmanCodec.from_frequencies(freqs) decoded = torch.Tensor(codec.decode(encoded)) return torch.reshape(decoded, shape)
def __init__(self, dataset): c = collections.Counter() for row in dataset: c.update(row) self.collection = c self.codec = HuffmanCodec.from_frequencies(self.collection)
def huffman_decode_block(encoded_block, block_frequencies): huffman_codec = HuffmanCodec.from_frequencies(block_frequencies) decoded_block = huffman_codec.decode(encoded_block) return decoded_block
def __init__(self, method): try: self.freqs = frequencies[method] self.codec = HuffmanCodec.from_frequencies(self.freqs) except KeyError: raise KeyError('Invalid Huffman dictionary.')
ext = tldextract.extract(url) other.extend(ext.domain + parts.path + parts.params + '?' + parts.query + '#' + parts.fragment) suffix.append(ext.suffix) counter = Counter(other) freq = defaultdict(int) freq.update(counter) # Make sure that all allowed url characters have an entry allowed_characters = "!#$&'()*+,/:;=?@[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~" for char in allowed_characters: freq[char] = freq[char] other_codec = HuffmanCodec.from_frequencies(freq) other_codec.save('other.codec') counter = Counter(suffix) freq = defaultdict(int) freq.update(counter) # Make sure that all tlds have an entry # TODO all_tlds = [] for char in all_tlds: freq[char] = freq[char] suffix_codec = HuffmanCodec.from_frequencies(freq) suffix_codec.save('suffix.codec')