def classify_list_of_videos(videos):
    LAST_EDGE_LAYER = FLAGS.layer_index
    NUM_BINS = FLAGS.num_bins
    DELTA_VALUE = FLAGS.delta_value
    codec_path = 'huffman_encoding_config/' + 'layer' + str(
        LAST_EDGE_LAYER) + '/' + 'num_bins_' + str(NUM_BINS)
    delta_hist = analysis.load_huff_dictionary(codec_path + '/delta_hist')
    delta_codec = HuffmanCodec.from_frequencies(delta_hist)
    frame_one_hist = analysis.load_huff_dictionary(codec_path +
                                                   '/frame_one_hist')
    frame_one_codec = HuffmanCodec.from_frequencies(frame_one_hist)
    codecs = delta_codec, frame_one_codec

    for i in range(len(videos)):
        print('Classifying: ', videos[i])
        MSE = classify_video(videos[i], codecs, write=False)
        send(RESET)
        results_path = "Results" + '/layer' + str(LAST_EDGE_LAYER) + '/num_bins_' + str(NUM_BINS) + '/delta_value' \
                       + str(DELTA_VALUE)
        avg_error = sum(MSE) / len(MSE)
        result = 'file: ' + videos[i] + ', AVG_MSE: ' + str(avg_error) + '\n'
        if not (os.path.isdir(results_path)):
            try:
                os.makedirs(results_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
        with open(results_path + '/MSE.txt', 'a') as myfile:
            myfile.write(result)
コード例 #2
0
ファイル: shakespeare.py プロジェクト: mcognetta/dahuffman
def main():
    logging.basicConfig(level=logging.INFO)
    # Shakespeare Complete Work from Project Gutenberg
    url = 'http://www.gutenberg.org/files/100/100-0.txt'
    path = download(url, 'shakespeare.txt')

    with path.open('r', encoding='utf-8-sig') as f:
        raw = f.read()

    _log.info('Building codec from raw data')
    frequencies = Counter(raw)
    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
    codec = HuffmanCodec.from_frequencies(frequencies)
    codec.save(CODECS / "shakespeare-raw.pickle",
               metadata={"frequencies": frequencies})

    _log.info('Doing white space clean up')
    clean = raw
    clean = re.sub(r'\s*\n+\s*', '\n', clean)
    clean = re.sub(r' +', ' ', clean)
    frequencies = Counter(clean)
    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
    codec = HuffmanCodec.from_frequencies(frequencies)
    codec.save(CODECS / "shakespeare.pickle",
               metadata={"frequencies": frequencies})

    _log.info('Only handling lower case')
    lower = clean.lower()
    frequencies = Counter(lower)
    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
    codec = HuffmanCodec.from_frequencies(frequencies)
    codec.save(CODECS / "shakespeare-lower.pickle",
               metadata={"frequencies": frequencies})
コード例 #3
0
ファイル: json-data.py プロジェクト: mcognetta/dahuffman
def main():
    logging.basicConfig(level=logging.INFO)

    # JSON data sets from https://www.data.gov/
    urls = [
        "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.json",
        "https://data.cdc.gov/api/views/bi63-dtpu/rows.json",
        "https://data.cdc.gov/api/views/cjae-szjv/rows.json",
        "https://data.cityofnewyork.us/api/views/25th-nujf/rows.json",
        "https://data.ct.gov/api/views/kbxi-4ia7/rows.json",
        "https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.json",
        "https://www.chapelhillopendata.org/api/v2/catalog/datasets/bicycle-crash-data-chapel-hill-region/exports/json",
        "https://data.cdc.gov/api/views/6vp6-wxuq/rows.json",
        "https://www.sba.gov/sites/default/files/data.json",
        "https://data.cdc.gov/api/views/e6fc-ccez/rows.json",
        "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.json",
        "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.json",
        "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.json",
        "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.json",
        "https://data.cdc.gov/api/views/6rkc-nb2q/rows.json",
        "https://data.sfgov.org/api/views/j4sj-j2nf/rows.json",
        "https://data.kingcounty.gov/api/views/gmen-63jm/rows.json",
        "https://data.mo.gov/api/views/vpge-tj3s/rows.json",
    ]

    _log.info('Building frequency tables')
    frequencies_raw = Counter()
    frequencies_compact = Counter()
    for url in urls:
        path = download(
            url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() +
            '.json')
        with path.open('r') as f:
            raw = f.read()
        # Only take first N bytes.
        # Large files probably have a lot of structural repetition, which skews the frequencies
        frequencies_raw.update(raw[:100000])

        # Parse and re-encode to compact JSON
        compact = json.dumps(json.loads(raw), separators=(',', ':'))
        frequencies_compact.update(compact[:100000])

    # TODO add more metadata
    _log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}')
    codec = HuffmanCodec.from_frequencies(frequencies_raw)
    codec.save(CODECS / "json.pickle",
               metadata={"frequencies": frequencies_raw})

    _log.info(
        f'Frequencies compact {len(frequencies_compact)}: {frequencies_compact}'
    )
    codec = HuffmanCodec.from_frequencies(frequencies_compact)
    codec.save(CODECS / "json-compact.pickle",
               metadata={"frequencies": frequencies_compact})
コード例 #4
0
ファイル: test_dahuffman.py プロジェクト: mcognetta/dahuffman
def test_eof_cut_off():
    # Using frequency table that should give this encoding
    # A   -> 0
    # B   -> 11
    # C   -> 101
    # EOF -> 100
    codec = HuffmanCodec.from_frequencies({
        'A': 5,
        'B': 4,
        'C': 2,
    })
    cases = {
        # Straightforward cases
        '': 0,
        'A': 1,
        'AB': 1,
        'ABB': 1,
        'CCC': 2,
        # Cases where EOF cut-off saves one output byte
        'ACC': 1,
        'CC': 1,
        'CCCCC': 2,
    }
    for data, expected_length in cases.items():
        encoded = codec.encode(data)
        assert len(encoded) == expected_length
        assert data == codec.decode(encoded)
コード例 #5
0
def huffman_encode_block(zigzagged_block):

    frequencies = collections.Counter(zigzagged_block)
    huffman_codec = HuffmanCodec.from_frequencies(frequencies)
    temp = []
    for k, v in frequencies.items():
        temp.extend([k, v])
    return huffman_codec.encode(zigzagged_block), dict(frequencies)
コード例 #6
0
ファイル: test_dahuffman.py プロジェクト: mcognetta/dahuffman
def test_print_code_table():
    codec = HuffmanCodec.from_frequencies({'a': 2, 'b': 4, 'c': 8})
    out = StringIO()
    codec.print_code_table(out=out)
    dump = out.getvalue()
    assert re.search(r"1\s+1\s+.*'c'", dump)
    assert re.search(r"2\s+01\s+.*'b'", dump)
    assert re.search(r"3\s+001\s+.*'a'", dump)
    assert re.search(r"3\s+000\s+.*_EOF", dump)
コード例 #7
0
ファイル: test_dahuffman.py プロジェクト: mcognetta/dahuffman
def test_trailing_zero_handling():
    """
    Just two symbols ('a' and 'b'): without end-of-file handling, each would only take 1 bit (e.g. a=0 and b=1)
    so 'abba' would be 4 bits '0110', trailed with zeros to fill a byte: '0110000', which is indiscernible
    from result of input 'abbaaaaa'. With proper end-of-file handling, trailing bits are ignored properly.
    """
    codec = HuffmanCodec.from_frequencies({'a': 1, 'b': 1})
    decoded = codec.decode(codec.encode('abba'))
    assert decoded == 'abba'
コード例 #8
0
ファイル: test_dahuffman.py プロジェクト: mcognetta/dahuffman
def test_custom_eof_in_frequencies():
    codec = HuffmanCodec.from_frequencies({
        'A': 5,
        'B': 3,
        'C': 2,
        'Z': 8
    },
                                          eof="Z")
    encoded = codec.encode("ABCACBZABAB")
    assert codec.decode(encoded) == "ABCACB"
コード例 #9
0
ファイル: huffman.py プロジェクト: devloop0/cs348k-project
def encode(data, fps=10):
    assert len(data.shape) == 4
    num_frames = data.shape[0]
    flattened = torch.flatten(data).sign()
    print(flattened.shape)
    flattened[flattened == 0] = 1
    data = flattened.int().tolist()

    freqs = {}
    freqs[1] = torch.sum(flattened == 1).item()
    freqs[-1] = torch.sum(flattened == -1).item()
    assert freqs[1] + freqs[-1] == len(data)

    codec = HuffmanCodec.from_frequencies(freqs)
    encoded = codec.encode(data)
    added_bitrate = (len(encoded) + (len(freqs) * (32 * 2 * 8))) / (num_frames * (2 ** 20)) * fps
    return freqs, encoded, added_bitrate
コード例 #10
0
def main():
    logging.basicConfig(level=logging.INFO)

    # XML data sets from https://www.data.gov/
    urls = [
        "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.xml",
        "https://data.cdc.gov/api/views/bi63-dtpu/rows.xml",
        "https://data.cdc.gov/api/views/cjae-szjv/rows.xml",
        "https://data.cityofnewyork.us/api/views/25th-nujf/rows.xml",
        "https://data.ct.gov/api/views/kbxi-4ia7/rows.xml",
        "https://data.cityofchicago.org/api/views/pfsx-4n4m/rows.xml",
        "https://data.cdc.gov/api/views/6vp6-wxuq/rows.xml",
        "https://www.sba.gov/sites/default/files/data.xml",
        "https://data.cdc.gov/api/views/e6fc-ccez/rows.xml",
        "https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.xml",
        "https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.xml",
        "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Roads_MileMarkers.xml",
        "https://data.cityofchicago.org/api/views/kn9c-c2s2/rows.xml",
        "https://data.cityofnewyork.us/api/views/5t4n-d72c/rows.xml",
        "https://data.cdc.gov/api/views/6rkc-nb2q/rows.xml",
        "https://gisdata.nd.gov/Metadata/ISO/xml/metadata_Airports.xml",
        "https://data.sfgov.org/api/views/j4sj-j2nf/rows.xml",
        "https://data.kingcounty.gov/api/views/gmen-63jm/rows.xml",
        "https://data.mo.gov/api/views/vpge-tj3s/rows.xml",
    ]

    _log.info('Building frequency tables')
    frequencies = Counter()
    for url in urls:
        path = download(
            url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() +
            '.xml')
        with path.open('r') as f:
            # Only take first N bytes.
            # Large files probably have a lot of structural repetition, which skews the frequencies
            raw = f.read(100000)
        frequencies.update(raw)

    # TODO add more metadata
    _log.info(f'Frequencies raw {len(frequencies)}: {frequencies}')
    codec = HuffmanCodec.from_frequencies(frequencies)
    codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})
コード例 #11
0
def decode(data):
    # decode binary data with special header for decompress algorithm
    real_size = int.from_bytes(data[:POS_NUM_OF_CHARS], byteorder='little')
    num_of_chars = int.from_bytes(data[POS_NUM_OF_CHARS:POS_NUM_OF_CHARS + 2],
                                  byteorder='big')
    # int(data[POS_NUM_OF_CHARS])

    dummy_list = []

    print("num_of_chars ", num_of_chars)
    for i in range(num_of_chars):
        c_ch = data[POS_ALPHABET + i * 4]

        c_ch_freq = int.from_bytes(data[POS_ALPHABET + 1 + i * 4:POS_ALPHABET +
                                        1 + i * 4 + 3],
                                   byteorder='big')

        dummy_list.append((c_ch, c_ch_freq))

    alphabet_frequencies = dict(dummy_list)
    print("alphabet_frequencies", alphabet_frequencies)
    codec = HuffmanCodec.from_frequencies(alphabet_frequencies)
    codec.print_code_table()
    return codec.decode(data[POS_ALPHABET + num_of_chars * 4:])
def main():
    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    # Bind the socket to the port
    server_address = ('localhost', 10000)
    print(sys.stderr, 'starting up on %s port %s' % server_address)
    sock.bind(server_address)

    # Listen for incoming connections
    sock.listen(1)
    connect = True
    failCount = 0
    passCount = 0

    INCEPT = torchvision.models.inception_v3(pretrained=True)
    PREVIOUS_ARRAY = None
    USE_DELTA = True
    LAST_EDGE_LAYER = FLAGS.layer_index
    NUM_BINS = FLAGS.num_bins
    DELTA_VALUE = FLAGS.delta_value
    save_results = FLAGS.save_results
    analyse_fc_results = FLAGS.compare_fc

    print('Number of bins: ', NUM_BINS)
    print('DELTA VALUE: ', DELTA_VALUE)
    print('Last edge index: ', LAST_EDGE_LAYER)
    print('Saving Results: ', save_results)

    if (LAST_EDGE_LAYER == 7):
        RESHAPE_ARRAY_DIMENSIONS = [192, 35, 35]
    elif (LAST_EDGE_LAYER == 11):
        RESHAPE_ARRAY_DIMENSIONS = [768, 17, 17]
    elif (LAST_EDGE_LAYER == 6):
        RESHAPE_ARRAY_DIMENSIONS = [192, 71, 71]
    else:
        RESHAPE_ARRAY_DIMENSIONS = None
        print("Reshape dimensions not defined for layer being partitioned")

    codec_path = 'huffman_encoding_config/' + 'layer' + str(LAST_EDGE_LAYER) + '/' + 'num_bins_' + str(NUM_BINS)
    delta_hist = load_huff_dictionary(codec_path + '/delta_hist')
    delta_codec = HuffmanCodec.from_frequencies(delta_hist)
    frame_one_hist = load_huff_dictionary(codec_path + '/frame_one_hist')
    frame_one_codec = HuffmanCodec.from_frequencies(frame_one_hist)

    sizes = []
    videos = [
        "videos/n01443537/goldfish_1.mp4",
        "videos/n01443537/goldfish_2.mp4",
        "videos/n01443537/goldfish_3.mp4",

        "videos/n01882714/koala_1.mp4",
        "videos/n01882714/koala_2.mp4",
        "videos/n01882714/koala_3.mp4",

        "videos/n02085620/dog_1.mp4",

        "videos/n02099601/golden_retriever_1.mp4",

        "videos/n02099712/golden_retriever_1.mp4",

        "videos/n02110958/pug_1.mp4",
        "videos/n02110958/pug_3.mp4",
        "videos/n02110958/pug_4.mp4",

        "videos/n02206856/bee_1.mp4",

        "videos/n02391049/zebra_1.mp4",
        "videos/n02391049/zebra_2.mp4",
        "videos/n02391049/zebra_3.mp4",

        "videos/n02510455/panda_1.mp4",
        "videos/n02510455/panda_2.mp4",
        "videos/n02510455/panda_3.mp4",
        "videos/n02510455/panda_4.mp4",
        "videos/n02510455/panda_5.mp4",

        "videos/n02676566/guitar_1.mp4",
        "videos/n02676566/guitar_2.mp4",
        "videos/n02676566/guitar_3.mp4",
        "videos/n02676566/guitar_4.mp4",
        "videos/n02676566/guitar_6.mp4",

        "videos/n02787622/banjo_1.mp4",
        "videos/n02787622/banjo_2.mp4",
        "videos/n02787622/banjo_3.mp4",
        "videos/n02787622/banjo_5.mp4",

        "videos/n03452741/piano_1.mp4",
        "videos/n03452741/piano_2.mp4",

        "videos/n03495258/harp_1.mp4",
        "videos/n03495258/harp_2.mp4",
        "videos/n03495258/harp_3.mp4",

        "videos/n03584254/ipod_1.mp4",
        "videos/n03584254/ipod_2.mp4",

        "videos/n03967562/plough_1.mp4",

        "videos/n04536866/violin_3.mp4",
        "videos/n04536866/violin_4.mp4",

        "videos/n06596364/comic_1.mp4",

        "videos/n01910747/jelly_fish_1.mp4",
        "videos/n01910747/jelly_fish_2.mp4",

        "videos/n02134084/polar_bear_1.mp4",
        "videos/n02134084/polar_bear_3.mp4",

        "videos/n02342885/hamster_1.mp4",
        "videos/n02342885/hamster_2.mp4",
        "videos/n02342885/hamster_4.mp4",
        "videos/n02342885/hamster_5.mp4",

        "videos/n02364673/guinea_pig_1.mp4",
        "videos/n02364673/guinea_pig_2.mp4"
    ]

    # for analysing fc output
    if analyse_fc_results is True:
        test_videos = [
            "videos/n01882714/koala_1.mp4",
            "videos/n02510455/panda_1.mp4",
            "videos/n02676566/guitar_2.mp4",
            "videos/n02133161/bear_1.mp4",
            "videos/n02110958/pug_3.mp4"
        ]
        videos = test_videos

    vid_num = 0
    frame_number = 0

    cats = json.load(open('config/categories.json'))
    class_id = videos[vid_num].split('/')[1]
    for j in range(len(cats)):
        if cats[j]['id'] == class_id:
            index = cats[j]['index']

    while True:
        # Wait for a connection
        received = ''
        arr = bytearray()
        byte_size = 0

        print(sys.stderr, 'waiting for a connection')
        connection, client_address = sock.accept()

        try:
            print(sys.stderr, 'connection from', client_address)

            # Receive the data in small chunks and retransmit it
            while True:
                data = connection.recv(1024)
                byte_size = byte_size + len(data)

                arr.extend(data)
                # print(sys.stderr, 'received "%s"' % data)
                if data:
                    # print(sys.stderr, 'sending data back to the client')
                    connection.sendall(data)
                else:
                    print(sys.stderr, 'no more data from', client_address)
                    connect = False
                    break

        finally:
            # Clean up the connection
            connection.close()
            print('Size of data received: ', byte_size)

            # code for receiving reset to frame one
            if byte_size == 1:
                print('Received reset')
                avg_byte_size = sum(sizes) / len(sizes)
                passRate = (passCount / (failCount + passCount)) * 100
                print('percentage of passed: ', passRate)
                result = 'file: ' + videos[vid_num] + ', %Passed: ' + str(passRate) + ', avg_byte_size: ' \
                         + str(avg_byte_size) + ', layer: ' + str(LAST_EDGE_LAYER) + ', num_bins_used: ' + str(
                    NUM_BINS) + \
                         ', Delta Value: ' + str(DELTA_VALUE) + '\n'
                results_path = "Results" + '/layer' + str(LAST_EDGE_LAYER) + '/num_bins_' + str(
                    NUM_BINS) + '/delta_value' \
                               + str(DELTA_VALUE)

                if save_results is True:
                    if not os.path.isdir(results_path):
                        try:
                            os.makedirs(results_path)
                        except OSError as e:
                            if e.errno != errno.EEXIST:
                                raise
                    with open(results_path + "/results.txt", "a") as myfile:
                        myfile.write(result)

                # Resetting variables
                PREVIOUS_ARRAY = None
                sizes = []
                passCount = 0
                failCount = 0
                vid_num += 1
                frame_number = 0

                cats = json.load(open('config/categories.json'))
                class_id = videos[vid_num].split('/')[1]
                for j in range(len(cats)):
                    if cats[j]['id'] == class_id:
                        index = cats[j]['index']

            elif PREVIOUS_ARRAY is not None and USE_DELTA is True:
                decoded = delta_codec.decode(arr)
                arr = np.reshape(decoded, RESHAPE_ARRAY_DIMENSIONS)

                decoded_arr = decode(arr, NUM_BINS, max_num=8, min_num=-8)
                delta_decoded_arr = decode_delta(PREVIOUS_ARRAY, decoded_arr)
                PREVIOUS_ARRAY = delta_decoded_arr
                fc_out = server_run(torch.Tensor(delta_decoded_arr), LAST_EDGE_LAYER, INCEPT)
                result = classify_server_run(fc_out, class_label=index)
            else:
                decoded = frame_one_codec.decode(arr)
                arr = np.reshape(decoded, RESHAPE_ARRAY_DIMENSIONS)

                decoded_arr = decode(arr, NUM_BINS, max_num=8, min_num=-8)
                PREVIOUS_ARRAY = decoded_arr
                fc_out = server_run(torch.Tensor(decoded_arr), LAST_EDGE_LAYER, INCEPT)
                result = classify_server_run(fc_out, class_label=index)

            # 0 for false 1 for true, str so can be written to file and easily calculate total
            top_five_is_the_same = '0'
            top_one_is_the_same = '0'
            if analyse_fc_results is True:
                video_file = videos[vid_num].split('/')[2]
                video = video_file.split('.')[0]
                saved_fc_dir = 'Results/fc_results/' + class_id
                path_to_saved_fc = saved_fc_dir + '/' + video + '_' + str(frame_number) + '.npy'
                print('looking for: ', path_to_saved_fc)
                if os.path.isdir(saved_fc_dir):
                    print('path exists')
                    unencoded_fc = np.load(path_to_saved_fc)
                    unencoded_top_five = unencoded_fc.argsort()[0][-1:-6:-1]  # Get top 5 classifications.
                    encoded_top_five = fc_out.data.numpy().argsort()[0][-1:-6:-1]
                    unencoded_top = unencoded_fc.argsort()[0][-1]
                    encoded_top = fc_out.data.numpy().argsort()[0][-1]
                    if (np.array_equal(unencoded_top_five, encoded_top_five)):
                        top_five_is_the_same = '1'
                    if (np.array_equal(unencoded_top, encoded_top)):
                        top_one_is_the_same = '1'

                    path_to_results = 'Results/fc_results/comparison_results/layer_' + str(LAST_EDGE_LAYER) + \
                                      '/num_bins_' + str(NUM_BINS) + \
                                      '/delta_value_' + str(DELTA_VALUE)
                    fc_analysis_result = videos[vid_num] + ' ,same top five predictions ,' + top_five_is_the_same + \
                                         ' ,same top one prediction ,' + top_one_is_the_same  + '\n'
                    if not (os.path.isdir(path_to_results)):
                        os.makedirs(path_to_results)
                    with open(path_to_results + '/fc_results.txt', 'a') as myfile:
                        myfile.write(fc_analysis_result)

            frame_number += 1
            if result:
                passCount += 1
            else:
                failCount += 1
            sizes.append(byte_size)
            print('Total checked: ', passCount + failCount)
            print('Number of correct classifications: ', passCount)
            print('Number Failed: ', failCount)
コード例 #13
0
ファイル: huffman.py プロジェクト: devloop0/cs348k-project
def decode(encoded, freqs, shape):
    codec = HuffmanCodec.from_frequencies(freqs)
    decoded = torch.Tensor(codec.decode(encoded))
    return torch.reshape(decoded, shape)
コード例 #14
0
 def __init__(self, dataset):
     c = collections.Counter()
     for row in dataset: 
         c.update(row)
     self.collection = c
     self.codec = HuffmanCodec.from_frequencies(self.collection)
コード例 #15
0
def huffman_decode_block(encoded_block, block_frequencies):

    huffman_codec = HuffmanCodec.from_frequencies(block_frequencies)
    decoded_block = huffman_codec.decode(encoded_block)
    return decoded_block
コード例 #16
0
ファイル: huffman.py プロジェクト: argoneuscze/AudioNMF
 def __init__(self, method):
     try:
         self.freqs = frequencies[method]
         self.codec = HuffmanCodec.from_frequencies(self.freqs)
     except KeyError:
         raise KeyError('Invalid Huffman dictionary.')
コード例 #17
0
            ext = tldextract.extract(url)

            other.extend(ext.domain + parts.path + parts.params + '?' +
                         parts.query + '#' + parts.fragment)
            suffix.append(ext.suffix)

counter = Counter(other)
freq = defaultdict(int)
freq.update(counter)

# Make sure that all allowed url characters have an entry
allowed_characters = "!#$&'()*+,/:;=?@[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~"
for char in allowed_characters:
    freq[char] = freq[char]

other_codec = HuffmanCodec.from_frequencies(freq)
other_codec.save('other.codec')

counter = Counter(suffix)
freq = defaultdict(int)
freq.update(counter)

# Make sure that all tlds have an entry
# TODO
all_tlds = []
for char in all_tlds:
    freq[char] = freq[char]

suffix_codec = HuffmanCodec.from_frequencies(freq)
suffix_codec.save('suffix.codec')