Ejemplo n.º 1
0
 def compressFunc(self):
     path = self.name.get()
     global h 
     h = HuffmanCoding(path)
     global output_path 
     output_path = h.compress()
     print("Compressed file path: " + output_path)
Ejemplo n.º 2
0
    def decode(self):
        output = np.zeros(self.img.shape)

        cnum = self.img.shape[1] + self.cpad
        horizontal_block_count = cnum // self.block_size

        h_tmp = HuffmanCoding([])

        for channel in range(self.img.shape[2]):
            for index, (encoded,
                        rev_map) in enumerate(self.encoded_blocks[channel]):

                i = index // horizontal_block_count
                j = index % horizontal_block_count

                r_min = i * self.block_size
                r_max = min((i + 1) * self.block_size, self.img.shape[0])
                row_diff = r_max - r_min

                c_min = j * self.block_size
                c_max = min((j + 1) * self.block_size, self.img.shape[1])
                col_diff = c_max - c_min

                h_tmp.reverse_mapping = rev_map
                zigzag = h_tmp.decompress(encoded)
                coeffs = fillDiagonal(zigzag, self.block_size)
                is_Y = (channel == 0)
                block = inv_dct(coeffs, mult=self.mult, is_Y=is_Y)

                output[r_min:r_max, c_min:c_max,
                       channel] = block[:row_diff, :col_diff]

        return output
Ejemplo n.º 3
0
def main():

    """ In this function we read the folder healthy_cow where are store the csv files that contains the matrix of pixels of the images and we selected an specific file.
    At first, we applied a compression with FFT and it returns a csv file, which will be compress again using the huffman method and will return a binary file.
    Followed by this, we decompress the binary file with the huffman method and it returns a csv file that will be decompress with FFT.
    Finally, using the library PIL, we show the original imagen and the decompress image.

    :raises: file not found

    :rtype: Image in PNG format
    """
    directory_sick_cow = "/Users/isabella/Documents/Segundo Semestre/Estructura de Datos /Proyecto/ganado_enfermo_csv"
    directory_healthy_cow = "/Users/isabella/Documents/Segundo Semestre/Estructura de Datos /Proyecto/Entrega 3/Codigo/huffman-coding-master/Vacas_Enferma"
     
    directory = directory_healthy_cow
    cont = os.listdir(directory)

    matriz_csv_var = load_img(directory+'/'+cont[0])

    fft_compression(matriz_csv_var, 0.05)
    h = HuffmanCoding("compressFFT.csv")
    output_path = h.compress()
    print("Compressed file path: " + output_path)


    decom_path = h.decompress(output_path)
    print("Decompressed file path: " + decom_path)
    img_fft_descompress = fft_descompression(decom_path)


    show_img(matriz_csv_var)
    show_img(img_fft_descompress)


    savetxt('dataff.csv', matriz_csv_var, delimiter=',')
Ejemplo n.º 4
0
def main():
    from huffman import HuffmanCoding
    import sys

    inputFilePath = "sample.txt"
    handle = HuffmanCoding(inputFilePath)
    output_path = handle.compress()
    print("Compressed file path: " + output_path)
    decom_path = h.decompress(output_path)
    print("Decompressed file path: " + decom_path)
Ejemplo n.º 5
0
 def test_decoded_msg_given_the_original_msg(self):
     txt = "The bird is the word"
     huffman_coding = HuffmanCoding(txt)
     print("The size of the data is: {}\n".format(sys.getsizeof(txt)))
     print("The content of the data is: {}\n".format(txt))
     encoded_data, tree = huffman_coding.huffman_encoding()
     print("The size of the encoded data is: {}\n".format(
         sys.getsizeof(int(encoded_data, base=2))))
     print("The content of the encoded data is: {}\n".format(encoded_data))
     self.assertTrue(txt,
                     huffman_coding.huffman_decoding(encoded_data, tree))
Ejemplo n.º 6
0
 def choice(self):
     filename = askopenfilename()
     lable = Label(self, text="", relief=RAISED)
     lable.configure(text=filename)
     lable.grid(column=0, row=2)
     lable1 = Label(self, text="")
     lable1.configure(text="Dung lượng " + str(os.path.getsize(filename)) +
                      " bytes")
     lable1.grid(column=0, row=3)
     self.path = filename
     self.h = HuffmanCoding(filename)
Ejemplo n.º 7
0
def algorithm(path):
    h = HuffmanCoding(path)
    first1 = time.time()
    output_path = h.compress()
    second1 = time.time()
    delta_time1 = second1 - first1
    first2 = time.time()
    decom_path = h.decompress(output_path)
    second3 = time.time()
    delta_time2 = second3 - first2
    return output_path, delta_time1, delta_time2, decom_path
Ejemplo n.º 8
0
 def test_when_msg_is_single_char(self):
     txt = "T"
     huffman_coding = HuffmanCoding(txt)
     print("The size of the data is: {}\n".format(sys.getsizeof(txt)))
     print("The content of the data is: {}\n".format(txt))
     encoded_data, tree = huffman_coding.huffman_encoding()
     print(encoded_data)
     print("The size of the encoded data is: {}\n".format(
         sys.getsizeof(int(encoded_data, base=2))))
     print("The content of the encoded data is: {}\n".format(encoded_data))
     decoded_data = huffman_coding.huffman_decoding(encoded_data, tree)
     print(decoded_data)
     self.assertTrue(txt,
                     huffman_coding.huffman_decoding(encoded_data, tree))
Ejemplo n.º 9
0
def main(argv):
    filepath = argv[1]
    read_bit_size = 8

    if len(argv) > 2:
        read_bit_size = argv[2]
        print(read_bit_size)

    h = HuffmanCoding(filepath, read_bit_size)

    output_path = h.compress()
    print("Compressed file path: " + output_path)

    decom_path = h.decompress(output_path)
    print("Decompressed file path: " + decom_path)
Ejemplo n.º 10
0
def mask_compression(mask):
    prev = 1
    rl = 0
    cnt = 0
    result = []
    for e in mask:
        if e == prev:
            rl += 1
        else:
            result += [rl]
            rl = 0
        prev = e
    if rl > 0:
        result += [rl]
    huffman = HuffmanCoding()
    size = len(huffman.compress(result)) * 4
    return size
Ejemplo n.º 11
0
def decode_huffman(model, enc, text, context, bits_per_word, device='cpu'):
    # inp is a list of token indices
    # context is a list of token indices
    inp = enc.encode(text)
    i = 0
    while i < len(inp):
        if inp[i] == 628:
            inp[i] = 198
            inp[i + 1:i + 1] = [198]
            i += 2
        else:
            i += 1

    context = torch.tensor(context[-1022:], device=device, dtype=torch.long)
    prev = context
    past = None

    message = []
    with torch.no_grad():
        i = 0
        while i < len(inp):
            if past and past[0].shape[3] >= 1023:
                raise RuntimeError

            logits, past = model(prev.unsqueeze(0), past=past)
            past = limit_past(past)
            logits[0, -1, -1] = -1e10  # endoftext can't happen
            logits[0, -1, 628] = -1e10  # 2 newlines can't happen
            logits, indices = logits[0, -1, :].sort(descending=True)

            # Get the top 2**bits options
            indices = indices[:2**bits_per_word]
            log_probs = F.log_softmax(logits, dim=-1)[:2**bits_per_word]
            probs = torch.exp(log_probs)

            if inp[i] not in indices:
                true_token_text = enc.decoder[inp[i]]
                for rank_idx in range(2**bits_per_word):
                    prop_token_text = enc.decoder[indices[rank_idx].item()]
                    # common case that is not caught
                    if inp[i] == 128 and indices[rank_idx] == 198:
                        rank = rank_idx
                        inp[i] = indices[rank_idx].item()
                        break

                    # Is there a more likely prefix token that could be the actual token generated?
                    if len(prop_token_text) <= len(true_token_text) and \
                            prop_token_text == true_token_text[:len(prop_token_text)]:
                        rank = rank_idx
                        suffix = true_token_text[len(prop_token_text):]
                        suffix_tokens = enc.encode(suffix)  # a list
                        inp[i] = indices[rank_idx].item()
                        inp[i + 1:i +
                            1] = suffix_tokens  # insert suffix tokens into list
                        break

                    # Is there a more likely longer token that could be the actual token generated?
                    elif len(prop_token_text) > len(true_token_text) and \
                              true_token_text == prop_token_text[:len(true_token_text)]:
                        whole_text = true_token_text
                        num_extra = 1
                        while len(whole_text) < len(prop_token_text):
                            whole_text += enc.decoder[inp[i + num_extra]]
                            num_extra += 1
                        if prop_token_text == whole_text[:len(prop_token_text
                                                              )]:
                            rank = rank_idx
                            inp[i] = indices[rank_idx].item()
                            for j in range(1, num_extra):
                                del inp[i + j]

                            if len(whole_text) > len(prop_token_text):
                                suffix = whole_text[len(prop_token_text):]
                                suffix_tokens = enc.encode(suffix)  # a list
                                inp[i + 1:i +
                                    1] = suffix_tokens  # insert suffix tokens into list
                            break
                else:
                    print(
                        'Unable to fix BPE error: token received: %s=%d, text: %s'
                        % (true_token_text, inp[i], text))
                    rank = 0
            else:
                rank = (indices == inp[i]).nonzero().item()

            probs_array = probs.cpu().numpy()
            coding = HuffmanCoding()
            coding.make_heap_from_array(probs_array)
            coding.merge_nodes()
            coding.make_codes()

            tokens_t = map(int, coding.codes[rank])

            message.extend(tokens_t)
            prev = torch.tensor([inp[i]], device=device, dtype=torch.long)
            i += 1

    return message
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(description='Word2vec')
    parser.add_argument('mode',
                        metavar='mode',
                        type=str,
                        help='"SG" for skipgram, "CBOW" for CBOW')
    parser.add_argument(
        'part',
        metavar='partition',
        type=str,
        help=
        '"part" if you want to train on a part of corpus, "full" if you want to train on full corpus'
    )
    parser.add_argument(
        'mode2',
        metavar='mode2',
        type=str,
        help=
        "0 for Hierarchical Softmax, 1 or more for Negative Sampling, 'None' for None of two"
    )
    parser.add_argument(
        'use_subsampling',
        metavar='subsample',
        type=str,
        help="0 for not using subsampling, 1 for using subsampling")
    args = parser.parse_args()
    mode = args.mode
    part = args.part
    mode2 = args.mode2
    subsample = args.use_subsampling

    #Load and tokenize corpus
    print("loading...")
    if part == "part":
        text = open('text8', mode='r').readlines(
        )[0][:1000000]  #Load a part of corpus for debugging
    elif part == "full":
        text = open('text8',
                    mode='r').readlines()[0]  #Load full corpus for submission
    else:
        print("Unknown argument : " + part)
        exit()

    print("tokenizing...")
    corpus = text.split()
    frequency = Counter(corpus)
    processed = []

    #Discard rare words
    for word in corpus:
        if frequency[word] > 4:
            processed.append(word)

    vocabulary = set(processed)

    #Assign an index number to a word
    word2ind = {}
    word2ind[" "] = 0
    i = 1
    for word in vocabulary:
        word2ind[word] = i
        i += 1
    ind2word = {}
    for k, v in word2ind.items():
        ind2word[v] = k

    print("Vocabulary size")
    print(len(word2ind))

    # Create Huffman Coding
    freq = dict()
    freq[0] = 0

    total_freq = 0

    for word in vocabulary:
        freq[word2ind[word]] = frequency[word]
        total_freq += frequency[word]

    # subsampling
    if subsample == "1":

        freq_subsampling = {}
        for word in vocabulary:
            freq_subsampling[word] = frequency[word] / total_freq

        # calculate subsampling_probability
        prob_subsampling = {}

        for word in vocabulary:
            prob_subsampling[word] = max(
                0, 1 - math.sqrt(0.001 / freq_subsampling[word]))

        # print(prob_subsampling)
        # exit()

        subsampled_corpus = []
        discard = 0

        for word in processed:
            prob = prob_subsampling[word]
            random_prob = np.random.rand()
            if random_prob > prob:
                subsampled_corpus.append(word)
            else:
                discard += 1

        print(len(processed))
        print("Discard : " + str(discard))

        processed = subsampled_corpus

    huffmanCode = HuffmanCoding()
    codes, nonleaf_ind = huffmanCode.build(freq)

    # negative sampling
    freqtable = [0, 0, 0]
    for k, v in frequency.items():
        f = int(v**0.75)
        for _ in range(f):
            if k in word2ind.keys():
                freqtable.append(word2ind[k])

    #Training section
    emb, _ = word2vec_trainer(processed,
                              word2ind,
                              codes=codes,
                              freqtable=freqtable,
                              nonleaf_ind=nonleaf_ind,
                              mode=mode,
                              mode2=mode2,
                              use_subsample=subsample,
                              dimension=64,
                              learning_rate=0.05,
                              iteration=50000)

    Analogical_Reasoning_Task(emb, word2ind, ind2word)
Ejemplo n.º 13
0
def encode_huffman(model,
                   enc,
                   message,
                   context,
                   bits_per_word,
                   finish_sent=False,
                   device='cpu'):
    length = len(message)

    context = torch.tensor(context[-1022:], device=device, dtype=torch.long)

    prev = context
    output = context
    past = None

    total_num = 0
    total_num_for_stats = 0
    total_log_probs = 0
    total_kl = 0  # in bits
    total_num_sents = 0

    with torch.no_grad():
        i = 0
        sent_finish = False
        while i < length or (finish_sent and not sent_finish):
            logits, past = model(prev.unsqueeze(0), past=past)
            past = limit_past(past)
            logits[0, -1, -1] = -1e10  # endoftext can't happen
            logits[0, -1, 628] = -1e10  # 2 newlines can't happen
            logits, indices = logits[0, -1, :].sort(descending=True)

            # Get the top 2**bits options
            indices = indices[:2**bits_per_word]
            log_probs = F.log_softmax(logits, dim=-1)[:2**bits_per_word]
            probs = torch.exp(log_probs)

            if i >= length:
                selection = 0
                sent_finish = is_sent_finish(indices[0].item(), enc)
            else:
                probs_array = probs.cpu().numpy()
                coding = HuffmanCoding()
                coding.make_heap_from_array(probs_array)
                coding.merge_nodes()
                root = coding.make_codes()

                #print(message[i:i+10])
                while root.token is None:
                    if i >= length or message[i] == 0:
                        root = root.left
                    else:
                        root = root.right
                    i += 1
                selection = root.token

                logq = torch.tensor([
                    -len(coding.codes[idx]) for idx in range(len(probs_array))
                ],
                                    dtype=torch.float,
                                    device=device)  # in bits
                logq = logq * 0.69315  # in nats
                q = torch.exp(logq)
                total_kl += kl(q, logq, log_probs)
                total_log_probs += log_probs[selection].item()
                total_num_for_stats += 1

            total_num += 1

            prev = indices[selection].view(1)
            output = torch.cat((output, prev))

    avg_NLL = -total_log_probs / total_num_for_stats
    avg_KL = total_kl / total_num_for_stats
    words_per_bit = total_num_for_stats / i

    return output[len(context):].tolist(), avg_NLL, avg_KL, words_per_bit
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description='Word2vec')
    parser.add_argument('mode', metavar='mode', type=str,
                        help='"SG" for skipgram, "CBOW" for CBOW')
    parser.add_argument('ns', metavar='negative_samples', type=int,
                        help='0 for hierarchical softmax, the other numbers would be the number of negative samples')
    parser.add_argument('part', metavar='partition', type=str,
                        help='"part" if you want to train on a part of corpus, "full" if you want to train on full corpus')
    args = parser.parse_args()
    mode = args.mode
    part = args.part
    ns = args.ns

	#Load and preprocess corpus
    print("loading...")
    if part=="part":
        text = open('text8.txt',mode='r').readlines()[0][:1000000] #Load a part of corpus for debugging
    elif part=="full":
        text = open('text8.txt',mode='r').readlines()[0] #Load full corpus for submission
    else:
        print("Unknown argument : " + part)
        exit()

    print("preprocessing...")
    corpus = text.split()
    stats = Counter(corpus)
    words = []

    #Discard rare words
    for word in corpus:
        if stats[word]>4:
            words.append(word)
    vocab = set(words)

    #Give an index number to a word
    w2i = {}
    w2i[" "]=0
    i = 1
    for word in vocab:
        w2i[word] = i
        i+=1
    i2w = {}
    for k,v in w2i.items():
        i2w[v]=k


    #Code dict for hierarchical softmax
    freqdict={}
    freqdict[0]=10
    for word in vocab:
        freqdict[w2i[word]]=stats[word]
    codedict, full_tree= HuffmanCoding().build(freqdict)

    subsampled_dic=subsampling_table(freqdict)

    #Frequency table for negative sampling
    freqtable = [0,0,0]
    for k,v in stats.items():
        f = int(v**0.75)
        for _ in range(f):
            if k in w2i.keys():
                freqtable.append(w2i[k])

    #Make training set
    print("build training set...")
    train_set = []
    input_set=[]
    target_set=[]
    window_size = 5
    if mode=="CBOW":
        for j in range(len(words)):
            #sampling_index=random.choice(subsampled_dic[w2i[words[j]]])
            sampling_index=1
            if sampling_index == 1:
                if j<window_size:
                    input_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)])
                    target_set.append(w2i[words[j]])
                elif j>=len(words)-window_size:
                    input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)])
                    target_set.append(w2i[words[j]])
                else:
                    input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)])
                    target_set.append(w2i[words[j]])
    if mode=="SG":
        for j in range(len(words)):
            #sampling_index=random.choice(subsampled_dic[w2i[words[j]]])
            sampling_index = 1
            if sampling_index == 1:
                if j<window_size:
                    input_set += [w2i[words[j]] for _ in range(window_size*2)]
                    target_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]
                elif j>=len(words)-window_size:
                    input_set += [w2i[words[j]] for _ in range(window_size*2)]
                    target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]
                else:
                    input_set += [w2i[words[j]] for _ in range(window_size*2)]
                    target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]

    print("Vocabulary size")
    print(len(w2i))
    print()
    print(input_set[:100])
    #Training section
    emb,_ = word2vec_trainer(input_set, target_set, len(w2i), codedict, full_tree ,freqtable, mode=mode, NS=ns, dimension=64, epoch=1, learning_rate=0.01)

    Analogical_Reasoning_Task(emb, w2i, i2w)

    def save(file_name):
        f = open(file_name, 'w')
        for word in list(w2i.keys()):
            word_index = w2i[word]
            vector_str = ' '.join([str(s.item()) for s in emb[word_index]])
            f.write('%s %s\n' % (word, vector_str))

        f.close()
        print("저장 완료!!!")


    if mode=='SG':
        name='skip-gra'
    else:
        name='CBOW'

    if ns==0:
        name+='_hierarchical-softmax'
    else:
        name+='_negative-sampling'

    if part=='part':
        name+='_part'
    else:
        name+="_full"

    save(name+'_subsampling')
Ejemplo n.º 15
0
def decompress(fromdir):
    h = HuffmanCoding(fromdir)
    return h.decompress(fromdir)
Ejemplo n.º 16
0
def upload_file():
    f = request.files['file']
    tag = request.form['tag']
    data = bytes(f.read())

    input_file_size = len(data)
    filename, file_extension = os.path.splitext(f.filename)

    if (tag == "huffman"):
        h = HuffmanCoding(data)
        huffman_file_size = h.compress(f.filename)
        return jsonify({
            'success': True,
            'fileSize': input_file_size,
            'HuffmanEncoding': {
                'compressionRatio': huffman_file_size / input_file_size,
                'compressionFactor': input_file_size / huffman_file_size,
                'savingPercentage':
                (input_file_size - huffman_file_size) / input_file_size,
                'fileSize': huffman_file_size
            },
        })
    if (tag == "shannon"):
        ShannonCompress(data, f.filename)
        shf_file_size = os.path.getsize(filename + ".shf")
        os.remove(filename + ".shf")
        return jsonify({
            'success': True,
            'fileSize': input_file_size,
            'ShannonFano': {
                'compressionRatio': shf_file_size / input_file_size,
                'compressionFactor': input_file_size / shf_file_size,
                'savingPercentage':
                (input_file_size - shf_file_size) / input_file_size,
                'fileSize': shf_file_size
            },
        })
    if (tag == "lempel"):
        LempelZivWelch(data, f.filename, 8)
        lzw_file_size = os.path.getsize(filename + ".lzw")
        os.remove(filename + ".lzw")
        return jsonify({
            'success': True,
            'fileSize': input_file_size,
            'LempelZivWelch': {
                'compressionRatio': lzw_file_size / input_file_size,
                'compressionFactor': input_file_size / lzw_file_size,
                'savingPercentage':
                (input_file_size - lzw_file_size) / input_file_size,
                'fileSize': lzw_file_size
            }
        })
    if (tag == "rle"):
        RunLengthEncoding(data, f.filename)
        rle_file_size = os.path.getsize(filename + ".rle")
        os.remove(filename + ".rle")
        return jsonify({
            'success': True,
            'fileSize': input_file_size,
            'RunLengthEncoding': {
                'compressionRatio': rle_file_size / input_file_size,
                'compressionFactor': input_file_size / rle_file_size,
                'savingPercentage':
                (input_file_size - rle_file_size) / input_file_size,
                'fileSize': rle_file_size
            },
        })

    return jsonify({'success': False, "message": "Please pass a valid tag"})
Ejemplo n.º 17
0
from huffman import HuffmanCoding
from evaluator import *

#imagePath = "Color/Image01.bmp"
#imagePath = "Color/Image02.bmp"
#imagePath = "Gray/Image03.bmp"
imagePath = "Gray/Image04.bmp"
h = HuffmanCoding(imagePath)

output_path = h.compress()
print("Compressed file path: " + output_path)
decom_path = h.decompress(output_path)
print("Decompressed file path: " + decom_path)

print "Compression Rate: " + str(
    round(CompressionRate(imagePath, output_path), 4))
print "SNR: " + str(round(SNR(imagePath, decom_path), 4))
# # cv2.resizeWindow("deltaback", 1000, 1000)
# # cv2.imshow("deltaback", img)

# print "Redecoded entropy: "
# print shannon_entropy(img)

filenames = glob.glob("images/*.png")

images = [cv2.imread(img) for img in filenames]

sum_ratio = 0

for img in images:
    img = img[:, :, 0]
    img = delta_encode(img)
    h = HuffmanCoding(img, os.getcwd() + "/test")
    h.compress()
    img = h.decompress(os.getcwd() + "/test.bin")
    img = delta_decode(img)

    rawsize = os.stat('raw.bin')
    testsize = os.stat('test.bin')

    ratio = float(float(testsize.st_size) / float(rawsize.st_size))
    sum_ratio += ratio

    print "Redecoded entropy: "
    print shannon_entropy(img)

    print "Compression ratio: "
    print ratio
Ejemplo n.º 19
0
def use_huffman(filename, wordlength=14):
    h = HuffmanCoding(filename, wordlength)
    output_path = h.compress()
    #h.decompress(output_path)
    return output_path
Ejemplo n.º 20
0
from huffman import HuffmanCoding
import sys

if __name__ == "__main__":
    path = sys.argv[1]  #arg[1] = file name
    h = HuffmanCoding(path)  # path ,code, heap and reverse mapping creating
    print("Compressing...")
    output_path = h.compress()
    print(f"Compressed file:  {output_path}\n")
    print("Decompressing...")
    #print('otput_com',output_path)
    output_path = h.decompress(output_path)  #pass as argument output compressed file
    print(f"Decompressed file:  {output_path}")
Ejemplo n.º 21
0
import base64


#Taking a json
path = input("Give json file...\n")
while path_.exists(path) ==  False:
    print("No such file exist")
    path = input("Give json file...\n")
deb_input = input("Run on Debug ? \n")
debug =  True if deb_input == '1' else False


with open(path) as json_file:
    data = json.load(json_file)

h = HuffmanCoding(path,debug)
h.set_reverse_mapping(data["reverse_mapping"])# The statics from the huffman compression

received_message = data["Messagebase64"]
received_message = bytes(received_message[2:-1],'utf-8') #Make it back to base64 bytes

income_message = str(base64.b64decode(received_message))
income_message = income_message[2:-1] #We take only what we need


dec_out = ''
i = 0
k = 7
while k <= len(income_message):
    input = income_message[i:k] # Do hamilton for input = income_message[i:k]
    lista = [] #make a list so i can make an numpy array
from time import sleep

clientSocket = socket.socket()
host = socket.gethostname()
port = 9001

clientSocket.connect((host, port))

filePath = input("Enter the path of the file: ")
fileName = os.path.basename(filePath)

start = datetime.now()

clientSocket.send(bytes(fileName, "utf-8"))

huffman = HuffmanCoding(filePath)
compressedFilePath = huffman.compress()
sleep(1)

clientSocket.send(pickle.dumps(huffman))

with open(compressedFilePath, "rb") as fp:
    data = fp.read(1024)
    while data:
        clientSocket.send(data)
        data = fp.read(1024)

end = datetime.now()

duration = end - start
compressionRatio = os.path.getsize(filePath) / os.path.getsize(
Ejemplo n.º 23
0
    def forward(self, x, ss_map=None):
        # sample from input
        if self.use_subsampling:
            x, thresh = x
            self.sizes[0] += x.view(-1).size(0) * 8
            # feature
            feat_1 = self.ctx(x)
            feat_1_ = self.unpool(feat_1)
        else:
            self.sizes[0] += x.view(-1).size(0) * 8
        x = self.sample(x)
        # after CNN
        self.sizes[1] += x.view(-1).size(0) * 8

        if ss_map is not None:
            ss_map = self.unpool(ss_map) > 0.5
            unpooled = self.unpool(self.pool(x))
            x = torch.where(ss_map, unpooled, x)

        # subsampling
        # data to be sent: mask + actual data
        B, C, H, W = x.size()
        if self.use_subsampling:
            th_1 = thresh
            # sub-sample
            ss_1 = self.unpool(self.pool1(x))
            # conditions
            cond_1 = feat_1_ < th_1
            mask_1 = feat_1 < th_1
            # subsampled data in different areas
            data_1 = self.pool1(x)[mask_1]
            cond_0 = torch.logical_not(cond_1)
            data_0 = x[cond_0]
            comp_data = torch.cat((data_0, data_1), 0)
            # after RAF
            self.sizes[2] += comp_data.size(0) * 8
            # affected data in the original shape
            if not self.training:
                x = torch.where(cond_1, ss_1, x)
            else:
                x = torch.mul(x, feat_1_) + torch.mul(ss_1, 1 - feat_1_)

        # quantization
        xsize = list(x.size())
        x = x.view(*(xsize + [1]))
        quant_dist = torch.pow(x - self.centers, 2)
        softout = torch.sum(self.centers *
                            nn.functional.softmax(-quant_dist, dim=-1),
                            dim=-1)
        minval, index = torch.min(quant_dist, dim=-1, keepdim=True)
        hardout = torch.sum(self.centers * (minval == quant_dist), dim=-1)
        x = softout
        # x = softout + (hardout - softout).detach()
        if self.use_subsampling:
            comp_data = comp_data.view(*(list(comp_data.size()) + [1]))
            quant_dist = torch.pow(comp_data - self.centers, 2)
            index2 = torch.min(quant_dist, dim=-1, keepdim=True)[1]
            # after Q
            self.sizes[3] += index2.view(-1).size(0) * 3
            # running length coding on bitmap
            huffman = HuffmanCoding()
            real_size = len(huffman.compress(
                index2.view(-1).cpu().numpy())) * 4  # bit
            rle_len1 = mask_compression(mask_1.view(-1).cpu().numpy())
            real_size += rle_len1
            # after lossless
            self.sizes[4] += real_size
            filter_loss = torch.mean(feat_1)
            real_cr = 1 / 16. * real_size / (H * W * C * B * 8)
            softmax_dist = nn.functional.softmax(-quant_dist, dim=-1)
            soft_prob = torch.mean(softmax_dist, dim=0)
            entropy = -torch.sum(torch.mul(soft_prob, torch.log(soft_prob)))
            return x, (filter_loss, real_cr, entropy)
        else:
            self.sizes[2] += index.view(-1).size(0) * 3
            huffman = HuffmanCoding()
            real_size = len(huffman.compress(index.view(-1).cpu().numpy())) * 4
            self.sizes[3] += real_size
            real_cr = 1 / 16. * real_size / (H * W * C * B * 8)
            return x, real_cr
Ejemplo n.º 24
0
#!/usr/bin/env python3
from huffman import HuffmanCoding
import pickle

# generates a huffman tree from Pride and Prejudice corpus from Project Gutenberg: https://www.gutenberg.org/ebooks/1342
# https://www.gutenberg.org/files/1342/1342-0.txt
import urllib.request

corpus_url = "https://www.gutenberg.org/files/1342/1342-0.txt"

h = HuffmanCoding()

txt = urllib.request.urlopen(corpus_url).read().decode('utf-8')
#print(txt)
txt = txt.replace("“", "\"")
txt = txt.replace("”", "\"")
output_path = h.generate_tree(txt)

# save tree to file

tree_loc = "tree.bin"

with open(tree_loc, 'wb') as binary_file:
    pickle.dump(h, binary_file)

print("Tree generated at {}".format(tree_loc))
Ejemplo n.º 25
0
def testing(text, test_number, path, test_name):

    ratio = []
    timing = []

    print(f"test number: {test_number}")
    output = open(path + f"/test_{test_number}.txt", 'w')
    output.write(text)
    original_size = os.path.getsize(path + f"/test_{test_number}.txt")

    # Huffman
    print("Compressing with Huffman...")
    h = HuffmanCoding(output.name)

    start = time.time()
    compressed = h.compress()
    timing.append((time.time() - start) * 1000)

    h.decompress(compressed)
    ratio.append(os.path.getsize(compressed) / original_size * 100)
    print("Compressing with Huffman finished")

    # RLE
    print("Compressing with RLE...")
    rle = RLE()
    output = open(path + f"/test_{test_number}_rle.rle", 'w')

    start = time.time()
    output.write(rle.encode(text))
    timing.append((time.time() - start) * 1000)

    ratio.append(
        os.path.getsize(path + f"/test_{test_number}_rle.rle") /
        original_size * 100)
    print("Compressing with RLE finished")

    # LZW
    print("Compressing with LZW...")

    start = time.time()
    lzw3Compressor.LZWCompressor().compress(
        path + f"/test_{test_number}.txt",
        path + f"/test_{test_number}_lzw.lzw")
    timing.append((time.time() - start) * 1000)

    # lzw3Decompressor.LZWDecompressor().decompress(path + f"/test_{test_number}_lzw.lzw", path + f"/test_{test_number}_lzw_decompressed.txt")
    ratio.append(
        os.path.getsize(path + f"/test_{test_number}_lzw.lzw") /
        original_size * 100)
    print("Compressing with LZW finished")

    # LZ78
    print("Compressing with LZ78...")
    output = open(path + f"/test_{test_number}_lz78.lz78", 'w')

    start = time.time()
    output.write(lz78_compress(text))
    timing.append((time.time() - start) * 1000)

    ratio.append(
        os.path.getsize(path + f"/test_{test_number}_lz78.lz78") /
        original_size * 100)
    print("Compressing with LZ78 finished")

    # PPM
    print("compression with PPM...")

    start = time.time()
    ppm_compression(path + f"/test_{test_number}.txt",
                    path + f"/test_{test_number}_ppm.ppm")
    timing.append((time.time() - start) * 1000)

    # ppm_decompression(path + f"/test_{test_number}_ppm.ppm", path + f"/test_{test_number}_ppm_decompresed.txt")
    ratio.append(
        os.path.getsize(path + f"/test_{test_number}_ppm.ppm") /
        original_size * 100)
    print("compressing with PPM finished")

    save_bar_graph(
        ratio, timing,
        f"{test_name} N°{test_number}\nOriginal Size: {original_size} bytes",
        f"graphs/{test_name} {test_number}.svg")

    tick_label = ['Huffman', 'RLE', 'LZW', 'LZ78', 'PPM']

    with open(os.getcwd() + f"/data.txt", 'a') as records:
        records.write(f"\nOriginal Size: {original_size} bytes\n")
        records.write(f"\t\t\tSize\t\tCompression Ratio\t\t\tTime\n")
        for i in range(5):
            spacing = [
                "\t" if i != 0 else "", "\t" if int(ratio[i]) < 100 else "",
                "\t" if int(ratio[i] / 100 * original_size) < 100000 else ""
            ]
            records.write(
                f"{tick_label[i]}:\t{spacing[0]}{int(ratio[i]/100*original_size)} bytes{spacing[2]}\t{ratio[i]}%\t\t{timing[i]} ms\n"
            )

    return ratio, timing
Ejemplo n.º 26
0
def main():
    h = HuffmanCoding(sample_data)
    output_path = h.compress()
    h.decompress(output_path)
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(description='Word2vec')
    parser.add_argument('mode',
                        metavar='mode',
                        type=str,
                        help='"SG" for skipgram, "CBOW" for CBOW')
    parser.add_argument(
        'ns',
        metavar='negative_samples',
        type=int,
        help=
        '0 for hierarchical softmax, the other numbers would be the number of negative samples'
    )
    parser.add_argument(
        'part',
        metavar='partition',
        type=str,
        help=
        '"part" if you want to train on a part of corpus, "full" if you want to train on full corpus'
    )
    args = parser.parse_args()
    mode = args.mode
    part = args.part
    ns = args.ns

    #Load and preprocess corpus
    print("loading...")
    if part == "part":
        text = open('text8', mode='r').readlines(
        )[0][:1000000]  #Load a part of corpus for debugging
    elif part == "full":
        text = open('text8',
                    mode='r').readlines()[0]  #Load full corpus for submission
    else:
        print("Unknown argument : " + part)
        exit()

    print("preprocessing...")

    #subsampling of frequent words
    corpus = text.split()
    stats = Counter(corpus)
    words = []

    #Discard rare words
    for word in corpus:
        if stats[word] > 4:
            words.append(word)
    vocab = set(words)

    #Give an index number to a word
    w2i = {}
    w2i[" "] = 0
    i = 1
    for word in vocab:
        w2i[word] = i
        i += 1
    i2w = {}
    for k, v in w2i.items():
        i2w[v] = k

    #Code dict for hierarchical softmax
    freqdict = {}
    for word in vocab:
        freqdict[w2i[word]] = stats[word]
    codedict = HuffmanCoding().build(freqdict)

    #Frequency table for negative sampling
    freqtable = [0, 0, 0]
    for k, v in stats.items():
        f = int(v**0.75)
        for _ in range(f):
            if k in w2i.keys():
                freqtable.append(w2i[k])

    #Make training set
    print("build training set...")
    input_set = []
    target_set = []
    window_size = 5
    if mode == "CBOW":
        for j in range(len(words)):
            if j < window_size:
                input_set.append(
                    [0 for _ in range(window_size - j)] +
                    [w2i[words[k]] for k in range(j)] +
                    [w2i[words[j + k + 1]] for k in range(window_size)])
                target_set.append(w2i[words[j]])
            elif j >= len(words) - window_size:
                input_set.append(
                    [w2i[words[j - k - 1]] for k in range(window_size)] + [
                        w2i[words[len(words) - k - 1]]
                        for k in range(len(words) - j - 1)
                    ] + [0 for _ in range(j + window_size - len(words) + 1)])
                target_set.append(w2i[words[j]])
            else:
                input_set.append(
                    [w2i[words[j - k - 1]] for k in range(window_size)] +
                    [w2i[words[j + k + 1]] for k in range(window_size)])
                target_set.append(w2i[words[j]])
    if mode == "SG":
        for j in range(len(words)):
            if j < window_size:
                input_set += [w2i[words[j]] for _ in range(window_size * 2)]
                target_set += [0 for _ in range(window_size - j)] + [
                    w2i[words[k]] for k in range(j)
                ] + [w2i[words[j + k + 1]] for k in range(window_size)]
            elif j >= len(words) - window_size:
                input_set += [w2i[words[j]] for _ in range(window_size * 2)]
                target_set += [
                    w2i[words[j - k - 1]] for k in range(window_size)
                ] + [
                    w2i[words[len(words) - k - 1]]
                    for k in range(len(words) - j - 1)
                ] + [0 for _ in range(j + window_size - len(words) + 1)]
            else:
                input_set += [w2i[words[j]] for _ in range(window_size * 2)]
                target_set += [
                    w2i[words[j - k - 1]] for k in range(window_size)
                ] + [w2i[words[j + k + 1]] for k in range(window_size)]

    print("Vocabulary size")
    print(len(w2i))
    print()

    #Training section
    emb, _ = word2vec_trainer(input_set,
                              target_set,
                              len(w2i),
                              codedict,
                              freqtable,
                              mode=mode,
                              NS=ns,
                              dimension=300,
                              epoch=1,
                              learning_rate=0.01)
    Analogical_Reasoning_Task(emb, w2i, i2w, mode, part, ns)
Ejemplo n.º 28
0
        print("Invalid Switch/Usage " + sys.argv[1])
    print("Usage :\n")
    print("To compress : \npython " + sys.argv[0] +
          " -c filename.txt [dictfile.dict]\n")
    print("To decompress : \npython " + sys.argv[0] +
          " -x filename.bin [dictfile.dict]")
    print(
        "filename.dict is optional, to be used if the dictionary was saved under a different name."
    )
    exit(0)

if [sys.argv[1]] == ['-c']:
    print()
    pathf = sys.argv[2]
    dictf = ''
    if len(sys.argv) > 3:
        dictf = sys.argv[3]
    h = HuffmanCoding(pathf)
    out = h.compress()
    h.save_codes(dictf)
    h.get_code()
    h.get_freq()
elif [sys.argv[1]] == ['-x']:
    print()
    pathf = sys.argv[2]
    dictf = ''
    if len(sys.argv) > 3:
        dictf = sys.argv[3]
    h = HuffmanCoding(pathf, dictf)
    h.decompress()
Ejemplo n.º 29
0
from huffman import HuffmanCoding
import sys

path = "textfile.txt"

h = HuffmanCoding(path)

output_path = h.compress()
print("Compressed file path: " + output_path)

decom_path = h.decompress(output_path)
print("Decompressed file path: " + decom_path)
Ejemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser(description='Word2vec')
    parser.add_argument('mode', metavar='mode', type=str,
                        help='"SG" for skipgram, "CBOW" for CBOW')
    parser.add_argument('ns', metavar='negative_samples', type=int,
                        help='0 for hierarchical softmax, the other numbers would be the number of negative samples')
    parser.add_argument('part', metavar='partition', type=str,
                        help='"part" if you want to train on a part of corpus, "full" if you want to train on full corpus')
    args = parser.parse_args()
    mode = args.mode
    part = args.part
    ns = args.ns

    print("loading...")
    if part=="part":
        # 텍스트 수
        text = open('text8',mode='r').readlines()[0][:1000000]
    elif part=="full":
        text = open('text8',mode='r').readlines()[0]
    else:
        print("Unknown argument : " + part)
        exit()

    print("preprocessing...")
    corpus = text.split()
    stats = Counter(corpus)
    words = []

    for word in corpus:
        if stats[word]>4:
            words.append(word)
    vocab = set(words)

    w2i = {}
    w2i[" "]=0
    i = 1
    for word in vocab:
        w2i[word] = i
        i+=1
    i2w = {}
    for k,v in w2i.items():
        i2w[v]=k

    freqdict={}
    freqdict[0]=10
    for word in vocab:
        freqdict[w2i[word]]=stats[word]
    codedict = HuffmanCoding().build(freqdict)

    freqtable = [0,0,0]
    for k,v in stats.items():
        f = int(v**0.75)
        for _ in range(f):
            if k in w2i.keys():
                freqtable.append(w2i[k])

    print("build training set...")
    input_set = []
    target_set = []
    window_size = 5
    if mode=="CBOW":
        for j in range(len(words)):
            if j<window_size:
                input_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)])
                target_set.append(w2i[words[j]])
            elif j>=len(words)-window_size:
                input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)])
                target_set.append(w2i[words[j]])
            else:
                input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)])
                target_set.append(w2i[words[j]])
    if mode=="SG":
        for j in range(len(words)):
            if j<window_size:
                input_set += [w2i[words[j]] for _ in range(window_size*2)]
                target_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]
            elif j>=len(words)-window_size:
                input_set += [w2i[words[j]] for _ in range(window_size*2)]
                target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]
            else:
                input_set += [w2i[words[j]] for _ in range(window_size*2)]
                target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]

    print("Vocabulary size")
    print(len(w2i))
    print()

    numwords = len(w2i)
    use_subsample = True
    W_in ,_ = word2vec_trainer(input_set, target_set, numwords, codedict, freqtable, mode=mode, NS=ns, dimension=64, epoch=1, learning_rate=0.01, do_subsampling=use_subsample)

    emb = {}
    for index in range(numwords):
        emb[i2w[index]] = W_in[index]

    Analogical_Reasoning_Task(emb, output_name=f"{mode} {ns} {use_subsample} dim=64.txt")