예제 #1
0
def main(args):
    # Initial process
    args = vars(args)
    unicode_enc = args['unicode_enc']  # 选择编码方式
    mode = args['mode']  # 选择隐写算法
    block_size = args['block_size']  # 隐写参数batch_size
    temp = args['temp']  # 隐写参数TEMPERATURE,注意下文中最好不要新建temp变量
    precision = args['precision']  # 隐写参数
    topk = args['topk']  # 文本生成相关参数
    device = args['device']  # device,文本生成相关参数,选择GPU/CPU,默认'cuda'
    finish_sent = args['finish_sent']  # 隐写参数
    nucleus = args['nucleus']  # saac相关隐写参数
    delta = args['delta']  # saac相关隐写参数
    model_name = args['language_model']  # 文本生成模型
    context_file = args['context_file']  # 上下文文件的位置
    message_str = args['name']
    # sample_tokens = 100               # 测试用变量

    # PARAMETERS 默认第一次的隐写信息(人名)
    # message_str = "Chhenl"              # string to be hidden.

    # VALIDATE PARAMETERS 验证隐写算法
    if mode not in ['arithmetic', 'huffman', 'bins', 'saac']:
        raise NotImplementedError

    # 打印隐写信息(人名)
    print("Default plain_text is ", message_str)

    # 读取上下文
    f = open(context_file, 'r', encoding='utf-8')
    context = f.read()
    f.close()
    print("sample context is ",
          context)  # related to the text generation procedure.

    # 加载文本生成模型
    print("loading GPT-2 LM to GPU")
    enc, model = get_model(model_name=model_name)
    print("finish loading !")

    print("implication of {}".format(mode))

    # bins隐写算法的处理
    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    # saac隐写算法的处理
    if delta and mode == "saac":
        nucleus = 2**(-1.0 * delta)

    # 以下注释都为旧调试过程中的注释
    # fix situation: directly encode the text.
    # print("directly encode the plain txt:\n", enc.encode(message_str))
    # print("Decode back:\n", enc.decode(enc.encode(message_str)))

    # can ensure the problem arise in the arithmetic_decode as well as the arithmetic_encode function.

    # ----------------------start test----------------------------
    # test_str = "hello world."
    # print("test_str = ", test_str)
    # out = enc.encode(test_str)
    # print("out = ", out)
    # decode_str = enc.decode(out)
    # print("decode_str = ", decode_str)
    # print("enc.encode(decode_str) = ", enc.encode(decode_str))
    # ----------------------stop test-----------------------------

    # Archive Basic Initialization----------------------------------
    # print("plain_text is {}".format(message_str))
    # unicode_enc = False
    # mode = 'huffman'
    # block_size = 3 # for huffman and bins
    # temp = 0.9 # for arithmetic
    # precision = 26 # for arithmetic
    # sample_tokens = 100 # for sample, delete sample
    # topk = 300
    # device = 'cuda'
    # finish_sent=False # whether or not to force finish sent. If so, stats displayed will be for non-finished sentence
    # nucleus = 0.95
    # Archive Basic Initialization----------------------------------

    first_flag = 1  # 对下文中默认处理的标志
    context_tokens = encode_context(context, enc)  # 对context进行语言模型相关的编码

    while (1):
        # ---此处在循环中,则会不断等待输入隐写信息(人名)--------------------------------------
        # ------------------------------------------------------------------------------------
        # list_for_bpw = [] # 用于计算Bits/word参数
        # list_for_DKL = [] # 用于计算KL参数
        # list_for_seq = [] # 用于标记

        if first_flag == 0:
            message_str = input("Please reenter a new plaintext:")
            # output_amount = len(message_str)

        # 得到对隐写信息(人名)的大小写集合
        message_str = message_str.upper()
        arr = list(message_str)
        generated_array = dfs(arr, 0, [])

        first_flag = 0
        covertext_list = []

        for temp_count in range(0, len(generated_array)):
            # First encode message to uniform bits, without any context
            # (not essential this is arithmetic vs ascii, but it's more efficient when the message is natural language)

            # if temp_count > 10:
            #     break                 # 测试时最好完成修正,此处限制输出10个COVERTEXT

            print("=" * 80)
            print("Altering the #{} msg_str:".format(temp_count), message_str)
            message_str = generated_array[temp_count]  # 选择一个隐写信息(比如 KiErAn)

            # 得到message。即上文所述的字节流
            if unicode_enc:
                ba = bitarray.bitarray()
                ba.frombytes(message_str.encode('utf-8'))
                message = ba.tolist()
            else:
                message_ctx = [enc.encoder['<|endoftext|>']]
                message_str += '<eos>'
                message = decode_arithmetic(model,
                                            enc,
                                            message_str,
                                            message_ctx,
                                            precision=40,
                                            topk=60000)

            # print("First encode the text to a bit sequence!")
            # print(message)  # the binary stream. text--arithmetic-->binary stream
            # print("the length is {}".format(len(message)))

            # Next encode bits into cover text, using arbitrary context

            # 下方完成隐写算法,使用不同隐写算法将字节流嵌入进生成文本中,得到out经过GPT2的解码器得到COVERTEXT
            Hq = 0
            if mode == 'arithmetic':
                out, nll, kl, words_per_bit, Hq = encode_arithmetic(
                    model,
                    enc,
                    message,
                    context_tokens,
                    temp=temp,
                    finish_sent=finish_sent,
                    precision=precision,
                    topk=topk)
            elif mode == 'huffman':
                out, nll, kl, words_per_bit = encode_huffman(
                    model,
                    enc,
                    message,
                    context_tokens,
                    block_size,
                    finish_sent=finish_sent)
            elif mode == 'bins':
                out, nll, kl, words_per_bit = encode_block(
                    model,
                    enc,
                    message,
                    context_tokens,
                    block_size,
                    bin2words,
                    words2bin,
                    finish_sent=finish_sent)
            elif mode == 'saac':
                out, nll, kl, words_per_bit, Hq, topk_list, case_studies = encode_saac(
                    model,
                    enc,
                    message,
                    context_tokens,
                    device=device,
                    temp=temp,
                    precision=precision,
                    topk=topk,
                    nucleus=nucleus)
            #     add thing contains device='cuda', temp=1.0, precision=26, topk=50, nucleus=0.95.
            covertext = enc.decode(out)
            covertext_list.append(covertext)  # 将所有COVERTEXT保存到一个结构中,可供调用

            # list_for_bpw.append(1/words_per_bit)      # 用于计算参数
            # list_for_DKL.append(kl)                   # 用于计算参数
            # list_for_seq.append(temp_count)
            # print("="*40 + " Encoding " + "="*40)

            # 打印结果,COVERTEXT,此处可以将covertext进行提取。
            print(
                '#{} generated covertext:\n'.format(temp_count), covertext
            )  # covertext. generated covertext that contains secret information.
            print(
                'ppl: %0.2f, kl: %0.3f, words/bit: %0.2f, bits/word: %0.2f, entropy: %.2f'
                % (math.exp(nll), kl, words_per_bit, 1 / words_per_bit,
                   Hq / 0.69315))

            # -----------------------------------------------------------------------------------
            # 以下为隐写提取过程, 选择不同的隐写算法对covertext进行提取,得到字节流 MESSAGE_REC
            # Decode binary message from bits using the same arbitrary context

            # 下方在编写时可能会使用到,这里先注释掉,接收人将自己的名字和covertext输入进行判定。
            # input_name = input("Please input ur name:")
            # input_covertext = input("Please input the covertext:")
            # covertext = input_covertext

            if mode == 'arithmetic':
                message_rec = decode_arithmetic(model,
                                                enc,
                                                covertext,
                                                context_tokens,
                                                temp=temp,
                                                precision=precision,
                                                topk=topk)
            elif mode == 'huffman':
                message_rec = decode_huffman(model, enc, covertext,
                                             context_tokens, block_size)
            elif mode == 'bins':
                message_rec = decode_block(model, enc, covertext,
                                           context_tokens, block_size,
                                           bin2words, words2bin)
            elif mode == 'saac':
                message_rec = decode_saac(model,
                                          enc,
                                          covertext,
                                          context_tokens,
                                          device=device,
                                          temp=temp,
                                          precision=precision,
                                          topk=topk,
                                          nucleus=nucleus)

            # print("="*40 + " Recovered Message " + "="*40)
            # print(message_rec)  # binary stream extracted from stego_text.
            # print("=" * 80)
            # Finally map message bits back to original text

            # 对字节流进行解码操作,最终得到的reconst变量即为最终隐写提取所得,正常使用应为人名。
            if unicode_enc:
                message_rec = [bool(item) for item in message_rec]
                ba = bitarray.bitarray(message_rec)
                reconst = ba.tobytes().decode('utf-8', 'ignore')
            else:
                reconst = encode_arithmetic(model,
                                            enc,
                                            message_rec,
                                            message_ctx,
                                            precision=40,
                                            topk=60000)
                # reconst = encode_arithmetic(model, enc, message_rec, message_ctx, temp=temp, precision=precision, topk=topk)
                # print("reconst[0] is", format(reconst[0]))
                reconst = enc.decode(reconst[0])
            print("The decode text is ")
            print(reconst[0:-5]
                  )  # Decoded text. message_rec --arithmetic decode--> reconst
예제 #2
0
def main(args):
    enc, model = get_model(model_name=args.lm)

    ## PARAMETERS
    message_str = (args.message
                   if args.message != "" else "This is a very secret message!")

    unicode_enc = False
    mode = args.mode
    block_size = 3  # for huffman and bins
    temp = 0.9  # for arithmetic
    precision = 26  # for arithmetic
    sample_tokens = 100  # for sample
    topk = 300
    finish_sent = False  # whether or not to force finish sent. If so, stats displayed will be for non-finished sentence

    ## VALIDATE PARAMETERS
    if mode not in ['arithmetic', 'huffman', 'bins', 'sample']:
        raise NotImplementedError

    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    context = args.context if args.context != "" else \
"""Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission.


"""

    context_tokens = encode_context(context, enc)

    # ------------------------------------------------------------------------------------
    # ------------------------------------------------------------------------------------

    # First encode message to uniform bits, without any context
    # (not essential this is arithmetic vs ascii, but it's more efficient when the message is natural language)
    if unicode_enc:
        ba = bitarray.bitarray()
        ba.frombytes(message_str.encode('utf-8'))
        message = ba.tolist()
    else:
        message_ctx = [enc.encoder['<|endoftext|>']]
        message_str += '<eos>'
        message = decode_arithmetic(model,
                                    enc,
                                    message_str,
                                    message_ctx,
                                    precision=40,
                                    topk=60000)

    # Next encode bits into cover text, using arbitrary context
    Hq = 0
    if mode == 'arithmetic':
        out, nll, kl, words_per_bit, Hq = encode_arithmetic(
            model,
            enc,
            message,
            context_tokens,
            temp=temp,
            finish_sent=finish_sent,
            precision=precision,
            topk=topk)
    elif mode == 'huffman':
        out, nll, kl, words_per_bit = encode_huffman(model,
                                                     enc,
                                                     message,
                                                     context_tokens,
                                                     block_size,
                                                     finish_sent=finish_sent)
    elif mode == 'bins':
        out, nll, kl, words_per_bit = encode_block(model,
                                                   enc,
                                                   message,
                                                   context_tokens,
                                                   block_size,
                                                   bin2words,
                                                   words2bin,
                                                   finish_sent=finish_sent)
    elif mode == 'sample':
        out, nll, kl, Hq = sample(model,
                                  enc,
                                  sample_tokens,
                                  context_tokens,
                                  temperature=temp,
                                  topk=topk)
        words_per_bit = 1
    text = enc.decode(out)

    print(message)
    print(len(message))
    print("=" * 40 + " Encoding " + "=" * 40)
    print(text)
    print(
        'ppl: %0.2f, kl: %0.3f, words/bit: %0.2f, bits/word: %0.2f, entropy: %.2f'
        % (math.exp(nll), kl, words_per_bit, 1 / words_per_bit, Hq / 0.69315))

    # Decode binary message from bits using the same arbitrary context
    if mode != 'sample':
        if mode == 'arithmetic':
            message_rec = decode_arithmetic(model,
                                            enc,
                                            text,
                                            context_tokens,
                                            temp=temp,
                                            precision=precision,
                                            topk=topk)
        elif mode == 'huffman':
            message_rec = decode_huffman(model, enc, text, context_tokens,
                                         block_size)
        elif mode == 'bins':
            message_rec = decode_block(model, enc, text, context_tokens,
                                       block_size, bin2words, words2bin)

        print("=" * 40 + " Recovered Message " + "=" * 40)
        print(message_rec)
        print("=" * 80)
        # Finally map message bits back to original text
        if unicode_enc:
            message_rec = [bool(item) for item in message_rec]
            ba = bitarray.bitarray(message_rec)
            reconst = ba.tobytes().decode('utf-8', 'ignore')
        else:
            reconst = encode_arithmetic(model,
                                        enc,
                                        message_rec,
                                        message_ctx,
                                        precision=40,
                                        topk=60000)
            reconst = enc.decode(reconst[0])
        print(reconst)
예제 #3
0
def encrypt(unicode_enc, mode, block_size, temp, precision, topk, device,
            finish_sent, model_name, delta, context, message_str):
    print("loading GPT-2 LM to GPU")
    enc, model = get_model(model_name=model_name)
    print("finish loading !")

    print("implication of {}".format(mode))
    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    if delta and mode == "saac":
        nucleus = 2**(-1.0 * delta)

    first_flag = 1
    context_tokens = encode_context(context, enc)
    while (1):
        sentence_assmble = []
        if first_flag == 0:
            message_str = input("Please reenter a new plaintext:")
            # output_amount = len(message_str)
        message_str = message_str.upper()
        arr = list(message_str)
        generated_array = dfs(arr, 0, [])
        first_flag = 0
        for temp_count in range(0, len(generated_array)):
            # First encode message to uniform bits, without any context
            # (not essential this is arithmetic vs ascii, but it's more efficient when the message is natural language)

            # if temp_count > 10: # protect from running too much times.
            #     break

            print("=" * 80)
            print("Altering the #{} msg_str:".format(temp_count), message_str)
            message_str = generated_array[temp_count]

            if unicode_enc:
                ba = bitarray.bitarray()
                ba.frombytes(message_str.encode('utf-8'))
                message = ba.tolist()
            else:
                message_ctx = [enc.encoder['<|endoftext|>']]
                message_str += '<eos>'
                message = decode_arithmetic(model,
                                            enc,
                                            message_str,
                                            message_ctx,
                                            precision=40,
                                            topk=60000)
                # message = decode_arithmetic(model, enc, message_str, message_ctx, precision=precision, topk=topk, temp=temp)

            Hq = 0
            if mode == 'arithmetic':
                out, nll, kl, words_per_bit, Hq = encode_arithmetic(
                    model,
                    enc,
                    message,
                    context_tokens,
                    temp=temp,
                    finish_sent=finish_sent,
                    precision=precision,
                    topk=topk)
            elif mode == 'huffman':
                out, nll, kl, words_per_bit = encode_huffman(
                    model,
                    enc,
                    message,
                    context_tokens,
                    block_size,
                    finish_sent=finish_sent)
            elif mode == 'bins':
                out, nll, kl, words_per_bit = encode_block(
                    model,
                    enc,
                    message,
                    context_tokens,
                    block_size,
                    bin2words,
                    words2bin,
                    finish_sent=finish_sent)
                words_per_bit = 1
            elif mode == 'saac':
                out, nll, kl, words_per_bit, Hq, topk_list, case_studies = encode_saac(
                    model,
                    enc,
                    message,
                    context_tokens,
                    device=device,
                    temp=temp,
                    precision=precision,
                    topk=topk,
                    nucleus=nucleus)
            #     add thing contains device='cuda', temp=1.0, precision=26, topk=50, nucleus=0.95.
            text = enc.decode(out)
            # print("="*40 + " Encoding " + "="*40)
            print(
                '#{} generated covertext:\n'.format(temp_count), text
            )  # covertext. generated text that contains secret information.
            # print('ppl: %0.2f, kl: %0.3f, words/bit: %0.2f, bits/word: %0.2f, entropy: %.2f' % (math.exp(nll), kl, words_per_bit, 1/words_per_bit, Hq/0.69315))
            sentence_assmble.append(text)
        dataframe = pd.DataFrame({'Sentences': sentence_assmble})
        dataframe.to_csv("User_{}_Name_{}_Amount_{}.csv".format(
            random.randint(1, 10000),
            message_str.upper()[0:-5], len(generated_array)),
                         index=False,
                         sep=',')
예제 #4
0
def embed(unicode_enc=False, mode='saac', block_size=1, temp=0.9, precision=26, topk=300, device='cuda',
          finish_sent=False, nucleus=0.95, delta=0.01, model_name='gpt2',
          context_file='D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/context.txt', name='Gogo'):
    # Example: embed(mode='saac', name='Chhenl', temp=0.9)
    # covertext_list保存生成文本的列表(生成10个)

    temp = float(temp)
    message_str = name
    # VALIDATE PARAMETERS 验证隐写算法
    if mode not in ['arithmetic', 'huffman', 'bins', 'saac']:
        raise NotImplementedError

    # 打印隐写信息(人名)
    print("Plain_text is ", message_str)

    # 读取上下文
    f = open(context_file, 'r', encoding='utf-8')
    context = f.read()
    f.close()
    print("sample context is ", context)  # related to the text generation procedure.

    # 加载文本生成模型
    print("loading GPT-2 LM to GPU")
    enc, model = get_model(model_name=model_name)
    print("finish loading !")

    print("implication of {}".format(mode))

    # bins隐写算法的处理
    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    # saac隐写算法的处理
    if delta and mode == "saac":
        nucleus = 2 ** (-1.0 * delta)

    # first_flag = 1  # 对下文中默认处理的标志
    context_tokens = encode_context(context, enc)  # 对context进行语言模型相关的编码

    # 得到对隐写信息(人名)的大小写集合
    message_str = message_str.upper()
    arr = list(message_str)
    generated_array = dfs(arr, 0, [])

    # first_flag = 0
    covertext_list = []

    for temp_count in range(0, len(generated_array)):
        # First encode message to uniform bits, without any context
        # (not essential this is arithmetic vs ascii, but it's more efficient when the message is natural language)

        if temp_count > 1:
            break                 # 测试时最好完成修正,此处限制输出10个COVERTEXT

        print("=" * 80)
        print("Altering the #{} msg_str:".format(temp_count), message_str)
        message_str = generated_array[temp_count]  # 选择一个隐写信息(比如 KiErAn)

        # 得到message。即上文所述的字节流
        if unicode_enc:
            ba = bitarray.bitarray()
            ba.frombytes(message_str.encode('utf-8'))
            message = ba.tolist()
        else:
            message_ctx = [enc.encoder['<|endoftext|>']]
            message_str += '<eos>'
            message = decode_arithmetic(model, enc, message_str, message_ctx, precision=40, topk=60000)

        # Next encode bits into cover text, using arbitrary context

        # 下方完成隐写算法,使用不同隐写算法将字节流嵌入进生成文本中,得到out经过GPT2的解码器得到COVERTEXT
        Hq = 0
        if mode == 'arithmetic':
            out, nll, kl, words_per_bit, Hq = encode_arithmetic(model, enc, message, context_tokens, temp=temp,
                                                                finish_sent=finish_sent, precision=precision,
                                                                topk=topk)
        elif mode == 'huffman':
            out, nll, kl, words_per_bit = encode_huffman(model, enc, message, context_tokens, block_size,
                                                         finish_sent=finish_sent)
        elif mode == 'bins':
            out, nll, kl, words_per_bit = encode_block(model, enc, message, context_tokens, block_size, bin2words,
                                                       words2bin, finish_sent=finish_sent)
        elif mode == 'saac':
            out, nll, kl, words_per_bit, Hq, topk_list, case_studies = encode_saac(model, enc, message,
                                                                                   context_tokens, device=device,
                                                                                   temp=temp, precision=precision,
                                                                                   topk=topk, nucleus=nucleus)
        covertext = enc.decode(out)
        covertext_list.append(covertext)  # 将所有COVERTEXT保存到一个结构中,可供调用

        # 打印结果,COVERTEXT,此处可以将covertext进行提取。
        print('#{} generated covertext:\n'.format(temp_count),
              covertext)  # covertext. generated covertext that contains secret information.

    return covertext_list
예제 #5
0
def extract(unicode_enc=False, mode='saac', block_size=1, temp=0.9, precision=26, topk=300, device='cuda', finish_sent=False, nucleus=0.95, delta=0.01, model_name='gpt2', context_file='D:/OneDrive - whu.edu.cn/桌面/NeuralSteganography-master1/context.txt', covertext='Hello', name='Gogo'):
    # Example: extract(mode='saac', name='Chhenl', temp=0.9, covertext='Temp')

    temp = float(temp)
    # VALIDATE PARAMETERS 验证隐写算法
    if mode not in ['arithmetic', 'huffman', 'bins', 'saac']:
        raise NotImplementedError


    # 打印隐写信息(人名)
    print("Cover_text is ", covertext)
    print("Target is ", name)
    # 读取上下文
    f = open(context_file, 'r', encoding='utf-8')
    context = f.read()
    f.close()
    print("sample context is ", context)  # related to the text generation procedure.

    # 加载文本生成模型
    print("loading GPT-2 LM to GPU")
    enc, model = get_model(model_name=model_name)
    print("finish loading !")
    message_ctx = [enc.encoder['<|endoftext|>']]
    print("implication of {}".format(mode))

    # bins隐写算法的处理
    if mode == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)

    # saac隐写算法的处理
    if delta and mode == "saac":
        nucleus = 2 ** (-1.0 * delta)

    context_tokens = encode_context(context, enc)  # 对context进行语言模型相关的编码

    # -----------------------------------------------------------------------------------
    # 以下为隐写提取过程, 选择不同的隐写算法对covertext进行提取,得到字节流 MESSAGE_REC
    # Decode binary message from bits using the same arbitrary context

    # 下方在编写时可能会使用到,这里先注释掉,接收人将自己的名字和covertext输入进行判定。
    # input_name = input("Please input ur name:")
    # input_covertext = input("Please input the covertext:")
    # covertext = input_covertext

    if mode == 'arithmetic':
        message_rec = decode_arithmetic(model, enc, covertext, context_tokens, temp=temp, precision=precision,
                                        topk=topk)
    elif mode == 'huffman':
        message_rec = decode_huffman(model, enc, covertext, context_tokens, block_size)
    elif mode == 'bins':
        message_rec = decode_block(model, enc, covertext, context_tokens, block_size, bin2words, words2bin)
    elif mode == 'saac':
        message_rec = decode_saac(model, enc, covertext, context_tokens, device=device, temp=temp,
                                  precision=precision, topk=topk, nucleus=nucleus)

    # print("="*40 + " Recovered Message " + "="*40)
    # print(message_rec)  # binary stream extracted from stego_text.
    # print("=" * 80)
    # Finally map message bits back to original text

    # 对字节流进行解码操作,最终得到的reconst变量即为最终隐写提取所得,正常使用应为人名。
    if unicode_enc:
        message_rec = [bool(item) for item in message_rec]
        ba = bitarray.bitarray(message_rec)
        reconst = ba.tobytes().decode('utf-8', 'ignore')
    else:
        reconst = encode_arithmetic(model, enc, message_rec, message_ctx, precision=40, topk=60000)
        # reconst = encode_arithmetic(model, enc, message_rec, message_ctx, temp=temp, precision=precision, topk=topk)
        # print("reconst[0] is", format(reconst[0]))
        reconst = enc.decode(reconst[0])
    print("The decode text is ")
    print(reconst[0:-5])  # Decoded text. message_rec --arithmetic decode--> reconst

    # 这里完成基本的判断,判断此时的covertext是否指向此人名,这里对应输入设置。
    extracted_name = reconst[0:-5].upper()
    # print("input name is ", name)
    # print("extracted name is ", extracted_name)
    if extracted_name == name.upper():
        print("YOU ARE THE ONE! (^..^)")
        return 1
    # Success
    else:
        print("PITY. ('..') ")
        return 0
예제 #6
0
def main(args):
    # process hyperparameters
    args = vars(args)
    dataset = args['dataset']
    dataset_path = args['dataset_path']
    lm_model = args['lm']
    device = args['device']
    encryption_method = args["encrypt"]
    use_cached_encryption_results = (encryption_method == "cached")
    steganography_method = args["encode"]
    precision = args["precision"]
    temp = args["temp"]
    topK = args["topK"]
    block_size = args["block_size"]
    nucleus = args["nucleus"]
    delta = args["delta"]
    if delta:
        nucleus = 2**(-1.0 * delta)
    print("Loading large LM to GPU, please wait for a few seconds...")
    enc, model, device = get_model(model_name=lm_model, device_id=device)

    # load plaintext dataset
    if dataset != "random":
        with open(f"{dataset_path}/plaintext.txt", "r") as fin:
            plaintexts = [
                line.strip() for line in fin.readlines() if line.strip() != ""
            ]
        print(f"Encoding {len(plaintexts)} plaintexts")
    bin2words, words2bin = get_bins(len(enc.encoder), block_size)
    args["bin2words"] = bin2words
    args["words2bin"] = words2bin

    # encryption
    print(f"Encryption Algorithm: {encryption_method}")
    if use_cached_encryption_results:
        print("Load existing encrypted messages")
        encryption_infos = []
        messages = []
        with open(f"{dataset_path}/message_bits.txt", "r") as fin:
            for line in fin:
                line = line.strip()
                if line:
                    messages.append(eval(line))
    else:
        encryption_infos = []
        encryption_context = ""
        messages = []
        for plaintext in tqdm(plaintexts, desc="encrypting"):
            message, info = plaintext2bits(plaintext, encryption_context,
                                           model, enc, lm_model,
                                           encryption_method, device)
            messages.append(message)
            encryption_infos.append(info)
        with open(f"{dataset_path}/message_bits.txt", "w") as fout:
            for message in messages:
                fout.write(str(message))
                fout.write("\n")

    # steganography encoding
    encoding_infos = []
    encoding_context = "Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission."
    covertexts = []
    print(f"Steganography Encoding Algorithm: {steganography_method}")
    start = time.time()
    for message in tqdm(messages[:100], desc="encoding"):
        covertext, info = bits2covertext(message,
                                         encoding_context,
                                         model,
                                         enc,
                                         lm_model,
                                         steganography_method,
                                         device,
                                         bin2words=bin2words,
                                         words2bin=words2bin,
                                         precision=precision,
                                         temp=temp,
                                         topK=topK,
                                         block_size=block_size,
                                         nucleus=nucleus)
        covertexts.append(covertext)
        encoding_infos.append(info)
    end = time.time()
    efficiency = (end - start) / 100
    print(f"Use {efficiency} per example")

    results = {
        "encrpytion_infos": encryption_infos,
        "encoding_infos": encoding_infos,
        "covertexts": covertexts
    }
    output_name = get_output_file_name(args)
    with open(output_name, "w") as fout:
        json.dump(results,
                  fout,
                  indent=4,
                  sort_keys=True,
                  separators=(',', ': '))
예제 #7
0
def main(args):
    # get model hyperparameters
    args = vars(args)
    lm_model = args['lm']
    device = args['device']
    encryption_method = args["encrypt"]
    steganography_method = args["encode"]
    precision = args["precision"]
    temp = args["temp"]
    topk = args["topK"]
    block_size = args["block_size"]
    nucleus = args["nucleus"]
    delta = args["delta"]
    if delta:
        nucleus = 2**(-1.0 * delta)

    # get plaintext
    if args["plaintext"] == "":
        plaintext = "molly ultra caps capped at 180mgs will have you flying for hrs clean come down 99 of the time ."
    else:
        plaintext = args["plaintext"]

    # get steganography encoding context
    if args["context"] == "":
        context = args["context"]
    else:
        context = "Washington received his initial military training and command with the Virginia Regiment during the French and Indian War. He was later elected to the Virginia House of Burgesses and was named a delegate to the Continental Congress, where he was appointed Commanding General of the nation's Continental Army. Washington led American forces, allied with France, in the defeat of the British at Yorktown. Once victory for the United States was in hand in 1783, Washington resigned his commission."

    # start steganography pipeline
    print("Loading large LM to GPU, please wait for a few seconds...")
    enc, model, device = get_model(model_name=lm_model, device_id=device)

    # Encryption: encrypt secret plaintext to message bits
    print(f"Plaintext: {plaintext}")
    print(f"Encryption method: {encryption_method}")
    if encryption_method == "utf8":
        ba = bitarray.bitarray()
        ba.frombytes(plaintext.encode('utf-8'))
        message = ba.tolist()
    elif encryption_method == "arithmetic":
        message_ctx = [enc.encoder['<|endoftext|>']]
        plaintext += '<eos>'
        message = decode_arithmetic(model,
                                    enc,
                                    plaintext,
                                    message_ctx,
                                    device=device,
                                    precision=40,
                                    topk=60000)
    print(f"Encrypted message bits: {message}")

    # Steganography Encoding: encode message bits to covertext
    print(f"Steganography encoding method: {steganography_method}")
    context_tokens = encode_context(context, enc)
    if steganography_method == 'bins':
        bin2words, words2bin = get_bins(len(enc.encoder), block_size)
        out, nll, kl, words_per_bit = encode_block(model,
                                                   enc,
                                                   message,
                                                   context_tokens,
                                                   block_size,
                                                   bin2words,
                                                   words2bin,
                                                   device=device)
    elif steganography_method == 'huffman':
        out, nll, kl, words_per_bit = encode_huffman(model,
                                                     enc,
                                                     message,
                                                     context_tokens,
                                                     block_size,
                                                     device=device)
    elif steganography_method == 'arithmetic':
        out, nll, kl, words_per_bit, Hq, kl_list = encode_arithmetic(
            model,
            enc,
            message,
            context_tokens,
            device=device,
            temp=temp,
            precision=precision,
            topk=topk)
    elif steganography_method == 'saac':
        out, nll, kl, words_per_bit, Hq, topk_list, case_studies = encode_saac(
            model,
            enc,
            message,
            context_tokens,
            device=device,
            temp=temp,
            precision=precision,
            topk=topk,
            nucleus=nucleus)
    covertext = enc.decode(out)
    print(f"Encoded covertext: {covertext}")
    print(f"kl: {kl}, bits/words: {1.0/words_per_bit}")

    # Steganography Decoding: decode covertext to message bits
    if steganography_method == 'bins':
        message_rec = decode_block(model, enc, covertext, context_tokens,
                                   block_size, bin2words, words2bin)
    elif steganography_method == 'huffman':
        message_rec = decode_huffman(model, enc, covertext, context_tokens,
                                     block_size)
    elif steganography_method == 'arithmetic':
        message_rec = decode_arithmetic(model,
                                        enc,
                                        covertext,
                                        context_tokens,
                                        device=device,
                                        temp=temp,
                                        precision=precision,
                                        topk=topk)
    elif steganography_method == 'saac':
        message_rec = decode_saac(model,
                                  enc,
                                  covertext,
                                  context_tokens,
                                  device=device,
                                  temp=temp,
                                  precision=precision,
                                  topk=topk,
                                  nucleus=nucleus)
    print(f"Decoded message bits: {message_rec}")

    # Decryption: map message bits back to original text
    if encryption_method == "utf8":
        message_rec = [bool(item) for item in message_rec]
        ba = bitarray.bitarray(message_rec)
        reconst = ba.tobytes().decode('utf-8', 'ignore')
    elif encryption_method == "arithmetic":
        reconst = encode_arithmetic(model,
                                    enc,
                                    message_rec,
                                    message_ctx,
                                    device=device,
                                    precision=40,
                                    topk=60000)
        reconst = enc.decode(reconst[0])
    print("Recovered plaintext:", reconst)