def __init__( self, # 16 or 32 byte key key, nonce=None, mode=AES_MODE_EAX, text_encoding='utf-8'): self.key = key Log.debug('Using key ' + str(str(self.key)) + '. Size = ' + str(len(self.key)) + '.') self.cipher_mode_str = mode if self.cipher_mode_str == AES_Encrypt.AES_MODE_EAX: self.cipher_mode = AES.MODE_EAX elif self.cipher_mode_str == AES_Encrypt.AES_MODE_CBC: self.cipher_mode = AES.MODE_CBC else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported AES mode "' + str(self.cipher_mode_str) + '"') if nonce is None: # Must be 16 bytes # nonce = key[0:16] nonce = AES_Encrypt.generate_random_bytes( size=AES_Encrypt.SIZE_NONCE, printable=True) self.nonce = nonce Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using nonce "' + str(self.nonce) + '". Size = ' + str(len(self.nonce))) self.text_encoding = text_encoding return
def convert_ascii_string_to_other_alphabet( ascii_char_string, # Default to CJK Unicode Block unicode_range=BLOCK_CHINESE, # If the characters come from a hexdigest from a hash, we can compress 4 times, # otherwise for a random ascii string, we can only compress 2 characters to 1 chinese. group_n_char=2): uni_len = unicode_range[1] - unicode_range[0] + 1 r = len(ascii_char_string) % 4 if r != 0: # Append 0's ascii_char_string = ascii_char_string + '0' * (4 - r) # raise Exception('Hash length ' + str(len(hash_hex_string)) # + ' for "' + str(hash_hex_string) + '" not 0 modulo-4') hash_zh = '' len_block = int(len(ascii_char_string) / group_n_char) for i in range(0, len_block, 1): idx_start = group_n_char * i idx_end = idx_start + group_n_char s = ascii_char_string[idx_start:idx_end] # Convert to Chinese, Korean, etc if group_n_char == 2: ord_arr = np.array([ord(x) for x in s]) val = ord_arr * np.array( [2**(8 * (x - 1)) for x in range(len(ord_arr), 0, -1)]) val = np.sum(val) Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s) + ', ordinal=' + str(ord_arr) + ', val=' + str(hex(val))) cjk_unicode = (val % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) elif group_n_char == 4: Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s)) n = int('0x' + str(s), 16) cjk_unicode = (n % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) Log.debugdebug('From ' + str(idx_start) + ': ' + str(s) + ', n=' + str(n) + ', char=' + str(chr(cjk_unicode))) return hash_zh
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) s = '니는 먹고 싶어' tests_set_1 = [[Hash.ALGO_SHA1, '蔮膫圈嫩慁覕邜蹋妡狿'], [Hash.ALGO_SHA256, '葶杊閹翔綐僤徼戻髯鼚胦嘭藃诠灑浽'], [Hash.ALGO_SHA512, '詐鏙仟墍例嵝烐檦蝡溲薑珇鸦東燢爻纷欜陲囚劚攠菜槑茹輀濯偑袁蓣质簨'], [Hash.ALGO_SHA3_256, '厥驹踸鸨揱澯鑢擠鳰僸覑儽悃徵絨控'], [ Hash.ALGO_SHA3_512, '醜怅僒础衺菼惓隔鮚腋釔晞鏙屜咖龩檵因伖蘦惌灱騾凊纅弪鮾蕏解铦欪臓' ]] for x in tests_set_1: algo = x[0] expected = x[1] # In Linux command line, echo -n "$s" | shasum -a 1 (or 256,512) Log.debug('Using algo "' + str(algo) + '":') hstr = Hash.hash(string=s, algo=algo) Log.debug('Hash: ' + str(hstr)) observed = Hash.convert_ascii_string_to_other_alphabet( ascii_char_string=hstr, # unicode_range = Hash.BLOCK_KOREAN_SYL, group_n_char=4) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test string "' + str(hstr) + '" got "' + str(observed) + '"')) tests_set_2 = [['abc/ii{}.!&%[][\\+=', '嵢弯敩睽簡琥坝坜礽縰'], ['8829amsf)(*&^%^*./', '蘸耹嵭潦眨砦娥娪簯縰']] for x in tests_set_2: ascii_string = x[0] expected = x[1] observed = Hash.convert_ascii_string_to_other_alphabet( ascii_char_string=ascii_string) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test string "' + str(ascii_string) + '" got "' + str(observed) + '"')) return res_final
def xor_string( self, s1, s2 ): Log.debug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': XOR between "' + str(s1) + '" and "' + str(s2) + '".' ) len_s1 = len(s1) len_s2 = len(s2) len_max = max(len(s1), len(s2)) # Append to the shorter one, in a repeat manner for i in range(len(s1), len_max, 1): s1 += s1[(i-len_s1)] for i in range(len(s2), len_max, 1): s2 += s2[(i-len_s2)] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After appending, XOR between "' + str(s1) + '" and "' + str(s2) + '".' ) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': s1 "' + str(s1) + '", s2 "' + str(s2) + '"' ) b1 = bytes(s1, encoding=Obfuscate.STRING_ENCODING) b2 = bytes(s2, encoding=Obfuscate.STRING_ENCODING) bytes_xor = self.xor_bytes( b1 = b1, b2 = b2 ) return bytes_xor
def xor_bytes( self, b1, b2 ): t12 = zip(b1,b2) res_xor = [] for x in t12: byte_xor = x[0] ^ x[1] Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + 'XOR "' + str(hex(x[0])) + '" and "' + str(hex(x[1])) + '" = ' + str(hex(byte_xor)) ) res_xor.append(byte_xor) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': XOR between "' + str(self.hexdigest(b1)) + '" and "' + str(self.hexdigest(b2)) + '" = "' + str(self.hexdigest(res_xor)) + '"' ) return res_xor
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) long_str = '' for i in range(10000): long_str += random.choice(AES_Encrypt.CHARS_STR) sentences = [ '니는 먹고 싶어', 'Дворянское ГНЕЗДО', '没问题 大陆 经济', '存款方式***2019-12-11 11:38:46***', '1234567890123456', long_str ] key = b'Sixteen byte key' nonce = b'0123456789xxyyzz' for mode in [AES_Encrypt.AES_MODE_CBC, AES_Encrypt.AES_MODE_EAX]: # aes_obj = AES_Encrypt(key=AES_Encrypt.generate_random_bytes(size=32, printable=True)) aes_obj = AES_Encrypt(key=key + key, mode=mode, nonce=nonce) for s in sentences: Log.debug('Encrypting "' + str(s) + '"') data_bytes = bytes(s.encode(encoding=STR_ENCODING)) Log.debug('Data length in bytes = ' + str(len(data_bytes))) res = aes_obj.encode(data=data_bytes) ciphertext = res.ciphertext_b64 Log.debug('Encrypted as "' + str(ciphertext) + '"') plaintext = aes_obj.decode(ciphertext=ciphertext) Log.debug('Decrypted as "' + plaintext + '"') res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=plaintext, expected=s, test_comment='mode "' + str(mode) + '" s=' + str(s) + '" encrypted to "' + str(ciphertext) + '", decrypted back to "' + str(plaintext))) return res_final
def hide_data( self, # In string JSON records_json, # Column names to hide hide_colname, encrypt_key_b64, nonce_b64 = None, is_number_only = False, case_sensitive = False, # We support processing only China for now process_phone_country = None, hash_encode_lang = 'zh', ): step = 0 if type(records_json) is str: try: records_json = json.loads( records_json ) except Exception as ex_json: errmsg = \ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Exception loading json: ' + str(records_json)\ + '. Got exception: ' + str(ex_json) Log.error(errmsg) return errmsg colname_clean = str(hide_colname) + '_clean' colname_last4char = str(hide_colname) + '_last4char' colname_hash = str(hide_colname) + '_sha256' colname_hash_readable = str(hide_colname) + '_sha256_readable' colname_encrypt = str(hide_colname) + '_encrypt' colname_encrypt_readable = str(hide_colname) + '_encrypt_readable' df = pd.DataFrame(records_json) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converted json object (first 20 records): ' + str(records_json[0:min(20,len(records_json))]) + ' to data frame: ' + str(df) ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Start processing records, hide column "' + str(hide_colname) + '". Records of sample rows' + str(records_json[0:min(10,len(records_json))]) ) # # Step 1 # - Clean phone numbers, bank accounts # - Extract last 4 digits of phone/bank-account numbers to separate columns # - Obfuscate the phone numbers, bank accounts for storage in cube # step += 1 start_filter_time = Profiling.start() def filter_col( x, is_number_only = False, case_sensitive = False ): try: # We always trim no matter what x = StringUtils.trim(str(x)) if not case_sensitive: x = x.lower() if is_number_only: x = re.sub(pattern='[^0-9]', repl='', string=x) return x except Exception as ex_clean: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error cleaning "' + str(x) + '". ' + str(ex_clean) ) return x df[colname_clean] = df[hide_colname].apply(filter_col, args=(is_number_only, case_sensitive)) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': BASIC CLEANING Took ' + str(Profiling.get_time_dif_secs(start=start_filter_time, stop=Profiling.stop(), decimals=2)) + ' secs. Successfully cleaned column "' + str(hide_colname)+ '", case sensitive "' + str(case_sensitive) + '", is number "' + str(is_number_only) + '", sample rows: ' + str(df[0:2]) ) # # Process Phone Number by Country # step += 2 start_phone_time = Profiling.start() def process_phone( x, country ): try: if country == 'china': return PhoneNumber.filter_phone_china(x) else: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported country "' + str(country) + '"' ) return x except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception processing phone "' + str(x) + '". Exception ' + str(ex) ) return x if process_phone_country == 'china': df[colname_clean] = df[colname_clean].apply(process_phone, args=[process_phone_country]) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': PHONE CLEANING Took ' + str(Profiling.get_time_dif_secs(start=start_phone_time, stop=Profiling.stop(), decimals=2)) + ' secs. Successfully processed phone for column "' + str(hide_colname) + '", sample rows: ' + str(df[0:2]) ) # # Extract last 4 characters # step += 1 start_last4_time = Profiling.start() def last4char( x ): len_x = len(str(x)) if len_x >= 8: start = max(0, len_x - 4) else: start = len_x-1 return '***' + str(x)[start:len_x] df[colname_last4char] = df[colname_clean].apply(last4char) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': EXTRACT LAST 4 CHAR Took ' + str(Profiling.get_time_dif_secs(start=start_last4_time, stop=Profiling.stop(), decimals=2)) + ' secs. Successfully extracted last 4 chars from column "' + str(hide_colname) + '"' ) # # Hash the column # step += 1 start_hash_time = Profiling.start() def hash( x, desired_byte_len = 32 ): s = Hash.hash( string = x, algo = Hash.ALGO_SHA256 ) # obf = Obfuscate() # bytes_list = obf.hash_compression( # s = str(x), # desired_byte_length = desired_byte_len # ) # s = obf.hexdigest( # bytes_list = bytes_list, # unicode_range = None # ) return s df[colname_hash] = df[colname_clean].apply(hash, args=[32]) stop_hash_time = Profiling.start() Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': HASH Took ' + str(Profiling.get_time_dif_secs(start=start_hash_time, stop=stop_hash_time, decimals=2)) + ' secs. Successfully obfuscated column "' + str(hide_colname) + '", sample rows: ' + str(df[0:2]) ) # # Obfuscate Hash hexdigest to Chinese/etc characters # step += 1 start_obflang_time = Profiling.start() def obfuscate_hash_to_lang( x, lang ): unicode_range = Hash.BLOCK_CHINESE if lang == 'ko': unicode_range = Hash.BLOCK_KOREAN_SYL s = Hash.convert_ascii_string_to_other_alphabet( ascii_char_string = x, unicode_range = unicode_range, group_n_char = 4 ) return s df[colname_hash_readable] = df[colname_hash].apply(obfuscate_hash_to_lang, args=[hash_encode_lang]) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': HASH TO CHAR Took ' + str(Profiling.get_time_dif_secs(start=start_obflang_time, stop=Profiling.stop(), decimals=2)) + ' secs. Successfully converted obfuscation to language for column "' + str(hide_colname) + '"' ) # # Encryption # step += 1 start_enc_time = Profiling.start() try: key_bytes = b64decode(encrypt_key_b64.encode('utf-8')) except Exception as ex_key_conversion: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error converting base64 key "' + str(encrypt_key_b64) + '" to bytes. Exception: ' + str(ex_key_conversion) ) try: nonce_bytes = b64decode(nonce_b64.encode(encoding='utf-8')) except Exception as ex_nonce: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error converting base64 nonce "' + str(nonce_b64) + '" to bytes. Exception: ' + str(ex_nonce) ) nonce_bytes = None Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': HASH Took ' + ': Key bytes "' + str(key_bytes) + '", len = ' + str(len(key_bytes)) ) encryptor = AES_Encrypt( key = key_bytes, mode = AES_Encrypt.AES_MODE_CBC, nonce = nonce_bytes ) def encrypt( x, encryptor ): try: # print('***** x=' + str(x)) x_bytes = bytes(x.encode(encoding='utf-8')) # print('***** x_bytes=' + str(x_bytes)) res = encryptor.encode(x_bytes) ciphermode = res.cipher_mode ciphertext_b64 = res.ciphertext_b64 tag_b64 = res.tag_b64 nonce_b64 = res.nonce_b64 # print('***** cipher=' + str(cipher) + ', bytelen=' + str(len(cipher))) # plaintext = encryptor.decode(ciphertext=ciphertext_b64) # print('***** decrypted=' + str(plaintext) + ', ok=' + str(plaintext==x)) # if plaintext != x: # raise Exception('Decrypt Failed for x "' + str(x) + '", decypted "' + str(plaintext) + '"') return { 'ciphermode': ciphermode, 'ciphertext_b64': ciphertext_b64, 'tag_b64': tag_b64, 'iv_b64': nonce_b64 } except Exception as ex: Log.error( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error encrypting "' + str(x) + '": ' + str(ex) ) return None df[colname_encrypt] = df[colname_clean].apply(encrypt, args=[encryptor]) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Step ' + str(step) + ': ENCRYPTION Took ' + str(Profiling.get_time_dif_secs(start=start_enc_time, stop=Profiling.stop(), decimals=2)) + ' secs. Successfully encrypted column "' + str(hide_colname) + '", for records (first 20 rows): ' + str(df.values[0:min(20,df.shape[0])]) ) # def obfuscate_cipher_to_lang( # x, # lang # ): # unicode_range = Hash.BLOCK_CHINESE # if lang == 'ko': # unicode_range = Hash.BLOCK_KOREAN_SYL # s = Hash.convert_ascii_string_to_other_alphabet( # ascii_char_string = x['ciphertext_b64'], # unicode_range = unicode_range, # group_n_char = 2 # ) # return s # # df[colname_encrypt_readable] = df[colname_encrypt].apply(obfuscate_cipher_to_lang, args=[hash_encode_lang]) df_json_str = df.to_json( # Make sure not ASCII force_ascii = False, orient = 'records', # Don't need indexing # index = False ) return df_json_str