Example #1
0

from hide.utils.CmdLine import CmdLine
#
# Decide whether to run multi-threaded in gunicorn or not
#
pv = cl.CmdLine.get_cmdline_params(pv_default={'gunicorn': '0'})
cmdline_params = CmdLine.get_cmdline_params(pv_default=pv)
print('Command line params: ' + str(cmdline_params))
cwd = os.getcwd()

if 'debug' in pv.keys():
    if pv['debug'] in ['1', 'y', 'yes']:
        Log.DEBUG_PRINT_ALL_TO_SCREEN = True
        print('Logs will be directed to stdout')
else:
    print('Current working directory "' + str(cwd) + '"')
    cwd = re.sub(pattern='([/\\\\]hide[/\\\\]).*', repl='/hide/', string=cwd)
    Log.LOGFILE = cwd + 'logs/hide.log'
    print('Logs will be directed to log file (with date) "' +
          str(Log.LOGFILE) + '"')

rest_api = HideApi()
if pv['gunicorn'] == '1':
    Log.important('Starting Hide API with gunicorn from folder "' + str(cwd))
    # Port and Host specified on command line already for gunicorn
else:
    Log.important('Starting Hide API without gunicorn from folder "' +
                  str(cwd))
    rest_api.run_hide_api()
Example #2
0
File: Hide.py Project: mapktah/hide
    def hide_data(
            self,
            # In string JSON
            records_json,
            # Column names to hide
            hide_colname,
            encrypt_key_b64,
            nonce_b64        = None,
            is_number_only   = False,
            case_sensitive   = False,
            # We support processing only China for now
            process_phone_country = None,
            hash_encode_lang = 'zh',
    ):
        step = 0

        if type(records_json) is str:
            try:
                records_json = json.loads(
                    records_json
                )
            except Exception as ex_json:
                errmsg = \
                    str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                    + ': Exception loading json: ' + str(records_json)\
                    + '. Got exception: ' + str(ex_json)
                Log.error(errmsg)
                return errmsg

        colname_clean            = str(hide_colname) + '_clean'
        colname_last4char        = str(hide_colname) + '_last4char'
        colname_hash             = str(hide_colname) + '_sha256'
        colname_hash_readable    = str(hide_colname) + '_sha256_readable'
        colname_encrypt          = str(hide_colname) + '_encrypt'
        colname_encrypt_readable = str(hide_colname) + '_encrypt_readable'

        df = pd.DataFrame(records_json)
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Converted json object (first 20 records): '
            + str(records_json[0:min(20,len(records_json))])
            + ' to data frame: ' + str(df)
        )

        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Start processing records, hide column "' + str(hide_colname)
            + '". Records of sample rows' +  str(records_json[0:min(10,len(records_json))])
        )

        #
        # Step 1
        #  - Clean phone numbers, bank accounts
        #  - Extract last 4 digits of phone/bank-account numbers to separate columns
        #  - Obfuscate the phone numbers, bank accounts for storage in cube
        #
        step += 1
        start_filter_time = Profiling.start()
        def filter_col(
                x,
                is_number_only = False,
                case_sensitive = False
        ):
            try:
                # We always trim no matter what
                x = StringUtils.trim(str(x))
                if not case_sensitive:
                    x = x.lower()
                if is_number_only:
                    x = re.sub(pattern='[^0-9]', repl='', string=x)
                return x
            except Exception as ex_clean:
                Log.error(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Error cleaning "' + str(x) + '". ' + str(ex_clean)
                )
                return x
        df[colname_clean] = df[hide_colname].apply(filter_col, args=(is_number_only, case_sensitive))
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': BASIC CLEANING Took '
            + str(Profiling.get_time_dif_secs(start=start_filter_time, stop=Profiling.stop(), decimals=2))
            + ' secs. Successfully cleaned column "' + str(hide_colname)+
            '", case sensitive "' + str(case_sensitive)
            + '", is number "' + str(is_number_only)
            + '", sample rows: ' + str(df[0:2])
        )

        #
        # Process Phone Number by Country
        #
        step += 2
        start_phone_time = Profiling.start()
        def process_phone(
                x,
                country
        ):
            try:
                if country == 'china':
                    return PhoneNumber.filter_phone_china(x)
                else:
                    Log.error(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Unsupported country "' + str(country) + '"'
                    )
                    return x
            except Exception as ex:
                Log.error(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Exception processing phone "' + str(x) + '". Exception ' + str(ex)
                )
                return x

        if process_phone_country == 'china':
            df[colname_clean] = df[colname_clean].apply(process_phone, args=[process_phone_country])
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Step ' + str(step) + ': PHONE CLEANING Took '
                + str(Profiling.get_time_dif_secs(start=start_phone_time, stop=Profiling.stop(), decimals=2))
                + ' secs. Successfully processed phone for column "' + str(hide_colname)
                + '", sample rows: ' + str(df[0:2])
            )

        #
        # Extract last 4 characters
        #
        step += 1
        start_last4_time = Profiling.start()
        def last4char(
                x
        ):
            len_x = len(str(x))
            if len_x >= 8:
                start = max(0, len_x - 4)
            else:
                start = len_x-1
            return '***' + str(x)[start:len_x]
        df[colname_last4char] = df[colname_clean].apply(last4char)
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': EXTRACT LAST 4 CHAR Took '
            + str(Profiling.get_time_dif_secs(start=start_last4_time, stop=Profiling.stop(), decimals=2))
            + ' secs. Successfully extracted last 4 chars from column "' + str(hide_colname)
            + '"'
        )

        #
        # Hash the column
        #
        step += 1
        start_hash_time = Profiling.start()
        def hash(
                x,
                desired_byte_len = 32
        ):
            s = Hash.hash(
                string = x,
                algo   = Hash.ALGO_SHA256
            )
            # obf = Obfuscate()
            # bytes_list = obf.hash_compression(
            #     s                   = str(x),
            #     desired_byte_length = desired_byte_len
            # )
            # s = obf.hexdigest(
            #     bytes_list    = bytes_list,
            #     unicode_range = None
            # )
            return s

        df[colname_hash] = df[colname_clean].apply(hash, args=[32])
        stop_hash_time = Profiling.start()
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': HASH Took '
            + str(Profiling.get_time_dif_secs(start=start_hash_time, stop=stop_hash_time, decimals=2))
            + ' secs. Successfully obfuscated column "' + str(hide_colname)
            + '", sample rows: ' + str(df[0:2])
        )

        #
        # Obfuscate Hash hexdigest to Chinese/etc characters
        #
        step += 1
        start_obflang_time = Profiling.start()
        def obfuscate_hash_to_lang(
                x,
                lang
        ):
            unicode_range = Hash.BLOCK_CHINESE
            if lang == 'ko':
                unicode_range = Hash.BLOCK_KOREAN_SYL
            s = Hash.convert_ascii_string_to_other_alphabet(
                ascii_char_string = x,
                unicode_range     = unicode_range,
                group_n_char      = 4
            )
            return s

        df[colname_hash_readable] = df[colname_hash].apply(obfuscate_hash_to_lang, args=[hash_encode_lang])
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': HASH TO CHAR Took '
            + str(Profiling.get_time_dif_secs(start=start_obflang_time, stop=Profiling.stop(), decimals=2))
            + ' secs. Successfully converted obfuscation to language for column "' + str(hide_colname)
            + '"'
        )

        #
        # Encryption
        #
        step += 1
        start_enc_time = Profiling.start()
        try:
            key_bytes = b64decode(encrypt_key_b64.encode('utf-8'))
        except Exception as ex_key_conversion:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Error converting base64 key "' + str(encrypt_key_b64)
                + '" to bytes. Exception: ' + str(ex_key_conversion)
            )
        try:
            nonce_bytes = b64decode(nonce_b64.encode(encoding='utf-8'))
        except Exception as ex_nonce:
            Log.warning(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Error converting base64 nonce "' + str(nonce_b64)
                + '" to bytes. Exception: ' + str(ex_nonce)
            )
            nonce_bytes = None
        Log.important(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': HASH Took '
            + ': Key bytes "' + str(key_bytes) + '", len = ' + str(len(key_bytes))
        )
        encryptor = AES_Encrypt(
            key   = key_bytes,
            mode  = AES_Encrypt.AES_MODE_CBC,
            nonce = nonce_bytes
        )
        def encrypt(
                x,
                encryptor
        ):
            try:
                # print('***** x=' + str(x))
                x_bytes = bytes(x.encode(encoding='utf-8'))
                # print('***** x_bytes=' + str(x_bytes))
                res = encryptor.encode(x_bytes)
                ciphermode = res.cipher_mode
                ciphertext_b64 = res.ciphertext_b64
                tag_b64 = res.tag_b64
                nonce_b64 = res.nonce_b64
                # print('***** cipher=' + str(cipher) + ', bytelen=' + str(len(cipher)))
                # plaintext = encryptor.decode(ciphertext=ciphertext_b64)
                # print('***** decrypted=' + str(plaintext) + ', ok=' + str(plaintext==x))
                # if plaintext != x:
                #     raise Exception('Decrypt Failed for x "' + str(x) + '", decypted "' + str(plaintext) + '"')
                return {
                    'ciphermode': ciphermode,
                    'ciphertext_b64': ciphertext_b64,
                    'tag_b64': tag_b64,
                    'iv_b64': nonce_b64
                }
            except Exception as ex:
                Log.error(
                    str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Error encrypting "' + str(x) + '": ' + str(ex)
                )
                return None

        df[colname_encrypt] = df[colname_clean].apply(encrypt, args=[encryptor])
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Step ' + str(step) + ': ENCRYPTION Took '
            + str(Profiling.get_time_dif_secs(start=start_enc_time, stop=Profiling.stop(), decimals=2))
            + ' secs. Successfully encrypted column "' + str(hide_colname)
            + '", for records (first 20 rows): ' + str(df.values[0:min(20,df.shape[0])])
        )

        # def obfuscate_cipher_to_lang(
        #         x,
        #         lang
        # ):
        #     unicode_range = Hash.BLOCK_CHINESE
        #     if lang == 'ko':
        #         unicode_range = Hash.BLOCK_KOREAN_SYL
        #     s = Hash.convert_ascii_string_to_other_alphabet(
        #         ascii_char_string = x['ciphertext_b64'],
        #         unicode_range     = unicode_range,
        #         group_n_char      = 2
        #     )
        #     return s
        #
        # df[colname_encrypt_readable] = df[colname_encrypt].apply(obfuscate_cipher_to_lang, args=[hash_encode_lang])

        df_json_str = df.to_json(
            # Make sure not ASCII
            force_ascii = False,
            orient      = 'records',
            # Don't need indexing
            # index       = False
        )

        return df_json_str