Example #1
0
    def read_data(self, input_data, mode="fit"):
        LOGGER.info("start to read sparse data and change data to instance")

        abnormal_detection.empty_table_detection(input_data)

        if not data_overview.get_data_shape(input_data):
            raise ValueError(
                "input data's value is empty, it does not contain a label")

        if mode == "fit":
            data_instance = self.fit(input_data)
        else:
            data_instance = self.transform(input_data)

        schema = make_schema(self.header, self.sid_name, self.label_name)
        set_schema(data_instance, schema)
        return data_instance
Example #2
0
    def fit(self, data_instances, validate_data=None):
        if not self.need_run:
            return
        # check if empty table
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        # get model
        model = self.get_model()
        # get header
        self.header = data_overview.get_header(data_instances)

        X_table = data_instances.mapValues(lambda v: v.features)
        y_table = data_instances.mapValues(lambda v: v.label)

        X = np.array([v[1] for v in list(X_table.collect())])
        y = np.array(list(y_table.collect()))[:, 1]

        self.model_fit = model.fit(X, y)
Example #3
0
    def read_data(self, input_data, mode="fit"):
        LOGGER.info("start to read dense data and change data to instance")

        abnormal_detection.empty_table_detection(input_data)

        input_data_labels = None

        fit_header = None
        if mode == "transform":
            fit_header = self.header

        self.generate_header(input_data, mode=mode)

        if self.label_idx is not None:
            data_shape = data_overview.get_data_shape(input_data)
            if not data_shape or self.label_idx >= data_shape:
                raise ValueError(
                    "input data's value is empty, it does not contain a label")

            input_data_features = input_data.mapValues(
                lambda value: [] if data_shape == 1 else value.split(
                    self.delimitor, -1)[:self.label_idx] + value.split(
                        self.delimitor, -1)[self.label_idx + 1:])

            input_data_labels = input_data.mapValues(
                lambda value: value.split(self.delimitor, -1)[self.label_idx])

        else:
            input_data_features = input_data.mapValues(lambda value: [
            ] if not self.header else value.split(self.delimitor, -1))

        if mode == "fit":
            data_instance = self.fit(input_data, input_data_features,
                                     input_data_labels)
        else:
            data_instance = self.transform(input_data_features,
                                           input_data_labels)
            # data_instance = ModelBase.align_data_header(data_instance, fit_header)
            data_instance = data_overview.header_alignment(
                data_instance, fit_header)

        return data_instance
Example #4
0
    def fit(self, data_instances, validate_data=None):
        if not self.need_run:
            return
        # check if empty table
        LOGGER.info("Enter Local Baseline fit")
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        # get model
        model = self.get_model()
        # get header
        self.header = data_overview.get_header(data_instances)

        X_table = data_instances.mapValues(lambda v: v.features)
        y_table = data_instances.mapValues(lambda v: v.label)

        X = np.array([v[1] for v in list(X_table.collect())])
        y = np.array([v[1] for v in list(y_table.collect())])

        self.model_fit = model.fit(X, y)
        self.need_one_vs_rest = len(self.model_fit.classes_) > 2
        self.set_summary(self.get_model_summary())
Example #5
0
    def read_data(self, table_name, namespace, mode="fit"):
        input_data = storage.get_data_table(table_name, namespace)
        LOGGER.info("start to read dense data and change data to instance")

        abnormal_detection.empty_table_detection(input_data)

        input_data_features = None
        input_data_labels = None

        if self.with_label:
            if type(self.label_idx).__name__ != "int":
                raise ValueError("label index should be integer")

            data_shape = data_overview.get_data_shape(input_data)
            if not data_shape or self.label_idx >= data_shape:
                raise ValueError(
                    "input data's value is empty, it does not contain a label")

            input_data_features = input_data.mapValues(
                lambda value: [] if data_shape == 1 else value.split(
                    self.delimitor, -1)[:self.label_idx] + value.split(
                        self.delimitor, -1)[self.label_idx + 1:])
            input_data_labels = input_data.mapValues(
                lambda value: value.split(self.delimitor, -1)[self.label_idx])

        else:
            input_data_features = input_data.mapValues(lambda value: [
            ] if not value else value.split(self.delimitor, -1))

        if mode == "fit":
            data_instance = self.fit(input_data_features, input_data_labels,
                                     table_name, namespace)
        else:
            data_instance = self.transform(input_data_features,
                                           input_data_labels)

        set_schema(data_instance, self.header)

        return data_instance
Example #6
0
 def _abnormal_detection(self, data_instances):
     """
     Make sure input data_instance is valid
     """
     abnormal_detection.empty_table_detection(data_instances)
     abnormal_detection.empty_feature_detection(data_instances)
    def fit(self, data_inst):
        """

        :param data_inst: Table, only the key and the value (Instance.label) are used
        :return:
        """
        LOGGER.info("data count = {}".format(data_inst.count()))
        # 0. Raw retrieval
        if self.model_param.raw_retrieval or self.security_level == 0:
            LOGGER.info("enter raw information retrieval host")
            abnormal_detection.empty_table_detection(data_inst)
            self._raw_information_retrieval(data_inst)
            self._display_result(block_num='N/A')
            return data_inst

        # 1. Data pre-processing
        LOGGER.info("enter secure information retrieval host")
        abnormal_detection.empty_table_detection(data_inst)
        self._parse_security_level(data_inst)
        if not self._check_oblivious_transfer_condition():
            self._failure_response()

        # 2. Sync commutative cipher public knowledge, block num and init
        self._sync_commutative_cipher_public_knowledge()
        self.commutative_cipher.init()
        LOGGER.info("commutative cipher key generated")

        # 3. 1st ID encryption and exchange
        # g: guest's plaintext
        # Eg: guest's ciphertext
        # EEg: guest's doubly encrypted ciphertext
        # h, Eh, EEh: host
        # i, Ei, EEi: intersection
        id_list_host_first = self._encrypt_id(data_inst, reserve_value=True)      # (h, (Eh, Instance))
        LOGGER.info("encrypted host id for the 1st time")
        id_list_guest_first = self._exchange_id_list(
            id_list_host_first.map(lambda k, v: (v[0], -1)))       # send (Eh, -1), get (Eg, -1)

        # 4. 2nd ID encryption and send doubly encrypted ID list to guest
        id_list_guest_second = self._encrypt_id(id_list_guest_first)         # (EEg, -1)
        LOGGER.info("encrypted host id for the 2nd time")
        self._sync_doubly_encrypted_id_list(id_list_guest_second)       # send (EEg, -1)

        # 5. Wait for guest to find intersection and re-index the messages
        LOGGER.info("waiting for guest to find intersection and perform natural indexation")

        # 6. Get the re-indexed doubly encrypted ID from guest
        id_blocks = self._iteratively_get_id_blocks()

        # 7. Restore value for the intersection
        id_blocks = self._restore_value(id_list_host_first, id_blocks)      # List[(Ei, val)]
        LOGGER.info("interested values restored")

        # 8. Execute OT as sender
        LOGGER.info("enter oblivious transfer protocol as a sender")
        key_list = self.oblivious_transfer.key_derivation(self.block_num)
        LOGGER.info("oblivious transfer key derived")

        # 9. Encrypt and transmit
        self._non_committing_encrypt(id_blocks, key_list)       # List[(Ei, Eval)]
        LOGGER.info("non-committing encryption and transmission completed")

        # 10. Get doubly encrypted ID list from guest
        id_list_intersect_cipher_cipher = self._sync_intersect_cipher_cipher()      # get (EEright, -1)

        # 11. Decrypt and send to guest
        id_list_intersect_cipher = self._decrypt_id(
            id_list_intersect_cipher_cipher, reserve_value=True)    # (EEright, Eright)
        LOGGER.info("decryption completed")
        self._sync_intersect_cipher(id_list_intersect_cipher)

        # 12. Slack
        self._sync_coverage(data_inst)
        self._display_result()
        LOGGER.info("secure information retrieval finished")

        return data_inst
Example #8
0
 def _abnormal_detection(self, data_instances):
     """检查输入的数据是否有效"""
     abnormal_detection.empty_table_detection(data_instances)
     abnormal_detection.empty_feature_detection(data_instances)
     ModelBase.check_schema_content(data_instances.schema)
    def fit(self, data_inst):
        """

        :param data_inst: Table, only the key column of the Table is used
        :return:
        """
        # 0. Raw retrieval
        if self.model_param.raw_retrieval or self.security_level == 0:
            LOGGER.info("enter raw information retrieval guest")
            abnormal_detection.empty_table_detection(data_inst)
            data_output = self._raw_information_retrieval(data_inst)
            self._display_result(block_num='N/A')
            return data_output

        # 1. Data pre-processing
        LOGGER.info("enter secure information retrieval guest")
        abnormal_detection.empty_table_detection(data_inst)
        self._parse_security_level(data_inst)
        if not self._check_oblivious_transfer_condition():
            self._failure_response()

        # 2. Sync commutative cipher public knowledge, block num and init
        self._sync_commutative_cipher_public_knowledge()
        self.commutative_cipher.init()
        LOGGER.info("commutative cipher key generated")

        # 3. 1st ID encryption and exchange
        # g: guest's plaintext
        # Eg: guest's ciphertext
        # EEg: guest's doubly encrypted ciphertext
        # h, Eh, EEh: host
        # i, Ei, EEi: intersection
        id_list_guest_first = self._encrypt_id(data_inst)      # (Eg, -1)
        LOGGER.info("encrypted guest id for the 1st time")
        id_list_host_first = self._exchange_id_list(id_list_guest_first)              # send (Eg, -1), get (Eh, -1)

        # 4. 2nd ID encryption, receive doubly encrypted ID list from host
        id_list_host_second = self._encrypt_id(id_list_host_first, reserve_original_key=True)    # (Eh, EEh)
        LOGGER.info("encrypted guest id for the 2nd time")
        id_list_host_second_only = id_list_host_second.map(lambda k, v: (v, -1))     # (EEh, -1)
        id_list_guest_second = self._sync_doubly_encrypted_id_list()       # get (EEg, -1)

        # 5. Find intersection and re-index
        id_list_intersect = self._find_intersection(
            id_list_guest_second, id_list_host_second_only)     # (EEi, -1)
        LOGGER.info("intersection found, sample num = {}".format(id_list_intersect.count()))

        # 6. Send the re-indexed doubly encrypted ID to host
        self._fake_blocks(id_list_intersect, id_list_host_second_only)  # List[(EEi, -1)]
        LOGGER.info("faked {} blocks for obfuscation".format(self.block_num))

        # 7. Wait for host to restore value for the intersection
        LOGGER.info("waiting for host to restore interested values for the intersection")

        # 8. Execute OT as receiver
        LOGGER.info("enter oblivious transfer protocol as a receiver")
        target_key = self.oblivious_transfer.key_derivation(self.target_block_index)
        LOGGER.info("oblivious transfer key derived")

        # 9. Wait for host to encrypt and transmit, and then receive the encrypted interested values
        id_block_ciphertext, nonce = self._iteratively_get_encrypted_values()
        LOGGER.info("got encrypted interested values and nonce")
        target_block_cipher_id = self._non_committing_decrypt(
            id_block_ciphertext, nonce, target_key)  # (Eright, val)
        LOGGER.info("used the right key to decrypt the wanted values")

        # 10. Encrypt again and send to host
        target_block_cipher_cipher_id = self._composite_encrypt(target_block_cipher_id)      # (EEright, val)
        self._sync_intersect_cipher_cipher(
            target_block_cipher_cipher_id.mapValues(lambda v: -1))       # send (EEright, -1)

        # 11. Get decrypted result from host, and decrypt again
        id_list_intersect_cipher_id = self._sync_intersect_cipher()        # get (EEright, Eright_host)
        id_list_intersect_cipher_id = self._composite_decrypt(id_list_intersect_cipher_id)        # (EEright, right)

        # 12. Merge result
        data_output = self._merge(target_block_cipher_cipher_id, id_list_intersect_cipher_id)
        data_output = self._compensate_set_difference(data_inst, data_output)
        self._display_result()
        LOGGER.info("secure information retrieval finished")

        return data_output
    def run(self, data_instances):
        LOGGER.info("Start rsa intersection")

        abnormal_detection.empty_table_detection(data_instances)

        public_key = get(name=self.transfer_variable.rsa_pubkey.name,
                         tag=self.transfer_variable.generate_transferid(
                             self.transfer_variable.rsa_pubkey),
                         idx=0)

        LOGGER.info("Get RSA public_key:{} from Host".format(public_key))
        self.e = public_key["e"]
        self.n = public_key["n"]

        # generate random value and sent intersect guest ids to guest
        # table(sid, r)
        table_random_value = data_instances.mapValues(
            lambda v: random.SystemRandom().getrandbits(self.random_bit))

        # table(sid, hash(sid))
        table_hash_sid = data_instances.map(
            lambda k, v: (k, int(RsaIntersectionGuest.hash(k), 16)))
        # table(sid. r^e % n *hash(sid))
        table_guest_id = table_random_value.join(
            table_hash_sid,
            lambda r, h: h * gmpy_math.powmod(r, self.e, self.n))
        # table(r^e % n *hash(sid), 1)
        table_send_guest_id = table_guest_id.map(lambda k, v: (v, 1))
        remote(table_send_guest_id,
               name=self.transfer_variable.intersect_guest_ids.name,
               tag=self.transfer_variable.generate_transferid(
                   self.transfer_variable.intersect_guest_ids),
               role=consts.HOST,
               idx=0)
        LOGGER.info("Remote guest_id to Host")

        # table(r^e % n *hash(sid), sid)
        table_exchange_guest_id = table_guest_id.map(lambda k, v: (v, k))

        # Recv host_ids_process
        # table(host_id_process, 1)
        table_host_ids_process = get(
            name=self.transfer_variable.intersect_host_ids_process.name,
            tag=self.transfer_variable.generate_transferid(
                self.transfer_variable.intersect_host_ids_process),
            idx=0)
        LOGGER.info("Get host_ids_process from Host")

        # Recv process guest ids
        # table(r^e % n *hash(sid), guest_id_process)
        table_recv_guest_ids_process = get(
            name=self.transfer_variable.intersect_guest_ids_process.name,
            tag=self.transfer_variable.generate_transferid(
                self.transfer_variable.intersect_guest_ids_process),
            # role=consts.HOST,
            idx=0)
        LOGGER.info("Get guest_ids_process from Host")

        # table(r^e % n *hash(sid), sid, guest_ids_process)
        table_join_guest_ids_process = table_exchange_guest_id.join(
            table_recv_guest_ids_process, lambda sid, g: (sid, g))
        # table(sid, guest_ids_process)
        table_sid_guest_ids_process = table_join_guest_ids_process.map(
            lambda k, v: (v[0], v[1]))

        # table(sid, hash(guest_ids_process/r)))
        table_sid_guest_ids_process_final = table_sid_guest_ids_process.join(
            table_random_value, lambda g, r: RsaIntersectionGuest.hash(
                gmpy2.divm(int(g), int(r), self.n)))

        # table(hash(guest_ids_process/r), sid)
        table_guest_ids_process_final_sid = table_sid_guest_ids_process_final.map(
            lambda k, v: (v, k))

        # intersect table(hash(guest_ids_process/r), sid)
        table_encrypt_intersect_ids = table_guest_ids_process_final_sid.join(
            table_host_ids_process, lambda sid, h: sid)

        # intersect table(hash(guest_ids_process/r), 1)
        table_send_intersect_ids = table_encrypt_intersect_ids.mapValues(
            lambda v: 1)
        LOGGER.info("Finish intersect_ids computing")

        # send intersect id
        if self.send_intersect_id_flag:
            remote(table_send_intersect_ids,
                   name=self.transfer_variable.intersect_ids.name,
                   tag=self.transfer_variable.generate_transferid(
                       self.transfer_variable.intersect_ids),
                   role=consts.HOST,
                   idx=0)
            LOGGER.info("Remote intersect ids to Host!")
        else:
            LOGGER.info("Not send intersect ids to Host!")

        # intersect table(sid, "intersect_id")
        intersect_ids = table_encrypt_intersect_ids.map(lambda k, v:
                                                        (v, "intersect_id"))

        if not self.only_output_key:
            intersect_ids = self._get_value_from_data(intersect_ids,
                                                      data_instances)

        return intersect_ids
    def fit(self, data_inst):
        """

        :param data_inst: Table, only the key column of the Table is used
        :return:
        """
        abnormal_detection.empty_table_detection(data_inst)

        # 0. Raw retrieval
        match_data = data_inst
        self.with_inst_id = data_overview.check_with_inst_id(data_inst)
        if self.with_inst_id:
            match_data = self._recover_match_id(data_inst)

        if self.model_param.raw_retrieval or self.security_level == 0:
            LOGGER.info("enter raw information retrieval guest")
            # abnormal_detection.empty_table_detection(data_inst)
            data_output = self._raw_information_retrieval(match_data)
            self._display_result(block_num='N/A')
            if self.with_inst_id:
                data_output = self._restore_sample_id(data_output)
            data_output = self._compensate_set_difference(data_inst, data_output)
            return data_output

        # 1. Data pre-processing
        LOGGER.info("enter secure information retrieval guest")
        self.need_label = self._check_need_label()
        # abnormal_detection.empty_table_detection(data_inst)
        self._parse_security_level(match_data)
        if not self._check_oblivious_transfer_condition():
            self._failure_response()

        # 2. Find intersection
        id_list_intersect = self.intersection_obj.get_intersect_doubly_encrypted_id(match_data)[0]
        id_list_host_second_only = self.intersection_obj.id_list_remote_second[0]

        # 3. Send the re-indexed doubly encrypted ID to host
        self._fake_blocks(id_list_intersect, id_list_host_second_only)  # List[(EEi, -1)]
        LOGGER.info("faked blocks for obfuscation")

        # 4. Wait for host to restore value for the intersection
        LOGGER.info("waiting for host to restore interested values for the intersection")

        # 5. Execute OT as receiver
        LOGGER.info("enter oblivious transfer protocol as a receiver")
        target_key = self.oblivious_transfer.key_derivation(self.target_block_index)
        LOGGER.info("oblivious transfer key derived")

        # 6. Wait for host to encrypt and transmit, and then receive the encrypted interested values
        id_block_ciphertext, nonce = self._iteratively_get_encrypted_values()
        LOGGER.info("got encrypted interested values and nonce")
        target_block_cipher_id = self._non_committing_decrypt(
            id_block_ciphertext, nonce, target_key)  # (Eright, val)
        LOGGER.info("used the right key to decrypt the wanted values")

        # 7. Get (EEright, instance)
        target_block_cipher_cipher_id = self.intersection_obj.map_raw_id_to_encrypt_id(target_block_cipher_id,
                                                                                       id_list_host_second_only,
                                                                                       keep_value=True)
        # 8. Get (EEright, Eright_guest)
        id_list_local_first = self.intersection_obj.id_list_local_first[0]  # (Eright_guest, id)
        id_list_local_second = self.intersection_obj.id_list_local_second[0]  # (EEright, Eright_guest)

        # 9. Merge result
        # (Eright_guest, instance)
        id_list_cipher = self._merge_instance(target_block_cipher_cipher_id, id_list_local_second, self.need_label)
        data_output = self._merge(id_list_cipher, id_list_local_first)

        if self.with_inst_id:
            data_output = self._restore_sample_id(data_output)
        data_output = self._compensate_set_difference(data_inst, data_output)
        self._display_result()
        LOGGER.info("secure information retrieval finished")

        return data_output
Example #12
0
    def run(self, data_instances):
        LOGGER.info("Start rsa intersection")

        abnormal_detection.empty_table_detection(data_instances)

        encrypt_operator = RsaEncrypt()
        encrypt_operator.generate_key(rsa_bit=1024)
        self.e, self.d, self.n = encrypt_operator.get_key_pair()
        LOGGER.info("Generate rsa keys.")
        public_key = {"e": self.e, "n": self.n}
        remote(public_key,
               name=self.transfer_variable.rsa_pubkey.name,
               tag=self.transfer_variable.generate_transferid(
                   self.transfer_variable.rsa_pubkey),
               role=consts.GUEST,
               idx=0)
        LOGGER.info("Remote public key to Guest.")

        # (host_id_process, 1)
        host_ids_process_pair = data_instances.map(
            lambda k, v: (RsaIntersectionHost.hash(
                gmpy_math.powmod(int(RsaIntersectionHost.hash(k), 16), self.d,
                                 self.n)), k))

        host_ids_process = host_ids_process_pair.mapValues(lambda v: 1)
        remote(host_ids_process,
               name=self.transfer_variable.intersect_host_ids_process.name,
               tag=self.transfer_variable.generate_transferid(
                   self.transfer_variable.intersect_host_ids_process),
               role=consts.GUEST,
               idx=0)
        LOGGER.info("Remote host_ids_process to Guest.")

        # Recv guest ids
        guest_ids = get(name=self.transfer_variable.intersect_guest_ids.name,
                        tag=self.transfer_variable.generate_transferid(
                            self.transfer_variable.intersect_guest_ids),
                        idx=0)
        LOGGER.info("Get guest_ids from guest")

        # Process guest ids and return to guest
        guest_ids_process = guest_ids.map(
            lambda k, v: (k, gmpy_math.powmod(int(k), self.d, self.n)))
        remote(guest_ids_process,
               name=self.transfer_variable.intersect_guest_ids_process.name,
               tag=self.transfer_variable.generate_transferid(
                   self.transfer_variable.intersect_guest_ids_process),
               role=consts.GUEST,
               idx=0)
        LOGGER.info("Remote guest_ids_process to Guest.")

        # recv intersect ids
        intersect_ids = None
        if self.get_intersect_ids_flag:
            encrypt_intersect_ids = get(
                name=self.transfer_variable.intersect_ids.name,
                tag=self.transfer_variable.generate_transferid(
                    self.transfer_variable.intersect_ids),
                idx=0)

            intersect_ids_pair = encrypt_intersect_ids.join(
                host_ids_process_pair, lambda e, h: h)
            intersect_ids = intersect_ids_pair.map(lambda k, v:
                                                   (v, "intersect_id"))
            LOGGER.info("Get intersect ids from Guest")

            if not self.only_output_key:
                intersect_ids = self._get_value_from_data(
                    intersect_ids, data_instances)

        return intersect_ids