def read_data(self, input_data, mode="fit"): LOGGER.info("start to read sparse data and change data to instance") abnormal_detection.empty_table_detection(input_data) if not data_overview.get_data_shape(input_data): raise ValueError( "input data's value is empty, it does not contain a label") if mode == "fit": data_instance = self.fit(input_data) else: data_instance = self.transform(input_data) schema = make_schema(self.header, self.sid_name, self.label_name) set_schema(data_instance, schema) return data_instance
def fit(self, data_instances, validate_data=None): if not self.need_run: return # check if empty table abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) # get model model = self.get_model() # get header self.header = data_overview.get_header(data_instances) X_table = data_instances.mapValues(lambda v: v.features) y_table = data_instances.mapValues(lambda v: v.label) X = np.array([v[1] for v in list(X_table.collect())]) y = np.array(list(y_table.collect()))[:, 1] self.model_fit = model.fit(X, y)
def read_data(self, input_data, mode="fit"): LOGGER.info("start to read dense data and change data to instance") abnormal_detection.empty_table_detection(input_data) input_data_labels = None fit_header = None if mode == "transform": fit_header = self.header self.generate_header(input_data, mode=mode) if self.label_idx is not None: data_shape = data_overview.get_data_shape(input_data) if not data_shape or self.label_idx >= data_shape: raise ValueError( "input data's value is empty, it does not contain a label") input_data_features = input_data.mapValues( lambda value: [] if data_shape == 1 else value.split( self.delimitor, -1)[:self.label_idx] + value.split( self.delimitor, -1)[self.label_idx + 1:]) input_data_labels = input_data.mapValues( lambda value: value.split(self.delimitor, -1)[self.label_idx]) else: input_data_features = input_data.mapValues(lambda value: [ ] if not self.header else value.split(self.delimitor, -1)) if mode == "fit": data_instance = self.fit(input_data, input_data_features, input_data_labels) else: data_instance = self.transform(input_data_features, input_data_labels) # data_instance = ModelBase.align_data_header(data_instance, fit_header) data_instance = data_overview.header_alignment( data_instance, fit_header) return data_instance
def fit(self, data_instances, validate_data=None): if not self.need_run: return # check if empty table LOGGER.info("Enter Local Baseline fit") abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) # get model model = self.get_model() # get header self.header = data_overview.get_header(data_instances) X_table = data_instances.mapValues(lambda v: v.features) y_table = data_instances.mapValues(lambda v: v.label) X = np.array([v[1] for v in list(X_table.collect())]) y = np.array([v[1] for v in list(y_table.collect())]) self.model_fit = model.fit(X, y) self.need_one_vs_rest = len(self.model_fit.classes_) > 2 self.set_summary(self.get_model_summary())
def read_data(self, table_name, namespace, mode="fit"): input_data = storage.get_data_table(table_name, namespace) LOGGER.info("start to read dense data and change data to instance") abnormal_detection.empty_table_detection(input_data) input_data_features = None input_data_labels = None if self.with_label: if type(self.label_idx).__name__ != "int": raise ValueError("label index should be integer") data_shape = data_overview.get_data_shape(input_data) if not data_shape or self.label_idx >= data_shape: raise ValueError( "input data's value is empty, it does not contain a label") input_data_features = input_data.mapValues( lambda value: [] if data_shape == 1 else value.split( self.delimitor, -1)[:self.label_idx] + value.split( self.delimitor, -1)[self.label_idx + 1:]) input_data_labels = input_data.mapValues( lambda value: value.split(self.delimitor, -1)[self.label_idx]) else: input_data_features = input_data.mapValues(lambda value: [ ] if not value else value.split(self.delimitor, -1)) if mode == "fit": data_instance = self.fit(input_data_features, input_data_labels, table_name, namespace) else: data_instance = self.transform(input_data_features, input_data_labels) set_schema(data_instance, self.header) return data_instance
def _abnormal_detection(self, data_instances): """ Make sure input data_instance is valid """ abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances)
def fit(self, data_inst): """ :param data_inst: Table, only the key and the value (Instance.label) are used :return: """ LOGGER.info("data count = {}".format(data_inst.count())) # 0. Raw retrieval if self.model_param.raw_retrieval or self.security_level == 0: LOGGER.info("enter raw information retrieval host") abnormal_detection.empty_table_detection(data_inst) self._raw_information_retrieval(data_inst) self._display_result(block_num='N/A') return data_inst # 1. Data pre-processing LOGGER.info("enter secure information retrieval host") abnormal_detection.empty_table_detection(data_inst) self._parse_security_level(data_inst) if not self._check_oblivious_transfer_condition(): self._failure_response() # 2. Sync commutative cipher public knowledge, block num and init self._sync_commutative_cipher_public_knowledge() self.commutative_cipher.init() LOGGER.info("commutative cipher key generated") # 3. 1st ID encryption and exchange # g: guest's plaintext # Eg: guest's ciphertext # EEg: guest's doubly encrypted ciphertext # h, Eh, EEh: host # i, Ei, EEi: intersection id_list_host_first = self._encrypt_id(data_inst, reserve_value=True) # (h, (Eh, Instance)) LOGGER.info("encrypted host id for the 1st time") id_list_guest_first = self._exchange_id_list( id_list_host_first.map(lambda k, v: (v[0], -1))) # send (Eh, -1), get (Eg, -1) # 4. 2nd ID encryption and send doubly encrypted ID list to guest id_list_guest_second = self._encrypt_id(id_list_guest_first) # (EEg, -1) LOGGER.info("encrypted host id for the 2nd time") self._sync_doubly_encrypted_id_list(id_list_guest_second) # send (EEg, -1) # 5. Wait for guest to find intersection and re-index the messages LOGGER.info("waiting for guest to find intersection and perform natural indexation") # 6. Get the re-indexed doubly encrypted ID from guest id_blocks = self._iteratively_get_id_blocks() # 7. Restore value for the intersection id_blocks = self._restore_value(id_list_host_first, id_blocks) # List[(Ei, val)] LOGGER.info("interested values restored") # 8. Execute OT as sender LOGGER.info("enter oblivious transfer protocol as a sender") key_list = self.oblivious_transfer.key_derivation(self.block_num) LOGGER.info("oblivious transfer key derived") # 9. Encrypt and transmit self._non_committing_encrypt(id_blocks, key_list) # List[(Ei, Eval)] LOGGER.info("non-committing encryption and transmission completed") # 10. Get doubly encrypted ID list from guest id_list_intersect_cipher_cipher = self._sync_intersect_cipher_cipher() # get (EEright, -1) # 11. Decrypt and send to guest id_list_intersect_cipher = self._decrypt_id( id_list_intersect_cipher_cipher, reserve_value=True) # (EEright, Eright) LOGGER.info("decryption completed") self._sync_intersect_cipher(id_list_intersect_cipher) # 12. Slack self._sync_coverage(data_inst) self._display_result() LOGGER.info("secure information retrieval finished") return data_inst
def _abnormal_detection(self, data_instances): """检查输入的数据是否有效""" abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) ModelBase.check_schema_content(data_instances.schema)
def fit(self, data_inst): """ :param data_inst: Table, only the key column of the Table is used :return: """ # 0. Raw retrieval if self.model_param.raw_retrieval or self.security_level == 0: LOGGER.info("enter raw information retrieval guest") abnormal_detection.empty_table_detection(data_inst) data_output = self._raw_information_retrieval(data_inst) self._display_result(block_num='N/A') return data_output # 1. Data pre-processing LOGGER.info("enter secure information retrieval guest") abnormal_detection.empty_table_detection(data_inst) self._parse_security_level(data_inst) if not self._check_oblivious_transfer_condition(): self._failure_response() # 2. Sync commutative cipher public knowledge, block num and init self._sync_commutative_cipher_public_knowledge() self.commutative_cipher.init() LOGGER.info("commutative cipher key generated") # 3. 1st ID encryption and exchange # g: guest's plaintext # Eg: guest's ciphertext # EEg: guest's doubly encrypted ciphertext # h, Eh, EEh: host # i, Ei, EEi: intersection id_list_guest_first = self._encrypt_id(data_inst) # (Eg, -1) LOGGER.info("encrypted guest id for the 1st time") id_list_host_first = self._exchange_id_list(id_list_guest_first) # send (Eg, -1), get (Eh, -1) # 4. 2nd ID encryption, receive doubly encrypted ID list from host id_list_host_second = self._encrypt_id(id_list_host_first, reserve_original_key=True) # (Eh, EEh) LOGGER.info("encrypted guest id for the 2nd time") id_list_host_second_only = id_list_host_second.map(lambda k, v: (v, -1)) # (EEh, -1) id_list_guest_second = self._sync_doubly_encrypted_id_list() # get (EEg, -1) # 5. Find intersection and re-index id_list_intersect = self._find_intersection( id_list_guest_second, id_list_host_second_only) # (EEi, -1) LOGGER.info("intersection found, sample num = {}".format(id_list_intersect.count())) # 6. Send the re-indexed doubly encrypted ID to host self._fake_blocks(id_list_intersect, id_list_host_second_only) # List[(EEi, -1)] LOGGER.info("faked {} blocks for obfuscation".format(self.block_num)) # 7. Wait for host to restore value for the intersection LOGGER.info("waiting for host to restore interested values for the intersection") # 8. Execute OT as receiver LOGGER.info("enter oblivious transfer protocol as a receiver") target_key = self.oblivious_transfer.key_derivation(self.target_block_index) LOGGER.info("oblivious transfer key derived") # 9. Wait for host to encrypt and transmit, and then receive the encrypted interested values id_block_ciphertext, nonce = self._iteratively_get_encrypted_values() LOGGER.info("got encrypted interested values and nonce") target_block_cipher_id = self._non_committing_decrypt( id_block_ciphertext, nonce, target_key) # (Eright, val) LOGGER.info("used the right key to decrypt the wanted values") # 10. Encrypt again and send to host target_block_cipher_cipher_id = self._composite_encrypt(target_block_cipher_id) # (EEright, val) self._sync_intersect_cipher_cipher( target_block_cipher_cipher_id.mapValues(lambda v: -1)) # send (EEright, -1) # 11. Get decrypted result from host, and decrypt again id_list_intersect_cipher_id = self._sync_intersect_cipher() # get (EEright, Eright_host) id_list_intersect_cipher_id = self._composite_decrypt(id_list_intersect_cipher_id) # (EEright, right) # 12. Merge result data_output = self._merge(target_block_cipher_cipher_id, id_list_intersect_cipher_id) data_output = self._compensate_set_difference(data_inst, data_output) self._display_result() LOGGER.info("secure information retrieval finished") return data_output
def run(self, data_instances): LOGGER.info("Start rsa intersection") abnormal_detection.empty_table_detection(data_instances) public_key = get(name=self.transfer_variable.rsa_pubkey.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.rsa_pubkey), idx=0) LOGGER.info("Get RSA public_key:{} from Host".format(public_key)) self.e = public_key["e"] self.n = public_key["n"] # generate random value and sent intersect guest ids to guest # table(sid, r) table_random_value = data_instances.mapValues( lambda v: random.SystemRandom().getrandbits(self.random_bit)) # table(sid, hash(sid)) table_hash_sid = data_instances.map( lambda k, v: (k, int(RsaIntersectionGuest.hash(k), 16))) # table(sid. r^e % n *hash(sid)) table_guest_id = table_random_value.join( table_hash_sid, lambda r, h: h * gmpy_math.powmod(r, self.e, self.n)) # table(r^e % n *hash(sid), 1) table_send_guest_id = table_guest_id.map(lambda k, v: (v, 1)) remote(table_send_guest_id, name=self.transfer_variable.intersect_guest_ids.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_guest_ids), role=consts.HOST, idx=0) LOGGER.info("Remote guest_id to Host") # table(r^e % n *hash(sid), sid) table_exchange_guest_id = table_guest_id.map(lambda k, v: (v, k)) # Recv host_ids_process # table(host_id_process, 1) table_host_ids_process = get( name=self.transfer_variable.intersect_host_ids_process.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_host_ids_process), idx=0) LOGGER.info("Get host_ids_process from Host") # Recv process guest ids # table(r^e % n *hash(sid), guest_id_process) table_recv_guest_ids_process = get( name=self.transfer_variable.intersect_guest_ids_process.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_guest_ids_process), # role=consts.HOST, idx=0) LOGGER.info("Get guest_ids_process from Host") # table(r^e % n *hash(sid), sid, guest_ids_process) table_join_guest_ids_process = table_exchange_guest_id.join( table_recv_guest_ids_process, lambda sid, g: (sid, g)) # table(sid, guest_ids_process) table_sid_guest_ids_process = table_join_guest_ids_process.map( lambda k, v: (v[0], v[1])) # table(sid, hash(guest_ids_process/r))) table_sid_guest_ids_process_final = table_sid_guest_ids_process.join( table_random_value, lambda g, r: RsaIntersectionGuest.hash( gmpy2.divm(int(g), int(r), self.n))) # table(hash(guest_ids_process/r), sid) table_guest_ids_process_final_sid = table_sid_guest_ids_process_final.map( lambda k, v: (v, k)) # intersect table(hash(guest_ids_process/r), sid) table_encrypt_intersect_ids = table_guest_ids_process_final_sid.join( table_host_ids_process, lambda sid, h: sid) # intersect table(hash(guest_ids_process/r), 1) table_send_intersect_ids = table_encrypt_intersect_ids.mapValues( lambda v: 1) LOGGER.info("Finish intersect_ids computing") # send intersect id if self.send_intersect_id_flag: remote(table_send_intersect_ids, name=self.transfer_variable.intersect_ids.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_ids), role=consts.HOST, idx=0) LOGGER.info("Remote intersect ids to Host!") else: LOGGER.info("Not send intersect ids to Host!") # intersect table(sid, "intersect_id") intersect_ids = table_encrypt_intersect_ids.map(lambda k, v: (v, "intersect_id")) if not self.only_output_key: intersect_ids = self._get_value_from_data(intersect_ids, data_instances) return intersect_ids
def fit(self, data_inst): """ :param data_inst: Table, only the key column of the Table is used :return: """ abnormal_detection.empty_table_detection(data_inst) # 0. Raw retrieval match_data = data_inst self.with_inst_id = data_overview.check_with_inst_id(data_inst) if self.with_inst_id: match_data = self._recover_match_id(data_inst) if self.model_param.raw_retrieval or self.security_level == 0: LOGGER.info("enter raw information retrieval guest") # abnormal_detection.empty_table_detection(data_inst) data_output = self._raw_information_retrieval(match_data) self._display_result(block_num='N/A') if self.with_inst_id: data_output = self._restore_sample_id(data_output) data_output = self._compensate_set_difference(data_inst, data_output) return data_output # 1. Data pre-processing LOGGER.info("enter secure information retrieval guest") self.need_label = self._check_need_label() # abnormal_detection.empty_table_detection(data_inst) self._parse_security_level(match_data) if not self._check_oblivious_transfer_condition(): self._failure_response() # 2. Find intersection id_list_intersect = self.intersection_obj.get_intersect_doubly_encrypted_id(match_data)[0] id_list_host_second_only = self.intersection_obj.id_list_remote_second[0] # 3. Send the re-indexed doubly encrypted ID to host self._fake_blocks(id_list_intersect, id_list_host_second_only) # List[(EEi, -1)] LOGGER.info("faked blocks for obfuscation") # 4. Wait for host to restore value for the intersection LOGGER.info("waiting for host to restore interested values for the intersection") # 5. Execute OT as receiver LOGGER.info("enter oblivious transfer protocol as a receiver") target_key = self.oblivious_transfer.key_derivation(self.target_block_index) LOGGER.info("oblivious transfer key derived") # 6. Wait for host to encrypt and transmit, and then receive the encrypted interested values id_block_ciphertext, nonce = self._iteratively_get_encrypted_values() LOGGER.info("got encrypted interested values and nonce") target_block_cipher_id = self._non_committing_decrypt( id_block_ciphertext, nonce, target_key) # (Eright, val) LOGGER.info("used the right key to decrypt the wanted values") # 7. Get (EEright, instance) target_block_cipher_cipher_id = self.intersection_obj.map_raw_id_to_encrypt_id(target_block_cipher_id, id_list_host_second_only, keep_value=True) # 8. Get (EEright, Eright_guest) id_list_local_first = self.intersection_obj.id_list_local_first[0] # (Eright_guest, id) id_list_local_second = self.intersection_obj.id_list_local_second[0] # (EEright, Eright_guest) # 9. Merge result # (Eright_guest, instance) id_list_cipher = self._merge_instance(target_block_cipher_cipher_id, id_list_local_second, self.need_label) data_output = self._merge(id_list_cipher, id_list_local_first) if self.with_inst_id: data_output = self._restore_sample_id(data_output) data_output = self._compensate_set_difference(data_inst, data_output) self._display_result() LOGGER.info("secure information retrieval finished") return data_output
def run(self, data_instances): LOGGER.info("Start rsa intersection") abnormal_detection.empty_table_detection(data_instances) encrypt_operator = RsaEncrypt() encrypt_operator.generate_key(rsa_bit=1024) self.e, self.d, self.n = encrypt_operator.get_key_pair() LOGGER.info("Generate rsa keys.") public_key = {"e": self.e, "n": self.n} remote(public_key, name=self.transfer_variable.rsa_pubkey.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.rsa_pubkey), role=consts.GUEST, idx=0) LOGGER.info("Remote public key to Guest.") # (host_id_process, 1) host_ids_process_pair = data_instances.map( lambda k, v: (RsaIntersectionHost.hash( gmpy_math.powmod(int(RsaIntersectionHost.hash(k), 16), self.d, self.n)), k)) host_ids_process = host_ids_process_pair.mapValues(lambda v: 1) remote(host_ids_process, name=self.transfer_variable.intersect_host_ids_process.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_host_ids_process), role=consts.GUEST, idx=0) LOGGER.info("Remote host_ids_process to Guest.") # Recv guest ids guest_ids = get(name=self.transfer_variable.intersect_guest_ids.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_guest_ids), idx=0) LOGGER.info("Get guest_ids from guest") # Process guest ids and return to guest guest_ids_process = guest_ids.map( lambda k, v: (k, gmpy_math.powmod(int(k), self.d, self.n))) remote(guest_ids_process, name=self.transfer_variable.intersect_guest_ids_process.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_guest_ids_process), role=consts.GUEST, idx=0) LOGGER.info("Remote guest_ids_process to Guest.") # recv intersect ids intersect_ids = None if self.get_intersect_ids_flag: encrypt_intersect_ids = get( name=self.transfer_variable.intersect_ids.name, tag=self.transfer_variable.generate_transferid( self.transfer_variable.intersect_ids), idx=0) intersect_ids_pair = encrypt_intersect_ids.join( host_ids_process_pair, lambda e, h: h) intersect_ids = intersect_ids_pair.map(lambda k, v: (v, "intersect_id")) LOGGER.info("Get intersect ids from Guest") if not self.only_output_key: intersect_ids = self._get_value_from_data( intersect_ids, data_instances) return intersect_ids