def preprocess_cnv_patient_data(self, patients, is_pos=True): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() res = [] for pat_id, patient in patients.items(): for cnv_type, ent_ids in patient.items(): patient[cnv_type] = [ uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid] ] return res
def preprocess_seq_patient_data(self, GE, all_ent_ids): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() found_ent_ids = [eid in ent2uni for eid in all_ent_ids] ent_ids = np.array([eid for eid in all_ent_ids if eid in ent2uni]) uni_ids = np.array([ent2uni[eid] for eid in ent_ids], dtype=object) log('uni_ids:', len(uni_ids)) log('miss_ent_ids:', len(all_ent_ids) - sum(found_ent_ids)) # prune genes whose uniprot id is not found GE = GE[found_ent_ids] return GE, uni_ids
def preprocess_som_patient_data(self, patients): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() res = [] num_empty = 0 for pat_id, ent_ids in patients.items(): # uni_ids = [uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid]] uni_ids = [uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid]] # if there are any matches map them res.append({ 'pat_id': pat_id, 'mutated_nodes': uni_ids, }) log('removed patients:', num_empty) return res
def preprocess_patient_data(patients): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() res = [] num_empty = 0 for pat_id, ent_ids in patients.items(): # uni_ids = [uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid]] uni_ids = [uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid]] # if there are any matches map them if len(uni_ids) > 0: res.append({ 'pat_id': pat_id, 'mutated_nodes': uni_ids, }) else: num_empty += 1 return res