def preprocess_som_patient_data(self, patients): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() res = [] num_empty = 0 for pat_id, ent_ids in patients.items(): # uni_ids = [uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid]] uni_ids = [ uid for eid in ent_ids if eid in ent2uni for uid in ent2uni[eid] ] # if there are any matches map them ''' if len(uni_ids) > 0: res.append({ 'pat_id': pat_id, 'mutated_nodes': uni_ids, }) else: num_empty += 1 ''' res.append({ 'pat_id': pat_id, 'mutated_nodes': uni_ids, }) log('removed patients:', num_empty) return res
def preprocess_seq_patient_data(self, GE, all_ent_ids): # get the dictionary of gene id mappers uni2ent, ent2uni = uniprot_mapper.json_to_dict() found_ent_ids = [eid in ent2uni for eid in all_ent_ids] ent_ids = np.array([eid for eid in all_ent_ids if eid in ent2uni]) uni_ids = np.array([ent2uni[eid] for eid in ent_ids]) log('uni_ids:', len(uni_ids)) log('miss_ent_ids:', len(all_ent_ids) - sum(found_ent_ids)) # prune genes whose uniprot id is not found GE = GE[found_ent_ids] return GE, uni_ids
def patient_entrez_to_uniprot(): list_of_gene_patient = [] with open(KIRC_PATH, 'r') as csvfile: csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') next(csv_reader) for row in csv_reader: if int(row[1]) != 0: list_of_gene_patient.append(list(row)) u_map = [] e_map = [] with open(UNIPROT_ENTREZ_MAP_FPATH, 'r') as csvfile: csv_reader = csv.reader(csvfile, delimiter='\t', quotechar='|') next(csv_reader) # skip header for row in csv_reader: u_map.append(row[0]) e_map.append(row[1]) uni_to_entrez, entrez_to_uni = um.json_to_dict() patient_uniprot_list = [] for row in list_of_gene_patient: uni_prot = [] patient = row[2] try: uni_prot.append(entrez_to_uni[row[1]]) except: if row[2] in e_map: uni_prot.append(u_map[e_map.index(row[2])]) else: print("none") if len(uni_prot) != 0: patient_uniprot_list.append([patient, uni_prot]) return patient_uniprot_list