def handle_seqres_di(info_dict): # Deal with SEQRES_COL resides_col_li = DEFAULT_COLS['SEQRES_COL'][1:4] mtoTool = Unit.MultiToOne() for i in range(len(info_dict[resides_col_li[0]])): for resides_col in resides_col_li: info_dict[resides_col][i] = ''.join( mtoTool.multi_letter_convert_to_one_letter(j) for j in info_dict[resides_col][i]) pdbx_poly_key = DEFAULT_COLS['SEQRES_COL'][0] coordinates_model_key = DEFAULT_COLS['SEQRES_COL'][8] for i in range(len(info_dict[pdbx_poly_key])): strand_id_index = [0] li = info_dict[pdbx_poly_key][i] save_id = li[0] strand_id_li = [save_id] for j in range(len(li)): if li[j] != save_id: save_id = li[j] strand_id_index.append(j) strand_id_li.append(save_id) info_dict[pdbx_poly_key][i] = strand_id_li for col in DEFAULT_COLS['SEQRES_COL'][1:4]: info_dict[col][i] = [ MMCIF2Dfrm.get_index(strand_id_index, info_dict[col][i], j) for j in range(len(strand_id_index)) ] for col in DEFAULT_COLS['SEQRES_COL'][4:8]: info_dict[col][i] = [ ';'.join( MMCIF2Dfrm.get_index(strand_id_index, info_dict[col][i], j)) for j in range(len(strand_id_index)) ] new_comodel_li = [] for ele in info_dict[coordinates_model_key][i]: if ele not in new_comodel_li: new_comodel_li.append(ele) info_dict[coordinates_model_key][i] = new_comodel_li
def get_data_from_mmcif(self, path_list, outputPath=False): ''' { '_pdbx_audit_revision_history.revision_date': ['initial_version_time', 'newest_version_time'], # sometimes not a list '_entity.pdbx_mutation': ['mutation_num', 'mutation_content'], # sometimes not a list '_entity.id': ['entity_id_aidMuta'], # sometimes not a list ['_em_3d_reconstruction.resolution','_refine.ls_d_res_high']: ['resolution'], # not a lists '_exptl.method': ['method'], # not a list } ''' info_dict = defaultdict(list) for path in path_list: if path[-3:] == 'cif': print(path) info_dict['pdb_id'].append(path[-8:-4]) MMCIF_unit.get_mmcif_info( MMCIF_unit.CONFIG['COMMON_COL'] + MMCIF_unit.CONFIG['BIOASS_COL'] + MMCIF_unit.CONFIG['ENTITY_COL'] + MMCIF_unit.CONFIG['TYPE_COL'] + MMCIF_unit.CONFIG['SEQRES_COL'] + MMCIF_unit.CONFIG['LIGAND_COL'], MMCIF_unit.CONFIG['COMMON_COL'][1:], info_dict, path) # Deal with Residues in SEQRES_COL resides_col_li = MMCIF_unit.CONFIG['SEQRES_COL'][1:4] mtoTool = Unit.MultiToOne() for i in range(len(info_dict[resides_col_li[0]])): for resides_col in resides_col_li: info_dict[resides_col][i] = ''.join( mtoTool.multi_letter_convert_to_one_letter(j) for j in info_dict[resides_col][i]) def get_index(x, y, z): return y[x[z]:x[z + 1]] if len(x) != 1 and z + 1 < len(x) else y[x[z]:] # Deal with SEQRES_COL pdbx_poly_key = MMCIF_unit.CONFIG['SEQRES_COL'][0] for i in range(len(info_dict[pdbx_poly_key])): strand_id_index = [0] li = info_dict[pdbx_poly_key][i] save_id = li[0] strand_id_li = [save_id] for j in range(len(li)): if li[j] != save_id: save_id = li[j] strand_id_index.append(j) strand_id_li.append(save_id) info_dict[pdbx_poly_key][i] = strand_id_li for col in MMCIF_unit.CONFIG['SEQRES_COL'][1:4]: info_dict[col][i] = [ get_index(strand_id_index, info_dict[col][i], j) for j in range(len(strand_id_index)) ] for col in MMCIF_unit.CONFIG['SEQRES_COL'][4:]: info_dict[col][i] = [ ';'.join(get_index(strand_id_index, info_dict[col][i], j)) for j in range(len(strand_id_index)) ] # Deal with LIGAND_COL ligand_col_list = MMCIF_unit.CONFIG['LIGAND_COL'] metal_li = MMCIF_unit.CONFIG['LIGAND_LIST'] for i in range(len(info_dict[ligand_col_list[0]])): if not info_dict[ligand_col_list[0]][i]: info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append( np.nan) info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append( np.nan) continue ligand_col_tp = tuple(info_dict[col][i] for col in ligand_col_list) ligand_col_zip_li = list(zip(*ligand_col_tp)) aa_li = list(MMCIF_unit.SEQ_DICT.keys())[:21] metal_ligand_info = list( filter(lambda x: x[0] == 'metalc', ligand_col_zip_li)) # chain_id: _struct_conn.ptnr2_auth_asym_id [4] sub_metal_ligand_info_1 = filter( lambda x: x[1] in metal_li and x[2] in aa_li, metal_ligand_info) # chain_id: _struct_conn.ptnr1_auth_asym_id [3] sub_metal_ligand_info_2 = filter( lambda x: x[2] in metal_li and x[1] in aa_li, metal_ligand_info) new_metal_ligand_info = [] for tp in sub_metal_ligand_info_1: new_metal_ligand_info.append( (tp[4], tp[1], tp[5], tp[2], tp[6])) for tp in sub_metal_ligand_info_2: new_metal_ligand_info.append( (tp[3], tp[2], tp[6], tp[1], tp[5])) new_metal_ligand_info.sort(key=lambda x: x[0]) try: save_id = new_metal_ligand_info[0][0] # print(new_metal_ligand_info) except IndexError: info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append( np.nan) info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append( np.nan) continue strand_id_li = [save_id] strand_id_index = [0] for j in range(len(new_metal_ligand_info)): if new_metal_ligand_info[j][0] != save_id: save_id = new_metal_ligand_info[j][0] strand_id_index.append(j) strand_id_li.append(save_id) info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append( strand_id_li) info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append([ get_index(strand_id_index, [ele[1:] for ele in new_metal_ligand_info], j) for j in range(len(strand_id_index)) ]) df = pd.DataFrame(info_dict) # Deal with the date of structure df['initial_version_time'] = df.apply( lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][0], axis=1) df['newest_version_time'] = df.apply( lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][-1], axis=1) # Deal with the mutations def muta_count(x): return x.count(',') + 1 if x != '?' else 0 df['mutation_num'] = df.apply( lambda x: [muta_count(i) for i in x['_entity.pdbx_mutation']], axis=1) # Deal with the resolution df['resolution'] = df.apply( lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][3]], axis=1) df['resolution'] = df.apply( lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][2]] if isinstance(x['resolution'], float) else x['resolution'], axis=1) # Deal with chain type def get_chainType_fun(ele): return MMCIF_unit.CONFIG['CHAIN_TYPE_DICT'].get(ele, 'other') df['pdb_contain_chain_type'] = df.apply(lambda x: ','.join( sorted( set( map(get_chainType_fun, json.loads(x['_entity_poly.type'].replace('\'', '"')))) )) if isinstance(x['_entity_poly.type'], str) else ','.join( sorted(set(map(get_chainType_fun, x['_entity_poly.type'])))), axis=1) # Deal with UNK_ALL in chain def get_unk_fun(ele): return len(ele) == ele.count('!') df['UNK_ALL_IN_CHAIN'] = df.apply( lambda x: list( map( get_unk_fun, json.loads(x['_pdbx_poly_seq_scheme.mon_id'].replace( '\'', '"')))) if isinstance(x['_entity_poly.type'], str) else list( map(get_unk_fun, x['_pdbx_poly_seq_scheme.mon_id'])), axis=1) # Deal with UNK_ALL in chains of a pdb df['contains_unk_in_chain_pdb'] = df.apply( lambda x: len(set(x['UNK_ALL_IN_CHAIN'])) == 2, axis=1) # Add Info about pdb_type df['pdb_type_MMCIF'] = df.apply(lambda x: MMCIF_unit.checkEntityType( x['_entity_poly.type'], x['_entity_poly.pdbx_strand_id']), axis=1) # Change the columns df.rename(columns={MMCIF_unit.CONFIG['COMMON_COL'][1]: 'method'}, inplace=True) '''df.drop(columns=[MMCIF_unit.CONFIG['COMMON_COL'][0], MMCIF_unit.CONFIG['COMMON_COL'] [2], MMCIF_unit.CONFIG['COMMON_COL'][3]], inplace=True)''' if os.path.exists(outputPath): self.file_o(outputPath, df, mode='a+', header=False) else: self.file_o(outputPath, df) return df