Ejemplo n.º 1
0
    def handle_seqres_di(info_dict):
        # Deal with SEQRES_COL
        resides_col_li = DEFAULT_COLS['SEQRES_COL'][1:4]
        mtoTool = Unit.MultiToOne()
        for i in range(len(info_dict[resides_col_li[0]])):
            for resides_col in resides_col_li:
                info_dict[resides_col][i] = ''.join(
                    mtoTool.multi_letter_convert_to_one_letter(j)
                    for j in info_dict[resides_col][i])

        pdbx_poly_key = DEFAULT_COLS['SEQRES_COL'][0]
        coordinates_model_key = DEFAULT_COLS['SEQRES_COL'][8]
        for i in range(len(info_dict[pdbx_poly_key])):
            strand_id_index = [0]
            li = info_dict[pdbx_poly_key][i]
            save_id = li[0]
            strand_id_li = [save_id]
            for j in range(len(li)):
                if li[j] != save_id:
                    save_id = li[j]
                    strand_id_index.append(j)
                    strand_id_li.append(save_id)
            info_dict[pdbx_poly_key][i] = strand_id_li

            for col in DEFAULT_COLS['SEQRES_COL'][1:4]:
                info_dict[col][i] = [
                    MMCIF2Dfrm.get_index(strand_id_index, info_dict[col][i], j)
                    for j in range(len(strand_id_index))
                ]

            for col in DEFAULT_COLS['SEQRES_COL'][4:8]:
                info_dict[col][i] = [
                    ';'.join(
                        MMCIF2Dfrm.get_index(strand_id_index,
                                             info_dict[col][i], j))
                    for j in range(len(strand_id_index))
                ]

            new_comodel_li = []
            for ele in info_dict[coordinates_model_key][i]:
                if ele not in new_comodel_li:
                    new_comodel_li.append(ele)
            info_dict[coordinates_model_key][i] = new_comodel_li
Ejemplo n.º 2
0
    def get_data_from_mmcif(self, path_list, outputPath=False):
        '''
        {
            '_pdbx_audit_revision_history.revision_date': ['initial_version_time', 'newest_version_time'], # sometimes not a list
            '_entity.pdbx_mutation': ['mutation_num', 'mutation_content'], # sometimes not a list
            '_entity.id': ['entity_id_aidMuta'], # sometimes not a list
            ['_em_3d_reconstruction.resolution','_refine.ls_d_res_high']: ['resolution'], # not a lists
            '_exptl.method': ['method'], # not a list
        }

        '''
        info_dict = defaultdict(list)
        for path in path_list:
            if path[-3:] == 'cif':
                print(path)
                info_dict['pdb_id'].append(path[-8:-4])
                MMCIF_unit.get_mmcif_info(
                    MMCIF_unit.CONFIG['COMMON_COL'] +
                    MMCIF_unit.CONFIG['BIOASS_COL'] +
                    MMCIF_unit.CONFIG['ENTITY_COL'] +
                    MMCIF_unit.CONFIG['TYPE_COL'] +
                    MMCIF_unit.CONFIG['SEQRES_COL'] +
                    MMCIF_unit.CONFIG['LIGAND_COL'],
                    MMCIF_unit.CONFIG['COMMON_COL'][1:], info_dict, path)

        # Deal with Residues in SEQRES_COL
        resides_col_li = MMCIF_unit.CONFIG['SEQRES_COL'][1:4]
        mtoTool = Unit.MultiToOne()
        for i in range(len(info_dict[resides_col_li[0]])):
            for resides_col in resides_col_li:
                info_dict[resides_col][i] = ''.join(
                    mtoTool.multi_letter_convert_to_one_letter(j)
                    for j in info_dict[resides_col][i])

        def get_index(x, y, z):
            return y[x[z]:x[z +
                            1]] if len(x) != 1 and z + 1 < len(x) else y[x[z]:]

        # Deal with SEQRES_COL
        pdbx_poly_key = MMCIF_unit.CONFIG['SEQRES_COL'][0]
        for i in range(len(info_dict[pdbx_poly_key])):
            strand_id_index = [0]
            li = info_dict[pdbx_poly_key][i]
            save_id = li[0]
            strand_id_li = [save_id]
            for j in range(len(li)):
                if li[j] != save_id:
                    save_id = li[j]
                    strand_id_index.append(j)
                    strand_id_li.append(save_id)
            info_dict[pdbx_poly_key][i] = strand_id_li

            for col in MMCIF_unit.CONFIG['SEQRES_COL'][1:4]:
                info_dict[col][i] = [
                    get_index(strand_id_index, info_dict[col][i], j)
                    for j in range(len(strand_id_index))
                ]

            for col in MMCIF_unit.CONFIG['SEQRES_COL'][4:]:
                info_dict[col][i] = [
                    ';'.join(get_index(strand_id_index, info_dict[col][i], j))
                    for j in range(len(strand_id_index))
                ]

        # Deal with LIGAND_COL
        ligand_col_list = MMCIF_unit.CONFIG['LIGAND_COL']
        metal_li = MMCIF_unit.CONFIG['LIGAND_LIST']

        for i in range(len(info_dict[ligand_col_list[0]])):
            if not info_dict[ligand_col_list[0]][i]:
                info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append(
                    np.nan)
                info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append(
                    np.nan)
                continue
            ligand_col_tp = tuple(info_dict[col][i] for col in ligand_col_list)
            ligand_col_zip_li = list(zip(*ligand_col_tp))

            aa_li = list(MMCIF_unit.SEQ_DICT.keys())[:21]
            metal_ligand_info = list(
                filter(lambda x: x[0] == 'metalc', ligand_col_zip_li))
            # chain_id: _struct_conn.ptnr2_auth_asym_id [4]
            sub_metal_ligand_info_1 = filter(
                lambda x: x[1] in metal_li and x[2] in aa_li,
                metal_ligand_info)
            # chain_id: _struct_conn.ptnr1_auth_asym_id [3]
            sub_metal_ligand_info_2 = filter(
                lambda x: x[2] in metal_li and x[1] in aa_li,
                metal_ligand_info)

            new_metal_ligand_info = []
            for tp in sub_metal_ligand_info_1:
                new_metal_ligand_info.append(
                    (tp[4], tp[1], tp[5], tp[2], tp[6]))
            for tp in sub_metal_ligand_info_2:
                new_metal_ligand_info.append(
                    (tp[3], tp[2], tp[6], tp[1], tp[5]))

            new_metal_ligand_info.sort(key=lambda x: x[0])
            try:
                save_id = new_metal_ligand_info[0][0]
                # print(new_metal_ligand_info)
            except IndexError:
                info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append(
                    np.nan)
                info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append(
                    np.nan)
                continue

            strand_id_li = [save_id]
            strand_id_index = [0]
            for j in range(len(new_metal_ligand_info)):
                if new_metal_ligand_info[j][0] != save_id:
                    save_id = new_metal_ligand_info[j][0]
                    strand_id_index.append(j)
                    strand_id_li.append(save_id)

            info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][0]].append(
                strand_id_li)
            info_dict[MMCIF_unit.CONFIG['METAL_LIGAND_COL'][1]].append([
                get_index(strand_id_index,
                          [ele[1:] for ele in new_metal_ligand_info], j)
                for j in range(len(strand_id_index))
            ])

        df = pd.DataFrame(info_dict)
        # Deal with the date of structure
        df['initial_version_time'] = df.apply(
            lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][0], axis=1)
        df['newest_version_time'] = df.apply(
            lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][-1], axis=1)

        # Deal with the mutations

        def muta_count(x):
            return x.count(',') + 1 if x != '?' else 0

        df['mutation_num'] = df.apply(
            lambda x: [muta_count(i) for i in x['_entity.pdbx_mutation']],
            axis=1)
        # Deal with the resolution
        df['resolution'] = df.apply(
            lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][3]], axis=1)
        df['resolution'] = df.apply(
            lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][2]]
            if isinstance(x['resolution'], float) else x['resolution'],
            axis=1)

        # Deal with chain type

        def get_chainType_fun(ele):
            return MMCIF_unit.CONFIG['CHAIN_TYPE_DICT'].get(ele, 'other')

        df['pdb_contain_chain_type'] = df.apply(lambda x: ','.join(
            sorted(
                set(
                    map(get_chainType_fun,
                        json.loads(x['_entity_poly.type'].replace('\'', '"'))))
            )) if isinstance(x['_entity_poly.type'], str) else ','.join(
                sorted(set(map(get_chainType_fun, x['_entity_poly.type'])))),
                                                axis=1)

        # Deal with UNK_ALL in chain

        def get_unk_fun(ele):
            return len(ele) == ele.count('!')

        df['UNK_ALL_IN_CHAIN'] = df.apply(
            lambda x: list(
                map(
                    get_unk_fun,
                    json.loads(x['_pdbx_poly_seq_scheme.mon_id'].replace(
                        '\'', '"'))))
            if isinstance(x['_entity_poly.type'], str) else list(
                map(get_unk_fun, x['_pdbx_poly_seq_scheme.mon_id'])),
            axis=1)
        # Deal with UNK_ALL in chains of a pdb
        df['contains_unk_in_chain_pdb'] = df.apply(
            lambda x: len(set(x['UNK_ALL_IN_CHAIN'])) == 2, axis=1)
        # Add Info about pdb_type
        df['pdb_type_MMCIF'] = df.apply(lambda x: MMCIF_unit.checkEntityType(
            x['_entity_poly.type'], x['_entity_poly.pdbx_strand_id']),
                                        axis=1)
        # Change the columns
        df.rename(columns={MMCIF_unit.CONFIG['COMMON_COL'][1]: 'method'},
                  inplace=True)
        '''df.drop(columns=[MMCIF_unit.CONFIG['COMMON_COL'][0], MMCIF_unit.CONFIG['COMMON_COL']
                         [2], MMCIF_unit.CONFIG['COMMON_COL'][3]], inplace=True)'''

        if os.path.exists(outputPath):
            self.file_o(outputPath, df, mode='a+', header=False)
        else:
            self.file_o(outputPath, df)
        return df