Ejemplo n.º 1
0
def get_buried_residues(structure, cutoff, tmp_dir, dssp_path):
    """Finds buried residues by using relative solvent accessible surface area.

    """
    # get structure id
    structure_id = structure.id

    all_letters = set(string.ascii_uppercase) | set(string.ascii_lowercase)

    # flatten models into a single model due to limitations of DSSP
    id_map = {}
    for k, model in enumerate(structure):
        if k == 0:
            #used_letters = set(model.child_dict.keys())
            used_letters = set()
            for chain in model:
                if chain.get_id() == ' ':
                    chain.id = 'A'
            #for l in used_letters:
                id_map[(model.id, chain.id)] = (model.id, chain.id)
                used_letters.add(chain.id)
            new_model = model.id
        else:
            for chain in model:
                left_over = all_letters - used_letters
                if not left_over:
                    # if run out of chain letters just return nothing
                    return []
                new_letter = left_over.pop()
                used_letters.add(new_letter)
                old_letter = chain.id
                chain.id = new_letter
                id_map[(new_model, new_letter)] = (model.id, old_letter)

                # add numbers if there is not more letters left
                if not (all_letters - used_letters):
                    all_letters.update(
                        set(string.digits) | set(string.punctuation))

            model.id = new_model

    # save new structure to tmp dir
    io = Bio.PDB.PDBIO()
    io.set_structure(structure)
    tmp_path = os.path.join(tmp_dir, structure_id + '.pdb')
    io.save(tmp_path)

    # read in tmp structure
    tmp_structure = utils.read_structure(tmp_path, structure_id, quiet=True)

    # find the solvent accessibility for residues
    dssp_results = Bio.PDB.DSSP(tmp_structure[0], tmp_path, dssp=dssp_path)

    # get bfactors for each amino acid residue
    bfacs_missing = [
        r for r in tmp_structure.get_residues()
        if Bio.PDB.is_aa(r) and 'CA' not in r.child_dict
    ]
    bfacs = [
        r['CA'].get_bfactor() for r in tmp_structure.get_residues()
        if Bio.PDB.is_aa(r) and 'CA' in r.child_dict
    ]
    mean_bfac = np.mean(bfacs)
    std_bfac = np.std(bfacs)

    # format output
    output = []
    for result in dssp_results:
        # skip if not an amino acid
        if not Bio.PDB.is_aa(result[0]):
            continue

        # format the ID
        full_id = result[0].get_full_id()
        #if full_id[2] == ' ':
        #full_id[2] = 'A'
        try:
            orig_model_chain = list(id_map[full_id[1:3]])
        except:
            print full_id, id_map
            raise
        # fix missing letter for homology models
        if orig_model_chain[1] == ' ':
            orig_model_chain[1] = 'A'

        # record whether it was buried
        if 'CA' in result[0].child_dict:
            norm_bfactor = (result[0]['CA'].get_bfactor() -
                            mean_bfac) / std_bfac
        else:
            norm_bfactor = None
        line = [structure_id] + orig_model_chain + [
            result[0].id[1], result[3], norm_bfactor
        ]
        if result[3] <= cutoff:
            line.append(1)
        else:
            line.append(0)
        output.append(line)

    # delete tmp file
    if os.path.exists(tmp_path): os.remove(tmp_path)

    return output
Ejemplo n.º 2
0
def summarize_residues(mutations, pdb_info, radius,
                       rASA, dssp, tmp_dir,
                       quiet=True):
    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = [['structure', 'tumor type', '# buried residues',
               '# protein interface residues', '# nucleic acid interface residues',
               'total residues', '# buried mutations', '# protien interface mutations',
               '# nucleic acid interface mutations', 'total # mutations',
               'burial p-value', 'protein interface p-value',
               'nucleic acid interface p-value']]
    for structure_id in pdb_info:
        #if structure_id.startswith('ENSP') or structure_id.startswith('NP_'):
            #continue
        #print structure_id
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t]
                     for t in unique_ttypes}
        #ttype_ixs['PANCAN'] = range(len(mcount))
        # add PANCAN as a "tumour type"
        unique_ttypes = list(unique_ttypes)
        #unique_ttypes.append('PANCAN')

        # obtain relevant info from structure
        tmp_info = pstruct.get_structure_info(structure, mchains, mres, mcount,
                                              struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry,
         mut_res_mutation_counts,
         all_res_centers_of_geometry,
         models) = tmp_info

        annotated_chains = {chain
                            for description in struct_info
                            for chain in struct_info[description]}

        # find buried residues
        buried_res = pstruct.get_buried_residues(structure, rASA, tmp_dir, dssp)
        tmp_buried = [res_id
                      for res_id in buried_res
                      if res_id[2] in annotated_chains]
        total_res = len(tmp_buried)
        buried_res_info = {(info[1], info[2], info[3])
                           for info in tmp_buried
                           if info[-1] == 1}
        num_buried_res = len(buried_res_info)

        # find interface residues for proteins and nucleic acids
        interface_res = pstruct.get_interface_residues(structure, radius)
        interface_prot_info = {(res_id[1], res_id[2], res_id[3][1])
                               for res_id in interface_res
                               if (res_id[2] in annotated_chains) and interface_res[res_id][0]==1}
        interface_na_info = {(res_id[1], res_id[2], res_id[3][1])
                             for res_id in interface_res
                             if (res_id[2] in annotated_chains) and sum(interface_res[res_id][1:])>=1}
        num_interface_prot_res = len(interface_prot_info)
        num_interface_na_res = len(interface_na_info)

        # iterate through each tumour type
        pan_counts = []
        pan_buried_counts = []
        pan_interface_prot_counts, pan_interface_na_counts = [], []
        tmp_output = []
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            #if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'):
                #continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            total_buried_muts = 0
            total_interface_prot_muts, total_interface_na_muts = 0, 0
            banned_chains = set()
            #if not tumour == 'PANCAN':
            if True:
                for k in t_mut_res_mutation_counts:
                    mutations_to_add = t_mut_res_mutation_counts[k]

                    # prevent double counting
                    cur_model = k[1]
                    cur_chain = k[2]
                    cur_pos = k[3][1]
                    for i in range(len(obs_models)):
                        if not cur_model == obs_models[i] and cur_chain == obs_chains[i]:
                            mutations_to_add = 0
                            break
                    if (cur_chain, cur_pos) in banned_chains:
                        mutations_to_add = 0

                    # add all equivalent chains to banned list
                    equiv_chains = pstruct.find_eq_letters(struct_info, cur_chain)
                    if equiv_chains is not None:
                        equiv_pos = set([(e, cur_pos) for e in equiv_chains])
                        banned_chains |= equiv_pos - set([(cur_chain, cur_pos)])

                    # add to total mutation count
                    total_mutations += mutations_to_add

                    # current residue of interest
                    curr_res = (cur_model, cur_chain, cur_pos)

                    # add buried residue mutation counts
                    is_buried = [(m, c[0], c[1]) in buried_res_info
                                 for c in equiv_pos
                                 for m in range(4)]
                    #if (curr_res in buried_res_info):
                    if any(is_buried):
                        total_buried_muts += mutations_to_add
                        pan_buried_counts.append(mutations_to_add)

                    # add interface residue mutation counts
                    is_interface_prot = [(m, c[0], c[1]) in interface_prot_info
                                         for c in equiv_pos
                                         for m in range(4)]
                    is_interface_na = [(m, c[0], c[1]) in interface_na_info
                                       for c in equiv_pos
                                       for m in range(4)]
                    #if (curr_res in interface_info):
                    if any(is_interface_prot):
                        total_interface_prot_muts += mutations_to_add
                        pan_interface_prot_counts.append(mutations_to_add)
                    if any(is_interface_na):
                        total_interface_na_muts += mutations_to_add
                        pan_interface_na_counts.append(mutations_to_add)

                    # mark chains/models
                    obs_models.append(k[1])
                    obs_chains.append(k[2])
                pan_counts.append(total_mutations)
            else:
                total_mutations = sum(pan_counts)
                total_buried_muts = sum(pan_buried_counts)
                total_interface_prot_muts = sum(pan_interface_prot_counts)
                total_interface_na_muts = sum(pan_interface_na_counts)

            tmp_output.append([structure_id, tumour, num_buried_res,
                               num_interface_prot_res, num_interface_na_res,
                               total_res, total_buried_muts,
                               total_interface_prot_muts, total_interface_na_muts,
                               total_mutations,
                               ])

        output.extend(tmp_output)
    return output
Ejemplo n.º 3
0
def get_buried_residues(structure, cutoff, tmp_dir, dssp_path):
    """Finds buried residues by using relative solvent accessible surface area.

    """
    # get structure id
    structure_id = structure.id

    all_letters = set(string.ascii_uppercase) | set(string.ascii_lowercase)

    # flatten models into a single model due to limitations of DSSP
    id_map = {}
    for k, model in enumerate(structure):
        if k == 0:
            #used_letters = set(model.child_dict.keys())
            used_letters = set()
            for chain in model:
                if chain.get_id() == ' ':
                    chain.id = 'A'
            #for l in used_letters:
                id_map[(model.id, chain.id)] = (model.id, chain.id)
                used_letters.add(chain.id)
            new_model = model.id
        else:
            for chain in model:
                left_over = all_letters - used_letters
                if not left_over:
                    # if run out of chain letters just return nothing
                    return []
                new_letter = left_over.pop()
                used_letters.add(new_letter)
                old_letter = chain.id
                chain.id = new_letter
                id_map[(new_model, new_letter)] = (model.id, old_letter)

                # add numbers if there is not more letters left
                if not (all_letters - used_letters):
                    all_letters.update(set(string.digits) | set(string.punctuation))

            model.id = new_model

    # save new structure to tmp dir
    io = Bio.PDB.PDBIO()
    io.set_structure(structure)
    tmp_path = os.path.join(tmp_dir, structure_id+'.pdb')
    io.save(tmp_path)

    # read in tmp structure
    tmp_structure = utils.read_structure(tmp_path, structure_id, quiet=True)

    # find the solvent accessibility for residues
    dssp_results = Bio.PDB.DSSP(tmp_structure[0], tmp_path, dssp=dssp_path)

    # get bfactors for each amino acid residue
    bfacs_missing = [r
                     for r in tmp_structure.get_residues()
                     if Bio.PDB.is_aa(r) and 'CA' not in r.child_dict]
    bfacs = [r['CA'].get_bfactor()
             for r in tmp_structure.get_residues()
             if Bio.PDB.is_aa(r) and 'CA' in r.child_dict]
    mean_bfac = np.mean(bfacs)
    std_bfac = np.std(bfacs)

    # format output
    output = []
    for result in dssp_results:
        # skip if not an amino acid
        if not Bio.PDB.is_aa(result[0]):
            continue

        # format the ID
        full_id = result[0].get_full_id()
        #if full_id[2] == ' ':
            #full_id[2] = 'A'
        try:
            orig_model_chain = list(id_map[full_id[1:3]])
        except:
            print full_id, id_map
            raise
        # fix missing letter for homology models
        if orig_model_chain[1] == ' ':
            orig_model_chain[1] = 'A'

        # record whether it was buried
        if 'CA' in result[0].child_dict:
            norm_bfactor = (result[0]['CA'].get_bfactor() - mean_bfac) / std_bfac
        else:
            norm_bfactor = None
        line = [structure_id] + orig_model_chain + [result[0].id[1], result[3], norm_bfactor]
        if result[3] <= cutoff:
            line.append(1)
        else:
            line.append(0)
        output.append(line)

    # delete tmp file
    if os.path.exists(tmp_path): os.remove(tmp_path)

    return output