コード例 #1
0
def summarize_residues(mutations,
                       pdb_info,
                       radius,
                       rASA,
                       dssp,
                       tmp_dir,
                       quiet=True):
    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = [[
        'structure', 'tumor type', '# buried residues',
        '# protein interface residues', '# nucleic acid interface residues',
        'total residues', '# buried mutations',
        '# protien interface mutations', '# nucleic acid interface mutations',
        'total # mutations', 'burial p-value', 'protein interface p-value',
        'nucleic acid interface p-value'
    ]]
    for structure_id in pdb_info:
        #if structure_id.startswith('ENSP') or structure_id.startswith('NP_'):
        #continue
        #print structure_id
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(
            *structure_mutations)  # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {
            t: [i for i in range(len(mcount)) if ttypes[i] == t]
            for t in unique_ttypes
        }
        #ttype_ixs['PANCAN'] = range(len(mcount))
        # add PANCAN as a "tumour type"
        unique_ttypes = list(unique_ttypes)
        #unique_ttypes.append('PANCAN')

        # obtain relevant info from structure
        tmp_info = pstruct.get_structure_info(structure, mchains, mres, mcount,
                                              struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry, mut_res_mutation_counts,
         all_res_centers_of_geometry, models) = tmp_info

        annotated_chains = {
            chain
            for description in struct_info
            for chain in struct_info[description]
        }

        # find buried residues
        buried_res = pstruct.get_buried_residues(structure, rASA, tmp_dir,
                                                 dssp)
        tmp_buried = [
            res_id for res_id in buried_res if res_id[2] in annotated_chains
        ]
        total_res = len(tmp_buried)
        buried_res_info = {(info[1], info[2], info[3])
                           for info in tmp_buried if info[-1] == 1}
        num_buried_res = len(buried_res_info)

        # find interface residues for proteins and nucleic acids
        interface_res = pstruct.get_interface_residues(structure, radius)
        interface_prot_info = {(res_id[1], res_id[2], res_id[3][1])
                               for res_id in interface_res
                               if (res_id[2] in annotated_chains)
                               and interface_res[res_id][0] == 1}
        interface_na_info = {(res_id[1], res_id[2], res_id[3][1])
                             for res_id in interface_res
                             if (res_id[2] in annotated_chains)
                             and sum(interface_res[res_id][1:]) >= 1}
        num_interface_prot_res = len(interface_prot_info)
        num_interface_na_res = len(interface_na_info)

        # iterate through each tumour type
        pan_counts = []
        pan_buried_counts = []
        pan_interface_prot_counts, pan_interface_na_counts = [], []
        tmp_output = []
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            #if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'):
            #continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            total_buried_muts = 0
            total_interface_prot_muts, total_interface_na_muts = 0, 0
            banned_chains = set()
            #if not tumour == 'PANCAN':
            if True:
                for k in t_mut_res_mutation_counts:
                    mutations_to_add = t_mut_res_mutation_counts[k]

                    # prevent double counting
                    cur_model = k[1]
                    cur_chain = k[2]
                    cur_pos = k[3][1]
                    for i in range(len(obs_models)):
                        if not cur_model == obs_models[
                                i] and cur_chain == obs_chains[i]:
                            mutations_to_add = 0
                            break
                    if (cur_chain, cur_pos) in banned_chains:
                        mutations_to_add = 0

                    # add all equivalent chains to banned list
                    equiv_chains = pstruct.find_eq_letters(
                        struct_info, cur_chain)
                    if equiv_chains is not None:
                        equiv_pos = set([(e, cur_pos) for e in equiv_chains])
                        banned_chains |= equiv_pos - set([(cur_chain, cur_pos)
                                                          ])

                    # add to total mutation count
                    total_mutations += mutations_to_add

                    # current residue of interest
                    curr_res = (cur_model, cur_chain, cur_pos)

                    # add buried residue mutation counts
                    is_buried = [(m, c[0], c[1]) in buried_res_info
                                 for c in equiv_pos for m in range(4)]
                    #if (curr_res in buried_res_info):
                    if any(is_buried):
                        total_buried_muts += mutations_to_add
                        pan_buried_counts.append(mutations_to_add)

                    # add interface residue mutation counts
                    is_interface_prot = [(m, c[0], c[1]) in interface_prot_info
                                         for c in equiv_pos for m in range(4)]
                    is_interface_na = [(m, c[0], c[1]) in interface_na_info
                                       for c in equiv_pos for m in range(4)]
                    #if (curr_res in interface_info):
                    if any(is_interface_prot):
                        total_interface_prot_muts += mutations_to_add
                        pan_interface_prot_counts.append(mutations_to_add)
                    if any(is_interface_na):
                        total_interface_na_muts += mutations_to_add
                        pan_interface_na_counts.append(mutations_to_add)

                    # mark chains/models
                    obs_models.append(k[1])
                    obs_chains.append(k[2])
                pan_counts.append(total_mutations)
            else:
                total_mutations = sum(pan_counts)
                total_buried_muts = sum(pan_buried_counts)
                total_interface_prot_muts = sum(pan_interface_prot_counts)
                total_interface_na_muts = sum(pan_interface_na_counts)

            tmp_output.append([
                structure_id,
                tumour,
                num_buried_res,
                num_interface_prot_res,
                num_interface_na_res,
                total_res,
                total_buried_muts,
                total_interface_prot_muts,
                total_interface_na_muts,
                total_mutations,
            ])

        output.extend(tmp_output)
    return output
コード例 #2
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC',
                             'TGCT', 'THYM', 'UVM']
            if ttype not in banned_ttypes:
                gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info,
                                              struct, opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')
コード例 #3
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [
            m for m in mtc if (m[ttype_ix] == ttype) and (
                float(m[qval_ix]) <= opts['q_value'])
        ]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            # fringe case
            if pdb_id not in pdb_info:
                print('skipping ' + pdb_id)
                continue

            # get path info
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                    tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix],
                                               s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            gene2graph_all = update_graph(gene2graph_all, cog,
                                          signif_struct_info, struct,
                                          opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line) + '\n')

    logger.info('Finished Successfully!!!')
コード例 #4
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # use external module to separate out the residues in the hotspot.py output
    # onto separate lines
    mtc = read_residue_info(opts['input'])
    pval_thresholds = read_thresholds(opts['significance'])

    # read in multiple testing file

    #mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)

    ttype_ix = header.index('Tumor Type')
    struct_ix = header.index('Structure')
    model_ix = header.index('Model')
    chain_ix = header.index('Chain')
    res_ix = header.index('Mutation Residues')
    pval_ix = header.index('Hotspot P-value')

    # iterate through each tumor type

    output = []
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))

        # if there is no pval threshold, nothing is significant
        if not ttype in pval_thresholds:
            continue

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])]



        # ANY EQUIVALENT COPY THING FOR STRUCTURES?
        # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
        #                       for m in mtc_ttype])
        #significant_res = list(mtc_ttype)
        significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix]))
                           for m in mtc_ttype]

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        all_annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        all_annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]):

            # initialize the graph to empty
            struct2graph = {}

            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)


            # get significant residues
            signif_struct_info = {}
            non_signif_struct_info = {}

            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    continue

                if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res:
                    signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))

                else:
                    non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))


            #print "Pushing update", pdb_id
            # update the graph to reflect info from the current structure
            struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info,
                                      struct, opts['radius'])


            # format the results into the output list
            tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours)
            output += tmp_out


        # format the results into the output list
        # tmp_out = retrieve_components(struct2graph, ttype)
        # output += tmp_out
        logger.info('Finished {0}'.format(ttype))


    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')
コード例 #5
0
ファイル: hotspot.py プロジェクト: skjq/HotMAPS
def main(opts):
    """Currently, performs analysis for the given genes. It attempts to use
    any available PDB sturctures. It then loops through each protein chain
    and tumor type.
    """
    # read in data
    logger.info('Reading in annotations . . .')
    pdb_info = utils.read_pdb_info(opts['annotation'])
    logger.info('Finished reading in annotations.')
    logger.info('Reading in mutations . . .')
    mutations = utils.read_mutations(opts['mutations'])
    logger.info('Finished reading in mutations.')

    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = []
    num_pdbs = 0
    num_missing_pdbs = 0
    missing_pdb_list = []
    error_pdb_structs = []
    quiet = True if opts[
        'log_level'] != "DEBUG" else False  # flag indicating pdb warnings
    pdb_parser = PDBParser(QUIET=quiet)  # parser for pdb files

    for structure_id in pdb_info:
        print(structure_id)
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        # get mutation info
        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(
            *structure_mutations)  # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {
            t: [i for i in range(len(mcount)) if ttypes[i] == t]
            for t in unique_ttypes
        }
        unique_ttypes = list(unique_ttypes)

        # obtain relevant info from structure
        tmp_info = get_structure_info(structure, mchains, mres, mcount,
                                      struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry, mut_res_mutation_counts,
         all_res_centers_of_geometry, models) = tmp_info
        if not all_res_centers_of_geometry:
            logger.error('No available center of geometries for {0}'.format(
                structure_id))
            continue

        # get neigbours for all residues
        neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius'])

        # iterate through each tumour type
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            if (not opts['tumor_type'] == tumour
                    and not opts['tumor_type'] == 'EVERY'):
                continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            mut_density = src.mutations.mutation_density(
                t_mut_res_mutation_counts, neighbors)
            mut_vals = mut_density.values()
            if mut_vals:
                max_obs_dens = max(mut_density.values())
            else:
                max_obs_dens = 0

            # generate null distribution
            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            for k in t_mut_res_mutation_counts:
                mutations_to_add = t_mut_res_mutation_counts[k]
                for i in range(len(obs_models)):
                    if not k[1] == obs_models[i] and k[2] == obs_chains[i]:
                        mutations_to_add = 0
                        break
                total_mutations += mutations_to_add
                obs_models.append(k[1])
                obs_chains.append(k[2])

            # generate empirical null distribution
            sim_null_dist = sim.generate_null_dist(
                structure_id, models, struct_info, all_res_centers_of_geometry,
                total_mutations, opts['num_simulations'], opts['seed'],
                neighbors, opts['stop_criterion'], max_obs_dens)

            # get a list of lists format for compute p values function
            mut_list = [[res_id, mut_density[res_id]]
                        for res_id in mut_density]
            if not t_mut_res_mutation_counts:
                print("here")

            # aditional information about p-values
            # for specific residues in a structure
            # compute p-values for observed
            obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist)

            output.append([
                structure_id,
                tumour,
                ','.join([str(o[0][1]) for o in mut_list]),
                ','.join([str(o[0][2]) for o in mut_list]),
                ','.join([str(o[0][3][1]) for o in mut_list]),
                ','.join(
                    [str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]),
                ','.join([str(o[1]) for o in mut_list]),
                ','.join(map(str, obs_pvals)),
            ])

    # write output to file
    output = [[
        'Structure',
        'Tumor Type',
        'Model',
        'Chain',
        'Mutation Residues',
        'Residue Mutation Count',
        'Mutation Density',
        'Hotspot P-value',
    ]] + output
    with open(opts['output'], 'w') as handle:
        csv.writer(handle, delimiter='\t',
                   lineterminator='\n').writerows(output)

    # if user specified to log failed reading of pdbs
    if opts['error_pdb'] and error_pdb_structs:
        with open(opts['error_pdb'], 'w') as handle:
            for bad_pdb in error_pdb_structs:
                handle.write(bad_pdb + '\n')

    print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF))
    print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF))
    print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF))
    print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF))
    logger.info('Finished successfully!')
コード例 #6
0
ファイル: hotspot.py プロジェクト: Al3n70rn/HotMAPS
def main(opts):
    """Currently, performs analysis for the given genes. It attempts to use
    any available PDB sturctures. It then loops through each protein chain
    and tumor type.
    """
    # read in data
    logger.info('Reading in annotations . . .')
    pdb_info  = utils.read_pdb_info(opts['annotation'])
    logger.info('Finished reading in annotations.')
    logger.info('Reading in mutations . . .')
    mutations = utils.read_mutations(opts['mutations'])
    logger.info('Finished reading in mutations.')

    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = []
    num_pdbs = 0
    num_missing_pdbs = 0
    missing_pdb_list = []
    error_pdb_structs = []
    quiet = True if opts['log_level'] != "DEBUG" else False  # flag indicating pdb warnings
    pdb_parser = PDBParser(QUIET=quiet)  # parser for pdb files

    for structure_id in pdb_info:
        print (structure_id)
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        # get mutation info
        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t]
                     for t in unique_ttypes}
        unique_ttypes = list(unique_ttypes)

        # obtain relevant info from structure
        tmp_info = get_structure_info(structure, mchains, mres, mcount,
                                      struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry,
         mut_res_mutation_counts,
         all_res_centers_of_geometry,
         models) = tmp_info
        if not all_res_centers_of_geometry:
            logger.error('No available center of geometries for {0}'.format(structure_id))
            continue

        # get neigbours for all residues
        neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius'])

        # iterate through each tumour type
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'):
                continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            mut_density = src.mutations.mutation_density(t_mut_res_mutation_counts,
                                                         neighbors)
            mut_vals = mut_density.values()
            if mut_vals:
                max_obs_dens = max(mut_density.values())
            else:
                max_obs_dens =0

            # generate null distribution
            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            for k in t_mut_res_mutation_counts:
                mutations_to_add = t_mut_res_mutation_counts[k]
                for i in range(len(obs_models)):
                    if not k[1] == obs_models[i] and k[2] == obs_chains[i]:
                        mutations_to_add = 0
                        break
                total_mutations += mutations_to_add
                obs_models.append(k[1])
                obs_chains.append(k[2])

            # generate empirical null distribution
            sim_null_dist = sim.generate_null_dist(structure_id, models, struct_info,
                                                   all_res_centers_of_geometry,
                                                   total_mutations,
                                                   opts['num_simulations'],
                                                   opts['seed'],
                                                   neighbors,
                                                   opts['stop_criterion'],
                                                   max_obs_dens)

            # get a list of lists format for compute p values function
            mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density]
            if not t_mut_res_mutation_counts:
                print("here")

            # aditional information about p-values
            # for specific residues in a structure
            # compute p-values for observed
            obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist)

            output.append([structure_id, tumour,
                            ','.join([str(o[0][1]) for o in mut_list]),
                            ','.join([str(o[0][2]) for o in mut_list]),
                            ','.join([str(o[0][3][1]) for o in mut_list]),
                            ','.join([str(t_mut_res_mutation_counts[o[0]])
                                        for o in mut_list]),
                            ','.join([str(o[1]) for o in mut_list]),
                            ','.join(map(str, obs_pvals)),])

    # write output to file
    output = [['Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues',
               'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value',
              ]] + output
    with open(opts['output'], 'w') as handle:
        csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output)

    # if user specified to log failed reading of pdbs
    if opts['error_pdb'] and error_pdb_structs:
        with open(opts['error_pdb'], 'w') as handle:
            for bad_pdb in error_pdb_structs:
                handle.write(bad_pdb+'\n')

    print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF))
    print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF))
    print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF))
    print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF))
    logger.info('Finished successfully!')
コード例 #7
0
ファイル: pdb_structure.py プロジェクト: franmj/clusters3D
def get_buried_residues(structure, cutoff, tmp_dir, dssp_path):
    """Finds buried residues by using relative solvent accessible surface area.

    """
    # get structure id
    structure_id = structure.id

    all_letters = set(string.ascii_uppercase) | set(string.ascii_lowercase)

    # flatten models into a single model due to limitations of DSSP
    id_map = {}
    for k, model in enumerate(structure):
        if k == 0:
            #used_letters = set(model.child_dict.keys())
            used_letters = set()
            for chain in model:
                if chain.get_id() == ' ':
                    chain.id = 'A'
            #for l in used_letters:
                id_map[(model.id, chain.id)] = (model.id, chain.id)
                used_letters.add(chain.id)
            new_model = model.id
        else:
            for chain in model:
                left_over = all_letters - used_letters
                if not left_over:
                    # if run out of chain letters just return nothing
                    return []
                new_letter = left_over.pop()
                used_letters.add(new_letter)
                old_letter = chain.id
                chain.id = new_letter
                id_map[(new_model, new_letter)] = (model.id, old_letter)

                # add numbers if there is not more letters left
                if not (all_letters - used_letters):
                    all_letters.update(
                        set(string.digits) | set(string.punctuation))

            model.id = new_model

    # save new structure to tmp dir
    io = Bio.PDB.PDBIO()
    io.set_structure(structure)
    tmp_path = os.path.join(tmp_dir, structure_id + '.pdb')
    io.save(tmp_path)

    # read in tmp structure
    tmp_structure = utils.read_structure(tmp_path, structure_id, quiet=True)

    # find the solvent accessibility for residues
    dssp_results = Bio.PDB.DSSP(tmp_structure[0], tmp_path, dssp=dssp_path)

    # get bfactors for each amino acid residue
    bfacs_missing = [
        r for r in tmp_structure.get_residues()
        if Bio.PDB.is_aa(r) and 'CA' not in r.child_dict
    ]
    bfacs = [
        r['CA'].get_bfactor() for r in tmp_structure.get_residues()
        if Bio.PDB.is_aa(r) and 'CA' in r.child_dict
    ]
    mean_bfac = np.mean(bfacs)
    std_bfac = np.std(bfacs)

    # format output
    output = []
    for result in dssp_results:
        # skip if not an amino acid
        if not Bio.PDB.is_aa(result[0]):
            continue

        # format the ID
        full_id = result[0].get_full_id()
        #if full_id[2] == ' ':
        #full_id[2] = 'A'
        try:
            orig_model_chain = list(id_map[full_id[1:3]])
        except:
            print full_id, id_map
            raise
        # fix missing letter for homology models
        if orig_model_chain[1] == ' ':
            orig_model_chain[1] = 'A'

        # record whether it was buried
        if 'CA' in result[0].child_dict:
            norm_bfactor = (result[0]['CA'].get_bfactor() -
                            mean_bfac) / std_bfac
        else:
            norm_bfactor = None
        line = [structure_id] + orig_model_chain + [
            result[0].id[1], result[3], norm_bfactor
        ]
        if result[3] <= cutoff:
            line.append(1)
        else:
            line.append(0)
        output.append(line)

    # delete tmp file
    if os.path.exists(tmp_path): os.remove(tmp_path)

    return output