def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC',
                             'TGCT', 'THYM', 'UVM']
            if ttype not in banned_ttypes:
                gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info,
                                              struct, opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')
Esempio n. 2
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [
            m for m in mtc if (m[ttype_ix] == ttype) and (
                float(m[qval_ix]) <= opts['q_value'])
        ]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            # fringe case
            if pdb_id not in pdb_info:
                print('skipping ' + pdb_id)
                continue

            # get path info
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                    tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix],
                                               s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            gene2graph_all = update_graph(gene2graph_all, cog,
                                          signif_struct_info, struct,
                                          opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line) + '\n')

    logger.info('Finished Successfully!!!')
Esempio n. 3
0
def main(opts):
    """Currently, performs analysis for the given genes. It attempts to use
    any available PDB sturctures. It then loops through each protein chain
    and tumor type.
    """
    # read in data
    logger.info('Reading in annotations . . .')
    pdb_info = utils.read_pdb_info(opts['annotation'])
    logger.info('Finished reading in annotations.')
    logger.info('Reading in mutations . . .')
    mutations = utils.read_mutations(opts['mutations'])
    logger.info('Finished reading in mutations.')

    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = []
    num_pdbs = 0
    num_missing_pdbs = 0
    missing_pdb_list = []
    error_pdb_structs = []
    quiet = True if opts[
        'log_level'] != "DEBUG" else False  # flag indicating pdb warnings
    pdb_parser = PDBParser(QUIET=quiet)  # parser for pdb files

    for structure_id in pdb_info:
        print(structure_id)
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        # get mutation info
        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(
            *structure_mutations)  # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {
            t: [i for i in range(len(mcount)) if ttypes[i] == t]
            for t in unique_ttypes
        }
        unique_ttypes = list(unique_ttypes)

        # obtain relevant info from structure
        tmp_info = get_structure_info(structure, mchains, mres, mcount,
                                      struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry, mut_res_mutation_counts,
         all_res_centers_of_geometry, models) = tmp_info
        if not all_res_centers_of_geometry:
            logger.error('No available center of geometries for {0}'.format(
                structure_id))
            continue

        # get neigbours for all residues
        neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius'])

        # iterate through each tumour type
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            if (not opts['tumor_type'] == tumour
                    and not opts['tumor_type'] == 'EVERY'):
                continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            mut_density = src.mutations.mutation_density(
                t_mut_res_mutation_counts, neighbors)
            mut_vals = mut_density.values()
            if mut_vals:
                max_obs_dens = max(mut_density.values())
            else:
                max_obs_dens = 0

            # generate null distribution
            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            for k in t_mut_res_mutation_counts:
                mutations_to_add = t_mut_res_mutation_counts[k]
                for i in range(len(obs_models)):
                    if not k[1] == obs_models[i] and k[2] == obs_chains[i]:
                        mutations_to_add = 0
                        break
                total_mutations += mutations_to_add
                obs_models.append(k[1])
                obs_chains.append(k[2])

            # generate empirical null distribution
            sim_null_dist = sim.generate_null_dist(
                structure_id, models, struct_info, all_res_centers_of_geometry,
                total_mutations, opts['num_simulations'], opts['seed'],
                neighbors, opts['stop_criterion'], max_obs_dens)

            # get a list of lists format for compute p values function
            mut_list = [[res_id, mut_density[res_id]]
                        for res_id in mut_density]
            if not t_mut_res_mutation_counts:
                print("here")

            # aditional information about p-values
            # for specific residues in a structure
            # compute p-values for observed
            obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist)

            output.append([
                structure_id,
                tumour,
                ','.join([str(o[0][1]) for o in mut_list]),
                ','.join([str(o[0][2]) for o in mut_list]),
                ','.join([str(o[0][3][1]) for o in mut_list]),
                ','.join(
                    [str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]),
                ','.join([str(o[1]) for o in mut_list]),
                ','.join(map(str, obs_pvals)),
            ])

    # write output to file
    output = [[
        'Structure',
        'Tumor Type',
        'Model',
        'Chain',
        'Mutation Residues',
        'Residue Mutation Count',
        'Mutation Density',
        'Hotspot P-value',
    ]] + output
    with open(opts['output'], 'w') as handle:
        csv.writer(handle, delimiter='\t',
                   lineterminator='\n').writerows(output)

    # if user specified to log failed reading of pdbs
    if opts['error_pdb'] and error_pdb_structs:
        with open(opts['error_pdb'], 'w') as handle:
            for bad_pdb in error_pdb_structs:
                handle.write(bad_pdb + '\n')

    print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF))
    print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF))
    print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF))
    print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF))
    logger.info('Finished successfully!')
Esempio n. 4
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # use external module to separate out the residues in the hotspot.py output
    # onto separate lines
    mtc = read_residue_info(opts['input'])
    pval_thresholds = read_thresholds(opts['significance'])

    # read in multiple testing file

    #mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)

    ttype_ix = header.index('Tumor Type')
    struct_ix = header.index('Structure')
    model_ix = header.index('Model')
    chain_ix = header.index('Chain')
    res_ix = header.index('Mutation Residues')
    pval_ix = header.index('Hotspot P-value')

    # iterate through each tumor type

    output = []
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))

        # if there is no pval threshold, nothing is significant
        if not ttype in pval_thresholds:
            continue

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])]



        # ANY EQUIVALENT COPY THING FOR STRUCTURES?
        # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
        #                       for m in mtc_ttype])
        #significant_res = list(mtc_ttype)
        significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix]))
                           for m in mtc_ttype]

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        all_annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        all_annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]):

            # initialize the graph to empty
            struct2graph = {}

            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)


            # get significant residues
            signif_struct_info = {}
            non_signif_struct_info = {}

            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    continue

                if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res:
                    signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))

                else:
                    non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))


            #print "Pushing update", pdb_id
            # update the graph to reflect info from the current structure
            struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info,
                                      struct, opts['radius'])


            # format the results into the output list
            tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours)
            output += tmp_out


        # format the results into the output list
        # tmp_out = retrieve_components(struct2graph, ttype)
        # output += tmp_out
        logger.info('Finished {0}'.format(ttype))


    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')
Esempio n. 5
0
def main(opts):
    """Currently, performs analysis for the given genes. It attempts to use
    any available PDB sturctures. It then loops through each protein chain
    and tumor type.
    """
    # read in data
    logger.info('Reading in annotations . . .')
    pdb_info  = utils.read_pdb_info(opts['annotation'])
    logger.info('Finished reading in annotations.')
    logger.info('Reading in mutations . . .')
    mutations = utils.read_mutations(opts['mutations'])
    logger.info('Finished reading in mutations.')

    # iterate over each structure
    logger.info('Running of PDB structures . . .')
    output = []
    num_pdbs = 0
    num_missing_pdbs = 0
    missing_pdb_list = []
    error_pdb_structs = []
    quiet = True if opts['log_level'] != "DEBUG" else False  # flag indicating pdb warnings
    pdb_parser = PDBParser(QUIET=quiet)  # parser for pdb files

    for structure_id in pdb_info:
        print (structure_id)
        # get pdb info
        struct_info = pdb_info[structure_id]
        pdb_path = struct_info.pop('path')

        # read in structure
        structure = utils.read_structure(pdb_path, structure_id, quiet=quiet)
        if structure is None:
            continue

        # make a list of all chain letters in structure
        struct_chains = []
        for k in struct_info.keys():
            struct_chains.extend(struct_info[k])

        # get mutation info
        structure_mutations = mutations.get(structure_id, [])
        # skip structure if no mutations
        if not structure_mutations:
            continue

        # separate out mutation info
        ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], [])

        # stratify mutations by their tumor type
        # ttype_ixs is a dictionary that contains
        # ttype as the keys and a list of relevant
        # indices as the values
        unique_ttypes = set(ttypes)
        ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t]
                     for t in unique_ttypes}
        unique_ttypes = list(unique_ttypes)

        # obtain relevant info from structure
        tmp_info = get_structure_info(structure, mchains, mres, mcount,
                                      struct_chains, ttype_ixs)
        (mut_res_centers_of_geometry,
         mut_res_mutation_counts,
         all_res_centers_of_geometry,
         models) = tmp_info
        if not all_res_centers_of_geometry:
            logger.error('No available center of geometries for {0}'.format(structure_id))
            continue

        # get neigbours for all residues
        neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius'])

        # iterate through each tumour type
        for tumour in unique_ttypes:
            # skip tumor types if not one specified
            if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'):
                continue

            # draw information for the specific tumour type
            t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour]
            t_mut_res_mutation_counts = mut_res_mutation_counts[tumour]

            mut_density = src.mutations.mutation_density(t_mut_res_mutation_counts,
                                                         neighbors)
            mut_vals = mut_density.values()
            if mut_vals:
                max_obs_dens = max(mut_density.values())
            else:
                max_obs_dens =0

            # generate null distribution
            # count total mutations in structure while
            # avoiding double counting due to same id and chain
            # being on multiple models
            obs_models = []
            obs_chains = []
            total_mutations = 0
            for k in t_mut_res_mutation_counts:
                mutations_to_add = t_mut_res_mutation_counts[k]
                for i in range(len(obs_models)):
                    if not k[1] == obs_models[i] and k[2] == obs_chains[i]:
                        mutations_to_add = 0
                        break
                total_mutations += mutations_to_add
                obs_models.append(k[1])
                obs_chains.append(k[2])

            # generate empirical null distribution
            sim_null_dist = sim.generate_null_dist(structure_id, models, struct_info,
                                                   all_res_centers_of_geometry,
                                                   total_mutations,
                                                   opts['num_simulations'],
                                                   opts['seed'],
                                                   neighbors,
                                                   opts['stop_criterion'],
                                                   max_obs_dens)

            # get a list of lists format for compute p values function
            mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density]
            if not t_mut_res_mutation_counts:
                print("here")

            # aditional information about p-values
            # for specific residues in a structure
            # compute p-values for observed
            obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist)

            output.append([structure_id, tumour,
                            ','.join([str(o[0][1]) for o in mut_list]),
                            ','.join([str(o[0][2]) for o in mut_list]),
                            ','.join([str(o[0][3][1]) for o in mut_list]),
                            ','.join([str(t_mut_res_mutation_counts[o[0]])
                                        for o in mut_list]),
                            ','.join([str(o[1]) for o in mut_list]),
                            ','.join(map(str, obs_pvals)),])

    # write output to file
    output = [['Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues',
               'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value',
              ]] + output
    with open(opts['output'], 'w') as handle:
        csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output)

    # if user specified to log failed reading of pdbs
    if opts['error_pdb'] and error_pdb_structs:
        with open(opts['error_pdb'], 'w') as handle:
            for bad_pdb in error_pdb_structs:
                handle.write(bad_pdb+'\n')

    print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF))
    print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF))
    print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF))
    print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF))
    logger.info('Finished successfully!')