Ejemplo n.º 1
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [
            m for m in mtc if (m[ttype_ix] == ttype) and (
                float(m[qval_ix]) <= opts['q_value'])
        ]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            # fringe case
            if pdb_id not in pdb_info:
                print('skipping ' + pdb_id)
                continue

            # get path info
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                    tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix],
                                               s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            gene2graph_all = update_graph(gene2graph_all, cog,
                                          signif_struct_info, struct,
                                          opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line) + '\n')

    logger.info('Finished Successfully!!!')
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # read in multiple testing file
    mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)
    ttype_ix = header.index('Tumor Type')
    qval_ix = header.index('q-value')
    gene_ix = header.index('HUGO Symbol')
    tx_ix = header.index('Sequence Ontology Transcript')
    res_ix = header.index('CRAVAT Res')
    #mtc.sort(key=lambda x: x[0])

    # iterate through each tumor type
    output = []
    gene2graph_all = {}  # graphs for combined tumor types
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))
        # initialize the graph to empty
        gene2graph = {}  # graph for an individual tumor type

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])]
        significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
                               for m in mtc_ttype])

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype)
        annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]):
            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            cog = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)

            # get significant residues
            signif_struct_info = {}
            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    print 'int error'
                    continue
                signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix])

            # update the graph to reflect info from the current structure
            gene2graph = update_graph(gene2graph, cog, signif_struct_info,
                                      struct, opts['radius'])
            # update graph for the combined cross-tumor type regions
            banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC',
                             'TGCT', 'THYM', 'UVM']
            if ttype not in banned_ttypes:
                gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info,
                                              struct, opts['radius'])

        # format the results into the output list
        tmp_out = retrieve_components(gene2graph, ttype)
        output += tmp_out
        logger.info('Finished {0}'.format(ttype))

    # update output to contain cross-tumor type reference regions
    tmp_out = retrieve_components(gene2graph_all, 'REF')
    output += tmp_out

    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')
Ejemplo n.º 3
0
def main(opts):
    # read in the PDB info file
    pdb_info = utils.read_pdb_info(opts['pdb_info'])

    # use external module to separate out the residues in the hotspot.py output
    # onto separate lines
    mtc = read_residue_info(opts['input'])
    pval_thresholds = read_thresholds(opts['significance'])

    # read in multiple testing file

    #mtc = read_delim(opts['multiple_testing'])
    header = mtc.pop(0)

    ttype_ix = header.index('Tumor Type')
    struct_ix = header.index('Structure')
    model_ix = header.index('Model')
    chain_ix = header.index('Chain')
    res_ix = header.index('Mutation Residues')
    pval_ix = header.index('Hotspot P-value')

    # iterate through each tumor type

    output = []
    uniq_ttypes = set(m[ttype_ix] for m in mtc)
    for ttype in uniq_ttypes:
        logger.info('Working on {0} . . .'.format(ttype))

        # if there is no pval threshold, nothing is significant
        if not ttype in pval_thresholds:
            continue

        # get the significant residues for the tumor type
        mtc_ttype = [m for m in mtc
                     if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])]



        # ANY EQUIVALENT COPY THING FOR STRUCTURES?
        # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix]))
        #                       for m in mtc_ttype])
        #significant_res = list(mtc_ttype)
        significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix]))
                           for m in mtc_ttype]

        # read annotation file
        annotation_file = os.path.join(opts['annotation_dir'],
                                       'mupit_mutations_' + ttype)
        all_annotation, col_pos = read_mupit_file(annotation_file, significant_res)
        pdb_ix = col_pos['pdb']
        anot_gene_ix = col_pos['gene']
        anot_tx_ix = col_pos['tx']
        anot_res_ix = col_pos['res']

        # sort by structure
        all_annotation.sort(key=lambda x: x[pdb_ix])

        for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]):

            # initialize the graph to empty
            struct2graph = {}

            struct_info = pdb_info[pdb_id].copy()
            pdb_path = struct_info.pop('path')
            struct_chains = []
            for d in struct_info:
                struct_chains.extend(struct_info[d])
            #pdb_path = pdb2path[pdb_id]

            struct = utils.read_structure(pdb_path, pdb_id)
            if struct is None:
                continue  # skip if pdb file not found

            # calculate the centers of geometry
            all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains)

            # contains relevant mupit annotations for this pdb
            tmp = list(grp)


            # get significant residues
            signif_struct_info = {}
            non_signif_struct_info = {}

            for s in tmp:
                try:
                   tmp_pos =  (s[col_pos['chain']], int(s[col_pos['pdb_res']]))
                except:
                    continue

                if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res:
                    signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))

                else:
                    non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix]))


            #print "Pushing update", pdb_id
            # update the graph to reflect info from the current structure
            struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info,
                                      struct, opts['radius'])


            # format the results into the output list
            tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours)
            output += tmp_out


        # format the results into the output list
        # tmp_out = retrieve_components(struct2graph, ttype)
        # output += tmp_out
        logger.info('Finished {0}'.format(ttype))


    # write output
    with open(opts['output'], 'wb') as handle:
        for line in output:
            handle.write('\t'.join(line)+'\n')

    logger.info('Finished Successfully!!!')