def calc_protein_p_value(mut_info, unmapped_mut_info, sc, gs, bed, graph_dir,
                         num_permutations, stop_thresh, min_recurrent,
                         min_fraction):
    """Computes the p-value for clustering on a neighbor graph composed
    of codons connected with edges if they are spatially near in 3D protein
    structure.

    Parameters
    ----------


    Returns
    -------

    """
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if graph_dir:
            gene_graph = scores.read_neighbor_graph_pickle(
                bed.gene_name, graph_dir)
            if gene_graph is None:
                logger.warning(
                    'Could not find neighbor graph for {0}, skipping . . .'.
                    format(bed.gene_name))
        else:
            gene_graph = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)
        try:
            # get vest score for actual mutations
            graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct)

            # perform simulations to get p-value
            protein_p_value, norm_graph_score = pm.protein_permutation(
                graph_score,
                len(pos_ct),
                context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                gene_graph,
                num_permutations,
                stop_thresh)
        except Exception as err:
            exc_info = sys.exc_info()
            norm_graph_score = 0.0
            protein_p_value = 1.0
            logger.warning('Codon numbering problem with ' + bed.gene_name)

    else:
        norm_graph_score = 0.0
        protein_p_value = 1.0
        num_recurrent = 0

    result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value]
    return result
def calc_protein_p_value(mut_info,
                         unmapped_mut_info,
                         sc,
                         gs,
                         bed,
                         graph_dir,
                         num_permutations,
                         stop_thresh,
                         min_recurrent,
                         min_fraction):
    """Computes the p-value for clustering on a neighbor graph composed
    of codons connected with edges if they are spatially near in 3D protein
    structure.

    Parameters
    ----------


    Returns
    -------

    """
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if graph_dir:
            gene_graph = scores.read_neighbor_graph_pickle(bed.gene_name, graph_dir)
            if gene_graph is None:
                logger.warning('Could not find neighbor graph for {0}, skipping . . .'.format(bed.gene_name))
        else:
            gene_graph = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos,
                                                                             ref_aa,
                                                                             somatic_aa,
                                                                             min_frac=min_fraction,
                                                                             min_recur=min_recurrent)
        try:
            # get vest score for actual mutations
            graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct)

            # perform simulations to get p-value
            protein_p_value, norm_graph_score = pm.protein_permutation(
                graph_score, len(pos_ct), context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                gene_graph, num_permutations, stop_thresh
            )
        except Exception as err:
            exc_info = sys.exc_info()
            norm_graph_score = 0.0
            protein_p_value = 1.0
            logger.warning('Codon numbering problem with '+bed.gene_name)

    else:
        norm_graph_score = 0.0
        protein_p_value = 1.0
        num_recurrent = 0

    result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value]
    return result
def calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, score_dir,
                          num_permutations, stop_thresh, pseudo_count,
                          min_recurrent, min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning(
                    'Could not find VEST scores for {0}, skipping . . .'.
                    format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(
            observed_stats,
            context_cts,
            context_to_mutations,
            sc,  # sequence context obj
            gs,  # gene sequence obj
            gene_vest,
            num_permutations,
            stop_thresh,
            pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [
        bed.gene_name, num_recurrent, pos_ent, vest_score, ent_p_value,
        vest_p_value
    ]
    return result
def calc_position_p_value(mut_info,
                          unmapped_mut_info,
                          sc,
                          gs,
                          bed,
                          score_dir,
                          num_permutations,
                          stop_thresh,
                          pseudo_count,
                          min_recurrent,
                          min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning('Could not find VEST scores for {0}, skipping . . .'.format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos,
                                                                     ref_aa,
                                                                     somatic_aa,
                                                                     min_frac=min_fraction,
                                                                     min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(observed_stats,
                                                     context_cts,
                                                     context_to_mutations,
                                                     sc,  # sequence context obj
                                                     gs,  # gene sequence obj
                                                     gene_vest,
                                                     num_permutations,
                                                     stop_thresh,
                                                     pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [bed.gene_name, num_recurrent, pos_ent, vest_score,
              ent_p_value, vest_p_value]
    return result