def test_ctnnb1_get_aa_mut_info(): import pysam from prob2020.python.gene_sequence import GeneSequence # read fasta ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa') gene_fa = pysam.Fastafile(ctnnb1_fasta) gs = GeneSequence(gene_fa, nuc_context=1) # read CTNNB1 bed file ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed') bed_list = [b for b in utils.bed_generator(ctnnb1_bed)] gs.set_gene(bed_list[0]) # specify mutation coding_pos = [0] somatic_base = ['C'] # check mutation info aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs) ref_codon_msg = 'First codon should be start codon ({0})'.format( aa_info['Reference Codon'][0]) assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg assert aa_info['Somatic Codon'][ 0] == 'CTG', 'First "A" should be replaced with a "C"' assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
def annotate_maf(coding_pos, somatic_base, gene_seq): # make sure numpy array coding_pos = np.array(coding_pos) # info about gene gene_name = gene_seq.bed.gene_name strand = gene_seq.bed.strand chrom = gene_seq.bed.chrom gene_seq.bed.init_genome_coordinates() # map seq pos to genome # determine result of random positions maf_list = [] # get genome coordinate pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x] + 1) genome_coord = pos2genome(coding_pos) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(coding_pos, somatic_base, gene_seq) # get string describing variant var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # prepare output for k, mysomatic_base in enumerate(somatic_base): ###### # Note: positions are converted to 1-based positions # for reporting DNA/Protein change, but internally # they are represented as 0-based ###### # format DNA change ref_nuc = tmp_mut_info['Reference Nuc'][k] nuc_pos = coding_pos[k] dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos + 1, mysomatic_base) # format protein change ref_aa = tmp_mut_info['Reference AA'][k] somatic_aa = tmp_mut_info['Somatic AA'][k] codon_pos = tmp_mut_info['Codon Pos'][k] codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based, somatic_aa) # reverse complement if on negative strand if strand == '-': ref_nuc = utils.rev_comp(ref_nuc) mysomatic_base = utils.rev_comp(mysomatic_base) # append results maf_line = [ gene_name, strand, chrom, genome_coord[k], genome_coord[k], ref_nuc, mysomatic_base, dna_change, protein_change, var_class[k] ] maf_list.append(maf_line) return maf_list
def annotate_maf(coding_pos, somatic_base, gene_seq): # make sure numpy array coding_pos = np.array(coding_pos) # info about gene gene_name = gene_seq.bed.gene_name strand = gene_seq.bed.strand chrom = gene_seq.bed.chrom gene_seq.bed.init_genome_coordinates() # map seq pos to genome # determine result of random positions maf_list = [] # get genome coordinate pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1) genome_coord = pos2genome(coding_pos) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(coding_pos, somatic_base, gene_seq) # get string describing variant var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # prepare output for k, mysomatic_base in enumerate(somatic_base): ###### # Note: positions are converted to 1-based positions # for reporting DNA/Protein change, but internally # they are represented as 0-based ###### # format DNA change ref_nuc = tmp_mut_info['Reference Nuc'][k] nuc_pos = coding_pos[k] dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos+1, mysomatic_base) # format protein change ref_aa = tmp_mut_info['Reference AA'][k] somatic_aa = tmp_mut_info['Somatic AA'][k] codon_pos = tmp_mut_info['Codon Pos'][k] codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based, somatic_aa) # reverse complement if on negative strand if strand == '-': ref_nuc = utils.rev_comp(ref_nuc) mysomatic_base = utils.rev_comp(mysomatic_base) # append results maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k], ref_nuc, mysomatic_base, dna_change, protein_change, var_class[k]] maf_list.append(maf_line) return maf_list
def calc_effect_p_value(mut_info, unmapped_mut_info, sc, gs, bed, num_permutations, pseudo_count, min_recurrent, min_fraction): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply( lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict( (name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # perform permutations permutation_result = pm.effect_permutation( context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj num_permutations, pseudo_count) effect_entropy_list, recur_list, inactivating_list = permutation_result # unpack results # get effect info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] effect_ent, num_recur, num_inactivating = cutils.calc_effect_info( codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) # calculate permutation p-value entropy_num_nulls = sum([ 1 for null_ent in effect_entropy_list if null_ent - utils.epsilon <= effect_ent ]) ent_p_value = entropy_num_nulls / float(num_permutations) else: num_recur = 0 num_inactivating = 0 effect_ent = 0 ent_p_value = 1.0 result = [ bed.gene_name, num_recur, num_inactivating, effect_ent, ent_p_value ] return result
def calc_effect_p_value(mut_info, unmapped_mut_info, sc, gs, bed, num_permutations, pseudo_count, min_recurrent, min_fraction): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict((name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # perform permutations permutation_result = pm.effect_permutation(context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj num_permutations, pseudo_count) effect_entropy_list, recur_list, inactivating_list = permutation_result # unpack results # get effect info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] effect_ent, num_recur, num_inactivating = cutils.calc_effect_info(codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) # calculate permutation p-value entropy_num_nulls = sum([1 for null_ent in effect_entropy_list if null_ent-utils.epsilon <= effect_ent]) ent_p_value = entropy_num_nulls / float(num_permutations) else: num_recur = 0 num_inactivating = 0 effect_ent = 0 ent_p_value = 1.0 result = [bed.gene_name, num_recur, num_inactivating, effect_ent, ent_p_value] return result
def non_silent_ratio_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000): """Performs null-permutations for non-silent ratio across all genes. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null Returns ------- non_silent_count_list : list of tuples list of non-silent and silent mutation counts under the null """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # determine result of random positions non_silent_count_list = [] for row in tmp_mut_pos: # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calc deleterious mutation info tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) non_silent_count_list.append(tmp_non_silent) return non_silent_count_list
def non_silent_ratio_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000): """Performs null-permutations for non-silent ratio across all genes. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null Returns ------- non_silent_count_list : list of tuples list of non-silent and silent mutation counts under the null """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # determine result of random positions non_silent_count_list = [] for row in tmp_mut_pos: # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calc deleterious mutation info tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) non_silent_count_list.append(tmp_non_silent) return non_silent_count_list
def test_ctnnb1_get_aa_mut_info(): import pysam from prob2020.python.gene_sequence import GeneSequence # read fasta ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa') gene_fa = pysam.Fastafile(ctnnb1_fasta) gs = GeneSequence(gene_fa, nuc_context=1) # read CTNNB1 bed file ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed') bed_list = [b for b in utils.bed_generator(ctnnb1_bed)] gs.set_gene(bed_list[0]) # specify mutation coding_pos = [0] somatic_base = ['C'] # check mutation info aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs) ref_codon_msg = 'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0]) assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"' assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
def calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, score_dir, num_permutations, stop_thresh, pseudo_count, min_recurrent, min_fraction): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply( lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict( (name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get vest scores for gene if directory provided if score_dir: gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir) if gene_vest is None: logger.warning( 'Could not find VEST scores for {0}, skipping . . .'. format(bed.gene_name)) else: gene_vest = None # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info( codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) # get vest score for actual mutations vest_score = scores.compute_vest_stat(gene_vest, aa_mut_info['Reference AA'], aa_mut_info['Somatic AA'], aa_mut_info['Codon Pos']) # perform simulations to get p-value observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score) permutation_result = pm.position_permutation( observed_stats, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj gene_vest, num_permutations, stop_thresh, pseudo_count) ent_p_value, vest_p_value = permutation_result else: num_recurrent = 0 pos_ent = 0 vest_score = 0.0 ent_p_value = 1.0 vest_p_value = 1.0 result = [ bed.gene_name, num_recurrent, pos_ent, vest_score, ent_p_value, vest_p_value ] return result
def singleprocess_permutation(info): bed_list, mut_df, opts = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) num_permutations = opts['num_permutations'] gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # variables for recording the actual observed number of non-silent # vs. silent mutations if not opts['by_sample']: obs_silent = 0 obs_non_silent = 0 obs_nonsense = 0 obs_loststop = 0 obs_splice_site = 0 obs_loststart = 0 obs_missense = 0 obs_vest = 0 obs_mga_entropy = 0 else: uniq_samp = mut_df['Tumor_Sample'].unique() obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))), index=uniq_samp, columns=cols) # go through each gene to permform simulation if opts['score_dir']: result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] for bed in bed_list: # compute context counts and somatic bases for each context gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if context_to_mutations: ## get information about observed non-silent counts # get info about mutations tmp_mut_info = mc.get_aa_mut_info( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # update the observed count if not opts['by_sample']: # calc deleterious mutation info #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'], #tmp_mut_info['Somatic AA'], #tmp_mut_info['Codon Pos']) # calc mutation info summarizing observed mutations tmp_result = cutils.calc_summary_info( tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], bed.gene_name, opts['score_dir'], #min_frac=opts['fraction'], min_frac=0.0, #min_recur=opts['recurrent'] min_recur=3) obs_non_silent += tmp_result[0] obs_silent += tmp_result[1] obs_nonsense += tmp_result[2] obs_loststop += tmp_result[3] obs_splice_site += tmp_result[4] obs_loststart += tmp_result[5] obs_missense += tmp_result[6] if opts['score_dir']: obs_vest += tmp_result[-2] obs_mga_entropy += tmp_result[-3] else: for tsamp in mutations_df['Tumor_Sample'].unique(): ixs = np.where(mutations_df['Tumor_Sample'] == tsamp)[0] ref_aa = [ r for i, r in enumerate(tmp_mut_info['Reference AA']) if i in ixs ] somatic_aa = [ s for i, s in enumerate(tmp_mut_info['Somatic AA']) if i in ixs ] codon_pos = [ c for i, c in enumerate(tmp_mut_info['Codon Pos']) if i in ixs ] #tmp_non_silent = cutils.calc_non_silent_info(ref_aa, #somatic_aa, #codon_pos) # get summary info tmp_result = cutils.calc_summary_info(ref_aa, somatic_aa, codon_pos, bed.gene_name, opts['score_dir'], min_frac=0.0, min_recur=3) if opts['score_dir']: tmp_result.pop(-4) tmp_result.pop(-4) tmp_result.pop(-1) # update df #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent) obs_df.loc[ tsamp, :] = obs_df.loc[tsamp, :] + np.array(tmp_result) ## Do permutations # calculate non silent count #tmp_result = pm.non_silent_ratio_permutation(context_cts, #context_to_mutations, #sc, # sequence context obj #gs, # gene sequence obj #num_permutations) tmp_result = pm.summary_permutation( context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj opts['score_dir'], num_permutations) else: if opts['score_dir']: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] # increment the non-silent/silent counts for each permutation offset = 3 for j in range(num_permutations): result[j][0] += tmp_result[j][0 + offset] result[j][1] += tmp_result[j][1 + offset] result[j][2] += tmp_result[j][2 + offset] result[j][3] += tmp_result[j][3 + offset] result[j][4] += tmp_result[j][4 + offset] result[j][5] += tmp_result[j][5 + offset] result[j][6] += tmp_result[j][6 + offset] if opts['score_dir']: result[j][7] += tmp_result[j][9 + offset] result[j][8] += tmp_result[j][10 + offset] gene_fa.close() if not opts['by_sample']: obs_result = [ obs_non_silent, obs_silent, obs_nonsense, obs_loststop, obs_splice_site, obs_loststart, obs_missense ] if opts['score_dir']: obs_result.extend([obs_mga_entropy, obs_vest]) else: obs_result = obs_df logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result, obs_result
def calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc, gs, bed, window_size, num_permutations, stop_thresh, report_index=False, null_save_path=None): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict((name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] pos_ct, window_sum_dict = utils.calc_windowed_sum(codon_pos, ref_aa, somatic_aa, window_size) # no missense mutations if not pos_ct: return [] # in case the index in the original mutation data frame is needed if report_index: mut_info['Codon Pos'] = aa_mut_info['Codon Pos'] pos2ix = mut_info.groupby('Codon Pos').groups # perform simulations to get p-value pval_dict = pm.hotmaps_permutation(window_sum_dict, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj window_size, num_permutations, stop_thresh, null_save_path=null_save_path) # prepare output # NOTE: internally codon positions start at 0, so add 1 for the output # to the user. if not report_index: result = [[bed.gene_name, mywin, k+1, pos_ct[k], window_sum_dict[mywin][k], pval_dict[mywin][k]] for mywin in window_sum_dict for k in window_sum_dict[mywin]] else: result = [[bed.gene_name, mywin, k+1, pos2ix[k][0], pos_ct[k], window_sum_dict[mywin][k], pval_dict[mywin][k]] for mywin in window_sum_dict for k in window_sum_dict[mywin]] else: result = [] return result
def calc_deleterious_p_value(mut_info, unmapped_mut_info, sc, gs, bed, num_permutations, stop_thresh, del_threshold, pseudo_count, seed=None): """Calculates the p-value for the number of inactivating SNV mutations. Calculates p-value based on how many simulations exceed the observed value. Parameters ---------- mut_info : dict contains codon and amino acid residue information for mutations mappable to provided reference tx. unmapped_mut_info : dict contains codon/amino acid residue info for mutations that are NOT mappable to provided reference tx. fs_ct : int number of frameshifts for gene prob_inactive : float proportion of inactivating mutations out of total over all genes sc : SequenceContext object contains the nucleotide contexts for a gene such that new random positions can be obtained while respecting nucleotide context. gs : GeneSequence contains gene sequence bed : BedLine just used to return gene name num_permutations : int number of permutations to perform to estimate p-value. more permutations means more precision on the p-value. seed : int (Default: None) seed number to random number generator (None to be randomly set) """ #prng = np.random.RandomState(seed) if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict((name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get deleterious info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] num_del = cutils.calc_deleterious_info(ref_aa, somatic_aa, codon_pos) #num_del = fs_ct + num_snv_del # skip permutation test if number of deleterious mutations is not at # least meet some user-specified threshold if num_del >= del_threshold: # perform permutations del_p_value = pm.deleterious_permutation(num_del, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj num_permutations, stop_thresh, pseudo_count) else: del_p_value = None else: num_del = 0 del_p_value = None result = [bed.gene_name, num_del, del_p_value] return result
def effect_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, pseudo_count=0): """Performs null-permutations for effect-based mutation statistics in a single gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null pseudo_count : int, default: 0 Pseudo-count for number of recurrent missense mutations for each permutation for the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- effect_entropy_list : list list of entropy of effect values under the null recur_list : list number of recurrent missense mutations inactivating_list : list number of inactivating mutations """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # calculate position-based statistics as a result of random positions effect_entropy_list, recur_list, inactivating_list = [], [], [] for row in tmp_mut_pos: # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_entropy, tmp_recur, tmp_inactivating = cutils.calc_effect_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) effect_entropy_list.append(tmp_entropy) recur_list.append(tmp_recur) inactivating_list.append(tmp_inactivating) return effect_entropy_list, recur_list, inactivating_list
def hotmaps_permutation(obs_stat, context_counts, context_to_mut, seq_context, gene_seq, window, num_permutations=10000, stop_criteria=100, max_batch=25000, null_save_path=None): """Performs null-permutations for position-based mutation statistics in a single gene. Parameters ---------- obs_stat : dict dictionary mapping codons to the sum of mutations in a window context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest window : int Number of codons to the left/right of a mutated position to consider in the window num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. max_batch : int maximum number of whole gene simulations to do at once. For large number of simulations holding a matrix of M x N, where M is the number of mutations and N is the number of simulations, can get quite large. null_save_path : str or None File path to save null distribution. If None, don't save it. Returns ------- pvals : dict Maps mutated codon position to the calculated p-value """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] # figure out which position has highest value max_key = {w: max(obs_stat[w], key=(lambda key: obs_stat[w][key])) for w in window} # setup null dist counts null_cts = {w: {k: 0 for k in obs_stat[w]} for w in window } # empirical null distribution (saved if file path provided) empirical_null = {w: {} for w in window} num_sim = 0 # number of simulations for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria) for w in window] if all(stop_flag): break #if null_cts[max_key] >= stop_criteria: #break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # calculate position-based statistics as a result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_pos, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], window) # update the counts when the empirical null passes the observed for tmp_w in tmp_sim: for tmp_key in tmp_sim[tmp_w]: # get mutation count for simulation val = tmp_sim[tmp_w][tmp_key] # add to empirical null distribution empirical_null[tmp_w].setdefault(val, 0) empirical_null[tmp_w][val] += 1 # update counts used for p-value for key in null_cts[tmp_w]: if val >= obs_stat[tmp_w][key]: null_cts[tmp_w][key] += 1 # update the number of simulations num_sim += len(tmp_pos) # stop iterations if reached sufficient precision stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria) for w in window] if all(stop_flag): break # calculate p-value from empirical null-distribution pvals = {w: {k: float(null_cts[w][k]) / (num_sim) for k in obs_stat[w]} for w in window} # save empirical distribution if null_save_path: for w in window: # create null distribution output = [['mutation_count', 'p-value']] sorted_cts = sorted(empirical_null[w].keys()) tmp_sum = 0 for i in range(len(sorted_cts)): tmp_sum += empirical_null[w][sorted_cts[-(i+1)]] tmp_pval = tmp_sum / float(num_sim) output.append([sorted_cts[-(i+1)], tmp_pval]) # save output with open(null_save_path.format(w), 'w') as handle: mywriter = csv.writer(handle, delimiter='\t', lineterminator='\n') mywriter.writerows(output) return pvals
def calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, score_dir, num_permutations, stop_thresh, pseudo_count, min_recurrent, min_fraction): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict((name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get vest scores for gene if directory provided if score_dir: gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir) if gene_vest is None: logger.warning('Could not find VEST scores for {0}, skipping . . .'.format(bed.gene_name)) else: gene_vest = None # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) # get vest score for actual mutations vest_score = scores.compute_vest_stat(gene_vest, aa_mut_info['Reference AA'], aa_mut_info['Somatic AA'], aa_mut_info['Codon Pos']) # perform simulations to get p-value observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score) permutation_result = pm.position_permutation(observed_stats, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj gene_vest, num_permutations, stop_thresh, pseudo_count) ent_p_value, vest_p_value = permutation_result else: num_recurrent = 0 pos_ent = 0 vest_score = 0.0 ent_p_value = 1.0 vest_p_value = 1.0 result = [bed.gene_name, num_recurrent, pos_ent, vest_score, ent_p_value, vest_p_value] return result
def maf_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, drop_silent=False): """Performs null-permutations across all genes and records the results in a format like a MAF file. This could be useful for examining the null permutations because the alternative approaches always summarize the results. With the simulated null-permutations, novel metrics can be applied to create an empirical null-distribution. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null drop_silent : bool, default=False Flage on whether to drop all silent mutations. Some data sources do not report silent mutations, and the simulations should match this. Returns ------- maf_list : list of tuples list of null mutations with mutation info in a MAF like format """ mycontexts = context_counts.index.tolist() somatic_base, base_context = zip(*[(base, one_context) for one_context in mycontexts for base in context_to_mut[one_context]]) # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # info about gene gene_name = gene_seq.bed.gene_name strand = gene_seq.bed.strand chrom = gene_seq.bed.chrom gene_seq.bed.init_genome_coordinates() # map seq pos to genome # determine result of random positions maf_list = [] for row in tmp_mut_pos: # get genome coordinate pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1) genome_coord = pos2genome(row) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # get string describing variant var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # prepare output for k, mysomatic_base in enumerate(somatic_base): # format DNA change ref_nuc = tmp_mut_info['Reference Nuc'][k] nuc_pos = row[k] dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base) # format protein change ref_aa = tmp_mut_info['Reference AA'][k] somatic_aa = tmp_mut_info['Somatic AA'][k] codon_pos = tmp_mut_info['Codon Pos'][k] protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa) # reverse complement if on negative strand if strand == '-': ref_nuc = utils.rev_comp(ref_nuc) mysomatic_base = utils.rev_comp(mysomatic_base) # append results if drop_silent and var_class[k].decode() == 'Silent': continue maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k], ref_nuc, mysomatic_base, base_context[k], dna_change, protein_change, var_class[k].decode()] maf_list.append(maf_line) return maf_list
def summary_permutation(context_counts, context_to_mut, seq_context, gene_seq, score_dir, num_permutations=10000, min_frac=0.0, min_recur=2, drop_silent=False): """Performs null-permutations and summarizes the results as features over the gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null drop_silent : bool, default=False Flage on whether to drop all silent mutations. Some data sources do not report silent mutations, and the simulations should match this. Returns ------- summary_info_list : list of lists list of non-silent and silent mutation counts under the null along with information on recurrent missense counts and missense positional entropy. """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # determine result of random positions gene_name = gene_seq.bed.gene_name gene_len = gene_seq.bed.cds_len summary_info_list = [] for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # Get all metrics summarizing each gene tmp_summary = cutils.calc_summary_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], gene_name, score_dir, min_frac=min_frac, min_recur=min_recur) # drop silent if needed if drop_silent: # silent mutation count is index 1 tmp_summary[1] = 0 # limit the precision of floats #pos_ent = tmp_summary[-1] #tmp_summary[-1] = '{0:.5f}'.format(pos_ent) summary_info_list.append([gene_name, i+1, gene_len]+tmp_summary) return summary_info_list
def effect_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, pseudo_count=0): """Performs null-permutations for effect-based mutation statistics in a single gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null pseudo_count : int, default: 0 Pseudo-count for number of recurrent missense mutations for each permutation for the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- effect_entropy_list : list list of entropy of effect values under the null recur_list : list number of recurrent missense mutations inactivating_list : list number of inactivating mutations """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # calculate position-based statistics as a result of random positions effect_entropy_list, recur_list, inactivating_list = [], [], [] for row in tmp_mut_pos: # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_entropy, tmp_recur, tmp_inactivating = cutils.calc_effect_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) effect_entropy_list.append(tmp_entropy) recur_list.append(tmp_recur) inactivating_list.append(tmp_inactivating) return effect_entropy_list, recur_list, inactivating_list
def protein_permutation(graph_score, num_codons_obs, context_counts, context_to_mut, seq_context, gene_seq, gene_graph, num_permutations=10000, stop_criteria=100, pseudo_count=0): """Performs null-simulations for position-based mutation statistics in a single gene. Parameters ---------- graph_score : float clustering score for observed data num_codons_obs : int number of codons with missense mutation in observed data context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. Returns ------- protein_pval : float p-value for clustering in neighbor graph constructure from protein structures """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # calculate position-based statistics as a result of random positions null_graph_entropy_ct = 0 coverage_list = [] num_mut_list = [] graph_entropy_list = [] for i, row in enumerate(tmp_mut_pos): # calculate the expected value of the relative increase in coverage if i == stop_criteria-1: rel_inc = [coverage_list[k] / float(num_mut_list[k]) for k in range(stop_criteria-1) if coverage_list[k]] exp_rel_inc = np.mean(rel_inc) # calculate observed statistic if num_codons_obs: obs_stat = graph_score / np.log2(exp_rel_inc*num_codons_obs) else: obs_stat = 1.0 # calculate statistics for simulated data sim_stat_list = [ent / np.log2(exp_rel_inc*num_mut_list[l]) for l, ent in enumerate(graph_entropy_list)] null_graph_entropy_ct = len([s for s in sim_stat_list if s-utils.epsilon <= obs_stat]) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_tuple = cutils.calc_pos_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) _, _, _, tmp_pos_ct = tmp_tuple # record num of mut codons if i < stop_criteria-1: tmp_num_mut_codons = len(tmp_pos_ct) num_mut_list.append(tmp_num_mut_codons) # get entropy on graph-smoothed probability distribution tmp_graph_entropy, tmp_coverage = scores.compute_ng_stat(gene_graph, tmp_pos_ct) # record the "coverage" in the graph if i < stop_criteria-1: coverage_list.append(tmp_coverage) graph_entropy_list.append(tmp_graph_entropy) # update empirical null distribution counts if i >= stop_criteria: #if tmp_graph_entropy-utils.epsilon <= graph_score: if tmp_num_mut_codons: sim_stat = tmp_graph_entropy / np.log2(exp_rel_inc*tmp_num_mut_codons) else: sim_stat = 1.0 # add count if sim_stat-utils.epsilon <= obs_stat: null_graph_entropy_ct += 1 # stop iterations if reached sufficient precision if null_graph_entropy_ct >= stop_criteria: break # calculate p-value from empirical null-distribution protein_pval = float(null_graph_entropy_ct) / (i+1) return protein_pval, obs_stat
def hotmaps_permutation(obs_stat, context_counts, context_to_mut, seq_context, gene_seq, window, num_permutations=10000, stop_criteria=100, max_batch=25000, null_save_path=None): """Performs null-permutations for position-based mutation statistics in a single gene. Parameters ---------- obs_stat : dict dictionary mapping codons to the sum of mutations in a window context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest window : int Number of codons to the left/right of a mutated position to consider in the window num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. max_batch : int maximum number of whole gene simulations to do at once. For large number of simulations holding a matrix of M x N, where M is the number of mutations and N is the number of simulations, can get quite large. null_save_path : str or None File path to save null distribution. If None, don't save it. Returns ------- pvals : dict Maps mutated codon position to the calculated p-value """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] # figure out which position has highest value max_key = {w: max(obs_stat[w], key=(lambda key: obs_stat[w][key])) for w in window} # setup null dist counts null_cts = {w: {k: 0 for k in obs_stat[w]} for w in window } # empirical null distribution (saved if file path provided) empirical_null = {w: {} for w in window} num_sim = 0 # number of simulations for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision # stop iterations if reached sufficient precision stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria) for w in window] if all(stop_flag): break #if null_cts[max_key] >= stop_criteria: #break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # calculate position-based statistics as a result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_pos, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], window) # update the counts when the empirical null passes the observed for tmp_w in tmp_sim: for tmp_key in tmp_sim[tmp_w]: # get mutation count for simulation val = tmp_sim[tmp_w][tmp_key] # add to empirical null distribution empirical_null[tmp_w].setdefault(val, 0) empirical_null[tmp_w][val] += 1 # update counts used for p-value for key in null_cts[tmp_w]: if val >= obs_stat[tmp_w][key]: null_cts[tmp_w][key] += 1 # update the number of simulations num_sim += len(tmp_pos) # stop iterations if reached sufficient precision stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria) for w in window] if all(stop_flag): break # calculate p-value from empirical null-distribution pvals = {w: {k: float(null_cts[w][k]) / (num_sim) for k in obs_stat[w]} for w in window} # save empirical distribution if null_save_path: for w in window: # create null distribution output = [['mutation_count', 'p-value']] sorted_cts = sorted(empirical_null[w].keys()) tmp_sum = 0 for i in range(len(sorted_cts)): tmp_sum += empirical_null[w][sorted_cts[-(i+1)]] tmp_pval = tmp_sum / float(num_sim) output.append([sorted_cts[-(i+1)], tmp_pval]) # save output with open(null_save_path.format(w), 'w') as handle: mywriter = csv.writer(handle, delimiter='\t', lineterminator='\n') mywriter.writerows(output) return pvals
def singleprocess_permutation(info): bed_list, mut_df, opts = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) num_permutations = opts['num_permutations'] gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # variables for recording the actual observed number of non-silent # vs. silent mutations if not opts['by_sample']: obs_silent = 0 obs_non_silent = 0 obs_nonsense = 0 obs_loststop = 0 obs_splice_site = 0 obs_loststart = 0 obs_missense = 0 obs_vest = 0 obs_mga_entropy = 0 else: uniq_samp = mut_df['Tumor_Sample'].unique() obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))), index=uniq_samp, columns=cols) # go through each gene to permform simulation if opts['score_dir']: result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] for bed in bed_list: # compute context counts and somatic bases for each context gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if context_to_mutations: ## get information about observed non-silent counts # get info about mutations tmp_mut_info = mc.get_aa_mut_info(mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # update the observed count if not opts['by_sample']: # calc deleterious mutation info #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'], #tmp_mut_info['Somatic AA'], #tmp_mut_info['Codon Pos']) # calc mutation info summarizing observed mutations tmp_result = cutils.calc_summary_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], bed.gene_name, opts['score_dir'], #min_frac=opts['fraction'], min_frac=0.0, #min_recur=opts['recurrent'] min_recur=3 ) obs_non_silent += tmp_result[0] obs_silent += tmp_result[1] obs_nonsense += tmp_result[2] obs_loststop += tmp_result[3] obs_splice_site += tmp_result[4] obs_loststart += tmp_result[5] obs_missense += tmp_result[6] if opts['score_dir']: obs_vest += tmp_result[-2] obs_mga_entropy += tmp_result[-3] else: for tsamp in mutations_df['Tumor_Sample'].unique(): ixs = np.where(mutations_df['Tumor_Sample']==tsamp)[0] ref_aa = [r for i, r in enumerate(tmp_mut_info['Reference AA']) if i in ixs] somatic_aa = [s for i, s in enumerate(tmp_mut_info['Somatic AA']) if i in ixs] codon_pos = [c for i, c in enumerate(tmp_mut_info['Codon Pos']) if i in ixs] #tmp_non_silent = cutils.calc_non_silent_info(ref_aa, #somatic_aa, #codon_pos) # get summary info tmp_result = cutils.calc_summary_info(ref_aa, somatic_aa, codon_pos, bed.gene_name, opts['score_dir'], min_frac=0.0, min_recur=3) if opts['score_dir']: tmp_result.pop(-4) tmp_result.pop(-4) tmp_result.pop(-1) # update df #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent) obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_result) ## Do permutations # calculate non silent count #tmp_result = pm.non_silent_ratio_permutation(context_cts, #context_to_mutations, #sc, # sequence context obj #gs, # gene sequence obj #num_permutations) tmp_result = pm.summary_permutation(context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj opts['score_dir'], num_permutations) else: if opts['score_dir']: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] # increment the non-silent/silent counts for each permutation offset = 3 for j in range(num_permutations): result[j][0] += tmp_result[j][0+offset] result[j][1] += tmp_result[j][1+offset] result[j][2] += tmp_result[j][2+offset] result[j][3] += tmp_result[j][3+offset] result[j][4] += tmp_result[j][4+offset] result[j][5] += tmp_result[j][5+offset] result[j][6] += tmp_result[j][6+offset] if opts['score_dir']: result[j][7] += tmp_result[j][9+offset] result[j][8] += tmp_result[j][10+offset] gene_fa.close() if not opts['by_sample']: obs_result = [obs_non_silent, obs_silent, obs_nonsense, obs_loststop, obs_splice_site, obs_loststart, obs_missense] if opts['score_dir']: obs_result.extend([obs_mga_entropy, obs_vest]) else: obs_result = obs_df logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result, obs_result
def calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc, gs, bed, window_size, num_permutations, stop_thresh, report_index=False): if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply( lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict( (name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] pos_ct, window_sum_dict = utils.calc_windowed_sum( codon_pos, ref_aa, somatic_aa, window_size) # no missense mutations if not pos_ct: return [] # in case the index in the original mutation data frame is needed if report_index: mut_info['Codon Pos'] = aa_mut_info['Codon Pos'] pos2ix = mut_info.groupby('Codon Pos').groups # perform simulations to get p-value pval_dict = pm.hotmaps_permutation( window_sum_dict, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj window_size, num_permutations, stop_thresh) # prepare output # NOTE: internally codon positions start at 0, so add 1 for the output # to the user. if not report_index: result = [[ bed.gene_name, k + 1, pos_ct[k], window_sum_dict[k], pval_dict[k] ] for k in window_sum_dict] else: result = [[ bed.gene_name, k + 1, pos2ix[k][0], pos_ct[k], window_sum_dict[k], pval_dict[k] ] for k in window_sum_dict] else: result = [] return result
def calc_protein_p_value(mut_info, unmapped_mut_info, sc, gs, bed, graph_dir, num_permutations, stop_thresh, min_recurrent, min_fraction): """Computes the p-value for clustering on a neighbor graph composed of codons connected with edges if they are spatially near in 3D protein structure. Parameters ---------- Returns ------- """ if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply( lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict( (name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get vest scores for gene if directory provided if graph_dir: gene_graph = scores.read_neighbor_graph_pickle( bed.gene_name, graph_dir) if gene_graph is None: logger.warning( 'Could not find neighbor graph for {0}, skipping . . .'. format(bed.gene_name)) else: gene_graph = None # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info( codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) try: # get vest score for actual mutations graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct) # perform simulations to get p-value protein_p_value, norm_graph_score = pm.protein_permutation( graph_score, len(pos_ct), context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj gene_graph, num_permutations, stop_thresh) except Exception as err: exc_info = sys.exc_info() norm_graph_score = 0.0 protein_p_value = 1.0 logger.warning('Codon numbering problem with ' + bed.gene_name) else: norm_graph_score = 0.0 protein_p_value = 1.0 num_recurrent = 0 result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value] return result
def summary_permutation(context_counts, context_to_mut, seq_context, gene_seq, score_dir, num_permutations=10000, min_frac=0.0, min_recur=2, drop_silent=False): """Performs null-permutations and summarizes the results as features over the gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null drop_silent : bool, default=False Flage on whether to drop all silent mutations. Some data sources do not report silent mutations, and the simulations should match this. Returns ------- summary_info_list : list of lists list of non-silent and silent mutation counts under the null along with information on recurrent missense counts and missense positional entropy. """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # determine result of random positions gene_name = gene_seq.bed.gene_name gene_len = gene_seq.bed.cds_len summary_info_list = [] for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # Get all metrics summarizing each gene tmp_summary = cutils.calc_summary_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], gene_name, score_dir, min_frac=min_frac, min_recur=min_recur) # drop silent if needed if drop_silent: # silent mutation count is index 1 tmp_summary[1] = 0 # limit the precision of floats #pos_ent = tmp_summary[-1] #tmp_summary[-1] = '{0:.5f}'.format(pos_ent) summary_info_list.append([gene_name, i+1, gene_len]+tmp_summary) return summary_info_list
def calc_deleterious_p_value(mut_info, unmapped_mut_info, sc, gs, bed, num_permutations, stop_thresh, del_threshold, pseudo_count, seed=None): """Calculates the p-value for the number of inactivating SNV mutations. Calculates p-value based on how many simulations exceed the observed value. Parameters ---------- mut_info : dict contains codon and amino acid residue information for mutations mappable to provided reference tx. unmapped_mut_info : dict contains codon/amino acid residue info for mutations that are NOT mappable to provided reference tx. fs_ct : int number of frameshifts for gene prob_inactive : float proportion of inactivating mutations out of total over all genes sc : SequenceContext object contains the nucleotide contexts for a gene such that new random positions can be obtained while respecting nucleotide context. gs : GeneSequence contains gene sequence bed : BedLine just used to return gene name num_permutations : int number of permutations to perform to estimate p-value. more permutations means more precision on the p-value. seed : int (Default: None) seed number to random number generator (None to be randomly set) """ #prng = np.random.RandomState(seed) if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply( lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict( (name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get deleterious info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] num_del = cutils.calc_deleterious_info(ref_aa, somatic_aa, codon_pos) #num_del = fs_ct + num_snv_del # skip permutation test if number of deleterious mutations is not at # least meet some user-specified threshold if num_del >= del_threshold: # perform permutations del_p_value = pm.deleterious_permutation( num_del, context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj num_permutations, stop_thresh, pseudo_count) else: del_p_value = None else: num_del = 0 del_p_value = None result = [bed.gene_name, num_del, del_p_value] return result
def deleterious_permutation(obs_del, context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, stop_criteria=100, pseudo_count=0, max_batch=25000): """Performs null-permutations for deleterious mutation statistics in a single gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null pseudo_count : int, default: 0 Pseudo-count for number of deleterious mutations for each permutation of the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- del_count_list : list list of deleterious mutation counts under the null """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] num_sim = 0 null_del_ct = 0 for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision if null_del_ct >= stop_criteria: #j = j - 1 break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # determine result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calc deleterious mutation info tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # update empricial null distribution if tmp_del_count >= obs_del: null_del_ct += 1 # stop if reach sufficient precision on p-value if null_del_ct >= stop_criteria: break # update number of simulations num_sim += i + 1 #num_sim = j*max_batch + i+1 del_pval = float(null_del_ct) / (num_sim) return del_pval
def protein_permutation(graph_score, num_codons_obs, context_counts, context_to_mut, seq_context, gene_seq, gene_graph, num_permutations=10000, stop_criteria=100, pseudo_count=0): """Performs null-simulations for position-based mutation statistics in a single gene. Parameters ---------- graph_score : float clustering score for observed data num_codons_obs : int number of codons with missense mutation in observed data context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. Returns ------- protein_pval : float p-value for clustering in neighbor graph constructure from protein structures """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # calculate position-based statistics as a result of random positions null_graph_entropy_ct = 0 coverage_list = [] num_mut_list = [] graph_entropy_list = [] for i, row in enumerate(tmp_mut_pos): # calculate the expected value of the relative increase in coverage if i == stop_criteria-1: rel_inc = [coverage_list[k] / float(num_mut_list[k]) for k in range(stop_criteria-1) if coverage_list[k]] exp_rel_inc = np.mean(rel_inc) # calculate observed statistic if num_codons_obs: obs_stat = graph_score / np.log2(exp_rel_inc*num_codons_obs) else: obs_stat = 1.0 # calculate statistics for simulated data sim_stat_list = [ent / np.log2(exp_rel_inc*num_mut_list[l]) for l, ent in enumerate(graph_entropy_list)] null_graph_entropy_ct = len([s for s in sim_stat_list if s-utils.epsilon <= obs_stat]) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_tuple = cutils.calc_pos_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) _, _, _, tmp_pos_ct = tmp_tuple # record num of mut codons if i < stop_criteria-1: tmp_num_mut_codons = len(tmp_pos_ct) num_mut_list.append(tmp_num_mut_codons) # get entropy on graph-smoothed probability distribution tmp_graph_entropy, tmp_coverage = scores.compute_ng_stat(gene_graph, tmp_pos_ct) # record the "coverage" in the graph if i < stop_criteria-1: coverage_list.append(tmp_coverage) graph_entropy_list.append(tmp_graph_entropy) # update empirical null distribution counts if i >= stop_criteria: #if tmp_graph_entropy-utils.epsilon <= graph_score: if tmp_num_mut_codons: sim_stat = tmp_graph_entropy / np.log2(exp_rel_inc*tmp_num_mut_codons) else: sim_stat = 1.0 # add count if sim_stat-utils.epsilon <= obs_stat: null_graph_entropy_ct += 1 # stop iterations if reached sufficient precision if null_graph_entropy_ct >= stop_criteria: break # calculate p-value from empirical null-distribution protein_pval = float(null_graph_entropy_ct) / (i+1) return protein_pval, obs_stat
def singleprocess_permutation(info): bed_list, mut_df, opts = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) num_iterations = opts['num_iterations'] gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # go through each gene to perform simulation result = [] for bed in bed_list: # compute context counts and somatic bases for each context gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if context_to_mutations: ## get information about observed non-silent counts if opts['summary'] and not num_iterations: tmp_mut_info = mc.get_aa_mut_info( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # calc mutation info summarizing observed mutations tmp_result = cutils.calc_summary_info( tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], bed.gene_name, opts['score_dir'], min_frac=opts['fraction'], min_recur=opts['recurrent']) tmp_result = [[bed.gene_name, 'NA', bed.cds_len] + tmp_result] ## Just record protein changes in MAF elif opts['maf'] and not num_iterations: # input code for just annotating genes mutations tmp_result = anot.annotate_maf( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # add tumor sample / tumor type info to output tmp_result = [ line + [ mutations_df['Tumor_Sample'].iloc[i], mutations_df['Tumor_Type'].iloc[i] ] for i, line in enumerate(tmp_result) ] ## Do permutations elif opts['maf']: # if user specified MAF format then output all mutations in # MAF format tmp_result = pm.maf_permutation(context_cts, context_to_mutations, sc, gs, num_iterations) else: # Summarized results for feature for each simulation for each # gene tmp_result = pm.summary_permutation( context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj opts['score_dir'], num_iterations, min_frac=opts['fraction'], min_recur=opts['recurrent']) result += tmp_result gene_fa.close() logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result
def deleterious_permutation(obs_del, context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, stop_criteria=100, pseudo_count=0, max_batch=25000): """Performs null-permutations for deleterious mutation statistics in a single gene. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null pseudo_count : int, default: 0 Pseudo-count for number of deleterious mutations for each permutation of the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- del_count_list : list list of deleterious mutation counts under the null """ mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] num_sim = 0 null_del_ct = 0 for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision if null_del_ct >= stop_criteria: #j = j - 1 break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # determine result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calc deleterious mutation info tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # update empricial null distribution if tmp_del_count >= obs_del: null_del_ct += 1 # stop if reach sufficient precision on p-value if null_del_ct >= stop_criteria: break # update number of simulations num_sim += i + 1 #num_sim = j*max_batch + i+1 del_pval = float(null_del_ct) / (num_sim) return del_pval
def calc_protein_p_value(mut_info, unmapped_mut_info, sc, gs, bed, graph_dir, num_permutations, stop_thresh, min_recurrent, min_fraction): """Computes the p-value for clustering on a neighbor graph composed of codons connected with edges if they are spatially near in 3D protein structure. Parameters ---------- Returns ------- """ if len(mut_info) > 0: mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x]) # group mutations by context cols = ['Context', 'Tumor_Allele'] unmapped_mut_df = pd.DataFrame(unmapped_mut_info) tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]]) context_cts = tmp_df['Context'].value_counts() context_to_mutations = dict((name, group['Tumor_Allele']) for name, group in tmp_df.groupby('Context')) # get vest scores for gene if directory provided if graph_dir: gene_graph = scores.read_neighbor_graph_pickle(bed.gene_name, graph_dir) if gene_graph is None: logger.warning('Could not find neighbor graph for {0}, skipping . . .'.format(bed.gene_name)) else: gene_graph = None # get recurrent info for actual mutations aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'], mut_info['Tumor_Allele'].tolist(), gs) codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos'] ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA'] somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA'] num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos, ref_aa, somatic_aa, min_frac=min_fraction, min_recur=min_recurrent) try: # get vest score for actual mutations graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct) # perform simulations to get p-value protein_p_value, norm_graph_score = pm.protein_permutation( graph_score, len(pos_ct), context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj gene_graph, num_permutations, stop_thresh ) except Exception as err: exc_info = sys.exc_info() norm_graph_score = 0.0 protein_p_value = 1.0 logger.warning('Codon numbering problem with '+bed.gene_name) else: norm_graph_score = 0.0 protein_p_value = 1.0 num_recurrent = 0 result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value] return result
def maf_permutation(context_counts, context_to_mut, seq_context, gene_seq, num_permutations=10000, drop_silent=False): """Performs null-permutations across all genes and records the results in a format like a MAF file. This could be useful for examining the null permutations because the alternative approaches always summarize the results. With the simulated null-permutations, novel metrics can be applied to create an empirical null-distribution. Parameters ---------- context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null drop_silent : bool, default=False Flage on whether to drop all silent mutations. Some data sources do not report silent mutations, and the simulations should match this. Returns ------- maf_list : list of tuples list of null mutations with mutation info in a MAF like format """ mycontexts = context_counts.index.tolist() somatic_base, base_context = zip(*[(base, one_context) for one_context in mycontexts for base in context_to_mut[one_context]]) # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), num_permutations) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # info about gene gene_name = gene_seq.bed.gene_name strand = gene_seq.bed.strand chrom = gene_seq.bed.chrom gene_seq.bed.init_genome_coordinates() # map seq pos to genome # determine result of random positions maf_list = [] for row in tmp_mut_pos: # get genome coordinate pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1) genome_coord = pos2genome(row) # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # get string describing variant var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # prepare output for k, mysomatic_base in enumerate(somatic_base): # format DNA change ref_nuc = tmp_mut_info['Reference Nuc'][k] nuc_pos = row[k] dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base) # format protein change ref_aa = tmp_mut_info['Reference AA'][k] somatic_aa = tmp_mut_info['Somatic AA'][k] codon_pos = tmp_mut_info['Codon Pos'][k] protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa) # reverse complement if on negative strand if strand == '-': ref_nuc = utils.rev_comp(ref_nuc) mysomatic_base = utils.rev_comp(mysomatic_base) # append results if drop_silent and var_class[k].decode() == 'Silent': continue maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k], ref_nuc, mysomatic_base, base_context[k], dna_change, protein_change, var_class[k].decode()] maf_list.append(maf_line) return maf_list
def position_permutation(obs_stat, context_counts, context_to_mut, seq_context, gene_seq, gene_vest=None, num_permutations=10000, stop_criteria=100, pseudo_count=0, max_batch=25000): """Performs null-permutations for position-based mutation statistics in a single gene. Parameters ---------- obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest) tuple containing the observed statistics context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. pseudo_count : int, default: 0 Pseudo-count for number of recurrent missense mutations for each permutation for the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- num_recur_list : list list of recurrent mutation counts under the null entropy_list : list list of position entropy values under the null """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat num_sim = 0 # number of simulations null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0 for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria: break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # calculate position-based statistics as a result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) # get vest scores if gene_vest: tmp_vest = scores.compute_vest_stat(gene_vest, tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) else: tmp_vest = 0.0 # update empirical null distribution counts if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1 if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1 # stop iterations if reached sufficient precision if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria: break # update the number of simulations num_sim += i+1 # calculate p-value from empirical null-distribution ent_pval = float(null_entropy_ct) / (num_sim) vest_pval = float(null_vest_ct) / (num_sim) return ent_pval, vest_pval
def position_permutation(obs_stat, context_counts, context_to_mut, seq_context, gene_seq, gene_vest=None, num_permutations=10000, stop_criteria=100, pseudo_count=0, max_batch=25000): """Performs null-permutations for position-based mutation statistics in a single gene. Parameters ---------- obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest) tuple containing the observed statistics context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. pseudo_count : int, default: 0 Pseudo-count for number of recurrent missense mutations for each permutation for the null distribution. Increasing pseudo_count makes the statistical test more stringent. Returns ------- num_recur_list : list list of recurrent mutation counts under the null entropy_list : list list of position entropy values under the null """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [base for one_context in mycontexts for base in context_to_mut[one_context]] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat num_sim = 0 # number of simulations null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0 for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria: break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos]) # calculate position-based statistics as a result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], pseudo_count=pseudo_count, is_obs=0) # get vest scores if gene_vest: tmp_vest = scores.compute_vest_stat(gene_vest, tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) else: tmp_vest = 0.0 # update empirical null distribution counts if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1 if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1 # stop iterations if reached sufficient precision if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria: break # update the number of simulations num_sim += i+1 # calculate p-value from empirical null-distribution ent_pval = float(null_entropy_ct) / (num_sim) vest_pval = float(null_vest_ct) / (num_sim) return ent_pval, vest_pval
def hotmaps_permutation(obs_stat, context_counts, context_to_mut, seq_context, gene_seq, window, num_permutations=10000, stop_criteria=100, max_batch=25000): """Performs null-permutations for position-based mutation statistics in a single gene. Parameters ---------- obs_stat : dict dictionary mapping codons to the sum of mutations in a window context_counts : pd.Series number of mutations for each context context_to_mut : dict dictionary mapping nucleotide context to a list of observed somatic base changes. seq_context : SequenceContext Sequence context for the entire gene sequence (regardless of where mutations occur). The nucleotide contexts are identified at positions along the gene. gene_seq : GeneSequence Sequence of gene of interest window : int Number of codons to the left/right of a mutate position to consider in the window num_permutations : int, default: 10000 number of permutations to create for null stop_criteria : int stop after stop_criteria iterations are more significant then the observed statistic. max_batch : int maximum number of whole gene simulations to do at once. For large number of simulations holding a matrix of M x N, where M is the number of mutations and N is the number of simulations, can get quite large. Returns ------- pvals : dict Maps mutated codon position to the calculated p-value """ # get contexts and somatic base mycontexts = context_counts.index.tolist() somatic_base = [ base for one_context in mycontexts for base in context_to_mut[one_context] ] # calculate the # of batches for simulations max_batch = min(num_permutations, max_batch) num_batches = num_permutations // max_batch remainder = num_permutations % max_batch batch_sizes = [max_batch] * num_batches if remainder: batch_sizes += [remainder] # figure out which position has highest value max_key = max(obs_stat, key=(lambda key: obs_stat[key])) # setup null dist counts null_cts = {k: 0 for k in obs_stat} num_sim = 0 # number of simulations for j, batch_size in enumerate(batch_sizes): # stop iterations if reached sufficient precision if null_cts[max_key] >= stop_criteria: break # get random positions determined by sequence context tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(), batch_size) tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos) # calculate position-based statistics as a result of random positions for i, row in enumerate(tmp_mut_pos): # get info about mutations tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq) # calculate position info _, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'], tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], window) # update the counts when the empirical null passes the observed for tmp_key in tmp_sim: val = tmp_sim[tmp_key] for key in null_cts: if val >= obs_stat[key]: null_cts[key] += 1 # update the number of simulations num_sim += len(tmp_sim) # stop iterations if reached sufficient precision if null_cts[max_key] >= stop_criteria: break # calculate p-value from empirical null-distribution pvals = {k: float(null_cts[k]) / (num_sim) for k in obs_stat} return pvals