def test_ctnnb1_get_aa_mut_info(): import pysam from prob2020.python.gene_sequence import GeneSequence # read fasta ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa') gene_fa = pysam.Fastafile(ctnnb1_fasta) gs = GeneSequence(gene_fa, nuc_context=1) # read CTNNB1 bed file ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed') bed_list = [b for b in utils.bed_generator(ctnnb1_bed)] gs.set_gene(bed_list[0]) # specify mutation coding_pos = [0] somatic_base = ['C'] # check mutation info aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs) ref_codon_msg = 'First codon should be start codon ({0})'.format( aa_info['Reference Codon'][0]) assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg assert aa_info['Somatic Codon'][ 0] == 'CTG', 'First "A" should be replaced with a "C"' assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
def test_no_context_constructor(): # no context gs = GeneSequence(gene_fa, nuc_context=0) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'None': 21} true_ctxt2pos = {'None': range(21)} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def test_simple_constructor(): gs = GeneSequence(gene_fa, nuc_context=1) gs.set_gene(bed) assert gs.exon_seq == 'ACATGAATGATAGATCCGAAA', 'Sequence is not correct' # this should update the sequence correctly fake_germline = ['A', 'C', 'N', 'G', 'T'] fake_pos = [1, 0, 20, 7, 15] gs.add_germline_variants(fake_germline, fake_pos) assert gs.exon_seq == 'CAATGAAGGATAGATTCGAAN'
def test_pos_to_codon(): gs = GeneSequence(gene_fa, nuc_context=1) gs.set_gene(bed) pos_list = [1, 12, 17] results = [] for pos in pos_list: codon_info = cutils.pos_to_codon(gs, pos) results.append(codon_info) true_results = [('ACA', 0, 1, 'C'), ('GAT', 4, 0, 'G'), ('CCG', 5, 2, 'G')] assert results == true_results, 'Codon information is incorrect'
def is_nonsilent(mut_df, bed_dict, opts): # convert dictionary to list for bed objects gene_beds = [b for chrom in bed_dict for b in bed_dict[chrom]] # initiate gene sequences gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # non-silent SNV classes non_silent_snv = [ 'Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site', 'Translation_Start_Site', 'Missense_Mutation' ] # record indels and get only snvs mut_df['is_nonsilent'] = 0 indel_flag = indel.is_indel_annotation(mut_df) mut_df.loc[indel_flag, 'is_nonsilent'] = 1 snv_df = mut_df[~indel_flag] # iterate over each gene for bed in gene_beds: # initiate for this gene tmp_df = snv_df[snv_df['Gene'] == bed.gene_name] gs.set_gene(bed) # compute context counts and somatic bases for each context gene_tuple = compute_mutation_context(bed, gs, tmp_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if len(mutations_df): # get snv information tmp_mut_info = get_aa_mut_info( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # get string describing variant var_class = cutils.get_variant_classification( tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos']) # detect if non-silent snv is_nonsilent_snv = [ 1 if (x in non_silent_snv) else 0 for x in var_class ] mut_df.loc[tmp_df.index, 'is_nonsilent'] = is_nonsilent_snv # return a pandas series indicating nonsilent status is_nonsilent_series = mut_df['is_nonsilent'].copy() del mut_df['is_nonsilent'] return is_nonsilent_series
def test_single_context_constructor(): # single nuc context gs = GeneSequence(gene_fa, nuc_context=1) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'A': 10, 'T': 4, 'G': 4, 'C': 3} true_ctxt2pos = {'A': [0, 2, 5, 6, 9, 11, 13, 18, 19, 20], 'C': [1, 15, 16], 'G': [4, 8, 12, 17], 'T': [3, 7, 10, 14]} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def test_chasm_context_constructor(): # chasm context, mixture between single and di context gs = GeneSequence(gene_fa, nuc_context=1.5) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'A': 10, 'C': 1, 'C*pG': 1, 'CpG*': 1, 'G*pA': 3, 'T': 4, 'TpC*': 1} true_ctxt2pos = {'A': [2, 5, 6, 9, 11, 13, 18, 19, 0, 20], 'C': [1], 'C*pG': [16], 'CpG*': [17], 'G*pA': [4, 8, 12], 'T': [3, 7, 10, 14], 'TpC*': [15]} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def test_ctnnb1_get_aa_mut_info(): import pysam from prob2020.python.gene_sequence import GeneSequence # read fasta ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa') gene_fa = pysam.Fastafile(ctnnb1_fasta) gs = GeneSequence(gene_fa, nuc_context=1) # read CTNNB1 bed file ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed') bed_list = [b for b in utils.bed_generator(ctnnb1_bed)] gs.set_gene(bed_list[0]) # specify mutation coding_pos = [0] somatic_base = ['C'] # check mutation info aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs) ref_codon_msg = 'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0]) assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"' assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
def singleprocess_permutation(info): # initialize input bed_list, mut_df, opts, fs_cts_df, p_inactivating = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # list of columns that are needed cols = [ 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Allele', 'Variant_Classification', ] # conditionally add protein_change column if exists if 'Protein_Change' in mut_df.columns: cols += ['Protein_Change'] # figure out which genes actually have a mutation genes_with_mut = set(mut_df['Gene'].unique()) # iterate through each gene result = [] for bed in bed_list: if bed.gene_name not in genes_with_mut: # skip genes with no mutations continue # prepare info for running permutation test mut_info = mut_df.loc[mut_df['Gene'] == bed.gene_name, cols] gs.set_gene(bed) sc = SequenceContext(gs, seed=opts['seed']) # count total mutations in gene total_mut = len(mut_info) # fix nucleotide letter if gene is on - strand if bed.strand == '-': rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x)) mut_info.loc[:, 'Tumor_Allele'] = rc # get coding positions, mutations unmapped to the reference tx will have # NA for a coding position pos_list = [] for ix, row in mut_info.iterrows(): coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position']) pos_list.append(coding_pos) mut_info.loc[:, 'Coding Position'] = pos_list # recover mutations that could not be mapped to the reference transcript # for a gene before being dropped (next step) unmapped_mut_info = mc.recover_unmapped_mut_info( mut_info, bed, sc, opts) # drop mutations wich do not map to reference tx mut_info = mut_info.dropna(subset=['Coding Position' ]) # mutations need to map to tx mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) num_mapped_muts = len(mut_info) unmapped_muts = total_mut - num_mapped_muts # construct sequence context #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(), # mut_info['Coding Position'].tolist()) # calculate results of permutation test if opts['kind'] == 'oncogene': # calculate position based permutation results tmp_result = mypval.calc_position_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['score_dir'], opts['num_iterations'], opts['stop_criteria'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) elif opts['kind'] == 'tsg': # calculate results for deleterious mutation permutation test #fs_ct = fs_cts_df['total'][bed.gene_name] #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name] # replaced fs_ct with zero to stop using the frameshifts in # simulation tmp_result = mypval.calc_deleterious_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], opts['stop_criteria'], opts['deleterious'], 0, # no deleterious mutation pseudo count opts['seed']) result.append(tmp_result + [num_mapped_muts, unmapped_muts]) #fs_ct, fs_unmapped]) elif opts['kind'] == 'hotmaps1d': # save null distribution if user option specified if opts['null_distr_dir']: if not os.path.exists(opts['null_distr_dir']): os.mkdir(opts['null_distr_dir']) save_path = os.path.join(opts['null_distr_dir'], bed.gene_name + '.{0}.txt') else: save_path = None # calculate position based permutation results mywindow = list(map(int, opts['window'].split(','))) tmp_result = mypval.calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc, gs, bed, mywindow, opts['num_iterations'], opts['stop_criteria'], opts['report_index'], null_save_path=save_path) result.extend(tmp_result) elif opts['kind'] == 'protein': tmp_result = mypval.calc_protein_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['neighbor_graph_dir'], opts['num_iterations'], opts['stop_criteria'], opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) else: # calc results for entropy-on-effect permutation test tmp_result = mypval.calc_effect_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) gene_fa.close() logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result
def singleprocess_permutation(info): bed_list, mut_df, opts = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) num_permutations = opts['num_permutations'] gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # variables for recording the actual observed number of non-silent # vs. silent mutations if not opts['by_sample']: obs_silent = 0 obs_non_silent = 0 obs_nonsense = 0 obs_loststop = 0 obs_splice_site = 0 obs_loststart = 0 obs_missense = 0 obs_vest = 0 obs_mga_entropy = 0 else: uniq_samp = mut_df['Tumor_Sample'].unique() obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))), index=uniq_samp, columns=cols) # go through each gene to permform simulation if opts['score_dir']: result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] for bed in bed_list: # compute context counts and somatic bases for each context gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if context_to_mutations: ## get information about observed non-silent counts # get info about mutations tmp_mut_info = mc.get_aa_mut_info( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # update the observed count if not opts['by_sample']: # calc deleterious mutation info #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'], #tmp_mut_info['Somatic AA'], #tmp_mut_info['Codon Pos']) # calc mutation info summarizing observed mutations tmp_result = cutils.calc_summary_info( tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], bed.gene_name, opts['score_dir'], #min_frac=opts['fraction'], min_frac=0.0, #min_recur=opts['recurrent'] min_recur=3) obs_non_silent += tmp_result[0] obs_silent += tmp_result[1] obs_nonsense += tmp_result[2] obs_loststop += tmp_result[3] obs_splice_site += tmp_result[4] obs_loststart += tmp_result[5] obs_missense += tmp_result[6] if opts['score_dir']: obs_vest += tmp_result[-2] obs_mga_entropy += tmp_result[-3] else: for tsamp in mutations_df['Tumor_Sample'].unique(): ixs = np.where(mutations_df['Tumor_Sample'] == tsamp)[0] ref_aa = [ r for i, r in enumerate(tmp_mut_info['Reference AA']) if i in ixs ] somatic_aa = [ s for i, s in enumerate(tmp_mut_info['Somatic AA']) if i in ixs ] codon_pos = [ c for i, c in enumerate(tmp_mut_info['Codon Pos']) if i in ixs ] #tmp_non_silent = cutils.calc_non_silent_info(ref_aa, #somatic_aa, #codon_pos) # get summary info tmp_result = cutils.calc_summary_info(ref_aa, somatic_aa, codon_pos, bed.gene_name, opts['score_dir'], min_frac=0.0, min_recur=3) if opts['score_dir']: tmp_result.pop(-4) tmp_result.pop(-4) tmp_result.pop(-1) # update df #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent) obs_df.loc[ tsamp, :] = obs_df.loc[tsamp, :] + np.array(tmp_result) ## Do permutations # calculate non silent count #tmp_result = pm.non_silent_ratio_permutation(context_cts, #context_to_mutations, #sc, # sequence context obj #gs, # gene sequence obj #num_permutations) tmp_result = pm.summary_permutation( context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj opts['score_dir'], num_permutations) else: if opts['score_dir']: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] else: tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)] # increment the non-silent/silent counts for each permutation offset = 3 for j in range(num_permutations): result[j][0] += tmp_result[j][0 + offset] result[j][1] += tmp_result[j][1 + offset] result[j][2] += tmp_result[j][2 + offset] result[j][3] += tmp_result[j][3 + offset] result[j][4] += tmp_result[j][4 + offset] result[j][5] += tmp_result[j][5 + offset] result[j][6] += tmp_result[j][6 + offset] if opts['score_dir']: result[j][7] += tmp_result[j][9 + offset] result[j][8] += tmp_result[j][10 + offset] gene_fa.close() if not opts['by_sample']: obs_result = [ obs_non_silent, obs_silent, obs_nonsense, obs_loststop, obs_splice_site, obs_loststart, obs_missense ] if opts['score_dir']: obs_result.extend([obs_mga_entropy, obs_vest]) else: obs_result = obs_df logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result, obs_result
def singleprocess_permutation(info): bed_list, mut_df, opts = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) num_iterations = opts['num_iterations'] gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # go through each gene to perform simulation result = [] for bed in bed_list: # compute context counts and somatic bases for each context gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts) context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple if context_to_mutations: ## get information about observed non-silent counts if opts['summary'] and not num_iterations: tmp_mut_info = mc.get_aa_mut_info( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # calc mutation info summarizing observed mutations tmp_result = cutils.calc_summary_info( tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'], tmp_mut_info['Codon Pos'], bed.gene_name, opts['score_dir'], min_frac=opts['fraction'], min_recur=opts['recurrent']) tmp_result = [[bed.gene_name, 'NA', bed.cds_len] + tmp_result] ## Just record protein changes in MAF elif opts['maf'] and not num_iterations: # input code for just annotating genes mutations tmp_result = anot.annotate_maf( mutations_df['Coding Position'], mutations_df['Tumor_Allele'].tolist(), gs) # add tumor sample / tumor type info to output tmp_result = [ line + [ mutations_df['Tumor_Sample'].iloc[i], mutations_df['Tumor_Type'].iloc[i] ] for i, line in enumerate(tmp_result) ] ## Do permutations elif opts['maf']: # if user specified MAF format then output all mutations in # MAF format tmp_result = pm.maf_permutation(context_cts, context_to_mutations, sc, gs, num_iterations) else: # Summarized results for feature for each simulation for each # gene tmp_result = pm.summary_permutation( context_cts, context_to_mutations, sc, # sequence context obj gs, # gene sequence obj opts['score_dir'], num_iterations, min_frac=opts['fraction'], min_recur=opts['recurrent']) result += tmp_result gene_fa.close() logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result
def singleprocess_permutation(info): # initialize input bed_list, mut_df, opts, fs_cts_df, p_inactivating = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # list of columns that are needed cols = ['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Allele', 'Variant_Classification',] # conditionally add protein_change column if exists if 'Protein_Change' in mut_df.columns: cols += ['Protein_Change'] # figure out which genes actually have a mutation genes_with_mut = set(mut_df['Gene'].unique()) # iterate through each gene result = [] for bed in bed_list: if bed.gene_name not in genes_with_mut: # skip genes with no mutations continue # prepare info for running permutation test mut_info = mut_df.loc[mut_df['Gene']==bed.gene_name, cols] gs.set_gene(bed) sc = SequenceContext(gs, seed=opts['seed']) # count total mutations in gene total_mut = len(mut_info) # fix nucleotide letter if gene is on - strand if bed.strand == '-': rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x)) mut_info.loc[:, 'Tumor_Allele'] = rc # get coding positions, mutations unmapped to the reference tx will have # NA for a coding position pos_list = [] for ix, row in mut_info.iterrows(): coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position']) pos_list.append(coding_pos) mut_info.loc[:, 'Coding Position'] = pos_list # recover mutations that could not be mapped to the reference transcript # for a gene before being dropped (next step) unmapped_mut_info = mc.recover_unmapped_mut_info(mut_info, bed, sc, opts) # drop mutations wich do not map to reference tx mut_info = mut_info.dropna(subset=['Coding Position']) # mutations need to map to tx mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) num_mapped_muts = len(mut_info) unmapped_muts = total_mut - num_mapped_muts # construct sequence context #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(), # mut_info['Coding Position'].tolist()) # calculate results of permutation test if opts['kind'] == 'oncogene': # calculate position based permutation results tmp_result = mypval.calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, opts['score_dir'], opts['num_iterations'], opts['stop_criteria'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) elif opts['kind'] == 'tsg': # calculate results for deleterious mutation permutation test #fs_ct = fs_cts_df['total'][bed.gene_name] #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name] # replaced fs_ct with zero to stop using the frameshifts in # simulation tmp_result = mypval.calc_deleterious_p_value(mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], opts['stop_criteria'], opts['deleterious'], 0, # no deleterious mutation pseudo count opts['seed']) result.append(tmp_result + [num_mapped_muts, unmapped_muts]) #fs_ct, fs_unmapped]) elif opts['kind'] == 'hotmaps1d': # save null distribution if user option specified if opts['null_distr_dir']: if not os.path.exists(opts['null_distr_dir']): os.mkdir(opts['null_distr_dir']) save_path = os.path.join(opts['null_distr_dir'], bed.gene_name + '.{0}.txt') else: save_path = None # calculate position based permutation results mywindow = list(map(int, opts['window'].split(','))) tmp_result = mypval.calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc, gs, bed, mywindow, opts['num_iterations'], opts['stop_criteria'], opts['report_index'], null_save_path=save_path) result.extend(tmp_result) elif opts['kind'] == 'protein': tmp_result = mypval.calc_protein_p_value(mut_info, unmapped_mut_info, sc, gs, bed, opts['neighbor_graph_dir'], opts['num_iterations'], opts['stop_criteria'], opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) else: # calc results for entropy-on-effect permutation test tmp_result = mypval.calc_effect_p_value(mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) gene_fa.close() logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result
def test_dinuc_context_constructor(): # dinucleotide context gs = GeneSequence(gene_fa, nuc_context=2) gs.set_gene(bed) sc = SequenceContext(gs)