def test_no_context_constructor(): # no context gs = GeneSequence(gene_fa, nuc_context=0) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'None': 21} true_ctxt2pos = {'None': range(21)} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def test_single_context_constructor(): # single nuc context gs = GeneSequence(gene_fa, nuc_context=1) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'A': 10, 'T': 4, 'G': 4, 'C': 3} true_ctxt2pos = {'A': [0, 2, 5, 6, 9, 11, 13, 18, 19, 20], 'C': [1, 15, 16], 'G': [4, 8, 12, 17], 'T': [3, 7, 10, 14]} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def test_chasm_context_constructor(): # chasm context, mixture between single and di context gs = GeneSequence(gene_fa, nuc_context=1.5) gs.set_gene(bed) sc = SequenceContext(gs) true_counts = {'A': 10, 'C': 1, 'C*pG': 1, 'CpG*': 1, 'G*pA': 3, 'T': 4, 'TpC*': 1} true_ctxt2pos = {'A': [2, 5, 6, 9, 11, 13, 18, 19, 0, 20], 'C': [1], 'C*pG': [16], 'CpG*': [17], 'G*pA': [4, 8, 12], 'T': [3, 7, 10, 14], 'TpC*': [15]} _check_true_counts(sc, true_counts) _check_true_context_pos(sc, true_ctxt2pos)
def singleprocess_permutation(info): # initialize input bed_list, mut_df, opts, fs_cts_df, p_inactivating = info current_chrom = bed_list[0].chrom logger.info('Working on chromosome: {0} . . .'.format(current_chrom)) gene_fa = pysam.Fastafile(opts['input']) gs = GeneSequence(gene_fa, nuc_context=opts['context']) # list of columns that are needed cols = [ 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Allele', 'Variant_Classification', ] # conditionally add protein_change column if exists if 'Protein_Change' in mut_df.columns: cols += ['Protein_Change'] # figure out which genes actually have a mutation genes_with_mut = set(mut_df['Gene'].unique()) # iterate through each gene result = [] for bed in bed_list: if bed.gene_name not in genes_with_mut: # skip genes with no mutations continue # prepare info for running permutation test mut_info = mut_df.loc[mut_df['Gene'] == bed.gene_name, cols] gs.set_gene(bed) sc = SequenceContext(gs, seed=opts['seed']) # count total mutations in gene total_mut = len(mut_info) # fix nucleotide letter if gene is on - strand if bed.strand == '-': rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x)) mut_info.loc[:, 'Tumor_Allele'] = rc # get coding positions, mutations unmapped to the reference tx will have # NA for a coding position pos_list = [] for ix, row in mut_info.iterrows(): coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position']) pos_list.append(coding_pos) mut_info.loc[:, 'Coding Position'] = pos_list # recover mutations that could not be mapped to the reference transcript # for a gene before being dropped (next step) unmapped_mut_info = mc.recover_unmapped_mut_info( mut_info, bed, sc, opts) # drop mutations wich do not map to reference tx mut_info = mut_info.dropna(subset=['Coding Position' ]) # mutations need to map to tx mut_info['Coding Position'] = mut_info['Coding Position'].astype(int) num_mapped_muts = len(mut_info) unmapped_muts = total_mut - num_mapped_muts # construct sequence context #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(), # mut_info['Coding Position'].tolist()) # calculate results of permutation test if opts['kind'] == 'oncogene': # calculate position based permutation results tmp_result = mypval.calc_position_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['score_dir'], opts['num_iterations'], opts['stop_criteria'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) elif opts['kind'] == 'tsg': # calculate results for deleterious mutation permutation test #fs_ct = fs_cts_df['total'][bed.gene_name] #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name] # replaced fs_ct with zero to stop using the frameshifts in # simulation tmp_result = mypval.calc_deleterious_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], opts['stop_criteria'], opts['deleterious'], 0, # no deleterious mutation pseudo count opts['seed']) result.append(tmp_result + [num_mapped_muts, unmapped_muts]) #fs_ct, fs_unmapped]) elif opts['kind'] == 'hotmaps1d': # save null distribution if user option specified if opts['null_distr_dir']: if not os.path.exists(opts['null_distr_dir']): os.mkdir(opts['null_distr_dir']) save_path = os.path.join(opts['null_distr_dir'], bed.gene_name + '.{0}.txt') else: save_path = None # calculate position based permutation results mywindow = list(map(int, opts['window'].split(','))) tmp_result = mypval.calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc, gs, bed, mywindow, opts['num_iterations'], opts['stop_criteria'], opts['report_index'], null_save_path=save_path) result.extend(tmp_result) elif opts['kind'] == 'protein': tmp_result = mypval.calc_protein_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['neighbor_graph_dir'], opts['num_iterations'], opts['stop_criteria'], opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) else: # calc results for entropy-on-effect permutation test tmp_result = mypval.calc_effect_p_value( mut_info, unmapped_mut_info, sc, gs, bed, opts['num_iterations'], 0, # no recurrent mutation pseudo count opts['recurrent'], opts['fraction']) result.append(tmp_result + [total_mut, unmapped_muts]) gene_fa.close() logger.info('Finished working on chromosome: {0}.'.format(current_chrom)) return result
def test_dinuc_context_constructor(): # dinucleotide context gs = GeneSequence(gene_fa, nuc_context=2) gs.set_gene(bed) sc = SequenceContext(gs)