Esempi in Python per GeneSequence, esempi in Python per prob2020.python.gene_sequence.GeneSequence

Esempio n. 1

0

Mostra file

File: test_position_permutation_test.py Progetto: yumyai/probabilistic2020

def test_ctnnb1_get_aa_mut_info():
    import pysam
    from prob2020.python.gene_sequence import GeneSequence

    # read fasta
    ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
    gene_fa = pysam.Fastafile(ctnnb1_fasta)
    gs = GeneSequence(gene_fa, nuc_context=1)

    # read CTNNB1 bed file
    ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
    bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
    gs.set_gene(bed_list[0])

    # specify mutation
    coding_pos = [0]
    somatic_base = ['C']

    # check mutation info
    aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
    ref_codon_msg = 'First codon should be start codon ({0})'.format(
        aa_info['Reference Codon'][0])
    assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
    assert aa_info['Somatic Codon'][
        0] == 'CTG', 'First "A" should be replaced with a "C"'
    assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'

Esempio n. 2

0

Mostra file

def test_no_context_constructor():
    # no context
    gs = GeneSequence(gene_fa, nuc_context=0)
    gs.set_gene(bed)
    sc = SequenceContext(gs)
    true_counts = {'None': 21}
    true_ctxt2pos = {'None': range(21)}
    _check_true_counts(sc, true_counts)
    _check_true_context_pos(sc, true_ctxt2pos)

Esempio n. 3

0

Mostra file

File: test_gene_sequence.py Progetto: KarchinLab/probabilistic2020

def test_simple_constructor():
    gs = GeneSequence(gene_fa, nuc_context=1)
    gs.set_gene(bed)
    assert gs.exon_seq == 'ACATGAATGATAGATCCGAAA', 'Sequence is not correct'

    # this should update the sequence correctly
    fake_germline = ['A', 'C', 'N', 'G', 'T']
    fake_pos = [1, 0, 20, 7, 15]
    gs.add_germline_variants(fake_germline, fake_pos)
    assert gs.exon_seq == 'CAATGAAGGATAGATTCGAAN'

Esempio n. 4

0

Mostra file

File: test_gene_sequence.py Progetto: KarchinLab/probabilistic2020

def test_pos_to_codon():
    gs = GeneSequence(gene_fa, nuc_context=1)
    gs.set_gene(bed)

    pos_list = [1, 12, 17]
    results = []
    for pos in pos_list:
        codon_info = cutils.pos_to_codon(gs, pos)
        results.append(codon_info)
    true_results = [('ACA', 0, 1, 'C'), ('GAT', 4, 0, 'G'), ('CCG', 5, 2, 'G')]
    assert results == true_results, 'Codon information is incorrect'

Esempio n. 5

0

Mostra file

def test_pos_to_codon():
    gs = GeneSequence(gene_fa, nuc_context=1)
    gs.set_gene(bed)

    pos_list = [1, 12, 17]
    results = []
    for pos in pos_list:
        codon_info = cutils.pos_to_codon(gs, pos)
        results.append(codon_info)
    true_results = [('ACA', 0, 1, 'C'), ('GAT', 4, 0, 'G'), ('CCG', 5, 2, 'G')]
    assert results == true_results, 'Codon information is incorrect'

Esempio n. 6

0

Mostra file

def is_nonsilent(mut_df, bed_dict, opts):
    # convert dictionary to list for bed objects
    gene_beds = [b for chrom in bed_dict for b in bed_dict[chrom]]

    # initiate gene sequences
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # non-silent SNV classes
    non_silent_snv = [
        'Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site',
        'Translation_Start_Site', 'Missense_Mutation'
    ]

    # record indels and get only snvs
    mut_df['is_nonsilent'] = 0
    indel_flag = indel.is_indel_annotation(mut_df)
    mut_df.loc[indel_flag, 'is_nonsilent'] = 1
    snv_df = mut_df[~indel_flag]

    # iterate over each gene
    for bed in gene_beds:
        # initiate for this gene
        tmp_df = snv_df[snv_df['Gene'] == bed.gene_name]
        gs.set_gene(bed)

        # compute context counts and somatic bases for each context
        gene_tuple = compute_mutation_context(bed, gs, tmp_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if len(mutations_df):
            # get snv information
            tmp_mut_info = get_aa_mut_info(
                mutations_df['Coding Position'],
                mutations_df['Tumor_Allele'].tolist(), gs)

            # get string describing variant
            var_class = cutils.get_variant_classification(
                tmp_mut_info['Reference AA'], tmp_mut_info['Somatic AA'],
                tmp_mut_info['Codon Pos'])

            # detect if non-silent snv
            is_nonsilent_snv = [
                1 if (x in non_silent_snv) else 0 for x in var_class
            ]
            mut_df.loc[tmp_df.index, 'is_nonsilent'] = is_nonsilent_snv

    # return a pandas series indicating nonsilent status
    is_nonsilent_series = mut_df['is_nonsilent'].copy()
    del mut_df['is_nonsilent']
    return is_nonsilent_series

Esempio n. 7

0

Mostra file

def test_single_context_constructor():
    # single nuc context
    gs = GeneSequence(gene_fa, nuc_context=1)
    gs.set_gene(bed)
    sc = SequenceContext(gs)
    true_counts = {'A': 10,
                   'T': 4,
                   'G': 4,
                   'C': 3}
    true_ctxt2pos = {'A': [0, 2, 5, 6, 9, 11, 13, 18, 19, 20],
                     'C': [1, 15, 16],
                     'G': [4, 8, 12, 17],
                     'T': [3, 7, 10, 14]}
    _check_true_counts(sc, true_counts)
    _check_true_context_pos(sc, true_ctxt2pos)

Esempio n. 8

0

Mostra file

def test_chasm_context_constructor():
    # chasm context, mixture between single and di context
    gs = GeneSequence(gene_fa, nuc_context=1.5)
    gs.set_gene(bed)
    sc = SequenceContext(gs)
    true_counts = {'A': 10,
                   'C': 1,
                   'C*pG': 1,
                   'CpG*': 1,
                   'G*pA': 3,
                   'T': 4,
                   'TpC*': 1}
    true_ctxt2pos = {'A': [2, 5, 6, 9, 11, 13, 18, 19, 0, 20],
                     'C': [1],
                     'C*pG': [16],
                     'CpG*': [17],
                     'G*pA': [4, 8, 12],
                     'T': [3, 7, 10, 14],
                     'TpC*': [15]}
    _check_true_counts(sc, true_counts)
    _check_true_context_pos(sc, true_ctxt2pos)

Esempio n. 9

0

Mostra file

File: test_position_permutation_test.py Progetto: KarchinLab/probabilistic2020

def test_ctnnb1_get_aa_mut_info():
    import pysam
    from prob2020.python.gene_sequence import GeneSequence

    # read fasta
    ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
    gene_fa = pysam.Fastafile(ctnnb1_fasta)
    gs = GeneSequence(gene_fa, nuc_context=1)

    # read CTNNB1 bed file
    ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
    bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
    gs.set_gene(bed_list[0])

    # specify mutation
    coding_pos = [0]
    somatic_base = ['C']

    # check mutation info
    aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
    ref_codon_msg =  'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0])
    assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
    assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"'
    assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'

Esempio n. 10

0

Mostra file

def test_simple_constructor():
    gs = GeneSequence(gene_fa, nuc_context=1)
    gs.set_gene(bed)
    assert gs.exon_seq == 'ACATGAATGATAGATCCGAAA', 'Sequence is not correct'

    # this should update the sequence correctly
    fake_germline = ['A', 'C', 'N', 'G', 'T']
    fake_pos = [1, 0, 20, 7, 15]
    gs.add_germline_variants(fake_germline, fake_pos)
    assert gs.exon_seq == 'CAATGAAGGATAGATTCGAAN'

Esempio n. 11

0

Mostra file

File: randomization_test.py Progetto: thamer33/probabilistic2020

def singleprocess_permutation(info):
    # initialize input
    bed_list, mut_df, opts, fs_cts_df, p_inactivating = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # list of columns that are needed
    cols = [
        'Chromosome',
        'Start_Position',
        'Reference_Allele',
        'Tumor_Allele',
        'Variant_Classification',
    ]
    # conditionally add protein_change column if exists
    if 'Protein_Change' in mut_df.columns:
        cols += ['Protein_Change']

    # figure out which genes actually have a mutation
    genes_with_mut = set(mut_df['Gene'].unique())

    # iterate through each gene
    result = []
    for bed in bed_list:
        if bed.gene_name not in genes_with_mut:
            # skip genes with no mutations
            continue

        # prepare info for running permutation test
        mut_info = mut_df.loc[mut_df['Gene'] == bed.gene_name, cols]
        gs.set_gene(bed)
        sc = SequenceContext(gs, seed=opts['seed'])

        # count total mutations in gene
        total_mut = len(mut_info)

        # fix nucleotide letter if gene is on - strand
        if bed.strand == '-':
            rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x))
            mut_info.loc[:, 'Tumor_Allele'] = rc

        # get coding positions, mutations unmapped to the reference tx will have
        # NA for a coding position
        pos_list = []
        for ix, row in mut_info.iterrows():
            coding_pos = bed.query_position(bed.strand, row['Chromosome'],
                                            row['Start_Position'])
            pos_list.append(coding_pos)
        mut_info.loc[:, 'Coding Position'] = pos_list

        # recover mutations that could not be mapped to the reference transcript
        # for a gene before being dropped (next step)
        unmapped_mut_info = mc.recover_unmapped_mut_info(
            mut_info, bed, sc, opts)

        # drop mutations wich do not map to reference tx
        mut_info = mut_info.dropna(subset=['Coding Position'
                                           ])  # mutations need to map to tx
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        num_mapped_muts = len(mut_info)
        unmapped_muts = total_mut - num_mapped_muts

        # construct sequence context
        #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(),
        #                         mut_info['Coding Position'].tolist())

        # calculate results of permutation test
        if opts['kind'] == 'oncogene':
            # calculate position based permutation results
            tmp_result = mypval.calc_position_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['score_dir'],
                opts['num_iterations'],
                opts['stop_criteria'],
                0,  # no recurrent mutation pseudo count
                opts['recurrent'],
                opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        elif opts['kind'] == 'tsg':
            # calculate results for deleterious mutation permutation test
            #fs_ct = fs_cts_df['total'][bed.gene_name]
            #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name]
            # replaced fs_ct with zero to stop using the frameshifts in
            # simulation
            tmp_result = mypval.calc_deleterious_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['num_iterations'],
                opts['stop_criteria'],
                opts['deleterious'],
                0,  # no deleterious mutation pseudo count
                opts['seed'])
            result.append(tmp_result + [num_mapped_muts, unmapped_muts])
            #fs_ct, fs_unmapped])
        elif opts['kind'] == 'hotmaps1d':
            # save null distribution if user option specified
            if opts['null_distr_dir']:
                if not os.path.exists(opts['null_distr_dir']):
                    os.mkdir(opts['null_distr_dir'])
                save_path = os.path.join(opts['null_distr_dir'],
                                         bed.gene_name + '.{0}.txt')
            else:
                save_path = None
            # calculate position based permutation results
            mywindow = list(map(int, opts['window'].split(',')))
            tmp_result = mypval.calc_hotmaps_p_value(mut_info,
                                                     unmapped_mut_info,
                                                     sc,
                                                     gs,
                                                     bed,
                                                     mywindow,
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['report_index'],
                                                     null_save_path=save_path)
            result.extend(tmp_result)
        elif opts['kind'] == 'protein':
            tmp_result = mypval.calc_protein_p_value(
                mut_info, unmapped_mut_info, sc, gs, bed,
                opts['neighbor_graph_dir'], opts['num_iterations'],
                opts['stop_criteria'], opts['recurrent'], opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        else:
            # calc results for entropy-on-effect permutation test
            tmp_result = mypval.calc_effect_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['num_iterations'],
                0,  #  no recurrent mutation pseudo count
                opts['recurrent'],
                opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result

Esempio n. 12

0

Mostra file

File: simulate_non_silent_ratio.py Progetto: yumyai/probabilistic2020

def singleprocess_permutation(info):
    bed_list, mut_df, opts = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    num_permutations = opts['num_permutations']
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # variables for recording the actual observed number of non-silent
    # vs. silent mutations
    if not opts['by_sample']:
        obs_silent = 0
        obs_non_silent = 0
        obs_nonsense = 0
        obs_loststop = 0
        obs_splice_site = 0
        obs_loststart = 0
        obs_missense = 0
        obs_vest = 0
        obs_mga_entropy = 0
    else:
        uniq_samp = mut_df['Tumor_Sample'].unique()
        obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))),
                              index=uniq_samp,
                              columns=cols)

    # go through each gene to permform simulation
    if opts['score_dir']:
        result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    else:
        result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    for bed in bed_list:
        # compute context counts and somatic bases for each context
        gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if context_to_mutations:
            ## get information about observed non-silent counts
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(
                mutations_df['Coding Position'],
                mutations_df['Tumor_Allele'].tolist(), gs)
            # update the observed count
            if not opts['by_sample']:
                # calc deleterious mutation info
                #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'],
                #tmp_mut_info['Somatic AA'],
                #tmp_mut_info['Codon Pos'])
                # calc mutation info summarizing observed mutations
                tmp_result = cutils.calc_summary_info(
                    tmp_mut_info['Reference AA'],
                    tmp_mut_info['Somatic AA'],
                    tmp_mut_info['Codon Pos'],
                    bed.gene_name,
                    opts['score_dir'],
                    #min_frac=opts['fraction'],
                    min_frac=0.0,
                    #min_recur=opts['recurrent']
                    min_recur=3)
                obs_non_silent += tmp_result[0]
                obs_silent += tmp_result[1]
                obs_nonsense += tmp_result[2]
                obs_loststop += tmp_result[3]
                obs_splice_site += tmp_result[4]
                obs_loststart += tmp_result[5]
                obs_missense += tmp_result[6]
                if opts['score_dir']:
                    obs_vest += tmp_result[-2]
                    obs_mga_entropy += tmp_result[-3]
            else:
                for tsamp in mutations_df['Tumor_Sample'].unique():
                    ixs = np.where(mutations_df['Tumor_Sample'] == tsamp)[0]
                    ref_aa = [
                        r for i, r in enumerate(tmp_mut_info['Reference AA'])
                        if i in ixs
                    ]
                    somatic_aa = [
                        s for i, s in enumerate(tmp_mut_info['Somatic AA'])
                        if i in ixs
                    ]
                    codon_pos = [
                        c for i, c in enumerate(tmp_mut_info['Codon Pos'])
                        if i in ixs
                    ]
                    #tmp_non_silent = cutils.calc_non_silent_info(ref_aa,
                    #somatic_aa,
                    #codon_pos)
                    # get summary info
                    tmp_result = cutils.calc_summary_info(ref_aa,
                                                          somatic_aa,
                                                          codon_pos,
                                                          bed.gene_name,
                                                          opts['score_dir'],
                                                          min_frac=0.0,
                                                          min_recur=3)
                    if opts['score_dir']:
                        tmp_result.pop(-4)
                        tmp_result.pop(-4)
                        tmp_result.pop(-1)
                    # update df
                    #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent)
                    obs_df.loc[
                        tsamp, :] = obs_df.loc[tsamp, :] + np.array(tmp_result)

            ## Do permutations
            # calculate non silent count
            #tmp_result = pm.non_silent_ratio_permutation(context_cts,
            #context_to_mutations,
            #sc,  # sequence context obj
            #gs,  # gene sequence obj
            #num_permutations)
            tmp_result = pm.summary_permutation(
                context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                opts['score_dir'],
                num_permutations)
        else:
            if opts['score_dir']:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                              for k in range(num_permutations)]
            else:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                              for k in range(num_permutations)]

        # increment the non-silent/silent counts for each permutation
        offset = 3
        for j in range(num_permutations):
            result[j][0] += tmp_result[j][0 + offset]
            result[j][1] += tmp_result[j][1 + offset]
            result[j][2] += tmp_result[j][2 + offset]
            result[j][3] += tmp_result[j][3 + offset]
            result[j][4] += tmp_result[j][4 + offset]
            result[j][5] += tmp_result[j][5 + offset]
            result[j][6] += tmp_result[j][6 + offset]
            if opts['score_dir']:
                result[j][7] += tmp_result[j][9 + offset]
                result[j][8] += tmp_result[j][10 + offset]

    gene_fa.close()
    if not opts['by_sample']:
        obs_result = [
            obs_non_silent, obs_silent, obs_nonsense, obs_loststop,
            obs_splice_site, obs_loststart, obs_missense
        ]
        if opts['score_dir']:
            obs_result.extend([obs_mga_entropy, obs_vest])
    else:
        obs_result = obs_df
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result, obs_result

Esempio n. 13

0

Mostra file

def singleprocess_permutation(info):
    bed_list, mut_df, opts = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    num_iterations = opts['num_iterations']
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # go through each gene to perform simulation
    result = []
    for bed in bed_list:
        # compute context counts and somatic bases for each context
        gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if context_to_mutations:
            ## get information about observed non-silent counts
            if opts['summary'] and not num_iterations:
                tmp_mut_info = mc.get_aa_mut_info(
                    mutations_df['Coding Position'],
                    mutations_df['Tumor_Allele'].tolist(), gs)
                # calc mutation info summarizing observed mutations
                tmp_result = cutils.calc_summary_info(
                    tmp_mut_info['Reference AA'],
                    tmp_mut_info['Somatic AA'],
                    tmp_mut_info['Codon Pos'],
                    bed.gene_name,
                    opts['score_dir'],
                    min_frac=opts['fraction'],
                    min_recur=opts['recurrent'])
                tmp_result = [[bed.gene_name, 'NA', bed.cds_len] + tmp_result]
            ## Just record protein changes in MAF
            elif opts['maf'] and not num_iterations:
                # input code for just annotating genes mutations
                tmp_result = anot.annotate_maf(
                    mutations_df['Coding Position'],
                    mutations_df['Tumor_Allele'].tolist(), gs)
                # add tumor sample / tumor type info to output
                tmp_result = [
                    line + [
                        mutations_df['Tumor_Sample'].iloc[i],
                        mutations_df['Tumor_Type'].iloc[i]
                    ] for i, line in enumerate(tmp_result)
                ]
            ## Do permutations
            elif opts['maf']:
                # if user specified MAF format then output all mutations in
                # MAF format
                tmp_result = pm.maf_permutation(context_cts,
                                                context_to_mutations, sc, gs,
                                                num_iterations)
            else:
                # Summarized results for feature for each simulation for each
                # gene
                tmp_result = pm.summary_permutation(
                    context_cts,
                    context_to_mutations,
                    sc,  # sequence context obj
                    gs,  # gene sequence obj
                    opts['score_dir'],
                    num_iterations,
                    min_frac=opts['fraction'],
                    min_recur=opts['recurrent'])
            result += tmp_result

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result

Esempio n. 14

0

Mostra file

File: randomization_test.py Progetto: KarchinLab/probabilistic2020

def singleprocess_permutation(info):
    # initialize input
    bed_list, mut_df, opts, fs_cts_df, p_inactivating = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # list of columns that are needed
    cols = ['Chromosome', 'Start_Position', 'Reference_Allele',
            'Tumor_Allele', 'Variant_Classification',]
    # conditionally add protein_change column if exists
    if 'Protein_Change' in mut_df.columns:
        cols += ['Protein_Change']

    # figure out which genes actually have a mutation
    genes_with_mut = set(mut_df['Gene'].unique())

    # iterate through each gene
    result = []
    for bed in bed_list:
        if bed.gene_name not in genes_with_mut:
            # skip genes with no mutations
            continue

        # prepare info for running permutation test
        mut_info = mut_df.loc[mut_df['Gene']==bed.gene_name, cols]
        gs.set_gene(bed)
        sc = SequenceContext(gs, seed=opts['seed'])

        # count total mutations in gene
        total_mut = len(mut_info)

        # fix nucleotide letter if gene is on - strand
        if bed.strand == '-':
            rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x))
            mut_info.loc[:, 'Tumor_Allele'] = rc

        # get coding positions, mutations unmapped to the reference tx will have
        # NA for a coding position
        pos_list = []
        for ix, row in mut_info.iterrows():
            coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position'])
            pos_list.append(coding_pos)
        mut_info.loc[:, 'Coding Position'] = pos_list

        # recover mutations that could not be mapped to the reference transcript
        # for a gene before being dropped (next step)
        unmapped_mut_info = mc.recover_unmapped_mut_info(mut_info, bed, sc, opts)

        # drop mutations wich do not map to reference tx
        mut_info = mut_info.dropna(subset=['Coding Position'])  # mutations need to map to tx
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        num_mapped_muts = len(mut_info)
        unmapped_muts = total_mut - num_mapped_muts

        # construct sequence context
        #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(),
        #                         mut_info['Coding Position'].tolist())

        # calculate results of permutation test
        if opts['kind'] == 'oncogene':
            # calculate position based permutation results
            tmp_result = mypval.calc_position_p_value(mut_info, unmapped_mut_info, sc,
                                                      gs, bed, opts['score_dir'],
                                                      opts['num_iterations'],
                                                      opts['stop_criteria'],
                                                      0,  # no recurrent mutation pseudo count
                                                      opts['recurrent'],
                                                      opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        elif opts['kind'] == 'tsg':
            # calculate results for deleterious mutation permutation test
            #fs_ct = fs_cts_df['total'][bed.gene_name]
            #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name]
            # replaced fs_ct with zero to stop using the frameshifts in
            # simulation
            tmp_result = mypval.calc_deleterious_p_value(mut_info, unmapped_mut_info,
                                                         sc, gs, bed,
                                                         opts['num_iterations'],
                                                         opts['stop_criteria'],
                                                         opts['deleterious'],
                                                         0,  # no deleterious mutation pseudo count
                                                         opts['seed'])
            result.append(tmp_result + [num_mapped_muts, unmapped_muts])
                                        #fs_ct, fs_unmapped])
        elif opts['kind'] == 'hotmaps1d':
            # save null distribution if user option specified
            if opts['null_distr_dir']:
                if not os.path.exists(opts['null_distr_dir']): os.mkdir(opts['null_distr_dir'])
                save_path = os.path.join(opts['null_distr_dir'], bed.gene_name + '.{0}.txt')
            else:
                save_path = None
            # calculate position based permutation results
            mywindow = list(map(int, opts['window'].split(',')))
            tmp_result = mypval.calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc,
                                                     gs, bed,
                                                     mywindow,
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['report_index'],
                                                     null_save_path=save_path)
            result.extend(tmp_result)
        elif opts['kind'] == 'protein':
            tmp_result = mypval.calc_protein_p_value(mut_info, unmapped_mut_info,
                                                     sc, gs, bed,
                                                     opts['neighbor_graph_dir'],
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['recurrent'],
                                                     opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        else:
            # calc results for entropy-on-effect permutation test
            tmp_result = mypval.calc_effect_p_value(mut_info, unmapped_mut_info,
                                                    sc, gs, bed,
                                                    opts['num_iterations'],
                                                    0, #  no recurrent mutation pseudo count
                                                    opts['recurrent'],
                                                    opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result

Esempio n. 15

0

Mostra file

def test_dinuc_context_constructor():
    # dinucleotide context
    gs = GeneSequence(gene_fa, nuc_context=2)
    gs.set_gene(bed)
    sc = SequenceContext(gs)