Beispiel #1
0
def annotate_maf(coding_pos, somatic_base, gene_seq):
    # make sure numpy array
    coding_pos = np.array(coding_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []

    # get genome coordinate
    pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x] + 1)
    genome_coord = pos2genome(coding_pos)

    # get info about mutations
    tmp_mut_info = mc.get_aa_mut_info(coding_pos, somatic_base, gene_seq)

    # get string describing variant
    var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                  tmp_mut_info['Somatic AA'],
                                                  tmp_mut_info['Codon Pos'])

    # prepare output
    for k, mysomatic_base in enumerate(somatic_base):
        ######
        # Note: positions are converted to 1-based positions
        # for reporting DNA/Protein change, but internally
        # they are represented as 0-based
        ######
        # format DNA change
        ref_nuc = tmp_mut_info['Reference Nuc'][k]
        nuc_pos = coding_pos[k]
        dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos + 1,
                                           mysomatic_base)

        # format protein change
        ref_aa = tmp_mut_info['Reference AA'][k]
        somatic_aa = tmp_mut_info['Somatic AA'][k]
        codon_pos = tmp_mut_info['Codon Pos'][k]
        codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None
        protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based,
                                              somatic_aa)

        # reverse complement if on negative strand
        if strand == '-':
            ref_nuc = utils.rev_comp(ref_nuc)
            mysomatic_base = utils.rev_comp(mysomatic_base)

        # append results
        maf_line = [
            gene_name, strand, chrom, genome_coord[k], genome_coord[k],
            ref_nuc, mysomatic_base, dna_change, protein_change, var_class[k]
        ]
        maf_list.append(maf_line)

    return maf_list
def annotate_maf(coding_pos, somatic_base, gene_seq):
    # make sure numpy array
    coding_pos = np.array(coding_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []

    # get genome coordinate
    pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
    genome_coord = pos2genome(coding_pos)

    # get info about mutations
    tmp_mut_info = mc.get_aa_mut_info(coding_pos,
                                      somatic_base,
                                      gene_seq)

    # get string describing variant
    var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                  tmp_mut_info['Somatic AA'],
                                                  tmp_mut_info['Codon Pos'])

    # prepare output
    for k, mysomatic_base in enumerate(somatic_base):
        ######
        # Note: positions are converted to 1-based positions
        # for reporting DNA/Protein change, but internally
        # they are represented as 0-based
        ######
        # format DNA change
        ref_nuc = tmp_mut_info['Reference Nuc'][k]
        nuc_pos = coding_pos[k]
        dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos+1, mysomatic_base)

        # format protein change
        ref_aa = tmp_mut_info['Reference AA'][k]
        somatic_aa = tmp_mut_info['Somatic AA'][k]
        codon_pos = tmp_mut_info['Codon Pos'][k]
        codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None
        protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based, somatic_aa)

        # reverse complement if on negative strand
        if strand == '-':
            ref_nuc = utils.rev_comp(ref_nuc)
            mysomatic_base = utils.rev_comp(mysomatic_base)

        # append results
        maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                    ref_nuc, mysomatic_base, dna_change,
                    protein_change, var_class[k]]
        maf_list.append(maf_line)

    return maf_list
Beispiel #3
0
def test_rev_comp():
    seq1 = 'CT'
    seq2 = 'AaCg'
    seq3 = 'aNnC'

    rc_seq1 = utils.rev_comp(seq1)
    assert rc_seq1 == 'AG'
    rc_seq2 = utils.rev_comp(seq2)
    assert rc_seq2 == 'cGtT'
    rc_seq3 = utils.rev_comp(seq3)
    assert rc_seq3 == 'GnNt'
Beispiel #4
0
def fetch_gene_fasta(gene_bed, fasta_obj):
    """Retreive gene sequences in FASTA format.

    Parameters
    ----------
    gene_bed : BedLine
        BedLine object representing a single gene
    fasta_obj : pysam.Fastafile
        fasta object for index retreival of sequence

    Returns
    -------
    gene_fasta : str
        sequence of gene in FASTA format
    """
    gene_fasta = ''
    strand = gene_bed.strand
    exons = gene_bed.get_exons()
    if strand == '-':
        exons.reverse()  # order exons 5' to 3', so reverse if '-' strand

    # iterate over exons
    for i, exon in enumerate(exons):
        exon_seq = fasta_obj.fetch(reference=gene_bed.chrom,
                                   start=exon[0],
                                   end=exon[1]).upper()
        if strand == '-':
            exon_seq = utils.rev_comp(exon_seq)
        exon_fasta = '>{0};exon{1}\n{2}\n'.format(gene_bed.gene_name, i,
                                                  exon_seq)

        # get splice site sequence
        if len(exons) == 1:
            # splice sites don't matter if there is no splicing
            ss_fasta = ''
        elif i == 0:
            # first exon only, get 3' SS
            ss_fasta = _fetch_5ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                        gene_bed.chrom, strand, exon[0],
                                        exon[1])
        elif i == (len(exons) - 1):
            # last exon only, get 5' SS
            ss_fasta = _fetch_3ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                        gene_bed.chrom, strand, exon[0],
                                        exon[1])
        else:
            # middle exon, get bot 5' and 3' SS
            fasta_3ss = _fetch_3ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                         gene_bed.chrom, strand, exon[0],
                                         exon[1])
            fasta_5ss = _fetch_5ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                         gene_bed.chrom, strand, exon[0],
                                         exon[1])
            ss_fasta = fasta_5ss + fasta_3ss

        gene_fasta += exon_fasta + ss_fasta

    return gene_fasta
def fetch_gene_fasta(gene_bed, fasta_obj):
    """Retreive gene sequences in FASTA format.

    Parameters
    ----------
    gene_bed : BedLine
        BedLine object representing a single gene
    fasta_obj : pysam.Fastafile
        fasta object for index retreival of sequence

    Returns
    -------
    gene_fasta : str
        sequence of gene in FASTA format
    """
    gene_fasta = ''
    strand = gene_bed.strand
    exons = gene_bed.get_exons()
    if strand == '-':
        exons.reverse()  # order exons 5' to 3', so reverse if '-' strand

    # iterate over exons
    for i, exon in enumerate(exons):
        exon_seq = fasta_obj.fetch(reference=gene_bed.chrom,
                                   start=exon[0],
                                   end=exon[1]).upper()
        if strand == '-':
            exon_seq = utils.rev_comp(exon_seq)
        exon_fasta = '>{0};exon{1}\n{2}\n'.format(gene_bed.gene_name,
                                                  i, exon_seq)

        # get splice site sequence
        if len(exons) == 1:
            # splice sites don't matter if there is no splicing
            ss_fasta = ''
        elif i == 0:
            # first exon only, get 3' SS
            ss_fasta = _fetch_5ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                        gene_bed.chrom, strand, exon[0], exon[1])
        elif i == (len(exons) - 1):
            # last exon only, get 5' SS
            ss_fasta = _fetch_3ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                        gene_bed.chrom, strand, exon[0], exon[1])
        else:
            # middle exon, get bot 5' and 3' SS
            fasta_3ss = _fetch_3ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                         gene_bed.chrom, strand, exon[0], exon[1])
            fasta_5ss = _fetch_5ss_fasta(fasta_obj, gene_bed.gene_name, i,
                                         gene_bed.chrom, strand, exon[0], exon[1])
            ss_fasta = fasta_5ss + fasta_3ss

        gene_fasta += exon_fasta + ss_fasta

    return gene_fasta
def _fetch_3ss_fasta(fasta, gene_name, exon_num,
                     chrom, strand, start, end):
    """Retreives the 3' SS sequence flanking the specified exon.

    Returns a string in fasta format with the first line containing
    a ">" and the second line contains the two base pairs of 3' SS.

    Parameters
    ----------
    fasta : pysam.Fastafile
        fasta object from pysam
    gene_name : str
        gene name used for fasta seq id
    exon_num : int
        the `exon_num` exon, used for seq id
    chrom : str
        chromsome
    strand : str
        strand, {'+', '-'}
    start : int
        0-based start position
    end : int
        0-based end position

    Returns
    -------
    ss_fasta : str
        string in fasta format with first line being seq id
    """

    if strand == '-':
        ss_seq = fasta.fetch(reference=chrom,
                             start=end-1,
                             end=end+3)
        ss_seq = utils.rev_comp(ss_seq)
    elif strand == '+':
        ss_seq = fasta.fetch(reference=chrom,
                             start=start-3,
                             end=start+1)
    ss_fasta = '>{0};exon{1};3SS\n{2}\n'.format(gene_name,
                                                exon_num,
                                                ss_seq.upper())
    return ss_fasta
Beispiel #7
0
def _fetch_3ss_fasta(fasta, gene_name, exon_num, chrom, strand, start, end):
    """Retreives the 3' SS sequence flanking the specified exon.

    Returns a string in fasta format with the first line containing
    a ">" and the second line contains the two base pairs of 3' SS.

    Parameters
    ----------
    fasta : pysam.Fastafile
        fasta object from pysam
    gene_name : str
        gene name used for fasta seq id
    exon_num : int
        the `exon_num` exon, used for seq id
    chrom : str
        chromsome
    strand : str
        strand, {'+', '-'}
    start : int
        0-based start position
    end : int
        0-based end position

    Returns
    -------
    ss_fasta : str
        string in fasta format with first line being seq id
    """

    if strand == '-':
        ss_seq = fasta.fetch(reference=chrom, start=end - 1, end=end + 3)
        ss_seq = utils.rev_comp(ss_seq)
    elif strand == '+':
        ss_seq = fasta.fetch(reference=chrom, start=start - 3, end=start + 1)
    ss_fasta = '>{0};exon{1};3SS\n{2}\n'.format(gene_name, exon_num,
                                                ss_seq.upper())
    return ss_fasta
Beispiel #8
0
def maf_permutation(context_counts,
                    context_to_mut,
                    seq_context,
                    gene_seq,
                    num_permutations=10000,
                    drop_silent=False):
    """Performs null-permutations across all genes and records the results in
    a format like a MAF file. This could be useful for examining the null
    permutations because the alternative approaches always summarize the results.
    With the simulated null-permutations, novel metrics can be applied to create
    an empirical null-distribution.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    maf_list : list of tuples
        list of null mutations with mutation info in a MAF like format
    """
    mycontexts = context_counts.index.tolist()
    somatic_base, base_context = zip(*[(base, one_context)
                                       for one_context in mycontexts
                                       for base in context_to_mut[one_context]])

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []
    for row in tmp_mut_pos:
        # get genome coordinate
        pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
        genome_coord = pos2genome(row)

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # get string describing variant
        var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                      tmp_mut_info['Somatic AA'],
                                                      tmp_mut_info['Codon Pos'])

        # prepare output
        for k, mysomatic_base in enumerate(somatic_base):
            # format DNA change
            ref_nuc = tmp_mut_info['Reference Nuc'][k]
            nuc_pos = row[k]
            dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base)

            # format protein change
            ref_aa = tmp_mut_info['Reference AA'][k]
            somatic_aa = tmp_mut_info['Somatic AA'][k]
            codon_pos = tmp_mut_info['Codon Pos'][k]
            protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa)

            # reverse complement if on negative strand
            if strand == '-':
                ref_nuc = utils.rev_comp(ref_nuc)
                mysomatic_base = utils.rev_comp(mysomatic_base)

            # append results
            if drop_silent and var_class[k].decode() == 'Silent': continue
            maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                        ref_nuc, mysomatic_base, base_context[k], dna_change,
                        protein_change, var_class[k].decode()]
            maf_list.append(maf_line)

    return maf_list
def detect_coordinates(mut_df, genome_fa):
    # detect problems with using 0-based coordinates
    zero_len_count = 0
    num_snv = 0
    matching_ref = [0, 0]
    matching_pair = [0, 0]
    bad_match = [0, 0]
    for ix, row in mut_df.iterrows():
        if (row['End_Position'] - row['Start_Position']) == 0:
            zero_len_count += 1
        no_shift_seq = genome_fa.fetch(reference=row['Chromosome'],
                                       start=row['Start_Position'],
                                       end=row['End_Position'])
        minus_1_seq = genome_fa.fetch(reference=row['Chromosome'],
                                      start=row['Start_Position']-1,
                                      end=row['End_Position'])
        seqs = [minus_1_seq, no_shift_seq]

        if len(row['Reference_Allele']) == 1 and row['Reference_Allele'] != '-':
            num_snv += 1

        for i in range(len(seqs)):
            if seqs[i].upper() == row['Reference_Allele'].upper() and len(row['Reference_Allele']) == 1:
                matching_ref[i] += 1
            elif seqs[i].upper() == utils.rev_comp(row['Reference_Allele']).upper() and len(row['Reference_Allele']) == 1:
                #if i == 1:
                    #print row
                matching_pair[i] += 1
            else:
                bad_match[i] += 1

    # return coordinate type
    num_mut = len(mut_df)
    zero_len_pct = zero_len_count / float(num_mut)
    matching_pair_pct = map(lambda x: x / float(num_snv), matching_pair)
    matching_pct = map(lambda x: x / float(num_snv), matching_ref)
    bad_match_pct = map(lambda x: x / float(num_snv), bad_match)
    logger.info('{0:.2f}%% for {1} tested mutations had zero length'.format(100*zero_len_pct, num_mut))
    logger.info('{0} for {1} did match the + strand reference genome'.format(matching_pct, num_snv))
    logger.info('{0} for {1} did match the - strand reference genome'.format(matching_pair_pct, num_snv))
    logger.info('{0} for {1} was a bad match'.format(bad_match_pct, num_snv))
    if zero_len_pct > .3:
        logger.info('1-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info('Mutations likely reported on the genes\'s coding strand')
            return 1, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 1, '+'
    elif (matching_ref[0] + matching_pair[0]) > (matching_ref[1] + matching_pair[1]):
        logger.info('0-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info('Mutations likely reported on the genes\'s coding strand')
            return 0, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 0, '+'
    else:
        logger.info('1-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info('Mutations likely reported on the genes\'s coding strand')
            return 1, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 1, '+'
def singleprocess_permutation(info):
    # initialize input
    bed_list, mut_df, opts, fs_cts_df, p_inactivating = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # list of columns that are needed
    cols = [
        'Chromosome',
        'Start_Position',
        'Reference_Allele',
        'Tumor_Allele',
        'Variant_Classification',
    ]
    # conditionally add protein_change column if exists
    if 'Protein_Change' in mut_df.columns:
        cols += ['Protein_Change']

    # figure out which genes actually have a mutation
    genes_with_mut = set(mut_df['Gene'].unique())

    # iterate through each gene
    result = []
    for bed in bed_list:
        if bed.gene_name not in genes_with_mut:
            # skip genes with no mutations
            continue

        # prepare info for running permutation test
        mut_info = mut_df.loc[mut_df['Gene'] == bed.gene_name, cols]
        gs.set_gene(bed)
        sc = SequenceContext(gs, seed=opts['seed'])

        # count total mutations in gene
        total_mut = len(mut_info)

        # fix nucleotide letter if gene is on - strand
        if bed.strand == '-':
            rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x))
            mut_info.loc[:, 'Tumor_Allele'] = rc

        # get coding positions, mutations unmapped to the reference tx will have
        # NA for a coding position
        pos_list = []
        for ix, row in mut_info.iterrows():
            coding_pos = bed.query_position(bed.strand, row['Chromosome'],
                                            row['Start_Position'])
            pos_list.append(coding_pos)
        mut_info.loc[:, 'Coding Position'] = pos_list

        # recover mutations that could not be mapped to the reference transcript
        # for a gene before being dropped (next step)
        unmapped_mut_info = mc.recover_unmapped_mut_info(
            mut_info, bed, sc, opts)

        # drop mutations wich do not map to reference tx
        mut_info = mut_info.dropna(subset=['Coding Position'
                                           ])  # mutations need to map to tx
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        num_mapped_muts = len(mut_info)
        unmapped_muts = total_mut - num_mapped_muts

        # construct sequence context
        #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(),
        #                         mut_info['Coding Position'].tolist())

        # calculate results of permutation test
        if opts['kind'] == 'oncogene':
            # calculate position based permutation results
            tmp_result = mypval.calc_position_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['score_dir'],
                opts['num_iterations'],
                opts['stop_criteria'],
                0,  # no recurrent mutation pseudo count
                opts['recurrent'],
                opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        elif opts['kind'] == 'tsg':
            # calculate results for deleterious mutation permutation test
            #fs_ct = fs_cts_df['total'][bed.gene_name]
            #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name]
            # replaced fs_ct with zero to stop using the frameshifts in
            # simulation
            tmp_result = mypval.calc_deleterious_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['num_iterations'],
                opts['stop_criteria'],
                opts['deleterious'],
                0,  # no deleterious mutation pseudo count
                opts['seed'])
            result.append(tmp_result + [num_mapped_muts, unmapped_muts])
            #fs_ct, fs_unmapped])
        elif opts['kind'] == 'hotmaps1d':
            # save null distribution if user option specified
            if opts['null_distr_dir']:
                if not os.path.exists(opts['null_distr_dir']):
                    os.mkdir(opts['null_distr_dir'])
                save_path = os.path.join(opts['null_distr_dir'],
                                         bed.gene_name + '.{0}.txt')
            else:
                save_path = None
            # calculate position based permutation results
            mywindow = list(map(int, opts['window'].split(',')))
            tmp_result = mypval.calc_hotmaps_p_value(mut_info,
                                                     unmapped_mut_info,
                                                     sc,
                                                     gs,
                                                     bed,
                                                     mywindow,
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['report_index'],
                                                     null_save_path=save_path)
            result.extend(tmp_result)
        elif opts['kind'] == 'protein':
            tmp_result = mypval.calc_protein_p_value(
                mut_info, unmapped_mut_info, sc, gs, bed,
                opts['neighbor_graph_dir'], opts['num_iterations'],
                opts['stop_criteria'], opts['recurrent'], opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        else:
            # calc results for entropy-on-effect permutation test
            tmp_result = mypval.calc_effect_p_value(
                mut_info,
                unmapped_mut_info,
                sc,
                gs,
                bed,
                opts['num_iterations'],
                0,  #  no recurrent mutation pseudo count
                opts['recurrent'],
                opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result
Beispiel #11
0
def compute_mutation_context(bed, gs, df, opts):
    # prepare info for running permutation test
    gene_mut = df[df['Gene'] == bed.gene_name]
    cols = [
        'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Allele',
        'Variant_Classification', 'Protein_Change', 'Tumor_Sample',
        'Tumor_Type'
    ]
    mut_info = gene_mut[cols]
    gs.set_gene(bed)

    # get sequence context
    if 'seed' in opts:
        sc = prob2020.python.sequence_context.SequenceContext(
            gs, seed=opts['seed'])
    else:
        sc = prob2020.python.sequence_context.SequenceContext(gs)

    # count total mutations in gene
    total_mut = len(mut_info)

    # fix nucleotide letter if gene is on - strand
    if bed.strand == '-':
        mut_info.loc[:, 'Tumor_Allele'] = mut_info['Tumor_Allele'].map(
            lambda x: utils.rev_comp(x))

    # get coding positions, mutations unmapped to the reference tx will have
    # NA for a coding position
    pos_list = []
    for ix, row in mut_info.iterrows():
        coding_pos = bed.query_position(bed.strand, row['Chromosome'],
                                        row['Start_Position'])
        pos_list.append(coding_pos)
    mut_info['Coding Position'] = pos_list

    # recover mutations that could not be mapped to the reference transcript
    # for a gene before being dropped (next step)
    unmapped_mut_info = recover_unmapped_mut_info(mut_info, bed, sc, opts)

    # drop mutations wich do not map to reference tx
    mut_info = mut_info.dropna(subset=['Coding Position'
                                       ])  # mutations need to map to tx
    mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
    unmapped_muts = total_mut - len(mut_info)

    cols = [
        'Context', 'Tumor_Allele', 'Coding Position', 'Tumor_Sample',
        'Tumor_Type'
    ]
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        rename_dict = {'Codon Pos': 'Coding Position'}
        unmapped_mut_df = unmapped_mut_df.rename(columns=rename_dict)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))
    else:
        # initialize empty results if there are no mutations
        context_cts = pd.Series([])
        context_to_mutations = {}
        tmp_df = pd.DataFrame(columns=cols)

    return context_cts, context_to_mutations, tmp_df, gs, sc
Beispiel #12
0
def detect_coordinates(mut_df, genome_fa):
    # detect problems with using 0-based coordinates
    zero_len_count = 0
    num_snv = 0
    matching_ref = [0, 0]
    matching_pair = [0, 0]
    bad_match = [0, 0]
    for ix, row in mut_df.iterrows():
        if (row['End_Position'] - row['Start_Position']) == 0:
            zero_len_count += 1
        no_shift_seq = genome_fa.fetch(reference=row['Chromosome'],
                                       start=row['Start_Position'],
                                       end=row['End_Position'])
        minus_1_seq = genome_fa.fetch(reference=row['Chromosome'],
                                      start=row['Start_Position'] - 1,
                                      end=row['End_Position'])
        seqs = [minus_1_seq, no_shift_seq]

        if len(row['Reference_Allele']
               ) == 1 and row['Reference_Allele'] != '-':
            num_snv += 1

        for i in range(len(seqs)):
            if seqs[i].upper() == row['Reference_Allele'].upper() and len(
                    row['Reference_Allele']) == 1:
                matching_ref[i] += 1
            elif seqs[i].upper() == utils.rev_comp(
                    row['Reference_Allele']).upper() and len(
                        row['Reference_Allele']) == 1:
                #if i == 1:
                #print row
                matching_pair[i] += 1
            else:
                bad_match[i] += 1

    # return coordinate type
    num_mut = len(mut_df)
    zero_len_pct = zero_len_count / float(num_mut)
    matching_pair_pct = map(lambda x: x / float(num_snv), matching_pair)
    matching_pct = map(lambda x: x / float(num_snv), matching_ref)
    bad_match_pct = map(lambda x: x / float(num_snv), bad_match)
    logger.info('{0:.2f}%% for {1} tested mutations had zero length'.format(
        100 * zero_len_pct, num_mut))
    logger.info('{0} for {1} did match the + strand reference genome'.format(
        matching_pct, num_snv))
    logger.info('{0} for {1} did match the - strand reference genome'.format(
        matching_pair_pct, num_snv))
    logger.info('{0} for {1} was a bad match'.format(bad_match_pct, num_snv))
    if zero_len_pct > .3:
        logger.info('1-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info(
                'Mutations likely reported on the genes\'s coding strand')
            return 1, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 1, '+'
    elif (matching_ref[0] + matching_pair[0]) > (matching_ref[1] +
                                                 matching_pair[1]):
        logger.info('0-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info(
                'Mutations likely reported on the genes\'s coding strand')
            return 0, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 0, '+'
    else:
        logger.info('1-based coordinate system likely used.')
        if matching_pair_pct[1] > .25:
            logger.info(
                'Mutations likely reported on the genes\'s coding strand')
            return 1, 'coding'
        else:
            logger.info('Mutations likely reported on the genes\'s + strand')
            return 1, '+'
def maf_permutation(context_counts,
                    context_to_mut,
                    seq_context,
                    gene_seq,
                    num_permutations=10000,
                    drop_silent=False):
    """Performs null-permutations across all genes and records the results in
    a format like a MAF file. This could be useful for examining the null
    permutations because the alternative approaches always summarize the results.
    With the simulated null-permutations, novel metrics can be applied to create
    an empirical null-distribution.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    maf_list : list of tuples
        list of null mutations with mutation info in a MAF like format
    """
    mycontexts = context_counts.index.tolist()
    somatic_base, base_context = zip(*[(base, one_context)
                                       for one_context in mycontexts
                                       for base in context_to_mut[one_context]])

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []
    for row in tmp_mut_pos:
        # get genome coordinate
        pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
        genome_coord = pos2genome(row)

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # get string describing variant
        var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                      tmp_mut_info['Somatic AA'],
                                                      tmp_mut_info['Codon Pos'])

        # prepare output
        for k, mysomatic_base in enumerate(somatic_base):
            # format DNA change
            ref_nuc = tmp_mut_info['Reference Nuc'][k]
            nuc_pos = row[k]
            dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base)

            # format protein change
            ref_aa = tmp_mut_info['Reference AA'][k]
            somatic_aa = tmp_mut_info['Somatic AA'][k]
            codon_pos = tmp_mut_info['Codon Pos'][k]
            protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa)

            # reverse complement if on negative strand
            if strand == '-':
                ref_nuc = utils.rev_comp(ref_nuc)
                mysomatic_base = utils.rev_comp(mysomatic_base)

            # append results
            if drop_silent and var_class[k].decode() == 'Silent': continue
            maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                        ref_nuc, mysomatic_base, base_context[k], dna_change,
                        protein_change, var_class[k].decode()]
            maf_list.append(maf_line)

    return maf_list
def singleprocess_permutation(info):
    # initialize input
    bed_list, mut_df, opts, fs_cts_df, p_inactivating = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # list of columns that are needed
    cols = ['Chromosome', 'Start_Position', 'Reference_Allele',
            'Tumor_Allele', 'Variant_Classification',]
    # conditionally add protein_change column if exists
    if 'Protein_Change' in mut_df.columns:
        cols += ['Protein_Change']

    # figure out which genes actually have a mutation
    genes_with_mut = set(mut_df['Gene'].unique())

    # iterate through each gene
    result = []
    for bed in bed_list:
        if bed.gene_name not in genes_with_mut:
            # skip genes with no mutations
            continue

        # prepare info for running permutation test
        mut_info = mut_df.loc[mut_df['Gene']==bed.gene_name, cols]
        gs.set_gene(bed)
        sc = SequenceContext(gs, seed=opts['seed'])

        # count total mutations in gene
        total_mut = len(mut_info)

        # fix nucleotide letter if gene is on - strand
        if bed.strand == '-':
            rc = mut_info['Tumor_Allele'].map(lambda x: utils.rev_comp(x))
            mut_info.loc[:, 'Tumor_Allele'] = rc

        # get coding positions, mutations unmapped to the reference tx will have
        # NA for a coding position
        pos_list = []
        for ix, row in mut_info.iterrows():
            coding_pos = bed.query_position(bed.strand, row['Chromosome'], row['Start_Position'])
            pos_list.append(coding_pos)
        mut_info.loc[:, 'Coding Position'] = pos_list

        # recover mutations that could not be mapped to the reference transcript
        # for a gene before being dropped (next step)
        unmapped_mut_info = mc.recover_unmapped_mut_info(mut_info, bed, sc, opts)

        # drop mutations wich do not map to reference tx
        mut_info = mut_info.dropna(subset=['Coding Position'])  # mutations need to map to tx
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        num_mapped_muts = len(mut_info)
        unmapped_muts = total_mut - num_mapped_muts

        # construct sequence context
        #gs.add_germline_variants(mut_info['Reference_Allele'].tolist(),
        #                         mut_info['Coding Position'].tolist())

        # calculate results of permutation test
        if opts['kind'] == 'oncogene':
            # calculate position based permutation results
            tmp_result = mypval.calc_position_p_value(mut_info, unmapped_mut_info, sc,
                                                      gs, bed, opts['score_dir'],
                                                      opts['num_iterations'],
                                                      opts['stop_criteria'],
                                                      0,  # no recurrent mutation pseudo count
                                                      opts['recurrent'],
                                                      opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        elif opts['kind'] == 'tsg':
            # calculate results for deleterious mutation permutation test
            #fs_ct = fs_cts_df['total'][bed.gene_name]
            #fs_unmapped = fs_cts_df['unmapped'][bed.gene_name]
            # replaced fs_ct with zero to stop using the frameshifts in
            # simulation
            tmp_result = mypval.calc_deleterious_p_value(mut_info, unmapped_mut_info,
                                                         sc, gs, bed,
                                                         opts['num_iterations'],
                                                         opts['stop_criteria'],
                                                         opts['deleterious'],
                                                         0,  # no deleterious mutation pseudo count
                                                         opts['seed'])
            result.append(tmp_result + [num_mapped_muts, unmapped_muts])
                                        #fs_ct, fs_unmapped])
        elif opts['kind'] == 'hotmaps1d':
            # save null distribution if user option specified
            if opts['null_distr_dir']:
                if not os.path.exists(opts['null_distr_dir']): os.mkdir(opts['null_distr_dir'])
                save_path = os.path.join(opts['null_distr_dir'], bed.gene_name + '.{0}.txt')
            else:
                save_path = None
            # calculate position based permutation results
            mywindow = list(map(int, opts['window'].split(',')))
            tmp_result = mypval.calc_hotmaps_p_value(mut_info, unmapped_mut_info, sc,
                                                     gs, bed,
                                                     mywindow,
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['report_index'],
                                                     null_save_path=save_path)
            result.extend(tmp_result)
        elif opts['kind'] == 'protein':
            tmp_result = mypval.calc_protein_p_value(mut_info, unmapped_mut_info,
                                                     sc, gs, bed,
                                                     opts['neighbor_graph_dir'],
                                                     opts['num_iterations'],
                                                     opts['stop_criteria'],
                                                     opts['recurrent'],
                                                     opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])
        else:
            # calc results for entropy-on-effect permutation test
            tmp_result = mypval.calc_effect_p_value(mut_info, unmapped_mut_info,
                                                    sc, gs, bed,
                                                    opts['num_iterations'],
                                                    0, #  no recurrent mutation pseudo count
                                                    opts['recurrent'],
                                                    opts['fraction'])
            result.append(tmp_result + [total_mut, unmapped_muts])

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result