Beispiel #1
0
def counter_triplets_MB():

    # the file we just generated
    mapp_file = 'data/megabase_probability/hg19.mappable.1Mb.windows.bed.extra.gz'

    df_mapp = pd.read_csv(mapp_file,
                          sep='\t',
                          names=[
                              'chr', 'start', 'end', 'val', 'chr1', 'start1',
                              'end1', 'overlapp', 'ID', 'real_start'
                          ])

    df_mapp['len'] = df_mapp['end'] - df_mapp['start']

    counter_per_megabase = defaultdict(dict)
    counter_nucl_per_megabase = defaultdict(int)

    for mb, region in tqdm(df_mapp.groupby(by='ID')):
        try:
            region['seq'] = region.apply(lambda x: hg19(
                x['chr'], x['start'], x['end'] - x['start'] + 2),
                                         axis=1)
        except:
            region['seq'] = region.apply(lambda x: hg19(
                x['chr'], x['start'] + 1, x['end'] - x['start']),
                                         axis=1)
        counter_region = Counter()
        for seq in region['seq'].tolist():
            sliced = Counter(list(slicing_window(seq)))
            counter_region += sliced

        counter_per_megabase[mb] = counter_region

        # count the length too
        counter_nucl_per_megabase[mb] = np.sum(region['len'].tolist())

    pickle.dump(
        dict(counter_per_megabase),
        gzip.open('data/megabase_probability/counter_1Mb.pckl.gz', 'wb'))
    pickle.dump(
        dict(counter_nucl_per_megabase),
        gzip.open(
            'data/megabase_probability/mappable_counts_megabase_mutations.pckl.gz',
            'wb'))
    total_count = defaultdict(int)
    for mb, d in counter_per_megabase.items():
        for triplet, c in d.items():
            total_count[triplet] += c

    pickle.dump(
        dict(total_count),
        gzip.open('data/megabase_probability/counter_mappable.pckl.gz', 'wb'))
def get_mutation_sigfit(row):
    ref = row['REF']
    chr = row['CHR']
    pos = int(row['POS'])
    triplet = hg19(chr, pos - 1, 3)
    if triplet[1] == ref and 'N' not in triplet:
        return triplet
    else:
        return None
def get_mutation_deconstructsigs(row):
    ref = row['REF']
    alt = row['ALT']
    chr = row['CHR']
    pos = int(row['POS'])
    triplet = hg19(chr, pos - 1, 3)
    if triplet[1] == ref and 'N' not in triplet:
        return '_'.join([triplet, alt])
    else:
        return None
Beispiel #4
0
def get_context_rev(rw):
    equival_nt = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    pos = rw['POS']
    left, ref, right = hg19(rw['#CHROM'], pos - 1, size=3)
    alt = rw['ALT']
    rw['TRIPLE'] = left + '[' + ref + '>' + alt + ']' + right
    rw['TRIPLE_COM'] = equival_nt[left] + '[' + equival_nt[ref] + '>' + equival_nt[alt] + ']' + equival_nt[right]
    rw['TRIPLE_COM_REV'] = equival_nt[right] + '[' + equival_nt[ref] + '>' + equival_nt[alt] + ']' + equival_nt[left]

    return rw
Beispiel #5
0
def consequences_in_genes(genic_locations, df_mapp_bed, outfile_name):

    genic_full_regions = BedTool.from_dataframe(
        genic_locations[['chr', 'Gene start (bp)', 'Gene end (bp)']])

    # intersect mappable regions with all genic regions so that only regions with overlapping genes are considered,
    # to speed up calculations.
    all_genic_overlapp = df_mapp_bed.intersect(
        genic_full_regions, wa=True).to_dataframe(names=[
            'chr', 'start', 'end', 'val', 'chr1', 'start1', 'end1', 'overlapp',
            'ID', 'real_start'
        ])

    conseq_type = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}

    # set of consequence variants that we will consider as a protein-affecting
    consequence_wanted = {
        'start_lost', 'splice_region_variant', 'splice_donor_variant',
        'stop_gained', 'stop_lost', 'missense_variant',
        'splice_acceptor_variant'
    }

    # get the consequence type of our regions
    megabase_genic = defaultdict(dict)
    with BGPack('hg19', '88') as reader:
        for mb, data in tqdm(all_genic_overlapp.groupby(by='ID')):

            # counter of each type of mutation and how many times it affects the protein
            counter_feat = defaultdict(int)

            # go over each of the mappable intervals
            for i, row in data.iterrows():

                # for each of the positions within the intervals, check the consequence type
                for pos, cons in reader.get(row['chr'], row['real_start'],
                                            row['end']):

                    # each consequence type (most severe one), coming from A; C; G; T
                    for ix, c in enumerate(cons):

                        # if we are interested in the consequence
                        if c in consequence_wanted:
                            # keep the triplet
                            triplet = hg19(row['chr'], pos - 1, 3)
                            key = '{}_{}'.format(triplet, conseq_type[ix])
                            counter_feat[key] += 1

            # add the dict to each megabase
            megabase_genic[mb] = counter_feat

    pickle.dump(
        dict(megabase_genic),
        gzip.open('data/megabase_probability/{}.pckl.gz'.format(outfile_name),
                  'wb'))
Beispiel #6
0
def separate(rw):
    """
    Separate information from id variant column #Uploaded_variation
    :param rw: row of the dataframe (one variant)
    :return: row with recovered information in new columns
    """

    if "_" in rw['#Uploaded_variation']:
        rw['#CHROM'] = rw['#Uploaded_variation'].split("_")[0]
        rw['POS'] = rw['#Uploaded_variation'].split("_")[1]
        rw['Change'] = rw['#Uploaded_variation'].split("_")[2]
        rw['REF'] = rw['Change'].split("/")[0]
        rw['ALT'] = rw['Change'].split("/")[1]
    else:
        rw['#CHROM'] = str(rw['Location'].split(":")[0])
        rw['POS'] = rw['Location'].split(":")[1]
        if "-" in rw['POS']:
            rw['POS'] = int(rw['POS'].split("-")[0])
        else:
            rw['POS'] = int(rw['POS'])
        rw['REF'] = hg19(str(rw['#CHROM']), rw['POS'], 1)
        rw['ALT'] = rw['Allele']
    return rw
def format_hartwig(mutation_file, cnvs_file, purity_file, outfile):

    # load files and preformat them
    df, cnv_bed, purity_score, gender = load_files(mutation_file, cnvs_file,
                                                   purity_file)

    # this is the sample column
    lastcol = list(df.columns)[-1]

    # get total reads
    df_reads = df.apply(get_reads, axis=1, args=([lastcol]))

    # select whether we have SNVs or others
    df_reads['len_alt'] = df_reads['ALT'].str.len()

    # number of characters in ref
    df_reads['len_ref'] = df_reads['REF'].str.len()

    # first classification between SNV and others
    df_reads['TYPE'] = df_reads.apply(lambda x: 'SNV' if (
        (x['len_alt'] == 1) and (x['len_ref'] == 1) and (x['ALT'] != '-') and
        (x['REF'] != '-')) else 'INDEL',
                                      axis=1)

    df_reads['pos-1'] = df_reads['POS'] - 1

    # get the triplet
    df_reads['TRIPLET'] = df_reads.apply(
        lambda x: hg19(x['CHROM'], x['pos-1'], 3), axis=1)
    df_reads['EXTENDED'] = df_reads.apply(
        lambda x: hg19(x['CHROM'],
                       int(x['POS']) - 2, 5), axis=1)

    snv_df = df_reads[df_reads['TYPE'] != 'INDEL']
    snv_df['CLASS'] = 'SNV'
    snv_df['VARIANT_CLASS'] = snv_df.apply(create_snv_class, axis=1)

    # classify indels
    indel_df = df_reads[df_reads['TYPE'] == 'INDEL']
    indels = indels_classification(indel_df)
    columns = indels.columns

    df_reads_merged = pd.concat([snv_df, indels], sort=True)
    df_reads_merged = df_reads_merged[columns]

    # assing the name of the sample
    df_reads_merged['sample'] = lastcol

    # create bed file
    mut_bed = BedTool.from_dataframe(df_reads_merged[[
        'CHROM', 'pos-1', 'POS', 'ref_reads', 'var_reads', 'VAF',
        'total_reads', 'REF', 'ALT', 'sample', 'TYPE', 'CLASS',
        'VARIANT_CLASS', 'TRIPLET', 'EXTENDED'
    ]])

    # Remove unmappable regions
    mapped = get_mappable_regions(mut_bed)

    # intersect with CN data
    out = mapped.intersect(cnv_bed, wao=True)

    # merge to dataframe
    merge = out.to_dataframe(names=[
        'CHROM', 'POS-1', 'POS', 'REF_COUNTS', 'VAR_COUNTS', 'VAF',
        'TOTAL_READS', 'REF', 'ALT', 'SAMPLE', 'TYPE', 'CLASS',
        'VARIANT_CLASS', 'TRIPLET', 'EXTENDED', 'c1', 'p1', 'p2',
        'MAJOR_CN_TEMP', 'actual_Baf', 'overlapp'
    ])

    # get the normal copy number values
    sex_chrom = ('Y', 'X')

    # get normal CN in the chromosome
    merge['NORMAL_CN'] = merge['CHROM'].apply(
        lambda x: 1 if x in sex_chrom and gender == "MALE" else 2)

    # add the purity score we got from PURPLE
    merge['PURITY'] = purity_score
    merge['GENDER'] = gender

    # get number of CNAs, if no overlapp then get the normal count
    merge['TOTAL_CN'] = merge.apply(get_major_cn, axis=1)

    # formula of allele specific copy number according to hartwig's people
    merge['MAJOR_CN'] = round(merge['actual_Baf'] *
                              merge['TOTAL_CN']).astype(int)
    merge['MINOR_CN'] = round(
        (1 - merge['actual_Baf']) * merge['TOTAL_CN']).astype(int)

    merge['CHROM'] = merge['CHROM'].apply(lambda x: 'chr{}'.format(x))

    # save files
    merge.dropna()[[
        'CHROM', 'POS', 'REF', 'ALT', 'TRIPLET', 'EXTENDED', 'CLASS',
        'VARIANT_CLASS', 'SAMPLE', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN',
        'NORMAL_CN', 'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY'
    ]].to_csv(outfile, sep='\t', index=False, header=True, compression='gzip')

    # clean BedTools temp files
    pybedtools.cleanup()