def counter_triplets_MB(): # the file we just generated mapp_file = 'data/megabase_probability/hg19.mappable.1Mb.windows.bed.extra.gz' df_mapp = pd.read_csv(mapp_file, sep='\t', names=[ 'chr', 'start', 'end', 'val', 'chr1', 'start1', 'end1', 'overlapp', 'ID', 'real_start' ]) df_mapp['len'] = df_mapp['end'] - df_mapp['start'] counter_per_megabase = defaultdict(dict) counter_nucl_per_megabase = defaultdict(int) for mb, region in tqdm(df_mapp.groupby(by='ID')): try: region['seq'] = region.apply(lambda x: hg19( x['chr'], x['start'], x['end'] - x['start'] + 2), axis=1) except: region['seq'] = region.apply(lambda x: hg19( x['chr'], x['start'] + 1, x['end'] - x['start']), axis=1) counter_region = Counter() for seq in region['seq'].tolist(): sliced = Counter(list(slicing_window(seq))) counter_region += sliced counter_per_megabase[mb] = counter_region # count the length too counter_nucl_per_megabase[mb] = np.sum(region['len'].tolist()) pickle.dump( dict(counter_per_megabase), gzip.open('data/megabase_probability/counter_1Mb.pckl.gz', 'wb')) pickle.dump( dict(counter_nucl_per_megabase), gzip.open( 'data/megabase_probability/mappable_counts_megabase_mutations.pckl.gz', 'wb')) total_count = defaultdict(int) for mb, d in counter_per_megabase.items(): for triplet, c in d.items(): total_count[triplet] += c pickle.dump( dict(total_count), gzip.open('data/megabase_probability/counter_mappable.pckl.gz', 'wb'))
def get_mutation_sigfit(row): ref = row['REF'] chr = row['CHR'] pos = int(row['POS']) triplet = hg19(chr, pos - 1, 3) if triplet[1] == ref and 'N' not in triplet: return triplet else: return None
def get_mutation_deconstructsigs(row): ref = row['REF'] alt = row['ALT'] chr = row['CHR'] pos = int(row['POS']) triplet = hg19(chr, pos - 1, 3) if triplet[1] == ref and 'N' not in triplet: return '_'.join([triplet, alt]) else: return None
def get_context_rev(rw): equival_nt = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} pos = rw['POS'] left, ref, right = hg19(rw['#CHROM'], pos - 1, size=3) alt = rw['ALT'] rw['TRIPLE'] = left + '[' + ref + '>' + alt + ']' + right rw['TRIPLE_COM'] = equival_nt[left] + '[' + equival_nt[ref] + '>' + equival_nt[alt] + ']' + equival_nt[right] rw['TRIPLE_COM_REV'] = equival_nt[right] + '[' + equival_nt[ref] + '>' + equival_nt[alt] + ']' + equival_nt[left] return rw
def consequences_in_genes(genic_locations, df_mapp_bed, outfile_name): genic_full_regions = BedTool.from_dataframe( genic_locations[['chr', 'Gene start (bp)', 'Gene end (bp)']]) # intersect mappable regions with all genic regions so that only regions with overlapping genes are considered, # to speed up calculations. all_genic_overlapp = df_mapp_bed.intersect( genic_full_regions, wa=True).to_dataframe(names=[ 'chr', 'start', 'end', 'val', 'chr1', 'start1', 'end1', 'overlapp', 'ID', 'real_start' ]) conseq_type = {0: 'A', 1: 'C', 2: 'G', 3: 'T'} # set of consequence variants that we will consider as a protein-affecting consequence_wanted = { 'start_lost', 'splice_region_variant', 'splice_donor_variant', 'stop_gained', 'stop_lost', 'missense_variant', 'splice_acceptor_variant' } # get the consequence type of our regions megabase_genic = defaultdict(dict) with BGPack('hg19', '88') as reader: for mb, data in tqdm(all_genic_overlapp.groupby(by='ID')): # counter of each type of mutation and how many times it affects the protein counter_feat = defaultdict(int) # go over each of the mappable intervals for i, row in data.iterrows(): # for each of the positions within the intervals, check the consequence type for pos, cons in reader.get(row['chr'], row['real_start'], row['end']): # each consequence type (most severe one), coming from A; C; G; T for ix, c in enumerate(cons): # if we are interested in the consequence if c in consequence_wanted: # keep the triplet triplet = hg19(row['chr'], pos - 1, 3) key = '{}_{}'.format(triplet, conseq_type[ix]) counter_feat[key] += 1 # add the dict to each megabase megabase_genic[mb] = counter_feat pickle.dump( dict(megabase_genic), gzip.open('data/megabase_probability/{}.pckl.gz'.format(outfile_name), 'wb'))
def separate(rw): """ Separate information from id variant column #Uploaded_variation :param rw: row of the dataframe (one variant) :return: row with recovered information in new columns """ if "_" in rw['#Uploaded_variation']: rw['#CHROM'] = rw['#Uploaded_variation'].split("_")[0] rw['POS'] = rw['#Uploaded_variation'].split("_")[1] rw['Change'] = rw['#Uploaded_variation'].split("_")[2] rw['REF'] = rw['Change'].split("/")[0] rw['ALT'] = rw['Change'].split("/")[1] else: rw['#CHROM'] = str(rw['Location'].split(":")[0]) rw['POS'] = rw['Location'].split(":")[1] if "-" in rw['POS']: rw['POS'] = int(rw['POS'].split("-")[0]) else: rw['POS'] = int(rw['POS']) rw['REF'] = hg19(str(rw['#CHROM']), rw['POS'], 1) rw['ALT'] = rw['Allele'] return rw
def format_hartwig(mutation_file, cnvs_file, purity_file, outfile): # load files and preformat them df, cnv_bed, purity_score, gender = load_files(mutation_file, cnvs_file, purity_file) # this is the sample column lastcol = list(df.columns)[-1] # get total reads df_reads = df.apply(get_reads, axis=1, args=([lastcol])) # select whether we have SNVs or others df_reads['len_alt'] = df_reads['ALT'].str.len() # number of characters in ref df_reads['len_ref'] = df_reads['REF'].str.len() # first classification between SNV and others df_reads['TYPE'] = df_reads.apply(lambda x: 'SNV' if ( (x['len_alt'] == 1) and (x['len_ref'] == 1) and (x['ALT'] != '-') and (x['REF'] != '-')) else 'INDEL', axis=1) df_reads['pos-1'] = df_reads['POS'] - 1 # get the triplet df_reads['TRIPLET'] = df_reads.apply( lambda x: hg19(x['CHROM'], x['pos-1'], 3), axis=1) df_reads['EXTENDED'] = df_reads.apply( lambda x: hg19(x['CHROM'], int(x['POS']) - 2, 5), axis=1) snv_df = df_reads[df_reads['TYPE'] != 'INDEL'] snv_df['CLASS'] = 'SNV' snv_df['VARIANT_CLASS'] = snv_df.apply(create_snv_class, axis=1) # classify indels indel_df = df_reads[df_reads['TYPE'] == 'INDEL'] indels = indels_classification(indel_df) columns = indels.columns df_reads_merged = pd.concat([snv_df, indels], sort=True) df_reads_merged = df_reads_merged[columns] # assing the name of the sample df_reads_merged['sample'] = lastcol # create bed file mut_bed = BedTool.from_dataframe(df_reads_merged[[ 'CHROM', 'pos-1', 'POS', 'ref_reads', 'var_reads', 'VAF', 'total_reads', 'REF', 'ALT', 'sample', 'TYPE', 'CLASS', 'VARIANT_CLASS', 'TRIPLET', 'EXTENDED' ]]) # Remove unmappable regions mapped = get_mappable_regions(mut_bed) # intersect with CN data out = mapped.intersect(cnv_bed, wao=True) # merge to dataframe merge = out.to_dataframe(names=[ 'CHROM', 'POS-1', 'POS', 'REF_COUNTS', 'VAR_COUNTS', 'VAF', 'TOTAL_READS', 'REF', 'ALT', 'SAMPLE', 'TYPE', 'CLASS', 'VARIANT_CLASS', 'TRIPLET', 'EXTENDED', 'c1', 'p1', 'p2', 'MAJOR_CN_TEMP', 'actual_Baf', 'overlapp' ]) # get the normal copy number values sex_chrom = ('Y', 'X') # get normal CN in the chromosome merge['NORMAL_CN'] = merge['CHROM'].apply( lambda x: 1 if x in sex_chrom and gender == "MALE" else 2) # add the purity score we got from PURPLE merge['PURITY'] = purity_score merge['GENDER'] = gender # get number of CNAs, if no overlapp then get the normal count merge['TOTAL_CN'] = merge.apply(get_major_cn, axis=1) # formula of allele specific copy number according to hartwig's people merge['MAJOR_CN'] = round(merge['actual_Baf'] * merge['TOTAL_CN']).astype(int) merge['MINOR_CN'] = round( (1 - merge['actual_Baf']) * merge['TOTAL_CN']).astype(int) merge['CHROM'] = merge['CHROM'].apply(lambda x: 'chr{}'.format(x)) # save files merge.dropna()[[ 'CHROM', 'POS', 'REF', 'ALT', 'TRIPLET', 'EXTENDED', 'CLASS', 'VARIANT_CLASS', 'SAMPLE', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN', 'NORMAL_CN', 'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY' ]].to_csv(outfile, sep='\t', index=False, header=True, compression='gzip') # clean BedTools temp files pybedtools.cleanup()