def target_pileup_from_mut(mut_file, base_file, bam, chrom): ''' piles up the mutation list in the tumor bam ''' # bed file can contain all chromosomes because chrom restriction comes with the -r parameter bed_file = f"{base_file}.bed" # create the bed file for mpileup shell(f"{csv2bed} < {mut_file} > {bed_file}") # # if I want to restrict chromosome in file: # mut_chr_file = f"{base_file}.csv" # mut_df.to_csv(mut_chr_file, sep='\t', index=False) # # create the bed file for mpileup from the mutation file # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}") # do the pileup into the matrix file matrix_file = f"{base_file}.matrix" pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}" pileup_cmd += f" -l {bed_file} -r {chrom} {tumor_bam}" pipe_cmd = f"{pileup_cmd} | cut -f 1,2,5 | {cleanpileup} | {pile2count} > {matrix_file}" show_output(f"Piling up tumor bam {tumor_bam}", color='normal') # do the pileup to matrix_file show_command(pipe_cmd, multi=False) shell(pipe_cmd) # cleanup shell(f"rm -f {bed_file}") show_output( f"Pileup matrix for chrom {chrom} of {tumor_bam} completed. Merging with cache file...", color='normal' ) return matrix_file
def main(s): """ wrapped into function lest module be called by each subprocess """ c = s.config w = s.wildcards i = s.input o = s.output cc = c["ascat"] show_output( f"Collecting chromosomal position data for {w.sample}_{w.type}", time=True, ) pos_dfs = [] for pos_file in i: pos_df = pd.read_csv( pos_file, sep="\t", header=None, names=["chrom", "pos", "ref", "alt"], ) pos_dfs.append(pos_df) allpos_df = pd.concat(pos_dfs) allpos_df.to_csv(o.pos, sep="\t", index=False, compression="gzip") show_output( f"Finished combining data for {w.sample}_{w.type}. Written to {o.pos}", color="success", )
def main(s): """ wrapped into function lest module be called by each subprocess """ c = s.config w = s.wildcards i = s.input o = s.output cc = c["ascat"] show_output( f"Combining {i.tumor} and {i.normal}", time=True, ) # get tumor and normal posfile into df tumor_df = pd.read_csv(i.tumor, sep="\t") normal_df = pd.read_csv(i.tumor, sep="\t") merged_df = tumor_df.merge(normal_df, on=["chrom", "pos"], suffixes=("_T", "_N")) merged_df["pos"] = merged_df["pos"].astype(int) results_df.to_csv(o.pos, sep="\t", index=False) show_output( f"Finished combining data for {w.sample}_{w.type}. Written to {o.pos}", color="success", )
def computeEBcache(mat_df, pen): show_output(f"Computing EBcache for {len(mat_df.index)} lines", time=True, multi=True, color='process') cache_df = matrix2AB(mat_df, pen) show_output(f"Finished!", time=True, multi=True, color='success') return cache_df
def compute_AB2EB(df): ''' per row of df, takes a target depth-ponAB matrix and computes the EBscore ''' show_output(f"Computing EBscore for {len(df.index)} lines", multi=True) df['EBscore'] = df.apply(AB2EBscore, axis=1) show_output("Finished!", multi=True) return df
def compute_matrix2EB(df, fit_pen): ''' per df row, computes the EBscore from full depth matrix first row: target depth next rows: pon depth ''' show_output(f"Computing EBscore for {len(df.index)} lines", multi=True) df['EBscore'] = df.apply(partial(matrix2EBscore, fit_pen), axis=1) show_output("Finished!", multi=True, time=True) return df
def get_primer_df(mut_df, primer3_config, chroms_folder, chrom): """ allocates dfs chrom-wise and controls row-wise computation """ show_output(f"Running primer3 for chrom {chrom}.", multi=True) chrom_dict = get_chrom(chrom, chroms_folder) chr_df = mut_df.query("Chr == @chrom") primer_df = chr_df.apply(compute_primers, chrom=chrom_dict, config=primer3_config, axis=1) show_output(f"Finished chrom {chrom}.", multi=True) return primer_df
def main(s): input = s.input output = s.output threads = s.threads extension = os.path.splitext(input[0])[1] if extension == '.fastq': # compress fastq as fastq.gz into workdir shell(f"pigz -5 -p {threads} {input} > {output}") elif extension == '.gz': show_output(f"Creating symlink for file {input}") # create shortcut to fastq.gz in workdir/fastq shell(f"ln -s {input} {output}") elif extension == '.bz2': show_output(f"file extension {extension} --> unzipping with bzcat") # uncompress fastq.b2 and recompress to fastq.gz in workdir/fastq shell(f"bzcat {input} | pigz -5 -p {threads} > {output}")
def primer3_master( i, o, chroms_folder, threads, PCR_config=PCR_config, primer3_config=p3_improved_config, ): """ wrapper around run_primer3 that allows for injecting with adjusted PCR and Primer3_configs and controls input and output """ # #### LOAD file ################### show_output(f"Loading {i} for primer3 computation. ", end="") filter1_df = pd.read_csv(i, sep="\t") show_output(f"{len(filter1_df.index)} mutations found.", time=False) # #### run primer3 ################# primer_df = run_primer3( filter1_df, chroms_folder, pcr_config=PCR_config, primer3_config=primer3_config, threads=threads, ) # ###### write to file ############# primer_df.to_csv(o, sep="\t", index=False) show_output(f"Writing primer list to {o}.")
def run_eb(table, tumor_bam, output, pon_list, chrom, log, threads, EBparams, full_output, cleanpileup, csv2bed, pon2cols, pile2count, matrix2EBinput, makeponlist ): ''' master function to start eb_computation ''' # ############## LOAD DATA ############################### show_output(f"Computing EBscore for chrom", color='normal') # get the sceleton mutation file for that chromosome mut_df = pd.read_csv(table, sep='\t', index_col=False, header=None, names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'somatic_status', 'TR1', 'TR1+', 'TR2', 'TR2+', 'NR1', 'NR1+', 'NR2', 'NR2+', 'somaticP', 'variantP']).query('Chr == @chrom').iloc[:, :5] mut_cols = list(mut_df.columns) # set base_name for intermediate files base_file = output[0].replace(".EB", "") # ############## PILEUP --> MATRIX FILE ################## # bed file can contain all chromosomes because chrom restriction comes with the -r parameter bed_file = f"{base_file}.bed" # create the bed file for mpileup shell(f"{csv2bed} < {table} > {bed_file}") # # if I want to restrict chromosome in file: # mut_chr_file = f"{base_file}.csv" # mut_df.to_csv(mut_chr_file, sep='\t', index=False) # # create the bed file for mpileup from the mutation file # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}") # create the pon_list containing the tumor-bam as first file sample_list = f"{base_file}.pon" # makeponlist removes the sample itself from list if it is part of PoN shell(f"{makeponlist} {tumor_bam} {pon_list} {sample_list}") show_output(f"Piling up {chrom} of {tumor_bam} with Pon List.", color='normal') shell(f"cat {sample_list}") # do the pileup into the matrix file matrix_file = f"{base_file}.matrix" pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}" pileup_cmd += f" -l {bed_file} -r {chrom} -b {sample_list}" # cut -f $({pon2cols}< {sample_list}) creates a cut command only including the desired pipe_cmd = f"{pileup_cmd} | cut -f $({pon2cols} < {sample_list}) | {cleanpileup} | {pile2count} > {matrix_file}" # do the pileup to matrix_file show_command(pipe_cmd, multi=False) shell(pipe_cmd) # cleanup shell(f"rm {bed_file} {sample_list}") # check if matrix_file has input if not os.path.getsize(matrix_file): # create empty file open(output[0], 'a').close() show_output(f"Pileup for {chrom} of {tumor_bam} was empty! Created empty file {output[0]}", color='warning') else: show_output(f"Pileup matrix for chrom {chrom} of {tumor_bam} completed.", color='normal') # ################ MERGE INTO MUTFILE ###################### # change mutation positions for deletions in mutation file mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1 # read matrix file into df matrix_df = pd.read_csv(matrix_file, sep='\t', index_col=False) # merge mut_matrix = mut_df.merge(matrix_df, on=['Chr', 'Start'], how='inner') # reset deletion positions mut_matrix.loc[mut_matrix['Alt'] == "-", 'Start'] = mut_matrix['Start'] + 1 # ####### if using matrix2EBinput.mawk ####################### # write to file mutmatrix_file = f"{base_file}.mutmatrix" mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False) # convert mutmatrix to direct EBinput EB_matrix_input_file = f"{base_file}.EB.matrix" shell(f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}") # load in the EB.matrix file eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t') # multithreaded computation EB_df = compute_matrix2EB_multi(eb_matrix, EBparams['fitting_penalty'], threads) # add EBscore to columns mut_cols.append('EBscore') # get the pon_matrix containing the Pon coverages in Alt and Ref pon_matrix = get_pon_bases(eb_matrix) # transfer PoN-Ref and PoN-Alt to EB_df EB_df[['PoN-Ref', 'PoN-Alt']] = pon_matrix[['PoN-Ref', 'PoN-Alt']] mut_cols += ['PoN-Ref', 'PoN-Alt'] # ###### add the full output ########## if full_output: # condense base info print('full_output') base_cols = list("AaGgCcTtIiDd") col_name = "|".join(base_cols) # convert base coverage to str for ch in base_cols: # take the letter info from the mut_matrix which is not yet condensated # str.replace removes the tumor bases EB_df[ch] = mut_matrix[ch].map(str).str.replace(r'^[0-9]+\|', "") # condense base info into col "A|a|G|g|C|c|T|t|I|i|D|d" EB_df[col_name] = EB_df[base_cols].apply(lambda row: "-".join(row), axis=1) # add "A|a|G|g|C|c|T|t|I|i|D|d" to columns mut_cols.append(col_name) # rm unnecessary columns EB_df = EB_df[mut_cols] # ######### WRITE TO FILE ############################################## EB_file = output[0] EB_df.to_csv(EB_file, sep='\t', index=False) # cleanup shell(f"rm {matrix_file} {EB_matrix_input_file}") # {mutmatrix_file} show_output(f"Created EBscore for chrom {chrom} of {tumor_bam} and written to {output[0]}", color='success')
def run_eb_from_cache(table, tumor_bam, output, pon_list, chrom, log, threads, EBparams, full_output, cleanpileup, csv2bed, pile2count, matrix2EBinput, reorder_matrix ): def target_pileup_from_mut(mut_file, base_file, bam, chrom): ''' piles up the mutation list in the tumor bam ''' # bed file can contain all chromosomes because chrom restriction comes with the -r parameter bed_file = f"{base_file}.bed" # create the bed file for mpileup shell(f"{csv2bed} < {mut_file} > {bed_file}") # # if I want to restrict chromosome in file: # mut_chr_file = f"{base_file}.csv" # mut_df.to_csv(mut_chr_file, sep='\t', index=False) # # create the bed file for mpileup from the mutation file # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}") # do the pileup into the matrix file matrix_file = f"{base_file}.matrix" pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}" pileup_cmd += f" -l {bed_file} -r {chrom} {tumor_bam}" pipe_cmd = f"{pileup_cmd} | cut -f 1,2,5 | {cleanpileup} | {pile2count} > {matrix_file}" show_output(f"Piling up tumor bam {tumor_bam}", color='normal') # do the pileup to matrix_file show_command(pipe_cmd, multi=False) shell(pipe_cmd) # cleanup shell(f"rm -f {bed_file}") show_output( f"Pileup matrix for chrom {chrom} of {tumor_bam} completed. Merging with cache file...", color='normal' ) return matrix_file # ############## LOAD DATA ############################### show_output(f"Computing EBscore for chrom {chrom} of {tumor_bam} using EBcache {AB_cache_file}", color='normal') # get the mutation file for the chromosome mut_df = pd.read_csv(mut_file, sep='\t', index_col=False, header=None, names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'somatic_status', 'TR1', 'TR1+', 'TR2', 'TR2+', 'NR1', 'NR1+', 'NR2', 'NR2+', 'somaticP', 'variantP']).query('Chr == @chrom').iloc[:, :5] mut_cols = list(mut_df.columns) # check for empty df if mut_df.empty: EB_df = pd.DataFrame(columns=mut_cols) EB_df.to_csv(output[0], sep='\t', index=False) show_output(f"No mutations for {chrom} in mutation list! Writing empty file to {output[0]}", color='warning') else: # set base_name for intermediate files base_file = output[0].replace(".cachedEB", "") # ############## LOAD PILEUP MATRIX CACHE AND MERGE INTO MUT_DF ##### # change mutation positions for deletions in mutation file mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1 show_output(f"Loading compressed matrix cache file {matrix_cache_file}", color='normal') # load in the target matrix file as df cache_matrix_df = pd.read_csv(matrix_cache_file, sep='\t', index_col=False, compression='gzip') # merge mut_matrix = mut_df.merge(cache_matrix_df, on=['Chr', 'Start'], how='inner') # reset deletion positions mut_matrix.loc[mut_matrix['Alt'] == "-", 'Start'] = mut_matrix['Start'] + 1 show_output(f"Loaded and merged into mutation list", color='normal') # ############### CHECK IF SAMPLE IN PON #################### # if sample_inpon == 0, then sample is not in PoN # else, pon matrix has to be acquired from cache and used in EBscore sample_in_pon = get_sample_pos(pon_list, tumor_bam) # ########################################### CACHE FROM MATRIX ##################################### if sample_in_pon: show_output( f"Corresponding normal sample for {tumor_bam} has been found in PoNs! EBcache cannot be used!", color='warning' ) show_output(f"Falling back to cached matrix file..", color='normal') # EBcache cannot be used directly # ######### REMOVE SAMPLE BASES FROM MATRIX FILE # get the cached matrix file and reorder sample bases to first position to create valid mutmatrix # reorder_matrix takes position of sample in pon_list as argument # if position of tumor bam in pon == 1, everything is already fine mutmatrix_file = f"{base_file}.mutmatrix" if sample_in_pon > 1: prematrix_file = f"{base_file}.prematrix" mut_matrix.to_csv(prematrix_file, sep='\t', index=False) # row is 0-based --> sample_in_pon + 1 reduce_matrix_cmd = f"cat {prematrix_file} | {reorder_matrix} {sample_in_pon - 1} > {mutmatrix_file}" show_command(reduce_matrix_cmd, multi=False) shell(reduce_matrix_cmd) # cleanup shell(f"rm {prematrix_file}") else: # tumor sample already in the right position mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False) show_output(f"Retrieving target data from cached matrix", color='normal') # # CONTINUE LIKE UNCACHED EBscore # convert mutmatrix to direct EBinput EB_matrix_input_file = f"{base_file}.EB.matrix" EBinput_cmd = f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}" show_command(EBinput_cmd, multi=False) shell(EBinput_cmd) # load in the EB.matrix file eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t') print('Start computation file') # multithreaded computation # passing attempts to threads EB_df = compute_matrix2EB_multi(eb_matrix, EBparams['fitting_penalty'], threads) print('Computation finished') # get the pon_matrix containing the Pon coverages in Alt and Ref pon_matrix = get_pon_bases(eb_matrix) # ########################################### CACHE FROM ABcache ########################### else: # ############## TARGET PILEUP --> MATRIX FILE ################## tumor_matrix_file = target_pileup_from_mut(mut_file, base_file, tumor_bam, chrom) # check if matrix_file has input # if not os.path.getsize(tumor_matrix_file): # # create empty file # EB_df = mut_df # EB_df['EBscore'] = 0 # has_pileup = False # else: # has input # has_pileup = True # reloading the target pileup into pileup_df # use dtype to ensure str encoding of chromosome columns pileup_df = pd.read_csv(tumor_matrix_file, sep='\t', dtype={'Chr': str, 'Start': int}, index_col=False) show_output(f"Loading compressed AB cache file {AB_cache_file}", color='normal') cache_df = pd.read_csv(AB_cache_file, compression='gzip', sep='\t') pileAB_df = pileup_df.merge(cache_df, on=['Chr', 'Start']) # change coords for merge with start and merge into mut_df for Ref mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1 pileAB_df = mut_df.merge(pileAB_df, on=['Chr', 'Start']) pileAB_df.loc[pileAB_df['Alt'] == "-", 'Start'] = pileAB_df['Start'] + 1 # save for debugging # pileAB_file = f"{base_file}.pileAB" # pileAB_df.to_csv(pileAB_file, sep='\t', index=False) show_output( f"Pileup matrix for for chrom {chrom} of {tumor_bam} merged with AB matrix." + " Going on with EB computation...", color='normal' ) # ############## EBSCORE COMPUTATION ######## # multithreaded computation EB_df = compute_AB2EB_multi(pileAB_df, threads) # convert matrix file to EB_input for getting PoN-Ref and Pon-Alt mutmatrix_file = f"{base_file}.mutmatrix" mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False) # do the conversion EB_matrix_input_file = f"{base_file}.EB.matrix" convert_cmd = (f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}") show_command(convert_cmd) shell(convert_cmd) # load in the EB.matrix file eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t') # get the pon_matrix containing the Pon coverages in Alt and Ref # tumor sample is not in PoN --> no removal neccessary pon_matrix = get_pon_bases(eb_matrix, remove_sample=False) # cleanup shell(f"rm -f {tumor_matrix_file}") # add EBscore to columns mut_cols.append('EBscore') # transfer PoN-Ref and PoN-Alt from pon_matrix to EB_df EB_df[['PoN-Ref', 'PoN-Alt']] = pon_matrix[['PoN-Ref', 'PoN-Alt']] mut_cols += ['PoN-Ref', 'PoN-Alt'] # ###### add the full output ########## if config['EBFilter']['full_pon_output']: # condense base info base_cols = list("AaGgCcTtIiDd") col_name = "|".join(base_cols) # convert base coverage to str for ch in base_cols: # take the letter info from the mut_matrix which is not yet condensated # str.replace removes the tumor bases EB_df[ch] = mut_matrix[ch].map(str).str.replace(r'^[0-9]+\|', "") # condense base info into col "A|a|G|g|C|c|T|t|I|i|D|d" EB_df[col_name] = EB_df[base_cols].apply(lambda row: "-".join(row), axis=1) # add "A|a|G|g|C|c|T|t|I|i|D|d" to columns mut_cols.append(col_name) # rm unnecessary columns EB_df = EB_df[mut_cols] # ######### WRITE TO FILE ############################################## EB_df.to_csv(output[0], sep='\t', index=False) # cleanup shell(f"rm -f {EB_matrix_input_file}") show_output( f"Created EBscore for chrom {chrom} of {tumor_bam} using EBcache and written to {output[0]}", color='success' )
config = snakemake.config params = snakemake.params MINSIM = params.min_sim min_q = params.min_q PAD = min(config['HDR']['padding'], config['filter_bam']['padding']) HDRMINCOUNT = params.min_HDR_count i = snakemake.input tumor_bam = i.bam filter_file = i.filter_file filter_pileup = i.pileup out_file = snakemake.output.HDR show_output( f'Starting HDR analysis of {filter_file}. [MIN_SIM={MINSIM}, PAD={PAD}]') # GET THE mutation file for masterHDR show_output(f'Importing {filter_file} for HDR detection', time=False) filter_df = pd.read_csv(filter_file, sep='\t').loc[:, [ 'Chr', 'Start', 'End', 'Ref', 'Alt', 'Gene']] HDR_df = masterHDR( pileup_file=filter_pileup, tumor_bam=tumor_bam, filter_df=filter_df, MINSIM=MINSIM, padding=PAD, min_q=min_q, min_HDR_count=HDRMINCOUNT )
def run_primer3( mut_df, chroms_folder=".", pcr_config={}, # use defaults defined at top primer3_config={}, # use defaults defined at top threads=1, ): """ input is df with columns 'Chr', 'Start', 'End', 'Ref', 'Alt' with optional id columns (everything left of Chr) output is id columns + 'Chr', 'Start', 'End', 'Ref', 'Alt' + primer cols primer cols: """ # apply pcr size to primer3_config primer3_config["PRIMER_PRODUCT_SIZE_RANGE"] = [ pcr_config["prod_size_min"], pcr_config["prod_size_max"], ] primer3_config.update(pcr_config) mut_df.loc[:, "Chr"] = mut_df["Chr"].astype("str") # COLS base_cols = ["Chr", "Start", "End", "Ref", "Alt"] # keep possible columns left of Chr # save the id columns into org_df for later merge keep_cols = list(mut_df.columns[:list(mut_df.columns).index("Chr")]) keep_df = mut_df.loc[:, keep_cols + base_cols] mut_df = mut_df.loc[:, base_cols] df_list = [] # cycle through (formatted) chromosomes # + load chromosome sequence # + create primer_df for mutations on that chromosome # + concat all mutations # ##### MULTIPROCESSING chrom_list = mut_df["Chr"].unique() show_output(f"Allocating processor pool for {threads} threads.") pool = Pool(threads) df_list = pool.map( partial(get_primer_df, mut_df, primer3_config, chroms_folder), chrom_list) primer_df = pd.concat(df_list, sort=True) new_cols = [ "fwdPrimer", "revPrimer", "Status", "Temp", "AmpliconRange", "AmpliconSize", "InsertRange", "InsertSize", "InsertSeq", "offsetL", "offsetR", ] for col in ["AmpliconSize", "InsertSize", "offsetL", "offsetR"]: primer_df[col] = primer_df[col].fillna(0).astype(int) primer_df = keep_df.merge(primer_df[base_cols + new_cols]) return primer_df
output = snakemake.output threads = snakemake.threads log = snakemake.log ##################################################################### # ###################### FISHER SCORE ################################ def get_fisher_exact(row): T1plus = row['TR1+'] T1min = row['TR1'] - T1plus T2plus = row['TR2+'] T2min = row['TR2'] - T2plus mat = np.matrix([[T1plus, T2plus], [T1min, T2min]]) fisher_p = fe(mat)[1] if fisher_p: return round(-10 * math.log(fisher_p, 10), 1) return 5000 print(f'Computing FisherScore for file {input[0]}') df = pd.read_csv(input[0], sep='\t') cols = list(df.columns)[:5] df['FisherScore'] = df.apply(get_fisher_exact, axis=1) # reduce to important cols cols += ['FisherScore'] # write file to filtered df[cols].to_csv(output[0], sep='\t', index=False) show_output(f'FisherScore for file {input[0]} written to {output[0]}', color='success')
def get_cover_svg(i, o, sample, exon_cover, refgen, log, prettifyBed): cmd = f"bedtools coverage -b {i} -a {exon_cover} -hist -sorted -g {refgen}.genome | grep \'^all\' | sort -k2,2nr | {prettifyBed} | sort -k2,2n > {o}" if run_cmd(cmd): make_svg(o, plot_name=sample) else: show_output(f"bedtools for {i} exited with error!", color="warning")