Ejemplo n.º 1
0
    def target_pileup_from_mut(mut_file, base_file, bam, chrom):
        '''
        piles up the mutation list in the tumor bam
        '''

        # bed file can contain all chromosomes because chrom restriction comes with the -r parameter
        bed_file = f"{base_file}.bed"
        # create the bed file for mpileup
        shell(f"{csv2bed} < {mut_file} > {bed_file}")

        # # if I want to restrict chromosome in file:
        # mut_chr_file = f"{base_file}.csv"
        # mut_df.to_csv(mut_chr_file, sep='\t', index=False)
        # # create the bed file for mpileup from the mutation file
        # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}")

        # do the pileup into the matrix file
        matrix_file = f"{base_file}.matrix"
        pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}"
        pileup_cmd += f" -l {bed_file} -r {chrom} {tumor_bam}"
        pipe_cmd = f"{pileup_cmd} | cut -f 1,2,5 | {cleanpileup} | {pile2count} > {matrix_file}"
        show_output(f"Piling up tumor bam {tumor_bam}", color='normal')
        # do the pileup to matrix_file
        show_command(pipe_cmd, multi=False)
        shell(pipe_cmd)
        # cleanup
        shell(f"rm -f {bed_file}")
        show_output(
            f"Pileup matrix for chrom {chrom} of {tumor_bam} completed. Merging with cache file...",
            color='normal'
        )
        return matrix_file
Ejemplo n.º 2
0
def main(s):
    """
    wrapped into function lest module be called by each subprocess
    """

    c = s.config
    w = s.wildcards
    i = s.input
    o = s.output
    cc = c["ascat"]

    show_output(
        f"Collecting chromosomal position data for {w.sample}_{w.type}",
        time=True,
    )
    pos_dfs = []
    for pos_file in i:
        pos_df = pd.read_csv(
            pos_file,
            sep="\t",
            header=None,
            names=["chrom", "pos", "ref", "alt"],
        )
        pos_dfs.append(pos_df)

    allpos_df = pd.concat(pos_dfs)
    allpos_df.to_csv(o.pos, sep="\t", index=False, compression="gzip")

    show_output(
        f"Finished combining data for {w.sample}_{w.type}. Written to {o.pos}",
        color="success",
    )
Ejemplo n.º 3
0
def main(s):
    """
    wrapped into function lest module be called by each subprocess
    """

    c = s.config
    w = s.wildcards
    i = s.input
    o = s.output
    cc = c["ascat"]

    show_output(
        f"Combining {i.tumor} and {i.normal}",
        time=True,
    )

    # get tumor and normal posfile into df
    tumor_df = pd.read_csv(i.tumor, sep="\t")
    normal_df = pd.read_csv(i.tumor, sep="\t")

    merged_df = tumor_df.merge(normal_df,
                               on=["chrom", "pos"],
                               suffixes=("_T", "_N"))
    merged_df["pos"] = merged_df["pos"].astype(int)

    results_df.to_csv(o.pos, sep="\t", index=False)

    show_output(
        f"Finished combining data for {w.sample}_{w.type}. Written to {o.pos}",
        color="success",
    )
Ejemplo n.º 4
0
def computeEBcache(mat_df, pen):
    show_output(f"Computing EBcache for {len(mat_df.index)} lines",
                time=True,
                multi=True,
                color='process')
    cache_df = matrix2AB(mat_df, pen)
    show_output(f"Finished!", time=True, multi=True, color='success')
    return cache_df
Ejemplo n.º 5
0
def compute_AB2EB(df):
    '''
    per row of df, takes a target depth-ponAB matrix and computes the EBscore
    '''

    show_output(f"Computing EBscore for {len(df.index)} lines", multi=True)
    df['EBscore'] = df.apply(AB2EBscore, axis=1)
    show_output("Finished!", multi=True)
    return df
Ejemplo n.º 6
0
def compute_matrix2EB(df, fit_pen):
    '''
    per df row, computes the EBscore from full depth matrix
    first row: target depth
    next rows: pon depth
    '''

    show_output(f"Computing EBscore for {len(df.index)} lines", multi=True)
    df['EBscore'] = df.apply(partial(matrix2EBscore, fit_pen), axis=1)
    show_output("Finished!", multi=True, time=True)
    return df
Ejemplo n.º 7
0
def get_primer_df(mut_df, primer3_config, chroms_folder, chrom):
    """
    allocates dfs chrom-wise and controls row-wise computation
    """
    show_output(f"Running primer3 for chrom {chrom}.", multi=True)
    chrom_dict = get_chrom(chrom, chroms_folder)
    chr_df = mut_df.query("Chr == @chrom")
    primer_df = chr_df.apply(compute_primers,
                             chrom=chrom_dict,
                             config=primer3_config,
                             axis=1)
    show_output(f"Finished chrom {chrom}.", multi=True)
    return primer_df
Ejemplo n.º 8
0
def main(s):
    input = s.input
    output = s.output
    threads = s.threads

    extension = os.path.splitext(input[0])[1]
    if extension == '.fastq':
        # compress fastq as fastq.gz into workdir
        shell(f"pigz -5 -p {threads} {input} > {output}")
    elif extension == '.gz':
        show_output(f"Creating symlink for file {input}")
        # create shortcut to fastq.gz in workdir/fastq
        shell(f"ln -s {input} {output}")
    elif extension == '.bz2':
        show_output(f"file extension {extension} --> unzipping with bzcat")
        # uncompress fastq.b2 and recompress to fastq.gz in workdir/fastq
        shell(f"bzcat {input} | pigz -5 -p {threads} > {output}")
Ejemplo n.º 9
0
def primer3_master(
    i,
    o,
    chroms_folder,
    threads,
    PCR_config=PCR_config,
    primer3_config=p3_improved_config,
):
    """
    wrapper around run_primer3 that allows for injecting with adjusted PCR and Primer3_configs
    and controls input and output
    """

    # #### LOAD file ###################
    show_output(f"Loading {i} for primer3 computation. ", end="")
    filter1_df = pd.read_csv(i, sep="\t")
    show_output(f"{len(filter1_df.index)} mutations found.", time=False)

    # #### run primer3 #################
    primer_df = run_primer3(
        filter1_df,
        chroms_folder,
        pcr_config=PCR_config,
        primer3_config=primer3_config,
        threads=threads,
    )

    # ###### write to file #############
    primer_df.to_csv(o, sep="\t", index=False)
    show_output(f"Writing primer list to {o}.")
Ejemplo n.º 10
0
def run_eb(table, tumor_bam, output, pon_list, chrom, log, threads, EBparams, full_output,
           cleanpileup,
           csv2bed,
           pon2cols,
           pile2count,
           matrix2EBinput,
           makeponlist
    ):
    '''
    master function to start eb_computation
    '''
    # ############## LOAD DATA ###############################
    show_output(f"Computing EBscore for chrom", color='normal')

    # get the sceleton mutation file for that chromosome
    mut_df = pd.read_csv(table, sep='\t', index_col=False, header=None, names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'somatic_status', 'TR1', 'TR1+', 'TR2', 'TR2+', 'NR1', 'NR1+', 'NR2', 'NR2+', 'somaticP', 'variantP']).query('Chr == @chrom').iloc[:, :5]
    mut_cols = list(mut_df.columns)
    # set base_name for intermediate files
    base_file = output[0].replace(".EB", "")

    # ############## PILEUP --> MATRIX FILE ##################

    # bed file can contain all chromosomes because chrom restriction comes with the -r parameter
    bed_file = f"{base_file}.bed"
    # create the bed file for mpileup
    shell(f"{csv2bed} < {table} > {bed_file}")

    # # if I want to restrict chromosome in file:
    # mut_chr_file = f"{base_file}.csv"
    # mut_df.to_csv(mut_chr_file, sep='\t', index=False)
    # # create the bed file for mpileup from the mutation file
    # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}")

    # create the pon_list containing the tumor-bam as first file
    sample_list = f"{base_file}.pon"
    # makeponlist removes the sample itself from list if it is part of PoN
    shell(f"{makeponlist} {tumor_bam} {pon_list} {sample_list}")

    show_output(f"Piling up {chrom} of {tumor_bam} with Pon List.", color='normal')
    shell(f"cat {sample_list}")
    # do the pileup into the matrix file
    matrix_file = f"{base_file}.matrix"
    pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}"
    pileup_cmd += f" -l {bed_file} -r {chrom} -b {sample_list}"
    # cut -f $({pon2cols}< {sample_list}) creates a cut command only including the desired

    pipe_cmd = f"{pileup_cmd} | cut -f $({pon2cols} < {sample_list}) | {cleanpileup} | {pile2count} > {matrix_file}"
    # do the pileup to matrix_file
    show_command(pipe_cmd, multi=False)
    shell(pipe_cmd)
    # cleanup
    shell(f"rm {bed_file} {sample_list}")

    # check if matrix_file has input
    if not os.path.getsize(matrix_file):
        # create empty file
        open(output[0], 'a').close()
        show_output(f"Pileup for {chrom} of {tumor_bam} was empty! Created empty file {output[0]}", color='warning')
    else:
        show_output(f"Pileup matrix for chrom {chrom} of {tumor_bam} completed.", color='normal')
        # ################ MERGE INTO MUTFILE ######################
        # change mutation positions for deletions in mutation file
        mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1
        # read matrix file into df
        matrix_df = pd.read_csv(matrix_file, sep='\t', index_col=False)
        # merge
        mut_matrix = mut_df.merge(matrix_df, on=['Chr', 'Start'], how='inner')
        # reset deletion positions
        mut_matrix.loc[mut_matrix['Alt'] == "-", 'Start'] = mut_matrix['Start'] + 1

        # ####### if using matrix2EBinput.mawk #######################
        # write to file
        mutmatrix_file = f"{base_file}.mutmatrix"
        mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False)

        # convert mutmatrix to direct EBinput
        EB_matrix_input_file = f"{base_file}.EB.matrix"
        shell(f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}")
        
        # load in the EB.matrix file
        eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t')

        # multithreaded computation
        EB_df = compute_matrix2EB_multi(eb_matrix, EBparams['fitting_penalty'], threads)

        # add EBscore to columns
        mut_cols.append('EBscore')

        # get the pon_matrix containing the Pon coverages in Alt and Ref
        pon_matrix = get_pon_bases(eb_matrix)
        # transfer PoN-Ref and PoN-Alt to EB_df
        EB_df[['PoN-Ref', 'PoN-Alt']] = pon_matrix[['PoN-Ref', 'PoN-Alt']]
        mut_cols += ['PoN-Ref', 'PoN-Alt']

        # ###### add the full output ##########
        if full_output:
            # condense base info
            print('full_output')
            base_cols = list("AaGgCcTtIiDd")
            col_name = "|".join(base_cols)
            # convert base coverage to str
            for ch in base_cols:
                # take the letter info from the mut_matrix which is not yet condensated
                # str.replace removes the tumor bases
                EB_df[ch] = mut_matrix[ch].map(str).str.replace(r'^[0-9]+\|', "")
            # condense base info into col "A|a|G|g|C|c|T|t|I|i|D|d"
            EB_df[col_name] = EB_df[base_cols].apply(lambda row: "-".join(row), axis=1)
            # add "A|a|G|g|C|c|T|t|I|i|D|d" to columns
            mut_cols.append(col_name)
        # rm unnecessary columns
        EB_df = EB_df[mut_cols]

        # ######### WRITE TO FILE ##############################################

        EB_file = output[0]
        EB_df.to_csv(EB_file, sep='\t', index=False)

        # cleanup
        shell(f"rm {matrix_file} {EB_matrix_input_file}")  # {mutmatrix_file}
        show_output(f"Created EBscore for chrom {chrom} of {tumor_bam} and written to {output[0]}", color='success')
Ejemplo n.º 11
0
def run_eb_from_cache(table, tumor_bam, output, pon_list, chrom, log, threads, EBparams, full_output,
           cleanpileup,
           csv2bed,
           pile2count,
           matrix2EBinput,
           reorder_matrix
    ):

    
    def target_pileup_from_mut(mut_file, base_file, bam, chrom):
        '''
        piles up the mutation list in the tumor bam
        '''

        # bed file can contain all chromosomes because chrom restriction comes with the -r parameter
        bed_file = f"{base_file}.bed"
        # create the bed file for mpileup
        shell(f"{csv2bed} < {mut_file} > {bed_file}")

        # # if I want to restrict chromosome in file:
        # mut_chr_file = f"{base_file}.csv"
        # mut_df.to_csv(mut_chr_file, sep='\t', index=False)
        # # create the bed file for mpileup from the mutation file
        # shell(f"{csv2bed} < {mut_chr_file} > {bed_file}")

        # do the pileup into the matrix file
        matrix_file = f"{base_file}.matrix"
        pileup_cmd = f"samtools mpileup -B -q {EBparams['MAPQ']} -Q {EBparams['Q']}"
        pileup_cmd += f" -l {bed_file} -r {chrom} {tumor_bam}"
        pipe_cmd = f"{pileup_cmd} | cut -f 1,2,5 | {cleanpileup} | {pile2count} > {matrix_file}"
        show_output(f"Piling up tumor bam {tumor_bam}", color='normal')
        # do the pileup to matrix_file
        show_command(pipe_cmd, multi=False)
        shell(pipe_cmd)
        # cleanup
        shell(f"rm -f {bed_file}")
        show_output(
            f"Pileup matrix for chrom {chrom} of {tumor_bam} completed. Merging with cache file...",
            color='normal'
        )
        return matrix_file

    # ############## LOAD DATA ###############################
    show_output(f"Computing EBscore for chrom {chrom} of {tumor_bam} using EBcache {AB_cache_file}", color='normal')

    # get the mutation file for the chromosome
    mut_df = pd.read_csv(mut_file, sep='\t', index_col=False, header=None, names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'somatic_status', 'TR1', 'TR1+', 'TR2', 'TR2+', 'NR1', 'NR1+', 'NR2', 'NR2+', 'somaticP', 'variantP']).query('Chr == @chrom').iloc[:, :5]
    mut_cols = list(mut_df.columns)

    # check for empty df
    if mut_df.empty:
        EB_df = pd.DataFrame(columns=mut_cols)
        EB_df.to_csv(output[0], sep='\t', index=False)
        show_output(f"No mutations for {chrom} in mutation list! Writing empty file to {output[0]}", color='warning')
    else:
        # set base_name for intermediate files
        base_file = output[0].replace(".cachedEB", "")

        # ############## LOAD PILEUP MATRIX CACHE AND MERGE INTO MUT_DF #####
        # change mutation positions for deletions in mutation file
        mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1
        show_output(f"Loading compressed matrix cache file {matrix_cache_file}", color='normal')
        # load in the target matrix file as df
        cache_matrix_df = pd.read_csv(matrix_cache_file, sep='\t', index_col=False, compression='gzip')
        # merge
        mut_matrix = mut_df.merge(cache_matrix_df, on=['Chr', 'Start'], how='inner')
        # reset deletion positions
        mut_matrix.loc[mut_matrix['Alt'] == "-", 'Start'] = mut_matrix['Start'] + 1
        show_output(f"Loaded and merged into mutation list", color='normal')

        # ############### CHECK IF SAMPLE IN PON ####################
        # if sample_inpon == 0, then sample is not in PoN
        # else, pon matrix has to be acquired from cache and used in EBscore
        sample_in_pon = get_sample_pos(pon_list, tumor_bam)

        # ########################################### CACHE FROM MATRIX #####################################
        if sample_in_pon:
            show_output(
                f"Corresponding normal sample for {tumor_bam} has been found in PoNs! EBcache cannot be used!",
                color='warning'
            )
            show_output(f"Falling back to cached matrix file..", color='normal')
            # EBcache cannot be used directly

            # ######### REMOVE SAMPLE BASES FROM MATRIX FILE

            # get the cached matrix file and reorder sample bases to first position to create valid mutmatrix
            # reorder_matrix takes position of sample in pon_list as argument
            # if position of tumor bam in pon == 1, everything is already fine
            mutmatrix_file = f"{base_file}.mutmatrix"
            if sample_in_pon > 1:
                prematrix_file = f"{base_file}.prematrix"
                mut_matrix.to_csv(prematrix_file, sep='\t', index=False)

                # row is 0-based --> sample_in_pon + 1
                reduce_matrix_cmd = f"cat {prematrix_file} | {reorder_matrix} {sample_in_pon - 1} > {mutmatrix_file}"
                show_command(reduce_matrix_cmd, multi=False)
                shell(reduce_matrix_cmd)
                # cleanup
                shell(f"rm {prematrix_file}")
            else:
                # tumor sample already in the right position
                mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False)
            show_output(f"Retrieving target data from cached matrix", color='normal')

            # # CONTINUE LIKE UNCACHED EBscore
            # convert mutmatrix to direct EBinput
            EB_matrix_input_file = f"{base_file}.EB.matrix"
            EBinput_cmd = f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}"
            show_command(EBinput_cmd, multi=False)
            shell(EBinput_cmd)
            # load in the EB.matrix file
            eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t')
            print('Start computation file')
            # multithreaded computation
            # passing attempts to threads
            EB_df = compute_matrix2EB_multi(eb_matrix, EBparams['fitting_penalty'], threads)
            print('Computation finished')
            # get the pon_matrix containing the Pon coverages in Alt and Ref
            pon_matrix = get_pon_bases(eb_matrix)
        # ########################################### CACHE FROM ABcache ###########################
        else:
            # ############## TARGET PILEUP --> MATRIX FILE ##################
            tumor_matrix_file = target_pileup_from_mut(mut_file, base_file, tumor_bam, chrom)
            # check if matrix_file has input
            # if not os.path.getsize(tumor_matrix_file):
            #     # create empty file
            #     EB_df = mut_df
            #     EB_df['EBscore'] = 0
            #     has_pileup = False

            # else:  # has input
            # has_pileup = True
            # reloading the target pileup into pileup_df
            # use dtype to ensure str encoding of chromosome columns
            pileup_df = pd.read_csv(tumor_matrix_file, sep='\t', dtype={'Chr': str, 'Start': int}, index_col=False)

            show_output(f"Loading compressed AB cache file {AB_cache_file}", color='normal')
            cache_df = pd.read_csv(AB_cache_file, compression='gzip', sep='\t')

            pileAB_df = pileup_df.merge(cache_df, on=['Chr', 'Start'])
            # change coords for merge with start and merge into mut_df for Ref
            mut_df.loc[mut_df['Alt'] == "-", 'Start'] = mut_df['Start'] - 1
            pileAB_df = mut_df.merge(pileAB_df, on=['Chr', 'Start'])
            pileAB_df.loc[pileAB_df['Alt'] == "-", 'Start'] = pileAB_df['Start'] + 1

            # save for debugging
            # pileAB_file = f"{base_file}.pileAB"
            # pileAB_df.to_csv(pileAB_file, sep='\t', index=False)
            show_output(
                f"Pileup matrix for for chrom {chrom} of {tumor_bam} merged with AB matrix." +
                " Going on with EB computation...",
                color='normal'
            )

            # ############## EBSCORE COMPUTATION  ########
            # multithreaded computation
            EB_df = compute_AB2EB_multi(pileAB_df, threads)

            # convert matrix file to EB_input for getting PoN-Ref and Pon-Alt
            mutmatrix_file = f"{base_file}.mutmatrix"
            mut_matrix.to_csv(mutmatrix_file, sep='\t', index=False)
            # do the conversion
            EB_matrix_input_file = f"{base_file}.EB.matrix"
            convert_cmd = (f"cat {mutmatrix_file} | {matrix2EBinput} > {EB_matrix_input_file}")
            show_command(convert_cmd)
            shell(convert_cmd)

            # load in the EB.matrix file
            eb_matrix = pd.read_csv(EB_matrix_input_file, sep='\t')

            # get the pon_matrix containing the Pon coverages in Alt and Ref
            # tumor sample is not in PoN --> no removal neccessary
            pon_matrix = get_pon_bases(eb_matrix, remove_sample=False)

            # cleanup
            shell(f"rm -f {tumor_matrix_file}")

        # add EBscore to columns
        mut_cols.append('EBscore')

        # transfer PoN-Ref and PoN-Alt from pon_matrix to EB_df
        EB_df[['PoN-Ref', 'PoN-Alt']] = pon_matrix[['PoN-Ref', 'PoN-Alt']]
        mut_cols += ['PoN-Ref', 'PoN-Alt']

        # ###### add the full output ##########
        if config['EBFilter']['full_pon_output']:
            # condense base info
            base_cols = list("AaGgCcTtIiDd")
            col_name = "|".join(base_cols)
            # convert base coverage to str
            for ch in base_cols:
                # take the letter info from the mut_matrix which is not yet condensated
                # str.replace removes the tumor bases
                EB_df[ch] = mut_matrix[ch].map(str).str.replace(r'^[0-9]+\|', "")
            # condense base info into col "A|a|G|g|C|c|T|t|I|i|D|d"
            EB_df[col_name] = EB_df[base_cols].apply(lambda row: "-".join(row), axis=1)
            # add "A|a|G|g|C|c|T|t|I|i|D|d" to columns
            mut_cols.append(col_name)

        # rm unnecessary columns
        EB_df = EB_df[mut_cols]

        # ######### WRITE TO FILE ##############################################
        EB_df.to_csv(output[0], sep='\t', index=False)

        # cleanup
        shell(f"rm -f {EB_matrix_input_file}")
        show_output(
            f"Created EBscore for chrom {chrom} of {tumor_bam} using EBcache and written to {output[0]}",
            color='success'
        )
Ejemplo n.º 12
0
config = snakemake.config
params = snakemake.params
MINSIM = params.min_sim
min_q = params.min_q
PAD = min(config['HDR']['padding'], config['filter_bam']['padding'])
HDRMINCOUNT = params.min_HDR_count

i = snakemake.input

tumor_bam = i.bam
filter_file = i.filter_file
filter_pileup = i.pileup
out_file = snakemake.output.HDR


show_output(
    f'Starting HDR analysis of {filter_file}. [MIN_SIM={MINSIM}, PAD={PAD}]')
# GET THE mutation file for masterHDR
show_output(f'Importing {filter_file} for HDR detection', time=False)
filter_df = pd.read_csv(filter_file, sep='\t').loc[:, [
    'Chr', 'Start', 'End', 'Ref', 'Alt', 'Gene']]

HDR_df = masterHDR(
    pileup_file=filter_pileup,
    tumor_bam=tumor_bam,
    filter_df=filter_df,
    MINSIM=MINSIM,
    padding=PAD,
    min_q=min_q,
    min_HDR_count=HDRMINCOUNT
)
Ejemplo n.º 13
0
def run_primer3(
    mut_df,
    chroms_folder=".",
    pcr_config={},  # use defaults defined at top
    primer3_config={},  # use defaults defined at top
    threads=1,
):
    """
    input is df with columns 'Chr', 'Start', 'End', 'Ref', 'Alt' with optional id columns (everything left of Chr)
    output is id columns + 'Chr', 'Start', 'End', 'Ref', 'Alt' + primer cols
    primer cols:

    """

    # apply pcr size to primer3_config
    primer3_config["PRIMER_PRODUCT_SIZE_RANGE"] = [
        pcr_config["prod_size_min"],
        pcr_config["prod_size_max"],
    ]
    primer3_config.update(pcr_config)

    mut_df.loc[:, "Chr"] = mut_df["Chr"].astype("str")

    # COLS
    base_cols = ["Chr", "Start", "End", "Ref", "Alt"]
    # keep possible columns left of Chr
    # save the id columns into org_df for later merge
    keep_cols = list(mut_df.columns[:list(mut_df.columns).index("Chr")])

    keep_df = mut_df.loc[:, keep_cols + base_cols]

    mut_df = mut_df.loc[:, base_cols]

    df_list = []
    # cycle through (formatted) chromosomes
    # + load chromosome sequence
    # + create primer_df for mutations on that chromosome
    # + concat all mutations

    # ##### MULTIPROCESSING
    chrom_list = mut_df["Chr"].unique()
    show_output(f"Allocating processor pool for {threads} threads.")
    pool = Pool(threads)
    df_list = pool.map(
        partial(get_primer_df, mut_df, primer3_config, chroms_folder),
        chrom_list)

    primer_df = pd.concat(df_list, sort=True)
    new_cols = [
        "fwdPrimer",
        "revPrimer",
        "Status",
        "Temp",
        "AmpliconRange",
        "AmpliconSize",
        "InsertRange",
        "InsertSize",
        "InsertSeq",
        "offsetL",
        "offsetR",
    ]
    for col in ["AmpliconSize", "InsertSize", "offsetL", "offsetR"]:
        primer_df[col] = primer_df[col].fillna(0).astype(int)

    primer_df = keep_df.merge(primer_df[base_cols + new_cols])

    return primer_df
Ejemplo n.º 14
0
output = snakemake.output
threads = snakemake.threads
log = snakemake.log

#####################################################################
# ###################### FISHER SCORE ################################


def get_fisher_exact(row):
    T1plus = row['TR1+']
    T1min = row['TR1'] - T1plus
    T2plus = row['TR2+']
    T2min = row['TR2'] - T2plus
    mat = np.matrix([[T1plus, T2plus], [T1min, T2min]])
    fisher_p = fe(mat)[1]
    if fisher_p:
        return round(-10 * math.log(fisher_p, 10), 1)
    return 5000


print(f'Computing FisherScore for file {input[0]}')
df = pd.read_csv(input[0], sep='\t')
cols = list(df.columns)[:5]
df['FisherScore'] = df.apply(get_fisher_exact, axis=1)

# reduce to important cols
cols += ['FisherScore']
# write file to filtered
df[cols].to_csv(output[0], sep='\t', index=False)
show_output(f'FisherScore for file {input[0]} written to {output[0]}',
            color='success')
Ejemplo n.º 15
0
def get_cover_svg(i, o, sample, exon_cover, refgen, log, prettifyBed):
    cmd = f"bedtools coverage -b {i} -a {exon_cover} -hist -sorted -g {refgen}.genome | grep \'^all\' | sort -k2,2nr | {prettifyBed} | sort -k2,2n > {o}"
    if run_cmd(cmd):
        make_svg(o, plot_name=sample)
    else:
        show_output(f"bedtools for {i} exited with error!", color="warning")