Example #1
0
def annotation(exp):

    from requests.exceptions import RetryError
    from time import sleep

    out_dir = make_folder(f'{exp.scratch}/Annotated/')

    condition_list = exp.IPs['Condition'].unique().tolist()
    for condition in condition_list:
        if 'peaktype' not in list(exp.sample_files[condition].keys()):
            peakset = 'overlap_peak' if exp.sample_files[condition][
                'idr_optimal_peak'] == 'none' else 'idr_optimal_peak'
            exp.sample_files[condition]['peaktype'] = peakset

    peakfiles = {
        condition: read_pd(exp.sample_files[condition][
            exp.sample_files[condition]['peaktype']])
        for condition in exp.IPs['Condition'].unique().tolist()
        if exp.sample_files[condition][exp.sample_files[condition]['peaktype']]
        != 'none'
    }

    for condition, file in peakfiles.items():
        genome = exp.IPs.loc[exp.IPs.Condition == condition,
                             'Genome'].unique().tolist()[0]

        cond_dir = make_folder(f'{out_dir}{condition}/')
        anno_results = annotate_peaks(
            {condition: file},
            cond_dir,
            genome,
            db='UCSC',
            check=False,
            log_file=exp.log_file,
            run_main=exp.run_main)[f'{condition}_annotated']
        anno_list = anno_results.SYMBOL.unique().tolist()

        try:
            sleep(1)
            enrichr(anno_list,
                    f'enrichr_{condition}',
                    cond_dir,
                    scan=None,
                    max_terms=10,
                    figsize=(12, 6),
                    run_main=exp.run_main,
                    log_file=exp.log_file)
        except RetryError:
            output(
                f'No stable enrichr connection.  Skipping enrichr for {condition}.',
                log_file=exp.log_file,
                run_main=exp.run_main)

        exp.anno_results = {**exp.anno_results, **anno_results}

    exp.tasks_complete.append('Annotations')

    return exp
Example #2
0
def principal_component_analysis(exp):

    out_dir = make_folder(f'{exp.scratch}PCA/')

    bigwigs = {
        sample: exp.sample_files[sample]['bw']
        for sample in exp.samples if len(exp.sample_files[sample]['bw']) != 0
    }
    multibw_command = f"multiBigwigSummary bins -b {' '.join(list(bigwigs.values()))} -l {' '.join(list(bigwigs.keys()))} -p 4 --chromosomesToSkip chrM,chrX,chrY -o {out_dir}{exp.name}_bwsummary.npz"

    correlation_command = f'plotCorrelation --corData {out_dir}{exp.name}_bwsummary.npz --corMethod pearson --whatToPlot heatmap --skipZeros --plotTitle "{exp.name} Binned Pearson Correlation Heatmap" --plotFileFormat png --outFileCorMatrix {out_dir}{exp.name}_CorMatrix.tab --colorMap Purples -o {out_dir}{exp.name}_CorHeatmap.png'

    pca_command = f'plotPCA --corData {out_dir}{exp.name}_bwsummary.npz --plotTitle "{exp.name} PCA Plot" --plotFileFormat png --outFileNameData {out_dir}{exp.name}_PCA_data.tab --log2 -o {out_dir}{exp.name}_PCA_Plot.png'

    command_list = [
        submission_prepend(), multibw_command, correlation_command, pca_command
    ]

    exp.job_id.append(
        send_job(command_list=command_list,
                 job_name=f"{exp.name}_Cor_PCA",
                 job_log_folder=exp.job_folder,
                 q='general',
                 mem=4000,
                 log_file=exp.log_file,
                 project=exp.project,
                 cores=5,
                 run_main=exp.run_main))

    exp.tasks_complete.append('PCA')

    return exp
Example #3
0
def preseq(exp):

    output(
        '\nRunning QC plots: library complexity extrapolation, signal correlation and pca plots.',
        log_file=exp.log_file,
        run_main=exp.run_main)

    for sample in exp.samples:

        out_dir = make_folder(f'{exp.scratch}QC/preseq/{sample}/')

        command_list = [
            submission_prepend(
                f'preseq lc_extrap -bam -output {out_dir}{sample}_preseq.txt {exp.sample_files[sample]["bam"]}'
            )
        ]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f"{sample}_preseq",
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=5000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=1,
                     run_main=exp.run_main))

    exp.tasks_complete.append('preseq')

    return exp
Example #4
0
def plot_venn3_set(dict_of_sets, overlap_name, folder):
    '''
    Makes 3 way venn from 3 sets.
    Saves to file.

    Inputs
    ------
    dict_of_sets: dictionary of sets to overlap
    overlap_name: string with name of overlap
    folder: output folder

    Returns
    -------
    None

    '''
    folder = make_folder(f"{val_folder(folder)}venn_plot")

    plt.clf()
    plt.figure(figsize=(7, 7))

    font = {
        'family': 'sans-serif',
        'weight': 'normal',
        'size': 16,
    }

    plt.rc('font', **font)

    set_list = []
    set_names = []
    for name, setlist in dict_of_sets.items():
        set_list.append(setlist)
        set_names.append(name.replace('_', ' '))

    # make venn
    venn_plot = venn3(subsets=set_list, set_labels=set_names)
    patch = ['100', '110', '101', '010', '011', '001', '111']
    for p in patch:
        if venn_plot.get_patch_by_id(p):
            venn_plot.get_patch_by_id(p).set_color('none')
            venn_plot.get_patch_by_id(p).set_alpha(.4)
            venn_plot.get_patch_by_id(p).set_edgecolor('none')

    # make
    c = venn3_circles(subsets=set_list)
    colors_list = ['green', 'blue', 'grey']
    for circle, color in zip(c, colors_list):
        circle.set_edgecolor(color)
        circle.set_alpha(0.8)
        circle.set_linewidth(4)

    plt.title(f"{overlap_name.replace('_', ' ')} Overlaps")
    plt.tight_layout()
    plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.svg")
    plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.png",
                dpi=300)
    plt.close()
Example #5
0
def fastq_screen(exp):
    '''
    Checks fastq files for contamination with alternative genomes using Bowtie2
    '''

    output(
        f'Screening for contamination during sequencing: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
        log_file=exp.log_file,
        run_main=exp.run_main)

    # Make QC folder
    exp.qc_folder = make_folder(f'{exp.scratch}QC/')

    cwd = val_folder(os.getcwd())
    os.chdir(exp.data_folder)

    samples = [
        file for file in exp.sample_df.Scratch_File1.tolist() if is_fastq(file)
    ]

    # Submit fastqc and fastq_screen jobs for each sample
    for sample in samples:
        command_list = [
            submission_prepend(
                f'fastq_screen --threads 4 --aligner bowtie2 {sample}')
        ]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f'{sample.split("/")[-1]}_fastq_screen',
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=3000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))
        time.sleep(1)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # move to qc folder
    fastqs_files = glob.glob(f'{exp.data_folder}*screen*')
    for f in fastqs_files:
        copy2(f, exp.qc_folder)
        os.remove(f)

    # change to experimental directory in scratch
    os.chdir(cwd)

    exp.tasks_complete.append('Fastq_screen')
    output(f'Screening complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Example #6
0
def enrichr(gene_list,
            description,
            out_dir,
            scan=None,
            max_terms=10,
            figsize=(12, 6),
            run_main=False):
    '''
    Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list.
    Uses enrichr.

    Inputs
    ------
    gene_list: list of genes to perform enrichment on
    description: string description for title
    out_dir: output director
    scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats)
    max_terms: limit return plot to this max
    load: load results
    figsize: change fig size

    Returns
    -------

    None

    '''

    out_dir = make_folder(out_dir)

    testscan = {
        'KEGG': 'KEGG_2016',
        'GO_biological_process': 'GO_Biological_Process_2017b',
        'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ChEA': 'ChEA_2016',
        'OMIM_Disease': 'OMIM_Disease'
    }

    if isinstance(scan, dict):
        testscan = {**testscan, **scan}

    for nick, name in testscan.items():
        gseapy.enrichr(gene_list=gene_list,
                       figsize=figsize,
                       top_term=max_terms,
                       description=f'{description}_{nick}',
                       gene_sets=name,
                       outdir=out_dir,
                       format='png')

        out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png',
                   f'Enrichr: {nick} for {description}',
                   run_main=run_main)

    out_list = pd.DataFrame({'Gene Name': gene_list},
                            index=range(len(gene_list)))
    out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
Example #7
0
def plot_venn3_counts(element_list, set_labels, overlap_name, folder):
    '''
    Plot three way venn based on counts of specific overlaping numbers.
    Saves to file.

    Inputs
    ------
    element_list: tuple with counts of the the overlaps from (Abc,aBc,ABc,abC,AbC,ABC)
    set_labels: list or tuple with names of the overlaps ('A','B','C')
    overlap_name: string with name of overlap
    folder: output folder

    Returns
    -------
    None

    '''
    folder = make_folder(f"{val_folder(folder)}venn_plot")

    plt.clf()
    plt.figure(figsize=(7, 7))

    font = {
        'family': 'sans-serif',
        'weight': 'normal',
        'size': 16,
    }

    plt.rc('font', **font)

    # make venn
    venn_plot = venn3(
        subsets=element_list,
        set_labels=[name.replace('_', ' ') for name in set_labels])
    patch = ['100', '110', '101', '010', '011', '001', '111']
    for p in patch:
        if venn_plot.get_patch_by_id(p):
            venn_plot.get_patch_by_id(p).set_color('none')
            venn_plot.get_patch_by_id(p).set_alpha(.4)
            venn_plot.get_patch_by_id(p).set_edgecolor('none')

    # make
    c = venn3_circles(subsets=element_list)
    colors_list = ['green', 'blue', 'grey']
    for circle, color in zip(c, colors_list):
        circle.set_edgecolor(color)
        circle.set_alpha(0.8)
        circle.set_linewidth(4)

    plt.title(f"{overlap_name.replace('_', ' ')} Overlaps")
    plt.tight_layout()
    plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.svg")
    plt.savefig(f"{folder}{overlap_name.replace(' ', '_')}-overlap.png",
                dpi=300)
Example #8
0
def fastqc(exp):
    '''
    Performs fastq spec analysis with FastQC
    '''
    output('Assessing fastq quality. \n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    # Make QC folder
    exp.qc_folder = make_folder(f'{exp.scratch}QC/')

    all_samples = exp.sample_df.Scratch_File1.tolist(
    ) + exp.sample_df.Scratch_File2.tolist()
    samples = [file for file in all_samples if is_fastq(file)]

    for sample in samples:
        command_list = [submission_prepend(f'fastqc {sample}')]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f'{sample.split("/")[-1]}_fastqc',
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=5000,
                     log_file=exp.log_file,
                     project=exp.project,
                     run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # move to qc folder
    fastqc_files = glob.glob(f'{exp.data_folder}*.zip')
    fastqc_files = fastqc_files + glob.glob(f'{exp.data_folder}*.html')
    for f in fastqc_files:
        copy2(f, exp.qc_folder)
        os.remove(f)

    exp.tasks_complete.append('FastQC')
    output(f'FastQC complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Example #9
0
def plot_venn2(Series, overlap_name, folder):
    '''
    Series with with overlaps 10,01,11
    Plots a 2 way venn.
    Saves to file.
    '''

    folder = make_folder(f"{val_folder(folder)}venn_plot")

    plt.clf()
    plt.figure(figsize=(7, 7))

    font = {
        'family': 'sans-serif',
        'weight': 'normal',
        'size': 16,
    }

    plt.rc('font', **font)

    # make venn
    venn_plot = venn2(
        subsets=(Series.iloc[0], Series.iloc[1], Series.iloc[2]),
        set_labels=[name.replace('_', ' ') for name in Series.index.tolist()])
    patch = ['10', '01', '11']
    for p in patch:
        if venn_plot.get_patch_by_id(p):
            venn_plot.get_patch_by_id(p).set_color('none')
            venn_plot.get_patch_by_id(p).set_alpha(.4)
            venn_plot.get_patch_by_id(p).set_edgecolor('none')

    c = venn2_circles(subsets=(Series.iloc[0], Series.iloc[1], Series.iloc[2]))
    colors = ['green', 'blue']
    for circle, color in zip(c, colors):
        circle.set_edgecolor(color)
        circle.set_alpha(0.8)
        circle.set_linewidth(2)

    plt.title(overlap_name.replace('_', ' ') + " overlaps")
    plt.tight_layout()
    plt.savefig(f'{folder}{overlap_name.replace(" ", "_")}-overlap.svg')
    plt.savefig(f'{folder}{overlap_name.replace(" ", "_")}-overlap.png',
                dpi=300)
Example #10
0
def encode3(exp):

    if 'Stage' not in exp.tasks_complete:
        output('Files not staged.\n', log_file=exp.log_file)
        exp = stage(exp)

    output('Running alignment and peak calling using ENCODE3 standards.',
           log_file=exp.log_file,
           run_main=exp.run_main)
    output('ENCODE3 cromwell pipeline.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    out_dir = make_folder(f'{exp.scratch}ENCODE3/')

    IPs = exp.IPs

    end_types = {'q.gz': 'fastq', '.bam': 'bam'}

    for experiment in IPs.Condition.unique().tolist():

        exp_dir = make_folder(f'{out_dir}{experiment}/')

        IP_sample_indicies = [(rep, index) for rep, index in enumerate(
            IPs[IPs.Condition == experiment].index.tolist(), start=1)]

        if len(IP_sample_indicies) > 6:
            raise IOError('Pipeline cannot handle more than 6 replicates.')

        seq_type = False if 'none' in IPs[
            IPs.Condition == experiment]['File2'].tolist() else True
        final_stage = 'align' if 'align' in IPs[
            IPs.Condition == experiment]['Final Stage'].tolist() else 'all'

        UMI_list = [
            x.lower()
            for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist()
        ]
        if len(set(UMI_list)) > 1:
            raise IOError(
                'All samples must be UMI processed or not for each condition.')
        UMI = True if UMI_list[0].lower() == 'yes' else False

        try:
            file_type = end_types[exp.sample_df[exp.sample_df.Condition ==
                                                experiment]
                                  ['Scratch_File1'].tolist()[0][-4:]]
        except KeyError:
            output(
                f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.",
                log_file=exp.log_file,
                run_main=exp.run_main)

        genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist()
        if len(genome) > 1:
            raise IOError(
                'Cannot align to more than one genome per condition.')

        chip_type = IPs[IPs.Condition ==
                        experiment]['ChIP Type'].unique().tolist()
        if len(chip_type) > 1:
            raise IOError(
                'Cannot have more than one chip type (histone or TF) for a condition.'
            )
        chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf'

        json_file = {
            'chip.pipeline_type': chip_type,
            'chip.paired_end': seq_type,
            'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]],
            'chip.bwa.mem_mb': 30000,
            'chip.macs2_mem_mb': 30000,
            'chip.peak_caller': 'macs2',
            "chip.true_rep_only": False,
            "chip.dup_marker": "picard",
            "chip.mapq_thresh": 30,
            "chip.regex_filter_reads": "chrM",
            "chip.subsample_reads": 0,
            "chip.ctl_subsample_reads": 0,
            "chip.xcor_subsample_reads": 15000000,
            "chip.keep_irregular_chr_in_bfilt_peak": False,
            "chip.always_use_pooled_ctl": False,
            "chip.ctl_depth_ratio": 1.2,
            "chip.macs2_cap_num_peak": 500000,
            "chip.pval_thresh": 0.01,
            "chip.idr_thresh": 0.05,
            "chip.bwa_cpu": 4,
            "chip.bwa_mem_mb": 20000,
            "chip.bwa_time_hr": 48,
            "chip.filter_cpu": 2,
            "chip.filter_mem_mb": 20000,
            "chip.filter_time_hr": 24,
            "chip.bam2ta_cpu": 2,
            "chip.bam2ta_mem_mb": 10000,
            "chip.bam2ta_time_hr": 6,
            "chip.fingerprint_cpu": 2,
            "chip.fingerprint_mem_mb": 12000,
            "chip.fingerprint_time_hr": 6,
            "chip.xcor_cpu": 2,
            "chip.xcor_mem_mb": 16000,
            "chip.xcor_time_hr": 24,
            "chip.macs2_time_hr": 24,
            "chip.spr_mem_mb": 16000
        }
        bams = []
        ctl_bams = []

        for rep, index in IP_sample_indicies:
            sample = exp.sample_df.loc[index, 'Sample_Name']
            input_sample = IPs.loc[index, 'Background_Name']

            if file_type == 'fastq':
                json_file[f'chip.fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{sample}_trim_R1.fastq.gz'
                ]
                json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz'
                ]
                if seq_type:
                    json_file[f'chip.fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{sample}_trim_R2.fastq.gz'
                    ]
                    json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz'
                    ]
            else:
                bams.append(f'{exp.data_folder}{sample}.bam')
                ctl_bams.append(f'{exp.data_folder}{input_sample}.bam')

        if file_type == 'bam':
            json_file[f'chip.bams'] = bams
            json_file[f'chip.ctl_bams'] = ctl_bams

        json_file['chip.align_only'] = True if UMI & (file_type
                                                      == 'fastq') else False
        json_file[
            'chip.align_only'] = True if final_stage == 'align' else False

        json_file['chip.no_dup_removal'] = True if UMI else False
        json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & (
            file_type == 'bam') else experiment
        json_file[
            "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}."

        encode_file = f'{exp_dir}{experiment}_ENCODE3.json'
        with open(encode_file, 'w') as file:
            json.dump(json_file, file, indent=4, sort_keys=True)

        pythonpath = shutil.which('python')
        miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
        cromwell_jar = re.sub(
            r'{}/.*'.format(miniconda),
            '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                miniconda), pythonpath)
        jar = cromwell_jar if os.path.isfile(
            cromwell_jar
        ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

        command_list = [
            submission_prepend(source='encode-chip-seq-pipeline'),
            f'cd {exp_dir}',
            f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}'
        ]

        sent_job = send_job(command_list=command_list,
                            job_name=f"{experiment}_ENCODE3",
                            job_log_folder=exp.job_folder,
                            q='bigmem',
                            mem=35000,
                            log_file=exp.log_file,
                            project=exp.project,
                            cores=1,
                            run_main=exp.run_main)

        exp.job_id.append(sent_job)
        job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    exp = encode_results(exp)

    exp.tasks_complete.append('ENCODE3')

    return exp
Example #11
0
def plot_col(df,
             title,
             ylabel,
             out='',
             xy=(None, None),
             xticks=[''],
             plot_type=['violin', 'swarm'],
             pvalue=False,
             compare_tags=None,
             log_file=None,
             run_main=False):
    '''
    One or two column boxplot from dataframe.  Titles x axis based on column names.

    Inputs
    ------
    df: dataframe (uses first two columns)
    title: string of title
    ylabel: string of y label
    xy: If specified, will x is the label column and y is the data column. (default: (None,None): Data separated into two columns).
    xticks: list of xtick names (default is none)
    pvalue: bool to perform ttest (default is False).  Will only work if xy=(None,None) or ther are only two labels in x.
    plot_type: list of one or more: violin, box, swarm (default=violin)
    compare_tags:  if xy and pvalue is specified and there are more than two tags in x, specify the tags to compare. eg. ['a','b']
    out: out parent directory.  if none returns into colplot/
    log_file: log_file

    Returns
    ------
    None
    '''

    out = make_folder(f'{val_folder(out)}plots/')

    plt.clf()
    sns.set(context='paper',
            font='Arial',
            font_scale=2,
            style='white',
            rc={
                'figure.dpi': 300,
                'figure.figsize': (5, 6)
            })

    if type(plot_type) != list:
        plot_type = plot_type.split()
    lower_plot_type = [x.lower() for x in plot_type]

    if len(lower_plot_type) == 0:
        raise IOError('Input a plot type.')
    elif True not in {
            x in lower_plot_type
            for x in ['violin', 'box', 'swarm']
    }:
        raise IOError('Did not recognize plot type.')

    if 'swarm' in lower_plot_type:
        if xy == (None, None):
            fig = sns.swarmplot(data=df, color='black', s=4)
        else:
            fig = sns.swarmplot(data=df, x=xy[0], y=xy[1], color='black', s=4)
    if 'violin' in lower_plot_type:
        if xy == (None, None):
            fig = sns.violinplot(data=df)
        else:
            fig = sns.violinplot(data=df, x=xy[0], y=xy[1])
    if 'box' in lower_plot_type:
        if xy == (None, None):
            fig = sns.boxplot(data=df)
        else:
            fig = sns.boxplot(data=df, x=xy[0], y=xy[1])

    fig.yaxis.set_label_text(ylabel)
    fig.set_title(title)
    if xticks:
        fig.xaxis.set_ticklabels(xticks)
        fig.xaxis.set_label_text('')
        for tick in fig.xaxis.get_ticklabels():
            tick.set_fontsize(12)

    if pvalue:
        if xy == (None, None):
            _, pvalue = stats.ttest_ind(a=df.iloc[:, 0], b=df.iloc[:, 1])
            compare_tags = df.columns
        else:
            _, pvalue = stats.ttest_ind(
                a=df[df[xy[0]] == compare_tags[0]][xy[1]],
                b=df[df[xy[0]] == compare_tags[1]][xy[1]])
        fig.text(
            s=f'p-value = {pvalue:.03g}, {compare_tags[0]} v {compare_tags[1]}',
            x=0,
            y=-.12,
            transform=fig.axes.transAxes,
            fontsize=12)

    sns.despine()
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.17, top=0.9)
    plt.savefig(f"{out}{title.replace(' ', '_')}.png", dpi=300)
    if run_main:
        plt.close()

    out_result(f"{out}{title.replace(' ', '_')}.png",
               f'{title} Plot',
               run_main=run_main)
    output(f"{title.replace(' ', '_')}.png found in {out}",
           log_file=log_file,
           run_main=run_main)
Example #12
0
def deeptools(regions,
              signals,
              matrix_name,
              out_name,
              pegasus_folder,
              title='',
              bps=(1500, 1500, 4000),
              type='center',
              scaled_names=('TSS', 'TES'),
              make=('matrix', 'heatmap', 'heatmap_group', 'profile',
                    'profile_group')):
    '''
    Inputs
    ------
    regions: dictionary {'region_name':'/path/to/ssh/bedfile'}
    signals: dictionary {'signal_name':'/path/to/ssh/bigwigfile'}
    matrix_name: string of matrix name or matrix to be named (before .matrix.gz)
    out_name: name for output file
    tite: plot title (optional)
    bps: tuple of region width on either side of center or scaled.  center ignores last number.  default is (1500,1500,4000)
    type: 'center' or 'scaled'
    scaled_names: optional names for scaled start and end (default ('TSS','TES'))
    make: tuple of deeptool commands.  options: matrix, heatmap, heatmap_group, profile, profile_group
    copy: bool.  Copy region and signal files to peagasus
    copy_folder: folder to copy into

    Returns
    -------
    string of commands for ssh_job

    '''
    pegasus_folder = make_folder(pegasus_folder)

    make_lower = [x.lower() for x in make]

    if type.lower() == 'center':
        deepMat = 'reference-point --referencePoint center'
        deepHeat = "--refPointLabel 'Peak Center'"
        deepProf = "--refPointLabel 'Peak Center'"
    else:
        deepMat = f'scale-regions --regionBodyLength {str(bps[2])}'
        deepHeat = f'--startLabel {scaled_names[0]} --endLabel {scaled_names[1]}'
        deepProf = f'--startLabel {scaled_names[0]} --endLabel {scaled_names[1]}'

    cmd_list = [submission_prepend()]

    pegasus_region_path = ' '.join([
        f"{pegasus_folder}{region_path.split('/')[-1]}"
        for region_path in regions.values()
    ])
    pegasus_signal_path = ' '.join([
        f"{pegasus_folder}{signal_path.split('/')[-1]}"
        for signal_path in signals.values()
    ])

    if 'matrix' in make_lower:
        signal_name = ' '.join([signal_name for signal_name in signals.keys()])
        computeMatrix = f"computeMatrix {deepMat} -a {str(bps[0])} -b {str(bps[1])} -p 4 -R {pegasus_region_path} -S {pegasus_signal_path} --samplesLabel {signal_name} -o {matrix_name}.matrix.gz"
        cmd_list.append(computeMatrix)

    if 'heatmap' in make_lower or 'heatmap_group' in make_lower:
        region_name = ' '.join([region_name for region_name in regions.keys()])
        plotHeatmap_base = f"plotHeatmap -m {matrix_name}.matrix.gz --dpi 300 {deepHeat} --regionsLabel {region_name} --plotTitle '{title.replace('_', ' ')}' --whatToShow 'heatmap and colorbar' --colorMap Reds -out {out_name}_heatmap"
        if 'heatmap' in make_lower:
            cmd_list.append(f"{plotHeatmap_base}.png")
        if 'heatmap_group' in make_lower:
            cmd_list.append(f"{plotHeatmap_base}_perGroup.png --perGroup")

    if 'profile' in make_lower or 'profile_group' in make_lower:
        region_name = ' '.join([region_name for region_name in regions.keys()])
        plotProfile_base = f"plotProfile -m {matrix_name}.matrix.gz --dpi 300 {deepProf} --plotTitle '{title.replace('_', ' ')}' --regionsLabel {region_name} -out {out_name}_profile"
        if 'profile' in make_lower:
            cmd_list.append(f"{plotProfile_base}.png")
        if 'profile_group' in make_lower:
            cmd_list.append(f"{plotProfile_base}_perGroup.png --perGroup")

    return cmd_list
Example #13
0
def stage(exp):
    '''
    Stages files in scratch folder
    '''
    output(f'Staging in {exp.scratch}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)
    exp.data_folder = make_folder(f'{exp.scratch}raw_data/')

    Scratch_File1 = []
    Scratch_File2 = []

    # join multiple files
    for sample in exp.sample_df.Sample_Name.tolist():
        index = exp.sample_df['Sample_Name'] == sample

        paired = exp.sample_df.loc[index, 'paired'].values[0]
        R1_list = ','.join(exp.sample_df.loc[index, 'File1']).split(',')
        R2_list = ','.join(exp.sample_df.loc[index, 'File2']).split(',')

        main_file = R1_list[0]

        # convert form bz2 to fastq
        if main_file.endswith('.tzt.bz2'):
            for file in R1_list + R2_list:
                newfile = file.replace(".txt.bz2", ".fastq.gz")
                os.system(f'bunzip2 -c < {file} | gzip -c > {newfile}')
            R1_list = [txt_replace(x) for x in R1_list]
            R2_list = [txt_replace(x) for x in R2_list]
            main_file = txt_replace(main_file)

        if main_file.endswith('.fastq.gz'):
            fileend = '_R1.fastq.gz' if paired else '.fastq.gz'
            filename = f'{exp.data_folder}{sample}{fileend}'
            os.system(f'cat {" ".join(R1_list)} > {filename}')
            Scratch_File1.append(filename)

            if paired:
                fileend = '_R2.fastq.gz'
                filename = f'{exp.data_folder}{sample}{fileend}'
                os.system(f'cat {" ".join(R2_list)} > {filename}')
                Scratch_File2.append(filename)
            else:
                Scratch_File2.append('none')

        elif main_file.endswith('.bam'):
            copy2(main_file, f'{exp.data_folder}{sample}.bam')

        else:
            raise IOError('Filetype not recognized.')

    exp.sample_df['Scratch_File1'] = Scratch_File1
    exp.sample_df['Scratch_File2'] = Scratch_File2
    exp.sample_df.replace([f'{exp.data_folder}none'], 'none', inplace=True)

    exp.tasks_complete.append('Stage')
    output(f'Staging complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Example #14
0
def spike(exp):
    '''
    If calling from jupyter.  Change backend as needed.

    Align sequencing files to drosophila.
    '''
    import pandas as pd

    if len(exp.spike_samples) == 0:
        output('Not processing Spike-ins',
               log_file=exp.log_file,
               run_main=exp.run_main)
        exp.tasks_complete.append('Spike')
        return exp

    # Make QC folder
    spike_folder = make_folder(f'{exp.scratch}spike/')
    output('Processing samples with drosophila-spike in chromatin.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    for sample in exp.spike_samples:
        bam = exp.sample_files[sample]['bam']

        spike_command = [
            submission_prepend(),
            f'samtools view -b -f 4 {bam} | samtools sort -n - | samtools fastq - > {spike_folder}{sample}.bwa_unaligned.fastq',
            f'bowtie2 -p 8 -x {exp.genome_indicies["spike_index"]} -U {spike_folder}{sample}.bwa_unaligned.fastq -S {spike_folder}{sample}.BDGP6.sam --very-sensitive-local -k 1 --no-unal',
            f'samtools view -b -F 4 {spike_folder}{sample}.BDGP6.sam | samtools sort - > {spike_folder}{sample}.BDGP6.bam',
            f'picard MarkDuplicates I={spike_folder}{sample}.BDGP6.bam O={spike_folder}{sample}.BDGP6.nodup.bam M={spike_folder}{sample}.BDGP6.nodups.markdups.qc ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true',
            f'samtools flagstat {spike_folder}{sample}.BDGP6.nodup.bam > {spike_folder}{sample}.unique_drosophila.flagstat.qc',
            f'rm {spike_folder}{sample}.BDGP6.sam {spike_folder}{sample}.BDGP6.nodup.bam {spike_folder}{sample}*.fastq'
        ]

        exp.job_id.append(
            send_job(command_list=spike_command,
                     job_name=f"{sample}_spike",
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=10000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file, exp.run_main)

    spike_reads = pd.DataFrame(index=['spike_reads', 'genome_reads'])

    for sample in exp.spike_samples:
        qc_file = f'{spike_folder}{sample}.unique_drosophila.flagstat.qc'
        exp.sample_files[sample]['drosophila'] = qc_file

        with open(qc_file, 'r') as fp:
            spike_number = fp.read().split(' ')[0]

        with open(exp.sample_files[sample]['nodup_flagstat']) as fp:
            target_number = fp.read().split(' ')[0]

        spike_reads[sample] = [spike_number, target_number]

    exp.spike_reads = spike_reads.T
    condition_dict = pd.Series(exp.sample_df.Condition.values,
                               index=exp.sample_df.Sample_Name).to_dict()

    exp.spike_reads['Replicate'] = [
        x.split('_')[-1] for x in exp.spike_reads.index.tolist()
    ]
    exp.spike_reads['Condition'] = [
        condition_dict[x] for x in exp.splike_reads.index.tolist()
    ]

    for name, spike_conditions in exp.spike_comparisons.items():
        out_dir = make_folder(f'{exp.scratch}spike/{name}')
        plot = spike_in_plot(exp.spike_reads, spike_conditions, name, out_dir)
        out_result(plot,
                   f'{name.replace("_", " ")} Spike-In Comparison',
                   run_main=exp.run_main)
        output(
            f'Spike-in comparison {name.replace("_", " ")} can be found here: {plot.replace(os.scratch, "")}'
        )

    output(f'Spike-in counts:\n {spike_reads.T}',
           log_file=exp.log_file,
           run_main=exp.run_main)

    output('Spike-in alignment jobs finished.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    # Generate one dataframe for all spike_counts

    output(
        f"Spike-in processing complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n",
        log_file=exp.log_file,
        run_main=exp.run_main)

    exp.tasks_complete.append('Spike')
    return exp
Example #15
0
def annotate_peaks(dict_of_dfs,
                   folder,
                   genome,
                   log_file,
                   db='UCSC',
                   check=False,
                   run_main=False):
    '''
    Annotate a dictionary of dataframes from bed files to the genome using ChIPseeker and Ensembl annotations.

    Inputs
    ------
    dict_of_beds: dictionary of bed files
    folder: output folder
    genome: hg38, hg19, mm10
    db: default UCSC, but can also accept Ensembl
    check: bool. checks whether annotation file already exists

    Returns
    -------
    dictionary of annotated bed files as dataframe

    '''
    pandas2ri.activate()

    ri.set_writeconsole_regular(rout_write)
    ri.set_writeconsole_warnerror(rout_write)

    folder = make_folder(folder)

    chipseeker = importr('ChIPseeker')
    genomicFeatures = importr('GenomicFeatures')
    makeGR = ro.r("makeGRangesFromDataFrame")

    check_df = {
        key: os.path.isfile(f'{folder}{key.replace(" ", "_")}_annotated.xlsx')
        for key in dict_of_dfs.keys()
    }
    return_bool = False not in set(check_df.values())
    if return_bool & check:
        return {
            f'{key}_annotated':
            pd.from_excel(f'{folder}{key.replace(" ", "_")}_annotated.xlsx')
            for key in dict_of_dfs.keys()
        }

    if db.lower() == 'ucsc':
        species = ('Mmusculus' if genome.lower() == 'mm10' else 'Hsapiens')
        TxDb = importr(f'TxDb.{species}.UCSC.{genome.lower()}.knownGene')
        txdb = ro.r(f'txdb <- TxDb.{species}.UCSC.{genome.lower()}.knownGene')
    elif db.lower() == 'ensembl':
        pwd = 'todo'
        loadDb = ro.r('loadDb')
        txdb = loadDb(pwd.format(genome.lower()))
    else:
        raise ValueError('UCSC or Ensembl only.')

    if genome.lower() == 'mm10':
        annoDb = importr('org.Mm.eg.db')
        anno = 'org.Mm.eg.db'
    elif genome.lower() == 'hg38' or genome.lower() == 'hg19':
        annoDb = importr('org.Hs.eg.db')
        anno = 'org.Hs.eg.db'

    return_dict = {}

    output('Annotating Peaks...', log_file=log_file, run_main=run_main)
    for key, df in dict_of_dfs.items():
        if check & check_df[key]:
            return_dict[f'{key}_annotated'] = pd.from_excel(
                f'{folder}{key.replace(" ", "_")}_annotated.xlsx')
        else:
            col_len = len(df.columns)
            df.columns = ["chr", "start", "end"] + list(range(col_len - 3))
            GR = makeGR(df)
            GR_anno = chipseeker.annotatePeak(GR,
                                              overlap='all',
                                              TxDb=txdb,
                                              annoDb=anno)
            return_dict[f'{key}_annotated'] = ro.pandas2ri.ri2py(
                chipseeker.as_data_frame_csAnno(GR_anno))
            return_dict[f'{key}_annotated'].to_excel(
                f'{folder}{key.replace(" ", "_")}_annotated.xlsx', index=None)

    return return_dict
Example #16
0
def overlaps(exp):
    '''
    Performs overlaps of two or more de_sig lists.
    '''

    out_dir = make_folder(f'{exp.scratch}/Overlaps/')

    for comparison, overlap_list in exp.overlaps.items():
        comp_dir = make_folder(f'{out_dir}{comparison}_Overlap/')
        peakset = 'overlap_peak' if 'none' in [
            exp.sample_files[condition]['idr_optimal_peak']
            for condition in overlap_list
        ] else 'idr_optimal_peak'

        if (peakset == 'overlap_peak') and ('none' in [
                exp.sample_files[condition]['overlap_peak']
                for condition in overlap_list
        ]):
            output(
                f'ENCODE processing did not finish for at least one of the samples in the comparison {comparison}.  Skipping overlap...',
                log_file=exp.log_file,
                run_main=exp.run_main)
            with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file:
                file.write(
                    'Cannot find the peaks for at least one sample.  Skipping overlap...'
                )
            continue

        for condition in overlap_list:
            exp.sample_files[condition]['peaktype'] = peakset

        bed_dict = {
            condition: load_bedtool(exp.sample_files[condition][peakset])
            for condition in overlap_list
        }

        genome_list = exp.IPs.loc[exp.IPs['Condition'].isin(overlap_list),
                                  'Genome'].unique().tolist()
        if len(genome_list) > 1:
            output(
                f'Cannot overlap peaks from different genomes for {condition}.',
                log_file=exp.log_file,
                run_main=exp.run_main)
            with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file:
                file.write(
                    'Cannot overlap peaks from different genomes for this condition.'
                )
            continue
        else:
            genome = genome_list[0]

        if len(overlap_list) == 2:
            exp.overlap_results[comparison] = overlap_two(
                bed_dict,
                comparison,
                comp_dir,
                exp.log_file,
                genome=genome,
                run_main=exp.run_main)
        elif len(overlap_list) == 3:
            exp.overlap_results[comparison] = overlap_three(
                bed_dict,
                comparison,
                comp_dir,
                exp.log_file,
                genome=genome,
                run_main=exp.run_main)
        else:
            output(f'Cannot overlap more than three samples for {condition}.',
                   log_file=exp.log_file,
                   run_main=exp.run_main)
            with open(f'{comp_dir}SKIPPING_OVERLAP.txt', 'w') as file:
                file.write('Cannot overlap more than three samples.')
            continue

    output(f'Overlap analysis complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Example #17
0
def overlap_two(bed_dict,
                overlap_name,
                out_folder,
                log_file,
                genome=None,
                run_main=False):
    '''
    Takes a dictionary of two bed-like format files.
    Merges all overlapping peaks for each bed into a master file.
    Intersects beds to merged master file.
    Performs annotations with ChIPseeker if genome is specified.
    Plots venn diagrams of peak overlaps
    If genome is specified, also plots venn diagrams of annotated gene sets.

    Inputs
    ------
    bed_dict:  dictionary of BedTool files
    genome: 'hg38','hg19','mm10'

    Returns
    -------
    Returns a dictionary of dataframes from unique and overlap peaks.
    If genome is specified, includes a dictionary of annotated peaks.
    '''

    names = list(bed_dict.keys())

    out_folder = make_folder(out_folder)

    output(f'Output files for {overlap_name} are found in {out_folder}',
           log_file=log_file,
           run_main=run_main)

    masterfile = bed_dict[names[0]].cat(bed_dict[names[1]]).sort().merge()
    sorted_dict = {key: bed.sort().merge() for key, bed in bed_dict.items()}
    overlap_dict = {
        'overlap':
        masterfile.intersect(sorted_dict[names[0]]).intersect(
            sorted_dict[names[1]])
    }
    for key, bed in sorted_dict.items():
        other = {
            other_key: other_bed
            for other_key, other_bed in sorted_dict.items() if other_key != key
        }
        overlap_dict[f'{key}_unique_peak'] = masterfile.intersect(
            sorted_dict[key]).intersect(list(other.values())[0], v=True)

    for key, bed in overlap_dict.items():
        if len(bed) == 0:
            open(
                f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed',
                'w').close()  # Can't convert empty bed file to dataframe
        else:
            bed2df(bed).to_csv(
                f'{out_folder}{key.replace(" ", "_")}-unique-peaks-from-mergedPeaks.bed',
                header=None,
                index=None,
                sep="\t")

    overlap_numbers = pd.Series(
        {
            names[0]: len(overlap_dict[f'{names[0]}_unique_peak']),
            names[1]: len(overlap_dict[f'{names[1]}_unique_peak']),
            'overlap': len(overlap_dict['overlap'])
        },
        index=[names[0], names[1], 'overlap'])

    # Venn
    plot_venn2(overlap_numbers, overlap_name.replace('_', ' '), out_folder)
    out_result(
        f'{out_folder}venn_plot/{overlap_name.replace(" ","_")}-overlap.png',
        f"{overlap_name.replace('_',' ')} Peak Venn Overlap",
        run_main=run_main)

    if bool(genome):
        # output(f'Annotating overlaping peaks for {overlap_name.replace("_"," ")}...', log_file)
        # Annotate with ChIPseeker
        unikey = '{}_unique'
        unianno = '{}_unique_annotated'
        return_dict = annotate_peaks(
            {
                unikey.format(key): bed2df(bed)
                for key, bed in overlap_dict.items() if len(bed) > 0
            },
            out_folder,
            genome=genome,
            log_file=log_file,
            run_main=run_main)
        for key, bed in overlap_dict.items():
            if len(bed) == 0:
                return_dict[unianno.format(key)] = None

        Set1_unique = set() if return_dict[unianno.format(
            f'{names[0]}_unique_peak')] is None else set(
                return_dict[unianno.format(
                    f'{names[0]}_unique_peak')].SYMBOL.unique().tolist())
        Set2_unique = set() if return_dict[unianno.format(
            f'{names[1]}_unique_peak')] is None else set(
                return_dict[unianno.format(
                    f'{names[1]}_unique_peak')].SYMBOL.unique().tolist())
        Overlap_Set = set(
        ) if return_dict[unianno.format('overlap')] is None else set(
            return_dict[unianno.format('overlap')].SYMBOL.unique().tolist())

        venn2_dict = {
            names[0]: (Set1_unique | Overlap_Set),
            names[1]: (Set2_unique | Overlap_Set)
        }

        plot_name = f'{overlap_name.replace("_"," ")} Annotated Gene'
        plot_venn2_set(venn2_dict, plot_name, out_folder)
        out_result(
            f'{out_folder}venn_plot/{plot_name.replace(" ","_")}-overlap.png',
            f"{overlap_name.replace('_',' ')} Venn Annotated Gene Overlap",
            run_main=run_main)

        gene_overlaps = {}
        gene_overlaps[f'{names[0]}_unique_genes'] = Set1_unique - (
            Set2_unique | Overlap_Set)
        gene_overlaps[f'{names[1]}_unique_genes'] = Set2_unique - (
            Set1_unique | Overlap_Set)
        gene_overlaps['Overlap_Gene_Set'] = (Set1_unique
                                             & Set2_unique) | Overlap_Set

        return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()}

        for key, item in gene_overlaps.items():
            return_dict[key] = item

    else:
        return_dict = {key: bed2df(bed) for key, bed in overlap_dict.items()}

    return return_dict
Example #18
0
def encode3(exp):

    if 'Stage' not in exp.tasks_complete:
        output('Files not staged.\n', log_file=exp.log_file)
        exp = stage(exp)

    output('Running alignment and peak calling using ENCODE3 standards.',
           log_file=exp.log_file,
           run_main=exp.run_main)
    output('ENCODE3 cromwell pipeline.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    out_dir = make_folder(f'{exp.scratch}ENCODE3/')

    IPs = exp.IPs

    end_types = {'q.gz': 'fastq', '.bam': 'bam'}

    for experiment in IPs.Condition.unique().tolist():

        exp_dir = make_folder(f'{out_dir}{experiment}/')

        IP_sample_indicies = [(rep, index) for rep, index in enumerate(
            IPs[IPs.Condition == experiment].index.tolist(), start=1)]

        if len(IP_sample_indicies) > 6:
            raise IOError('Pipeline cannot handle more than 6 replicates.')

        seq_type = False if 'none' in IPs[
            IPs.Condition == experiment]['File2'].tolist() else True

        aligner = IPs[IPs.Condition == experiment]['Aligner'].unique().tolist()
        if len(aligner) != 1:
            raise IOError(
                'All replicates must be aligned using the same aligner or not, which must be specified.'
            )
        else:
            aligner = aligner[0]

        peak_caller = IPs[IPs.Condition ==
                          experiment]['Peak Caller'].unique().tolist()
        if len(peak_caller) != 1:
            raise IOError(
                'All replicates peaks must be called or not using the same peak calling strategy.'
            )
        else:
            peak_caller = peak_caller[0]

        UMI_list = [
            x.lower()
            for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist()
        ]
        if len(set(UMI_list)) > 1:
            raise IOError(
                'All samples must be UMI processed or not for each condition.')
        UMI = True if UMI_list[0] == 'yes' else False

        try:
            file_type = end_types[exp.sample_df[exp.sample_df.Condition ==
                                                experiment]
                                  ['Scratch_File1'].tolist()[0][-4:]]
        except KeyError:
            output(
                f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.",
                log_file=exp.log_file,
                run_main=exp.run_main)

        file_type = 'bam' if (UMI is True) & (
            'UMI' in exp.tasks_complete) else file_type

        genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist()
        if len(genome) > 1:
            raise IOError(
                'Cannot align to more than one genome per condition.')

        chip_type = IPs[IPs.Condition ==
                        experiment]['ChIP Type'].unique().tolist()
        if len(chip_type) > 1:
            raise IOError(
                'Cannot have more than one chip type (histone or TF) for a condition.'
            )
        chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf'

        json_file = {
            'chip.pipeline_type': chip_type,
            'chip.paired_end': seq_type,
            'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]],
            'chip.align_mem_mb': 30000,
            "chip.true_rep_only": False,
            "chip.dup_marker": "picard",
            "chip.mapq_thresh": 30,
            "chip.filter_chrs": ["chrM"],
            "chip.subsample_reads": 0,
            "chip.ctl_subsample_reads": 0,
            "chip.xcor_subsample_reads": 15000000,
            "chip.always_use_pooled_ctl": False,
            "chip.ctl_depth_ratio": 1.2,
            "chip.cap_num_peak_macs2": 500000,
            "chip.pval_thresh": 0.01,
            "chip.idr_thresh": 0.05,
            "chip.align_cpu": 4,
            "chip.align_time_hr": 48,
            "chip.filter_cpu": 2,
            "chip.filter_mem_mb": 20000,
            "chip.filter_time_hr": 24,
            "chip.bam2ta_cpu": 2,
            "chip.bam2ta_mem_mb": 10000,
            "chip.bam2ta_time_hr": 6,
            "chip.jsd_cpu": 2,
            "chip.jsd_mem_mb": 12000,
            "chip.jsd_time_hr": 6,
            "chip.xcor_cpu": 2,
            "chip.xcor_mem_mb": 16000,
            "chip.xcor_time_hr": 24,
            "chip.align_time_hr": 24,
            "chip.spr_mem_mb": 16000,
            "chip.enable_count_signal_track": True,
        }

        if peak_caller == 'macs2':
            json_file['chip.peak_caller'] = 'macs2'

        if aligner != 'none':
            json_file['chip.aligner'] = aligner

        bams = []
        ctl_bams = []

        for rep, index in IP_sample_indicies:
            sample = exp.sample_df.loc[index, 'Sample_Name']
            input_sample = IPs.loc[index, 'Background_Name']

            if file_type == 'fastq':
                json_file[f'chip.fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{sample}_trim_R1.fastq.gz'
                ]
                json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz'
                ]
                if seq_type:
                    json_file[f'chip.fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{sample}_trim_R2.fastq.gz'
                    ]
                    json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz'
                    ]
            else:
                bams.append(f'{exp.data_folder}{sample}.bam')
                ctl_bams.append(f'{exp.data_folder}{input_sample}.bam')

        if file_type == 'bam':
            json_file[f'chip.bams'] = bams
            json_file[f'chip.ctl_bams'] = ctl_bams

        json_file['chip.align_only'] = True if UMI & (file_type
                                                      == 'fastq') else False
        json_file[
            'chip.align_only'] = True if peak_caller == 'none' else json_file[
                'chip.align_only']

        json_file['chip.no_dup_removal'] = True if UMI else False
        json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & (
            file_type == 'bam') else experiment
        json_file[
            "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}."

        encode_file = f'{exp_dir}{experiment}_ENCODE3.json'
        with open(encode_file, 'w') as file:
            json.dump(json_file, file, indent=4, sort_keys=True)

        pythonpath = shutil.which('python')
        miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
        cromwell_jar = re.sub(
            r'{}/.*'.format(miniconda),
            '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                miniconda), pythonpath)
        jar = cromwell_jar if os.path.isfile(
            cromwell_jar
        ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

        command_list = [
            submission_prepend(source='encode-chip-seq-pipeline'),
            f'cd {exp_dir}',
            f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}'
        ]

        sent_job = send_job(command_list=command_list,
                            job_name=f"{experiment}_ENCODE3",
                            job_log_folder=exp.job_folder,
                            q='bigmem',
                            mem=35000,
                            log_file=exp.log_file,
                            project=exp.project,
                            cores=1,
                            run_main=exp.run_main)

        exp.job_id.append(sent_job)
        job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # Check fraglength and resubmit with set 200 fraglen for macs2 if xcor error
    for experiment in exp.IPs.Condition.unique().tolist():
        rep_number = len(exp.IPs[exp.IPs.Condition == experiment])

        frag_list = []

        for rep in range(rep_number):
            file = glob_check(
                f'{exp.scratch}ENCODE3/{experiment}/cromwell-executions/chip/*/call-xcor/shard-{rep}/execution/*fraglen.txt'
            )
            with open(file, 'r') as f:
                frag_list.append(f.read().split()[0])

        if '-' in [x[0] for x in frag_list]:
            output(
                f'Xcor failed for {experiment}.  Resubmitting with fragment length set to 200 for failed sample/s',
                log_file=exp.log_file,
                run_main=exp.run_main)

            frag_list = [x if x[0] != '-' else '200' for x in frag_list]
            exp_dir = f'{exp.scratch}ENCODE3/{experiment}/'
            encode_file = f'{exp_dir}{experiment}_ENCODE3.json'

            with open(encode_file, 'r') as file:
                json_file = json.load(file)

            json_file["chip.fraglen"] = frag_list

            resubmit_file = f'{exp_dir}/{experiment}_ENCODE3_setfraglenth.json'
            with open(resubmit_file, 'w') as file:
                json.dump(json_file, file, indent=4, sort_keys=True)

            pythonpath = shutil.which('python')
            miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
            cromwell_jar = re.sub(
                r'{}/.*'.format(miniconda),
                '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                    miniconda), pythonpath)
            jar = cromwell_jar if os.path.isfile(
                cromwell_jar
            ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

            command_list = [
                submission_prepend(source='encode-chip-seq-pipeline'),
                f'cd {exp_dir}',
                f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {resubmit_file}'
            ]

            sent_job = send_job(command_list=command_list,
                                job_name=f"{experiment}_ENCODE3_resubmission",
                                job_log_folder=exp.job_folder,
                                q='bigmem',
                                mem=35000,
                                log_file=exp.log_file,
                                project=exp.project,
                                cores=1,
                                run_main=exp.run_main)

            exp.job_id.append(sent_job)
            job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    exp = encode_results(exp)

    exp.tasks_complete.append('ENCODE3')

    return exp
Example #19
0
def overlap_three(bed_dict,
                  overlap_name,
                  out_folder,
                  log_file,
                  genome=None,
                  run_main=False):
    '''
    Takes a dictionary of three bed-like format files.
    Merges all overlapping peaks for each bed into a master file.
    Intersects beds to merged master file.
    Performs annotations with ChIPseeker if genome is specified.
    Plots venn diagrams of peak overlaps
    If genome is specified, also plots venn diagrams of annotated gene sets.

    Inputs
    ------
    bed_dict:  dictionary of BedTool files
    genome: 'hg38','hg19','mm10'

    Returns
    -------
    Returns a dictionary of dataframes from unique and overlap peaks.
    If genome is specified, includes a dictionary of annotated peaks.
    '''
    from collections import OrderedDict

    names = list(bed_dict.keys())

    out = make_folder(out_folder)

    output(f'Output files are found in {out}',
           log_file=log_file,
           run_main=run_main)
    output(f'A: {names[0]}, B: {names[1]}, C: {names[2]}',
           log_file=log_file,
           run_main=run_main)
    with open(f'{out}README.txt', 'w') as file:
        file.write(
            'All peaks are unique, meaning that each peak is in only one group.\n'
        )
        file.write(
            'Capital letter means this sample peak is included in the overlap.\n'
        )
        file.write(
            'Lowercase letter means the sample is excluded in the overlap.\n\n'
        )
        file.write(f'A: {names[0]}\nB: {names[1]}\nC: {names[2]}')

    master = bed_dict[names[0]].cat(bed_dict[names[1]]).cat(
        bed_dict[names[2]]).sort().merge()

    A = bed_dict[names[0]].sort().merge()
    B = bed_dict[names[1]].sort().merge()
    C = bed_dict[names[2]].sort().merge()

    sorted_dict = OrderedDict({'master': master, 'A': A, 'B': B, 'C': C})
    sorted_dict['Abc'] = master.intersect(A).intersect(B, v=True).intersect(
        C, v=True)
    sorted_dict['aBc'] = master.intersect(B).intersect(A, v=True).intersect(
        C, v=True)
    sorted_dict['ABc'] = master.intersect(A).intersect(B).intersect(C, v=True)
    sorted_dict['abC'] = master.intersect(C).intersect(A, v=True).intersect(
        B, v=True)
    sorted_dict['AbC'] = master.intersect(A).intersect(C).intersect(B, v=True)
    sorted_dict['aBC'] = master.intersect(B).intersect(C).intersect(A, v=True)
    sorted_dict['ABC'] = master.intersect(A).intersect(B).intersect(C)

    labTup = tuple(key for key in sorted_dict.keys())
    lenTup = tuple(len(bed) for bed in sorted_dict.values())

    output(f'{labTup}\n{lenTup}', log_file=log_file, run_main=run_main)

    plot_venn3_counts(lenTup[4:], names, f'{overlap_name} Peak', out)
    out_result(f'{out}venn_plot/{overlap_name}_Peak-overlap.png',
               f"{overlap_name} Peak Venn Overlap",
               run_main=run_main)

    for key, bed in sorted_dict.items():
        if len(bed) == 0:
            open(f'{out}{key.replace(" ", "_")}-peaks-from-mergedPeaks.bed',
                 'w').close()  # Can't convert empty bed file to dataframe
        else:
            bed2df(bed).to_csv(
                f"{out}{key.replace(' ', '_')}-peaks-from-mergedPeaks.bed",
                header=None,
                index=None,
                sep="\t")

    if bool(genome):
        output('Annotating ovelapped peaks...', log_file=log_file)
        unikey = '{}_unique'
        unianno = '{}_unique_annotated'
        return_dict = annotate_peaks(
            {
                unikey.format(key): bed2df(bed)
                for key, bed in sorted_dict.items() if len(bed) > 0
            },
            out,
            genome=genome,
            log_file=log_file,
            run_main=run_main)
        for key, bed in sorted_dict.items():
            if len(bed) == 0:
                return_dict[unianno.format(key)] = None

        Set1 = set() if return_dict[unianno.format('A')] is None else set(
            return_dict[unianno.format('A')].SYMBOL.unique().tolist())
        Set2 = set() if return_dict[unianno.format('B')] is None else set(
            return_dict[unianno.format('B')].SYMBOL.unique().tolist())
        Set3 = set() if return_dict[unianno.format('C')] is None else set(
            return_dict[unianno.format('C')].SYMBOL.unique().tolist())

        plot_venn3_set({
            names[0]: Set1,
            names[1]: Set2,
            names[2]: Set3
        }, f'{overlap_name}_annotated_genes', out)
        out_result(
            f'{out}venn_plot/{overlap_name}_annotated_genes-overlap.png',
            f"{overlap_name.replace('_',' ')} Gene Venn Overlap",
            run_main=run_main)

    return_sorted_dict = {key: bed2df(bed) for key, bed in sorted_dict.items()}

    return return_sorted_dict if genome is None else {
        **return_sorted_dict,
        **return_dict
    }
Example #20
0
def parse_config(config_file, run_main=False):
    '''
    Parse experimental info from yaml file
    '''

    with open(config_file, 'r') as file:
        yml = yaml.safe_load(file)

    # Make a new experimental object
    exp = Experiment()

    # Project
    exp.project = yml['LSF_Project']

    # Check if running as pipeline
    exp.run_main = run_main

    # Setting Scratch folder
    exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/'
    os.makedirs(exp.scratch, exist_ok=True)

    # check whether experiment has been attempted
    exp.name = yml['Name']
    exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/")
    filename = f'{exp.scratch}{exp.name}_incomplete.pkl'

    if os.path.isfile(filename):
        if yml['Restart'] is False:
            with open(filename, 'rb') as experiment:
                exp = pickle.load(experiment)
            os.remove(filename)

            # set new date
            exp.date = f'{datetime.now():%Y-%m-%d}'

            # For output of R logs into job_log_folder
            os.chdir(exp.job_folder)

            output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main)

            return exp
        else:
            os.remove(filename)

    # Passing paramters to new object
    exp.date = f'{datetime.now():%Y-%m-%d}'

    # Log file
    exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log'

    output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main)
    output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)
    output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main)
    output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main)

    # Setting Job Folder
    exp.job_folder = f'{val_folder(exp.scratch)}logs/'
    os.makedirs(exp.job_folder, exist_ok=True)

    # Load sample info
    exp.sample_df = read_pd(yml['Sample_file'])

    # Make Sample Name
    exp.sample_df.replace([np.nan], 'none', inplace=True)
    exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate
    output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main)

    # Paired
    exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()]

    exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy()
    sample_dict = exp.sample_df.Sample_Name.to_dict()
    exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict)
    exp.samples = exp.IPs.Sample_Name.tolist()

    # Convert Comparisons to a column of lists, then make unique comparisons
    exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    comparisons = []
    for comparison in exp.IPs.Comparison_names.tolist():
        comparisons += comparison
    exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)}

    # Spike-in comparisons
    exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls]
    exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)}

    spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls]
    exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist()

    # Make out directory if it doesn't exist
    exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/')

    # Lab specific files
    exp.genome_indicies['spike_index'] = yml['Spike_index']

    # Locating genome indicies
    tsvs = yml['Genome_tsv'].split(',')
    genomes = ['hg38', 'hg19', 'mm10']
    for tsv in tsvs:
        glob_check(tsv)
        exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'],
                                             **{genome: tsv for genome in genomes if genome in tsv}
                                             }

    exp.encode3_folder = val_folder(yml['ENCODE3_folder'])

    # Initialized Process Complete List
    exp._parsed = True

    output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)

    return exp
Example #21
0
def UMI(exp):

    # exp.data_type = 'bam'

    IPs = exp.IPs

    for experiment in IPs.Condition.unique().tolist():
        UMI = True if 'yes' in IPs[IPs.Condition ==
                                   experiment]['UMI'].tolist() else False

        if not UMI:
            return exp

        else:
            out_dir = make_folder(f'{exp.scratch}UMI/')
            output('Deduplicating bam files using UMIs with UMI-tools.',
                   log_file=exp.log_file,
                   run_main=exp.run_main)

            for index in IPs[IPs.Condition == experiment].index.tolist():
                sample = IPs.loc[index, 'Sample_Name']
                input_sample = IPs.loc[index, 'Background_Name']

                bam = exp.sample_files[sample]['bam']
                input_bam = exp.sample_files[input_sample]['bam']
                nodup_bam = f'{out_dir}{sample}.UMI.dedup.bam'
                nodup_input = f'{out_dir}{input_sample}.UMI.dedup.bam'

                umi_string = 'umi_tools dedup --umi-separator=":" --output-stats={out_dir}{sample}deduplicated.qc -I {inbam} -S {outbam} -L {out_dir}{sample}.UMI.log'

                seq_type = False if 'none' in IPs[
                    IPs.Condition ==
                    experiment]['Scratch_File2'].tolist() else True
                if seq_type == 'paired':
                    umi_string += ' --paired'

                command_list = [
                    submission_prepend(), f'samtools index {bam}',
                    f'samtools index {input_bam}',
                    umi_string.format(inbam=bam,
                                      outbam=nodup_bam,
                                      sample=sample,
                                      out_dir=out_dir),
                    umi_string.format(inbam=input_bam,
                                      outbam=nodup_input,
                                      sample=input_sample,
                                      out_dir=out_dir)
                ]

                exp.job_id.append(
                    send_job(command_list=command_list,
                             job_name=f"{sample}_UMI_dedup",
                             job_log_folder=exp.job_folder,
                             q='bigmem',
                             mem=40000,
                             log_file=exp.log_file,
                             project=exp.project,
                             cores=1,
                             run_main=exp.run_main))

                exp.sample_files[sample]['nodup_bam'] = nodup_bam
                exp.sample_files[input_sample]['nodup_bam'] = nodup_input

    job_wait(exp.job_id, exp.log_file)

    output(
        'Dedplication complete.  Submitting deduplicated files for the remainder of processing.',
        log_file=exp.log_file,
        run_main=exp.run_main)
    exp.tasks_complete.append('UMI')

    return encode3(exp)