Example #1
0
def annotation(exp):

    from requests.exceptions import RetryError
    from time import sleep

    out_dir = make_folder(f'{exp.scratch}/Annotated/')

    condition_list = exp.IPs['Condition'].unique().tolist()
    for condition in condition_list:
        if 'peaktype' not in list(exp.sample_files[condition].keys()):
            peakset = 'overlap_peak' if exp.sample_files[condition][
                'idr_optimal_peak'] == 'none' else 'idr_optimal_peak'
            exp.sample_files[condition]['peaktype'] = peakset

    peakfiles = {
        condition: read_pd(exp.sample_files[condition][
            exp.sample_files[condition]['peaktype']])
        for condition in exp.IPs['Condition'].unique().tolist()
        if exp.sample_files[condition][exp.sample_files[condition]['peaktype']]
        != 'none'
    }

    for condition, file in peakfiles.items():
        genome = exp.IPs.loc[exp.IPs.Condition == condition,
                             'Genome'].unique().tolist()[0]

        cond_dir = make_folder(f'{out_dir}{condition}/')
        anno_results = annotate_peaks(
            {condition: file},
            cond_dir,
            genome,
            db='UCSC',
            check=False,
            log_file=exp.log_file,
            run_main=exp.run_main)[f'{condition}_annotated']
        anno_list = anno_results.SYMBOL.unique().tolist()

        try:
            sleep(1)
            enrichr(anno_list,
                    f'enrichr_{condition}',
                    cond_dir,
                    scan=None,
                    max_terms=10,
                    figsize=(12, 6),
                    run_main=exp.run_main,
                    log_file=exp.log_file)
        except RetryError:
            output(
                f'No stable enrichr connection.  Skipping enrichr for {condition}.',
                log_file=exp.log_file,
                run_main=exp.run_main)

        exp.anno_results = {**exp.anno_results, **anno_results}

    exp.tasks_complete.append('Annotations')

    return exp
Example #2
0
def final_qc(exp):
    ''' add preseq '''
    try:
        output(f'Beginning final qc: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
               log_file=exp.log_file,
               run_main=exp.run_main)

        if os.path.isdir(f'{exp.scratch}QC/multiqc_data') is False:
            os.system(f'multiqc {exp.scratch}* -o {exp.scratch}QC/')

        # Summary plots for FastQC data
        fastqc_file = f'{exp.scratch}/QC/multiqc_data/multiqc_fastqc.txt'
        if os.path.isfile(fastqc_file):
            gen_stats = read_pd(
                f'{exp.scratch}/QC/multiqc_data/multiqc_general_stats.txt')
            samples = exp.sample_df.Sample_Name.tolist()
            plot_col(df=gen_stats.loc[
                samples, 'FastQC_mqc-generalstats-fastqc-total_sequences'] /
                     1e6,
                     title='Total Sequencer Reads per Sample',
                     ylabel='Reads (Millions)',
                     log_file=exp.log_file,
                     run_main=exp.run_main)

            plot_col(
                df=gen_stats.loc[samples,
                                 'FastQC_mqc-generalstats-fastqc-percent_gc'],
                title='Percent GC Content per Sample',
                ylabel='Percentage of Reads with GC Content',
                log_file=exp.log_file,
                run_main=exp.run_main)

            if os.path.isdir('plots/'):
                copytree('plots/', f'{exp.scratch}QC/plots/')
                rmtree('plots')

        display(HTML('<h1>Final QC Summary</h1>'))
        display(HTML(f'{exp.scratch}/multiqc_report.html'))

        exp.tasks_complete.append('MultiQC')

        return exp

    except:
        close_out('final qc', exp)
Example #3
0
def parse_config(config_file, run_main=False):
    '''
    Parse experimental info from yaml file
    '''

    with open(config_file, 'r') as file:
        yml = yaml.safe_load(file)

    # Make a new experimental object
    exp = Experiment()

    # Project
    exp.project = yml['LSF_Project']

    # Check if running as pipeline
    exp.run_main = run_main

    # Setting Scratch folder
    exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/'
    os.makedirs(exp.scratch, exist_ok=True)

    # check whether experiment has been attempted
    exp.name = yml['Name']
    exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/")
    filename = f'{exp.scratch}{exp.name}_incomplete.pkl'

    if os.path.isfile(filename):
        if yml['Restart'] is False:
            with open(filename, 'rb') as experiment:
                exp = pickle.load(experiment)
            os.remove(filename)

            # set new date
            exp.date = f'{datetime.now():%Y-%m-%d}'

            # For output of R logs into job_log_folder
            os.chdir(exp.job_folder)

            output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main)

            return exp
        else:
            os.remove(filename)

    # Passing paramters to new object
    exp.date = f'{datetime.now():%Y-%m-%d}'

    # Log file
    exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log'

    output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main)
    output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)
    output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main)
    output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main)

    # Setting Job Folder
    exp.job_folder = f'{val_folder(exp.scratch)}logs/'
    os.makedirs(exp.job_folder, exist_ok=True)

    # Load sample info
    exp.sample_df = read_pd(yml['Sample_file'])

    # Make Sample Name
    exp.sample_df.replace([np.nan], 'none', inplace=True)
    exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate
    output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main)

    # Paired
    exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()]

    exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy()
    sample_dict = exp.sample_df.Sample_Name.to_dict()
    exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict)
    exp.samples = exp.IPs.Sample_Name.tolist()

    # Convert Comparisons to a column of lists, then make unique comparisons
    exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    comparisons = []
    for comparison in exp.IPs.Comparison_names.tolist():
        comparisons += comparison
    exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)}

    # Spike-in comparisons
    exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls]
    exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)}

    spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls]
    exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist()

    # Make out directory if it doesn't exist
    exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/')

    # Lab specific files
    exp.genome_indicies['spike_index'] = yml['Spike_index']

    # Locating genome indicies
    tsvs = yml['Genome_tsv'].split(',')
    genomes = ['hg38', 'hg19', 'mm10']
    for tsv in tsvs:
        glob_check(tsv)
        exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'],
                                             **{genome: tsv for genome in genomes if genome in tsv}
                                             }

    exp.encode3_folder = val_folder(yml['ENCODE3_folder'])

    # Initialized Process Complete List
    exp._parsed = True

    output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)

    return exp