Example #1
0
def extract_AQUAS_report_data(base_folder,
                              out_folder='',
                              histone=False,
                              replicate=False):
    '''
    Inputs
    -----
    base_folder:  AQUAS results folder.  Will use subfolders for sample name and look for report in those subfolders.
    replicate: Whether the ChIPseq was performed as a repliate or not.

    Returns
    -----
    DataFrame of results
    '''

    reports = glob.glob(f'{base_folder}/*/*report.html')
    out_folder = val_folder(out_folder)
    base_folder = val_folder(base_folder)

    if replicate is True:
        raise AssertionError('Not set up for replicates yet.')

    results_df = pd.DataFrame(index=[
        'Percent_mapped', 'Mapped_Reads', 'Fraction_Duplicated',
        'S_JS_Distance', 'PBC1', 'RSC', 'Raw_Peak_Number',
        'N_optimal_overlap_peaks', 'FrIP_IDR', 'N_IDR_peaks'
    ])
    for file in reports:
        name = re.findall(r'.*/(.*)_report.html', file)[0]
        report = pd.read_html(file)
        series = pd.Series()
        series['Percent_mapped'] = report[1].iloc[7, 1]
        series['Mapped_Reads'] = report[2].iloc[5, 1]
        series['Fraction_Duplicated'] = report[3].iloc[7, 1]
        series['S_JS_Distance'] = report[4].iloc[7, 1]
        series['PBC1'] = report[5].iloc[6, 1]
        series['RSC'] = report[6].iloc[8, 1]
        series['Raw_Peak_Number'] = report[7].iloc[0, 1]
        series['N_optimal_overlap_peaks'] = report[10].iloc[4, 1]
        if histone is False:
            series['FrIP_IDR'] = report[11].iloc[0, 1]
            series['N_IDR_peaks'] = report[12].iloc[4, 1]
        results_df[name] = series

    for index in results_df.index.tolist():
        plot_col(results_df.loc[index],
                 out=out_folder,
                 title=f'{index}',
                 ylabel=index.replace('_', ' '),
                 plot_type=['violin', 'swarm'])

    return results_df
Example #2
0
def fastq_screen(exp):
    '''
    Checks fastq files for contamination with alternative genomes using Bowtie2
    '''

    output(
        f'Screening for contamination during sequencing: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
        log_file=exp.log_file,
        run_main=exp.run_main)

    # Make QC folder
    exp.qc_folder = make_folder(f'{exp.scratch}QC/')

    cwd = val_folder(os.getcwd())
    os.chdir(exp.data_folder)

    samples = [
        file for file in exp.sample_df.Scratch_File1.tolist() if is_fastq(file)
    ]

    # Submit fastqc and fastq_screen jobs for each sample
    for sample in samples:
        command_list = [
            submission_prepend(
                f'fastq_screen --threads 4 --aligner bowtie2 {sample}')
        ]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f'{sample.split("/")[-1]}_fastq_screen',
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=3000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))
        time.sleep(1)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # move to qc folder
    fastqs_files = glob.glob(f'{exp.data_folder}*screen*')
    for f in fastqs_files:
        copy2(f, exp.qc_folder)
        os.remove(f)

    # change to experimental directory in scratch
    os.chdir(cwd)

    exp.tasks_complete.append('Fastq_screen')
    output(f'Screening complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Example #3
0
def parse_config(config_file, run_main=False):
    '''
    Parse experimental info from yaml file
    '''

    with open(config_file, 'r') as file:
        yml = yaml.safe_load(file)

    # Make a new experimental object
    exp = Experiment()

    # Project
    exp.project = yml['LSF_Project']

    # Check if running as pipeline
    exp.run_main = run_main

    # Setting Scratch folder
    exp.scratch = f'{os.getcwd()}/{yml["Name"]}_tmp/' if yml["Scratch_folder"] is None else f'{val_folder(yml["Scratch_folder"])}{yml["Name"]}/'
    os.makedirs(exp.scratch, exist_ok=True)

    # check whether experiment has been attempted
    exp.name = yml['Name']
    exp.out_dir = make_folder(f"{val_folder(yml['Output_directory'])}{exp.name}/")
    filename = f'{exp.scratch}{exp.name}_incomplete.pkl'

    if os.path.isfile(filename):
        if yml['Restart'] is False:
            with open(filename, 'rb') as experiment:
                exp = pickle.load(experiment)
            os.remove(filename)

            # set new date
            exp.date = f'{datetime.now():%Y-%m-%d}'

            # For output of R logs into job_log_folder
            os.chdir(exp.job_folder)

            output(f'\n#############\nRestarting pipeline on {datetime.now():%Y-%m-%d %H:%M:%S}, from last completed step.', log_file=exp.log_file, run_main=exp.run_main)

            return exp
        else:
            os.remove(filename)

    # Passing paramters to new object
    exp.date = f'{datetime.now():%Y-%m-%d}'

    # Log file
    exp.log_file = f'{exp.out_dir}{exp.name}-{exp.date}.log'

    output(f'Pipeline version {version()} run on {exp.date} \n', log_file=exp.log_file, run_main=run_main)
    output(f'Beginning ChIPseq Analysis: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)
    output('Reading experimental file...\n', log_file=exp.log_file, run_main=run_main)
    output(f"Pipeline output folder: {exp.out_dir}\n", log_file=exp.log_file, run_main=run_main)

    # Setting Job Folder
    exp.job_folder = f'{val_folder(exp.scratch)}logs/'
    os.makedirs(exp.job_folder, exist_ok=True)

    # Load sample info
    exp.sample_df = read_pd(yml['Sample_file'])

    # Make Sample Name
    exp.sample_df.replace([np.nan], 'none', inplace=True)
    exp.sample_df['Sample_Name'] = exp.sample_df.Condition + '_' + exp.sample_df.Replicate
    output(f'Processing samples:\n{exp.sample_df}', log_file=exp.log_file, run_main=run_main)

    # Paired
    exp.sample_df['paired'] = [x != 'none' for x in exp.sample_df.File2.tolist()]

    exp.IPs = exp.sample_df[exp.sample_df['Background Sample'] != 'none'].copy()
    sample_dict = exp.sample_df.Sample_Name.to_dict()
    exp.IPs['Background_Name'] = exp.IPs['Background Sample'].map(sample_dict)
    exp.samples = exp.IPs.Sample_Name.tolist()

    # Convert Comparisons to a column of lists, then make unique comparisons
    exp.IPs['Comparisons'] = exp.IPs.Comparisons.apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Comparison_names'] = exp.IPs[['Condition', 'Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    comparisons = []
    for comparison in exp.IPs.Comparison_names.tolist():
        comparisons += comparison
    exp.overlaps = {o_name: o_name.split('_v_') for o_name in set(comparisons)}

    # Spike-in comparisons
    exp.IPs['Spike-in Comparisons'] = exp.IPs['Spike-in Comparisons'].apply(lambda x: [x.replace(' ', '') for x in x.split(',')])
    exp.IPs['Spike_names'] = exp.IPs[['Condition', 'Spike-in Comparisons']].apply(lambda x: ['_v_'.join(sorted([x[0], y])) for y in x[1] if x[1][0] != 'none'], axis=1)

    sp_comparisons = [comparison for subls in exp.IPs.Spike_names.tolist() for comparison in subls]
    exp.spike_comparisons = {s_name: s_name.split('_v_') for s_name in set(sp_comparisons)}

    spike_samples = [condition for subls in exp.spike_comparisons.values() for condition in subls]
    exp.spike_samples = exp.IPs[exp.IPs.Condition.isin(spike_samples)].Sample_Name.tolist()

    # Make out directory if it doesn't exist
    exp.out_dir = make_folder(f'{val_folder(yml["Output_directory"])}{exp.name}/')

    # Lab specific files
    exp.genome_indicies['spike_index'] = yml['Spike_index']

    # Locating genome indicies
    tsvs = yml['Genome_tsv'].split(',')
    genomes = ['hg38', 'hg19', 'mm10']
    for tsv in tsvs:
        glob_check(tsv)
        exp.genome_indicies['encode_tsv'] = {**exp.genome_indicies['encode_tsv'],
                                             **{genome: tsv for genome in genomes if genome in tsv}
                                             }

    exp.encode3_folder = val_folder(yml['ENCODE3_folder'])

    # Initialized Process Complete List
    exp._parsed = True

    output(f'Experiment file parsed: {datetime.now():%Y-%m-%d %H:%M:%S}\n', log_file=exp.log_file, run_main=run_main)

    return exp