Esempio n. 1
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='genericpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
Esempio n. 2
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/out/{sample[0]}.bam')

    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    return pipeline
Esempio n. 4
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    #pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(task_func=stages.fastqc,
                       name='fastqc',
                       input=output_from('original_fastqs'),
                       filter=suffix('.fastq.gz'),
                       output='_fastqc')

    # Index the reference using BWA
    #pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort alignment with sambamba
    pipeline.transform(task_func=stages.sort_bam_sambamba,
                       name='sort_alignment',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools
    pipeline.transform(task_func=stages.index_bam,
                       name='index_mmr_alignment',
                       input=output_from('extract_genes_bedtools'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
                       output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(task_func=stages.bamtools_stats,
                       name='bamtools_stats',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(task_func=stages.extract_discordant_alignments,
                       name='extract_discordant_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(task_func=stages.extract_split_read_alignments,
                       name='extract_split_read_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs([
            '{path[0]}/{sample[0]}.splitters.bam',
            '{path[0]}/{sample[0]}.discordants.bam'
        ]),
        output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment').
     follows('sort_splitters').follows('sort_discordants'))

    # Call genotypes on lumpy output using SVTyper
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output=
        '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}']))

    # Call DELs with DELLY
    pipeline.merge(task_func=stages.deletions_delly,
                   name='deletions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DEL.vcf')

    # Call DUPs with DELLY
    pipeline.merge(task_func=stages.duplications_delly,
                   name='duplications_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DUP.vcf')

    # Call INVs with DELLY
    pipeline.merge(task_func=stages.inversions_delly,
                   name='inversions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.INV.vcf')

    # Call TRAs with DELLY
    pipeline.merge(task_func=stages.translocations_delly,
                   name='translocations_delly',
                   input=output_from('sort_alignment'),
                   output='delly.TRA.vcf')

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    #(pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
Esempio n. 5
0
def main():
    # Preparatory actions
    os.system("clear")
    os.chdir(".")
    Data.default_cwd = os.getcwd()
    Data.logs = []
    Data.fails = []
    Data.PROFILES_HISTORY_DIR_PATH = (os.path.expanduser("~") + os.sep +
                                      Data.PROFILES_HISTORY_DIR_PATH)

    # Getting deploy profiles from history
    deploy_profile = None
    is_hist_profile_selected = False
    if Data.USE_PROFILES_HISTORY_SEARCH:
        profiles_history = ProfilesHistory.get_profiles_paths_from_history()
        if len(profiles_history) > 0:
            print("[i] Found profiles in history:")
            print("0. Enter deploy profile path manually")
            profiles_history_counter = 0
            for one_hist_profile_path in profiles_history:
                profiles_history_counter = profiles_history_counter + 1
                print(
                    str(profiles_history_counter) + ". " +
                    one_hist_profile_path)
            selected_hist_profile_idx = input('Select variant: ')
            if selected_hist_profile_idx.isdigit():
                selected_hist_profile_idx = int(selected_hist_profile_idx)
                if len(profiles_history) >= selected_hist_profile_idx:
                    if selected_hist_profile_idx > 0:
                        is_hist_profile_selected = True
                        deploy_profile = Helpers.get_profile_file(
                            profiles_history[selected_hist_profile_idx - 1])
            else:
                exit("[X] Invalid input!")
    if not is_hist_profile_selected:
        # Reading deploy profile manually
        deploy_profile = Helpers.get_profile_file(None)
        if not deploy_profile:
            exit("[X] Invalid deploy profile or file not found!")

    start_time = datetime.datetime.now()

    # Profile validation
    profile_validation_res = Helpers.validate_profile(deploy_profile)
    if profile_validation_res != True:
        exit("[X] Profile validation error: " + str(profile_validation_res))

    # Project name
    project_name = deploy_profile["project_name"]
    print("=" * 40)
    print("[i] Selected project name: " + str(project_name))

    # Parsing profile environments
    print("[i] Found environments: " +
          str(Helpers.get_all_profile_envs(deploy_profile)))
    selected_profile_env = Helpers.select_profile_env(deploy_profile)
    if selected_profile_env == False:
        exit("[X] Unknown environment!")
    print("[i] Selected environment: " + selected_profile_env)

    # Processing stages
    print("[i] Processing stages...")
    deploy_stages = deploy_profile["environments"][selected_profile_env][
        "stages"]
    deploy_credentials = deploy_profile["environments"][selected_profile_env][
        "credentials"]
    stages_counter = 0
    for one_stage in deploy_stages:
        if one_stage["ignore"]:
            print("[i] " + str(stages_counter) + ". " +
                  str(one_stage["name"]) + " (IGNORED)")
            continue
        stages_counter = stages_counter + 1
        print("[i] " + str(stages_counter) + ". " + str(one_stage["print"]))
        result = Stages.run_stage(one_stage["name"], one_stage["details"],
                                  deploy_credentials)
        if result != True:
            print("[!] Current stage failed!\nResult:\n" + str(result) +
                  "\nContinue? (yes/no)")
            continue_or_not = input()
            if continue_or_not.replace(" ", "") != "yes":
                exit("[X] Exited with error!")

    # Saving current profile path to local history
    ProfilesHistory.save_profile_path_to_history(Data.curr_profile_path)

    end_time = datetime.datetime.now()
    print("[i] All done in " +
          str(int((end_time - start_time).total_seconds())) + " seconds!")
    print("=" * 40)

    # Slack notification
    print("Send notification slack? yes/no")
    send_notice_or_not = input()
    if send_notice_or_not.lower().replace(" ", "") == "yes":
        is_slack_bot_creds_valid = Notifications.validate_slack_bot_credentials(
            deploy_credentials)
        if is_slack_bot_creds_valid:
            print("Enter the message to send:")
            slack_msg_to_send = input()
            if slack_msg_to_send.lower().replace(" ", "") != "":
                slack_msg_to_send = (
                    "Backend update!" + "\n" + "Start time: " +
                    str(start_time) + "\n" + "End time: " + str(end_time) +
                    "\n" + "Elapsed time: " +
                    str(int((end_time - start_time).total_seconds())) +
                    " seconds\n" + "Environment: " + selected_profile_env +
                    "\n" + "Service: " + project_name + "\n" + "Message: " +
                    slack_msg_to_send)
                Notifications.send_msg_to_slack(
                    deploy_credentials["slack_bot"]["main"]["bot_token"],
                    deploy_credentials["slack_bot"]["main"]["project_channel"],
                    slack_msg_to_send,
                    deploy_credentials["slack_bot"]["main"]["icon_emoji"])
        else:
            print("[!] Invalid Slack credentials!")
Esempio n. 6
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(
        task_func=stages.primary_bam,
        name='primary_bam',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        output='.primary.bam')

    # index bam file
    pipeline.transform(
        task_func=stages.index_sort_bam_picard,
        name='index_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam')
        .follows('index_bam'))

    ###### GATK VARIANT CALLING - MuTect2 ######

    # Call somatics variants using MuTect2
    pipeline.transform(
        task_func=stages.call_mutect2_gatk,
        name='call_mutect2_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'),
        # extras=['{sample[0]}'],
        output='variants/mutect2/{sample[0]}.mutect2.vcf')
        # .follows('clip_bam')

    ###### GATK VARIANT CALLING - MuTect2 ######

    # -------- VEP ----------
    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('call_mutect2_gatk'),
        filter=suffix('.mutect2.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vcf')
        .follows('call_mutect2_gatk'))
    #
    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.mutect2.vt.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vep.vcf')
        .follows('apply_vt'))
    #
    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_vep'),
        filter=suffix('.mutect2.vt.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.annotated.vcf')
        .follows('apply_vep'))

    return pipeline
Esempio n. 7
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(task_func=stages.chrom_intervals_gatk,
                       name='chrom_intervals_gatk',
                       input=output_from('mark_duplicates_picard'),
                       filter=suffix('.sort.dedup.bam'),
                       output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows(
            'mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(task_func=stages.base_recalibration_gatk,
                       name='base_recalibration_gatk',
                       input=output_from('local_realignment_gatk'),
                       filter=suffix('.sort.dedup.realn.bam'),
                       output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows(
            'local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(task_func=stages.call_variants_gatk,
                       name='call_variants_gatk',
                       input=output_from('print_reads_gatk'),
                       filter=suffix('.sort.dedup.realn.recal.bam'),
                       output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_variants_gatk'),
                   output='COMPLEXO.mergedgvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.mergedgvcf.vcf'),
                       output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(task_func=stages.snp_recalibrate_gatk,
                       name='snp_recalibrate_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.genotyped.vcf'),
                       output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']),
        output='.recal_SNP.vcf').follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']),
        output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']),
        output='.combined.vcf').follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(task_func=stages.select_variants_gatk,
                       name='select_variants_gatk',
                       input=output_from('combine_variants_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.selected.vcf')

    return pipeline
Esempio n. 8
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='cellfree_seq')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    pipeline.transform(task_func=stages.run_connor,
                       name='run_connor',
                       input=output_from('align_bwa'),
                       filter=suffix('.sort.hq.bam'),
                       output='.sort.hq.connor.bam')

    safe_make_dir('metrics')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/connor')

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_raw',
        input=output_from('coverage_bed_raw', 'genome_reads_raw',
                          'target_reads_raw', 'total_reads_raw'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'summary.txt'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/connor/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_connor',
        input=output_from('coverage_bed_connor', 'genome_reads_connor',
                          'target_reads_connor', 'total_reads_connor'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/connor/all_sample.summary.\1.txt',
        extras=[r'\1', 'connor.summary.txt'])

    safe_make_dir('variants')
    safe_make_dir('variants/vardict')

    pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    (pipeline.merge(
        task_func=stages.concatenate_vcfs,
        name='concatenate_vcfs',
        input=output_from('sort_vcfs'),
        output='variants/vardict/combined.vcf.gz').follows('index_vcfs'))

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('vt_decompose_normalise'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Esempio n. 9
0
def make_pipeline_process(state):
    #originate process pipeline state

    # Define empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))

    stages = Stages(state)

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='processed/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf')

    ####### vardict stuff

    vardict_files = []
    for directory in run_directories:
        vardict_files.extend(
            glob.glob(directory + '/variants/vardict/*sorted.vcf.gz'))

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_vardict,
                       name='glob_vardict',
                       output=vardict_files)

    safe_make_dir('processed/vardict')

    #concatenate all vardict vcfs
    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_vardict'),
                   output='processed/vardict/combined.vcf.gz')

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise_vardict',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise_vardict'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep_vardict',
        input=output_from('vt_decompose_normalise_vardict'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
Esempio n. 10
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the FASTQ files
    #fastq_files = state.config.get_option('fastqs')
    fastq_files = glob.glob("fastqs/*.gz")
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('processed_fastqs')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/pass_samples')
    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/vardict')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    pipeline.transform(
        task_func=stages.run_surecalltrimmer,
        name='run_surecalltrimmer',
        input=output_from('original_fastqs'),
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'),
        #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'),
        extras=['{sample[0]}'],
        # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step.
        output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz')
    #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('run_surecalltrimmer'),
        filter=formatter(
            'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz'
        ),
        add_inputs=add_inputs(
            'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'),
        #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'),
        #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'),
        extras=['{sample[0]}'],
        output='alignments/{sample[0]}.bam')

    # Run locatit from agilent.  this should produce sorted bam files, so no sorting needed at the next step
    pipeline.collate(task_func=stages.run_locatit,
                     name='run_locatit',
                     input=output_from('align_bwa', 'original_fastqs'),
                     filter=regex(r'.+/(.+_L\d\d\d).+'),
                     output=r'alignments/\1.locatit.bam')

    pipeline.transform(task_func=stages.sort_bam,
                       name='sort_bam',
                       input=output_from('run_locatit'),
                       filter=suffix('.locatit.bam'),
                       output='.sorted.locatit.bam')

    # # # # # Metrics stages # # # # #
    # generate mapping metrics (post locatit)
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    # Intersect the bam file with the region of interest
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    # Calculate coverage metrics from the intersected bam file
    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    # Count the number of mapped reads
    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    # Count the number of on-target reads
    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    # Count the number of total reads
    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    # Generate summary metrics from the stats files produces
    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])
    # # # # # Metrics stages end # # # # #

    # # # # # Checking metrics and calling # # # # #
    # Originate to set the location of the metrics summary file
    (pipeline.originate(
        task_func=stages.grab_summary_file,
        name='grab_summary_file',
        output='all_sample.summary.txt').follows('generate_stats'))

    # Awk command to produce a list of bam files passing filters
    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt')

    # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller
    pipeline.subdivide(name='passed_filter_files',
                       task_func=stages.read_samples,
                       input=output_from('filter_stats'),
                       filter=formatter(),
                       output="metrics/pass_samples/*.bam")

    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam'))

    # Call variants with vardict
    (pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}']).follows('sort_bam'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    return (pipeline)
Esempio n. 11
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    #
    # performs fastqc on fastq inputs
    #
    pipeline.transform(
        task_func=stages.fastqc,
        name='fastqc',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'),
        output='{path[0]}/{filename[0]}_fastqc')

    #
    # converts the fastq inputs to pre-aligned bams
    #
    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/{sample[0]}.bam')

    #
    # validates pre-aligned bams x.bam -> x.validation
    #
    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    # aligns pre-aligned bam x.bam -> x.mapped.bam
    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    # generates stats about an aligned bam
    pipeline.transform(
        task_func=stages.align_stats_bedtools,
        name='align_stats_bedtools',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.genomecov.stats')

    # generates stats about an aligned bam
    pipeline.transform(
        task_func=stages.align_stats_picard,
        name='align_stats_picard',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.picard.stats')

    #
    # runs the Sanger variant calling pipeline
    #
    #pipeline.transform(
    #    task_func=stages.analyse_wgs,
    #    name='analyse_wgs',
    #    input=output_from('align'),
    #    filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
    #    output='{path[0]}/{sample[0]}.wgs/manifest')

    # runs the components of the Sanger variant calling pipeline
    pipeline.transform(
        task_func=stages.analyse_wgs_prepare,
        name='analyse_wgs_prepare',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.prepare')

    pipeline.transform(
        task_func=stages.analyse_wgs_reference_files,
        name='analyse_wgs_reference_files',
        input=[output_from('align'),
               output_from('analyse_wgs_prepare')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.reference_files')

    pipeline.transform(
        task_func=stages.analyse_wgs_init,
        name='analyse_wgs_init',
        input=[
            output_from('align'),
            output_from('analyse_wgs_reference_files')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.init')

    # block 1
    pipeline.transform(
        task_func=stages.analyse_wgs_verify_WT,
        name='analyse_wgs_verify_WT',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.verify_WT')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel_input,
        name='analyse_wgs_cgpPindel_input',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_input')

    pipeline.transform(
        task_func=stages.analyse_wgs_alleleCount,
        name='analyse_wgs_alleleCount',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.alleleCount')

    # block 2
    pipeline.transform(
        task_func=stages.analyse_wgs_ascat,
        name='analyse_wgs_ascat',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.ascat')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel,
        name='analyse_wgs_cgpPindel',
        input=[
            output_from('align'),
            output_from('analyse_wgs_cgpPindel_input')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS_input,
        name='analyse_wgs_BRASS_input',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS_input')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS_cover,
        name='analyse_wgs_BRASS_cover',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS_cover')

    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan_split,
        name='analyse_wgs_CaVEMan_split',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_split')

    # after block 2
    pipeline.transform(
        task_func=stages.analyse_wgs_ascat_prep,
        name='analyse_wgs_ascat_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_ascat')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.ascat_prep')

    pipeline.transform(
        task_func=stages.analyse_wgs_pindel_prep,
        name='analyse_wgs_pindel_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_cgpPindel')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.pindel_prep')

    # parallel block 3
    pipeline.transform(
        task_func=stages.analyse_wgs_verify_MT,
        name='analyse_wgs_verify_MT',
        input=[
            output_from('align'),
            output_from('analyse_wgs_verify_WT'),
            output_from('analyse_wgs_ascat_prep')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.verify_MT')

    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan,
        name='analyse_wgs_CaVEMan',
        input=[
            output_from('align'),
            output_from('analyse_wgs_CaVEMan_split'),
            output_from('analyse_wgs_ascat_prep'),
            output_from('analyse_wgs_cgpPindel')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS,
        name='analyse_wgs_BRASS',
        input=[
            output_from('align'),
            output_from('analyse_wgs_BRASS_cover'),
            output_from('analyse_wgs_BRASS_input'),
            output_from('analyse_wgs_ascat_prep')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel_annot,
        name='analyse_wgs_cgpPindel_annot',
        input=[output_from('align'),
               output_from('analyse_wgs_pindel_prep')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_annot')

    # pre block 4
    pipeline.transform(
        task_func=stages.analyse_wgs_caveman_prep,
        name='analyse_wgs_caveman_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_CaVEMan')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.caveman_prep')

    # block 4
    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan_annot,
        name='analyse_wgs_CaVEMan_annot',
        input=[output_from('align'),
               output_from('analyse_wgs_caveman_prep')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_annot')

    # done
    pipeline.transform(
        task_func=stages.analyse_wgs_finish,
        name='analyse_wgs_finish',
        input=[
            output_from('align'),
            output_from('analyse_wgs_CaVEMan_annot'),
            output_from('analyse_wgs_BRASS'),
            output_from('analyse_wgs_cgpPindel_annot'),
            output_from('analyse_wgs_alleleCount'),
            output_from('analyse_wgs_verify_MT')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.finish')

    #
    # runs the delly singularity container
    #

    pipeline.transform(
        task_func=stages.delly,
        name='delly',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.delly.completed')

    pipeline.transform(
        task_func=stages.gridss,
        name='gridss',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.gridss.completed')

    pipeline.transform(
        task_func=stages.muse,
        name='muse',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.muse.completed')

    pipeline.transform(
        task_func=stages.mutect2,
        name='mutect2',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.mutect2.completed')

    return pipeline
Esempio n. 12
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam'
    )

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),
        # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        extras=['{sample[0]}', '{readid[0]}'],

        # The output file name is the sample name with a .bam extension.
        output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(task_func=stages.primary_bam,
                       name='primary_bam',
                       input=output_from('sort_bam_picard'),
                       filter=suffix('.sort.bam'),
                       output='.primary.bam')

    # index bam file
    pipeline.transform(task_func=stages.index_sort_bam_picard,
                       name='index_bam',
                       input=output_from('primary_bam'),
                       filter=suffix('.primary.bam'),
                       output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam').follows('index_bam'))

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')
    # .follows('index_sort_bam_picard'))

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_haplotypecaller_gatk'),
                   output='variants/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.annotate.vcf')

    # Apply VariantFiltration using GATK
    pipeline.transform(task_func=stages.apply_variant_filtration_gatk_lenient,
                       name='apply_variant_filtration_gatk_lenient',
                       input=output_from('variant_annotator_gatk'),
                       filter=suffix('.raw.annotate.vcf'),
                       output='.raw.annotate.filtered_lenient.vcf')

    return pipeline
Esempio n. 13
0
def make_pipeline_process(state):
    # Define empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    undr_rover_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))
        undr_rover_files.extend(
            glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz'))

    # Stages are dependent on the state
    stages = Stages(state)

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    #Dummy stage to grab the undr rover files
    pipeline.originate(task_func=stages.glob_undr_rover,
                       name='glob_undr_rover',
                       output=undr_rover_files)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')

    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_undr_rover'),
                   output='variants/undr_rover/combined_undr_rover.vcf.gz')

    pipeline.transform(task_func=stages.index_final_vcf,
                       name='index_final_vcf',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.vcf.gz.tbi')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        add_inputs=add_inputs(
            ['variants/undr_rover/combined_undr_rover.vcf.gz']),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows(
            'index_final_vcf'))

    return pipeline
Esempio n. 14
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz'
        ),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.clipped.sort.hq.bam')

    # generate mapping metrics.
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])

    summary_file = 'all_sample.summary.txt'

    (pipeline.originate(task_func=stages.grab_summary_file,
                        name='grab_summary_file',
                        output=summary_file).follows('generate_stats'))

    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt',
                       extras=['all_sample.failed.summary.txt'])

    return pipeline
Esempio n. 15
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam'
    )

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),
        # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        extras=['{sample[0]}', '{readid[0]}'],

        # The output file name is the sample name with a .bam extension.
        output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(task_func=stages.primary_bam,
                       name='primary_bam',
                       input=output_from('sort_bam_picard'),
                       filter=suffix('.sort.bam'),
                       output='.primary.bam')

    # index bam file
    pipeline.transform(task_func=stages.index_sort_bam_picard,
                       name='index_bam',
                       input=output_from('primary_bam'),
                       filter=suffix('.primary.bam'),
                       output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam').follows('index_bam'))

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')
    # .follows('index_sort_bam_picard'))

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_haplotypecaller_gatk'),
                   output='variants/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.annotate.vcf')

    # Apply VariantFiltration using GATK
    pipeline.transform(task_func=stages.apply_variant_filtration_gatk,
                       name='apply_variant_filtration_gatk',
                       input=output_from('variant_annotator_gatk'),
                       filter=suffix('.raw.annotate.vcf'),
                       output='.raw.annotate.filtered.vcf')

    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('apply_variant_filtration_gatk'),
        filter=suffix('.raw.annotate.filtered.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vcf').follows(
            'apply_variant_filtration_gatk'))

    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.raw.annotate.filtered.norm.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vep.vcf').follows('apply_vt'))

    # Apply SnpEff
    (pipeline.transform(
        task_func=stages.apply_snpeff,
        name='apply_snpeff',
        input=output_from('apply_vep'),
        filter=suffix('.raw.annotate.filtered.norm.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vep.snpeff.vcf').follows(
            'apply_vep'))

    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_snpeff'),
        filter=suffix('.raw.annotate.filtered.norm.vep.snpeff.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.annotated.vcf').follows('apply_snpeff'))

    # Concatenate undr_rover vcf files
    pipeline.merge(task_func=stages.apply_cat_vcf,
                   name='apply_cat_vcf',
                   input=output_from('apply_undr_rover'),
                   output='variants/undr_rover/ur.vcf.gz')

    # # Apple VEP on concatenated undr_rover vcf file
    # (pipeline.transform(
    #     task_func=stages.apply_vep,
    #     name='apply_vep_ur',
    #     input=output_from('apply_cat_vcf'),
    #     filter=suffix('.vcf.gz'),
    #     output='.vep.vcf')
    #     .follows('apply_cat_vcf'))
    #
    # # Apply vcfanno on concatenated/vep undr_rover vcf file
    # (pipeline.transform(
    #     task_func=stages.apply_vcfanno,
    #     name='apply_vcfanno_ur',
    #     input=output_from('apply_vep_ur'),
    #     filter=suffix('.vep.vcf'),
    #     output='.vep.anno.vcf')
    #     .follows('apply_vep_ur'))
    #
    # # Apply snpeff
    # (pipeline.transform(
    #     task_func=stages.apply_snpeff,
    #     name='apply_snpeff_ur',
    #     input=output_from('apply_vcfanno_ur'),
    #     filter=suffix('.vep.anno.vcf'),
    #     output='.vep.anno.snpeff.vcf.gz')
    #     .follows('apply_vcfanno_ur'))
    #
    # Apply tabix
    pipeline.transform(task_func=stages.apply_tabix,
                       name='apply_tabix',
                       input=output_from('apply_cat_vcf'),
                       filter=suffix('.vcf.gz'),
                       output='.vcf.gz.tbi')

    # # Apply HomopolymerRun
    # (pipeline.transform(
    #     task_func=stages.apply_homopolymer_ann,
    #     name='apply_homopolymer_ann',
    #     input=output_from('apply_snpeff_ur'),
    #     filter=suffix('.vep.anno.snpeff.vcf.gz'),
    #     output='.annotated.vcf')
    #     .follows('apply_tabix'))

    # # Apply summarize multi coverage
    # (pipeline.merge(
    #     task_func=stages.apply_multicov,
    #     name='apply_multicov',
    #     input=output_from('primary_bam'),
    #     # filter=suffix('.primary.bam'),
    #     output='coverage/all.multicov.txt')
    #     .follows('index_bam'))

    # Apply summarize picard coverage
    # (pipeline.merge(
    #     task_func=stages.apply_summarize_picard,
    #     name='apply_summarize_picard',
    #     input=output_from('target_coverage'),
    #     output='coverage/all.hsmetrics.txt')
    #     .follows('target_coverage'))

    # # Apply summarize multicov coverage plots
    # (pipeline.merge(
    #     task_func=stages.apply_multicov_plots,
    #     name='apply_multicov_plots',
    #     input=output_from('apply_multicov'),
    #     output='coverage/coverage_analysis_main.html')
    #     .follows('apply_multicov'))

    return pipeline
Esempio n. 16
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline