Ejemplo n.º 1
0
def run_gdc_pipeline(params):
    gdc_bam_files = params['gdc_bam_files']
    parsl.set_stream_logger()
    parsl.load(params['parsl_config'])
    LOGGER.info("GDC Pipeline started!")

    GDCPatientDNASeq.gdc_output_dir = params['gdc_output_dir']
    GDCPatientDNASeq.gdc_executables = params['gdc_executables']
    GDCPatientDNASeq.gdc_data_files = params['gdc_data_files']
    GDCPatientDNASeq.gdc_params = params
    gdc_workflow.LOGGER = LOGGER

    def process_bam_pair(patient, bam_pair, label=None):
        if isinstance(bam_pair, list):
            bam_pair = bam_pair[0]

        gdc_patient = GDCPatientDNASeq(patient, bam_pair, label)
        cleaned_bam_pair = {}
        if ('cleaned' in bam_pair) and (bam_pair['cleaned']):
            cleaned_bam_pair = bam_pair
        else:
            cleaned_bam_pair = gdc_patient.process_patient_seq_data()

        gdc_patient.run_variant_callers(cleaned_bam_pair)

    for patient, bam_pair_list in gdc_bam_files.items():
        if isinstance(bam_pair_list, dict) or len(bam_pair_list) == 1:
            process_bam_pair(patient, bam_pair_list)

        else:
            count = 1
            for bam_pair in bam_pair_list:
                process_bam_pair(patient, bam_pair, str(count))
                count += 1

    LOGGER.info("Waiting for GDC Pipeline tasks to complete...")
    parsl.wait_for_current_tasks()
    LOGGER.info("GDC Pipeline tasks completed!")
Ejemplo n.º 2
0
                  reference,
                  bams['normal'],
                  bams['tumor'],
                  os.path.abspath(output_dir),
                  label='{}-somaticsniper'.format(patient))
    muse(executables,
         reference,
         bams['normal'],
         bams['tumor'],
         known_sites,
         os.path.abspath(output_dir),
         label='{}-muse'.format(patient))
    varscan(executables,
            reference,
            bams['normal'],
            bams['tumor'],
            os.path.abspath(output_dir),
            label='{}-varscan'.format(patient))
    mutect2(
        executables,
        reference,
        bams['normal'],
        bams['tumor'],
        os.path.abspath(output_dir),
        normal_panel,
        known_sites,
        label='{}-mutect2'.format(patient),
    )

parsl.wait_for_current_tasks()
Ejemplo n.º 3
0
def main():
    '''Reads input from terminal and coordinates pipeline'''
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "i:o:f:l:e:m:a:t:s:r:F:A:c:p:h", [
                "input_dir=", "output_dir=", "genome_fasta=", "genome_len=",
                "genome_include=", "motif_path=", "sample_attr=",
                "sampleinfo_table=", "SV_types=", "rand_sv_ratio=",
                "FIMO_thresh=", "AME_scoring=", "config=", "prefix=", "help"
            ])
    except getopt.GetoptError as e:
        print(e)
        sys.exit(2)

    if len(args) > 0:
        message = "Error: non-paired arguments are not allowed."
        raise exceptions.WrongArgumentError(message)

    motif_pipeline = pipeline.MotifPipeline()
    sample_attr_path = None
    genome_fasta = None
    genome_len = None
    genome_include = None
    prefix = None
    config_name = "local"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            description()
            sys.exit()
        elif opt in ("-i", "--input_dir"):
            motif_pipeline.set_input_dir(arg)
        elif opt in ("-o", "--output_dir"):
            motif_pipeline.set_output_dir(arg)
        elif opt in ("-f", "--genome_fasta"):
            genome_fasta = arg
        elif opt in ("-l", "--genome_len"):
            genome_len = arg
        elif opt in ("-e", "--genome_include"):
            genome_include = arg
        elif opt in ("-m", "--motif_path"):
            motif_pipeline.set_motif_path(arg)
        elif opt in ("-a", "--sample_attr"):
            motif_pipeline.set_sample_attr(arg)
        elif opt in ("-t", "--sampleinfo_table"):
            sample_attr_path = arg
        elif opt in ("-s", "--SV_types"):
            motif_pipeline.set_SV_types(arg)
        elif opt in ("-r", "--rand_sv_ratio"):
            motif_pipeline.set_rand_sv_ratio(arg)
        elif opt in ("-F", "--FIMO_thresh"):
            motif_pipeline.set_FIMO_thresh(arg)
        elif opt in ("-A", "--AME_scoring"):
            motif_pipeline.set_AME_scoring(arg)
        elif opt in ("-c", "--config"):
            config_name = arg
        elif opt in ("-p", "--prefix"):
            prefix = arg
        else:
            message = "Error: {opt} is not a valid option".format(opt=opt)
            raise exceptions.WrongArgumentError(message)

    if ((sample_attr_path is None and not motif_pipeline.sample_attr == "all")
            or (sample_attr_path is not None
                and motif_pipeline.sample_attr == "all")):
        message = "Error: you must indicate both --sampleinfo_table and --sample_attr, or neither."
        raise exceptions.MissingArgumentError(message)
    if genome_fasta is None:
        message = "Error: you must indicate --genome_fasta."
        raise exceptions.MissingArgumentError(message)
    if genome_len is None:
        message = "Error: you must indicate --genome_len."
        raise exceptions.MissingArgumentError(message)
    if genome_include is None:
        message = "Error: you must indicate --genome_include."
        raise exceptions.MissingArgumentError(message)
    for pipeline_attr in ["input_dir", "output_dir", "motif_path"]:
        if not hasattr(motif_pipeline, pipeline_attr):
            message = ("Error: you must indicate --{attr}.").format(
                attr=pipeline_attr)
            raise exceptions.MissingArgumentError(message)
    motif_pipeline.set_subdir_name(prefix)
    motif_pipeline.write_description()
    motif_pipeline.set_list_bedpe(sample_attr_path)
    reference_genome = refgenome.ReferenceGenome(genome_fasta, genome_len,
                                                 genome_include)
    base_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
    try:
        config = os.path.join(base_dir, 'configs', '{}.py'.format(config_name))
        spec = importlib.util.spec_from_file_location('', config)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        parsl.load(module.config)
    except:
        raise exceptions.IncorrectPathError(
            "Cannot find the config file <{config_name}>.".format(
                config_name=config_name))

    if not os.path.isdir(motif_pipeline.output_dir + "bed_files"):
        os.mkdir(motif_pipeline.output_dir + "bed_files")
    for file_name in motif_pipeline.list_bedpe:
        sv_types_to_run = get_SV_types(motif_pipeline, file_name)
        if sv_types_to_run:
            extractdata.bedpe_to_bed(reference_genome, motif_pipeline,
                                     file_name, sv_types_to_run)
    parsl.wait_for_current_tasks()
    runprogram.merge(motif_pipeline)
    motif_pipeline.set_num_SV_breakpoints()
    runprogram.bedtools(motif_pipeline, reference_genome)
    runprogram.FIMO(motif_pipeline)
    runprogram.AME(motif_pipeline)
    extractdata.extract_list_sequences_AME(motif_pipeline)
    extractdata.extract_output_FIMO(motif_pipeline)
    extractdata.extract_output_AME(motif_pipeline)
    graphs.generate_histogram(motif_pipeline)
Ejemplo n.º 4
0
def run_pipeline(pipeline, genome, download_future):
    '''
    Orchestrates the entire TCR/BCR pipeline for each sample (either input from the 
    command line or inferred from the fastq/bam directories). Each program is only 
    run if the output directory for that specific program/sample/run-mode is non-empty.
    Sample names that do not have corresponding fastq/bam directories are ignored.
    '''
    dict_subdirectory = {
        "MiXCR": True,
        "TRUST3": True,
        "TRUST4": False,
        "VDJer": True,
        "CATT": True
    }
    for sample in pipeline.fastq_dict.keys():
        sample_output = pipeline.output_dir + sample + "/"
        if not os.path.isdir(sample_output):
            os.mkdir(sample_output)
        # STAR_align(pipeline, genome, sample_name, inputs=[], VDJer=False)
        if ('TRUST3' in pipeline.run_program
                or 'TRUST4' in pipeline.run_program):
            os.makedirs(sample_output + "STAR_align/", exist_ok=True)
            align_dir = sample_output + "STAR_align/" + genome.version
            if not os.path.isdir(align_dir):
                os.mkdir(align_dir)
                align_future = [
                    programs.STAR_align(pipeline,
                                        genome,
                                        sample,
                                        inputs=download_future)
                ]
            elif not os.listdir(align_dir):
                align_future = [
                    programs.STAR_align(pipeline,
                                        genome,
                                        sample,
                                        inputs=download_future)
                ]
            else:
                if os.path.isfile(align_dir + "/" + sample +
                                  ".Aligned.sortedByCoord.out.bam"):
                    print(
                        f"Non-empty {genome.version} STAR alignment directory for {sample}: alignment already run."
                    )
                    align_future = []
                else:
                    align_files = glob.glob(align_dir + "/*")
                    for rm_file in align_files:
                        os.remove(rm_file)
                    align_future = [
                        programs.STAR_align(pipeline,
                                            genome,
                                            sample,
                                            inputs=download_future)
                    ]
        if 'VDJer' in pipeline.run_program:
            if genome.version != "hg38":
                VDJer_align_dir = sample_output + "STAR_align/hg38"
                if not os.path.isdir(VDJer_align_dir):
                    os.mkdir(VDJer_align_dir)
                    VDJer_align_future = [
                        programs.STAR_align(pipeline,
                                            genome,
                                            sample,
                                            inputs=download_future,
                                            VDJer=True)
                    ]
                elif not os.listdir(VDJer_align_dir):
                    VDJer_align_future = [
                        programs.STAR_align(pipeline,
                                            genome,
                                            sample,
                                            inputs=download_future,
                                            VDJer=True)
                    ]
                else:
                    if os.path.isfile(VDJer_align_dir + "/" + sample +
                                      ".Aligned.sortedByCoord.out.bam"):
                        print(
                            f"Non-empty hg38 STAR alignment directory for {sample}: alignment already run."
                        )
                        VDJer_align_future = []
                    else:
                        VDJer_align_files = glob.glob(VDJer_align_dir + "/*")
                        for rm_file in VDJer_align_files:
                            os.remove(rm_file)
                        VDJer_align_future = [
                            programs.STAR_align(pipeline,
                                                genome,
                                                sample,
                                                inputs=download_future,
                                                VDJer=True)
                        ]
            else:
                VDJer_align_future = align_future

        for program in pipeline.run_program:
            if program in ["TRUST3", "TRUST4"]:
                program_input = align_future
            elif program == "VDJer":
                program_input = VDJer_align_future
            else:
                program_input = []
            if dict_subdirectory[program]:
                if not os.path.isdir(sample_output + program):
                    os.mkdir(sample_output + program)
                run_parameters = list_run_parameters(
                    pipeline, sample_output + "/" + program + "/",
                    program == "CATT")
                if program == "MiXCR":
                    # Check if main MiXCR dir is not empty
                    # Make more modular ?? Check for specific files ?
                    # MiXCR_align
                    # MiXCR_assemblePartial
                    # MiXCR_extendAlignments
                    # MiXCR_assemble
                    # MiXCR_exportClones
                    rerun_exportClones = True
                    mixcr_intermediate_files = [
                        mixcr_file
                        for mixcr_file in glob.glob(sample_output + "/MiXCR/*")
                        if os.path.isfile(mixcr_file)
                    ]
                    if mixcr_intermediate_files:
                        if os.path.isfile(sample_output + "/MiXCR/" + sample +
                                          "_clones.clns"):
                            print(
                                f"Non-empty MiXCR directory for {sample}: align/assemblePartial/extendAlignments/assemble already run."
                            )
                            rerun_exportClones = False
                            exportClones_input = []
                        else:
                            for rm_file in mixcr_intermediate_files:
                                os.remove(rm_file)
                            exportClones_input = run_mixcr_prep(
                                pipeline, genome, program_input, sample)
                    else:
                        exportClones_input = run_mixcr_prep(
                            pipeline, genome, program_input, sample)
                    if not rerun_exportClones:
                        # rerunning because previous files have been modified
                        for parameter in run_parameters:
                            programs.MiXCR_exportClones(
                                pipeline,
                                genome,
                                sample,
                                parameter,
                                inputs=exportClones_input)
                    else:
                        for parameter in pipeline.receptor:
                            mixcr_files = glob.glob(sample_output + "/MiXCR/" +
                                                    parameter + "/*")
                            for rm_file in mixcr_files:
                                os.remove(rm_file)
                            programs.MiXCR_exportClones(
                                pipeline,
                                genome,
                                sample,
                                parameter,
                                inputs=exportClones_input)
                if program == "TRUST3":
                    bam_file = align_dir + "/" + sample + ".Aligned.sortedByCoord.out.bam"
                    if not os.path.isfile(bam_file + ".bai"):
                        program_input = [
                            programs.samtools_index(pipeline,
                                                    bam_file,
                                                    "index",
                                                    inputs=program_input)
                        ]
                if program != "MiXCR":
                    program_function = getattr(programs, program)
                    for parameter in run_parameters:
                        program_function(pipeline,
                                         genome,
                                         sample,
                                         parameter,
                                         inputs=program_input)
            else:
                program_function = getattr(programs, program)
                if not os.path.isdir(sample_output + program):
                    os.mkdir(sample_output + program)
                    program_function(pipeline,
                                     genome,
                                     sample,
                                     inputs=program_input)
                elif not os.listdir(sample_output + program):
                    program_function(pipeline,
                                     genome,
                                     sample,
                                     inputs=program_input)
                else:
                    if program == "TRUST4":
                        output_file = sample + "_report.tsv"
                    if os.path.isfile(sample_output + program + "/" +
                                      output_file):
                        print(
                            f"Non-empty {program} directory for {sample}: {program} already run."
                        )
                    else:
                        all_output_files = glob.glob(sample_output + program +
                                                     "/*")
                        for rm_file in all_output_files:
                            os.remove(rm_file)
                        program_function(pipeline,
                                         genome,
                                         sample,
                                         inputs=program_input)
    parsl.wait_for_current_tasks()
Ejemplo n.º 5
0
def germline(workflow, work_dir, contigs_file, out_dir):
    ########################################################################
    ## Set up w/ sample for loop
    ########################################################################
    # Will occur in all runs (alignment, geno, or both)

    contigs = read_data(contigs_file)
    with open(os.path.join(out_dir, SwagStrings.patient_out_filename)) as f:
        samples = list(csv.DictReader(f, delimiter=' '))

    for sample in samples:
        sample_dir = os.path.abspath(sample['dir'])
        sample_id = sample['ID']

        ########################################################################
        ## Aligment
        ########################################################################
        # in_bam will be passed to split by contig if alignment is not needed
        # INPUT - Unaligned input files (including read groups)

        in_bam = os.path.join(sample_dir, "{}.bam".format(sample_id))
        contig_split_bam = in_bam

        if workflow.has_alignment:
            contig_split_bam = SwagStrings.contig_split_bam  # Default bam name post-alignment

            alnSampleContigBams = align(
                in_bam=in_bam,
                work_dir=work_dir,
                aligner_app=getattr(swag.parsl.apps, workflow.aligner),
                mergesort_app=getattr(swag.parsl.apps,
                                      SwagStrings.generate_sort_app),
                sample_dir=sample_dir,
                sample_id=sample_id)

        # FIXME broken if `(not workflow.has_alignment)`
        contigBams = []
        vcfs = collections.defaultdict(list)
        for contig, bam in zip(contigs, alnSampleContigBams):
            if workflow.has_alignment:  # Dup removal optional for non-alignment cases
                bam = picard_mark_duplicates(work_dir, bam, contig, sample_id,
                                             sample_dir)
            bam_index = index_bam(work_dir, bam)

            ref_dir = os.path.join(work_dir, out_dir,
                                   SwagStrings.analysis_reference_dir)
            contig_segments = read_data(
                os.path.join(ref_dir, 'contig_segments_{}.txt'.format(contig)))
            for segment in contig_segments:

                ########################################################################
                ## GATK Post-processing
                ########################################################################
                # These steps will occur in a block
                # Will update the name of the genoBam if gatk performed
                # if workflow.has_gatk:
                # printGrpFilenames(swift_script, tabCount=1)
                # Str = (tabs + 'file mergedGrp <single_file_mapper; file=strcat(sample.dir,"/",sample.ID,".merged.grp")>;\n' +
                #        tabs + 'file mergeGrpLog <single_file_mapper; file=strcat(sample.dir,"/",sample.ID,".merged.grp.log")>;\n' +
                #        tabs + 'file grpFiles [];\n\n')

                # genoBam = printGatkAppCalls(
                #    parsl_script,
                #    tabCount=2,
                #    inputBam=genoBam
                # )

                ########################################################################
                ## Single sample coordinate genotyping
                ########################################################################
                if workflow.has_genotyping:
                    for genotyper in workflow.genotypers:
                        vcfs[genotyper].append(
                            single_sample_genotype(
                                work_dir,
                                genotyper=genotyper,
                                contig=contig,
                                segment=segment,
                                ref_dir=os.path.join(
                                    work_dir, out_dir,
                                    SwagStrings.analysis_reference_dir),
                                bam=bam,
                                bam_index=bam_index,
                                sample_dir=sample_dir,
                                sample_id=sample_id))

                ########################################################################
                # Structural variant calling
                ########################################################################
                # Step flexible even if Delly only supported
                # if workflow.has_struct_vars:
                # for structVarCaller in workflow.struct_var_callers:
                #      Str = tabs + 'file[auto] ' + structVarCaller + 'ContigVcfs;\n'
                # printDellyApp(
                #     parsl_script,
                #     tabCount=2,
                #     genoBam=genoBam,
                #     genoBamIndex=genoBam + 'Bai'
                # )

            contigBams.append(bam)

        #########################################################################
        ### Reduce bam steps
        #########################################################################
        ## All reduce steps look for the no_mapped_reads flag within each wrapper

        ## Create the command that will print the grp reduce call if GATK required
        #if workflow.has_gatk:
        #     Str = (tabs + 'file mergedGrp <single_file_mapper; file=strcat(sample.dir,"/",sample.ID,".merged.grp")>;\n' +
        #            tabs + 'file mergeGrpLog <single_file_mapper; file=strcat(sample.dir,"/",sample.ID,".merged.grp.log")>;\n' +
        #            tabs + 'file grpFiles [];\n\n')
        #    printReduceGrpAppCall(parsl_script, tabCount=1)

        if workflow.has_alignment:
            bam, bam_index = contig_merge_sort(work_dir, contigBams,
                                               sample_dir, sample_id)

        #########################################################################
        ### Reduce vcf steps
        #########################################################################
        ## All vcfs will be merged here

        if workflow.has_genotyping:
            for genotyper in workflow.genotypers:
                concat_vcf(work_dir, sample_id, sample_dir, genotyper,
                           vcfs[genotyper])

        #if workflow.has_struct_vars:
        #    for structVarCaller in workflow.struct_var_callers:
        #         Str = tabs + 'file[auto] ' + structVarCaller + 'ContigVcfs;\n'
        #    for structVarCaller in workflow.struct_var_callers:
        #        print(structVarCaller)

        #        # Will perform translocations analysis prior to the merge
        #        if structVarCaller == 'DellyGerm':
        #            printGermDellyTransApp(
        #                parsl_script,
        #                tabCount=1,
        #                mergedBam=QCBam,
        #                mergedBamIndex=QCBam + 'Index'
        #            )

        #        printReduceVcfApp(
        #            parsl_script,
        #            tabCount=1,
        #            genotyper=structVarCaller,
        #            sampleDir='sample.dir',
        #            sampleID='sample.ID'
        #        )

        perform_quality_control(work_dir, bam, sample_dir, sample_id,
                                workflow.bam_quality_control_apps)

        parsl.wait_for_current_tasks()