コード例 #1
0
ファイル: ex2.py プロジェクト: qqss88/Cosmos2
def run_ex2(execution):
    # Create two jobs that echo "hello" and "world" respectively (source nodes in the graph).
    echos = [execution.add_task(echo,
                                tags=dict(word=word),
                                out_dir='{word}')
             for word in ['hello', 'world']]

    # Split each echo into two jobs (a one2many relationship).
    cats = [execution.add_task(cat,
                               tags=dict(n=n, **echo_task.tags),
                               parents=[echo_task],
                               out_dir='{word}/{n}')
            for echo_task in echos
            for n in [1, 2]]

    # Count the words in the previous stage.  An example of a one2one relationship,
    # the most common stage dependency pattern.  For each task in StageA, you create a single dependent task in StageB.
    word_counts = one2one(cmd_fxn=word_count, parents=cats, tag=dict(chars=True))

    # Cat the contents of all word_counts into one file.  Note only one node is being created who's parents are
    # all of the WordCounts (a many2one relationship).
    summarize = execution.add_task(cat,
                                   dict(),
                                   word_counts,
                                   '',
                                   'Summary_Analysis')

    if pygraphviz_available:
        # These images can also be seen on the fly in the web-interface
        draw_stage_graph(execution.stage_graph(), '/tmp/ex2_task_graph.png', format='png')
        draw_task_graph(execution.task_graph(), '/tmp/ex2_stage_graph.png', format='png')
    else:
        print 'Pygraphviz is not available :('

    execution.run(max_attempts=1, max_cores=10)
コード例 #2
0
ファイル: ex2.py プロジェクト: yassineS/COSMOS-2.0
def run_ex2(execution):
    # Create two jobs that echo "hello" and "world" respectively (source nodes in the graph).
    echos = [execution.add_task(echo,
                                tags=dict(word=word),
                                out_dir='{word}')
             for word in ['hello', 'world']]

    # Split each echo into two jobs (a one2many relationship).
    cats = [execution.add_task(cat,
                               tags=dict(n=n, **echo_task.tags),
                               parents=[echo_task],
                               out_dir='{word}/{n}')
            for echo_task in echos
            for n in [1, 2]]

    # Count the words in the previous stage.  An example of a one2one relationship,
    # the most common stage dependency pattern.  For each task in StageA, you create a single dependent task in StageB.
    word_counts = one2one(execution, cmd_fxn=word_count, parents=cats, tag=dict(chars=True))

    # Cat the contents of all word_counts into one file.  Note only one node is being created who's parents are
    # all of the WordCounts (a many2one relationship).
    summarize = execution.add_task(cat,
                                   dict(),
                                   word_counts,
                                   '',
                                   'Summary_Analysis')

    if pygraphviz_available:
        # These images can also be seen on the fly in the web-interface
        draw_stage_graph(execution.stage_graph(), '/tmp/ex1_task_graph.png', format='png')
        draw_task_graph(execution.task_graph(), '/tmp//ex1_stage_graph.png', format='png')
    else:
        print 'Pygraphviz is not available :('

    execution.run()
コード例 #3
0
ファイル: recipe.py プロジェクト: yanding/COSMOS-2.0
def variant_call(execution, bam_path, target_bed_path, max_complex_gap):
    """
    Bioinformatics variant calling workflow
    """
    contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path, shell=True).strip().split("\n")

    bed_tasks = [execution.add_task(tools.filter_bed_by_contig, tags=dict(in_bam=bam_path, in_bed=target_bed_path, contig=contig), out_dir='work/{contig}')
                 for contig in contigs ]

    freebayes_tasks = one2one(tools.freebayes, bed_tasks, dict(max_complex_gap=max_complex_gap))

    merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks)

    execution.run()
コード例 #4
0
def variant_call(execution, bam_path, target_bed_path, max_complex_gap):
    """
    Bioinformatics variant calling workflow
    """
    contigs = sp.check_output("cat %s |cut -f1|uniq" % target_bed_path,
                              shell=True).strip().split("\n")

    bed_tasks = [
        execution.add_task(tools.filter_bed_by_contig,
                           tags=dict(in_bam=bam_path,
                                     in_bed=target_bed_path,
                                     contig=contig),
                           out_dir='work/{contig}') for contig in contigs
    ]

    freebayes_tasks = one2one(tools.freebayes, bed_tasks,
                              dict(max_complex_gap=max_complex_gap))

    merge_vcf_tasks = many2one(tools.vcf_concat_parts, freebayes_tasks)

    execution.run()
コード例 #5
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks,
                                        by=[
                                            'sample_name', 'library',
                                            'platform', 'platform_unit',
                                            'rgid', 'chunk'
                                        ]):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(
            bwa.bwa_mem,
            tags=dict(**tags),
            parents=fastq_task_group,
            out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates,
                      aligns,
                      groupby=['sample_name', 'library'],
                      out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [
        execution.add_task(gatk.realigner_target_creator,
                           dict(contig=target_bed_task.tags['contig'],
                                in_target_bed=target_bed_task.output_files[0],
                                **tags),
                           parents + [target_bed_task],
                           out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(dedupe, ['sample_name'])  # Many2one
        for target_bed_task in target_bed_tasks
    ]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [
        execution.add_task(samtools.view,
                           dict(out_bam=out_dir('both_pairs_unmapped.bam' %
                                                lb_task),
                                f='12',
                                sample_name=tags['sample_name'],
                                contig='BOTH_PAIRS_UNMAPPED',
                                library=lb_task.tags['library']),
                           parents=lb_task,
                           out_dir='SM_{sample_name}/work/LB_{library}',
                           stage_name='Filter_Both_Pairs_Unmapped')
        for tags, sm_tasks in group(dedupe, ['sample_name'])
        for lb_task in sm_tasks
    ]

    # Skipping BQSR.  Will improve results only slightly, if at all.

    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files,
                      realigned_by_sample_contig_tasks, ['sample_name'],
                      out_dir='SM_{sample_name}',
                      stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics,
            merged,
            out_dir='SM_{sample_name}/metrics')

    return merged
コード例 #6
0
ファイル: recipe.py プロジェクト: LPM-HMS/GenomeKey2
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(bwa.bwa_mem,
                                        tags=dict(**tags),
                                        parents=fastq_task_group,
                                        out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [execution.add_task(gatk.realigner_target_creator,
                                    dict(contig=target_bed_task.tags['contig'],
                                         in_target_bed=target_bed_task.output_files[0], **tags),
                                    parents + [target_bed_task],
                                    out_dir='SM_{sample_name}/work/contigs/{contig}')
                 for tags, parents in group(dedupe, ['sample_name'])  # Many2one
                 for target_bed_task in target_bed_tasks]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [execution.add_task(samtools.view,
                                                            dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task),
                                                                 f='12',
                                                                 sample_name=tags['sample_name'],
                                                                 contig='BOTH_PAIRS_UNMAPPED',
                                                                 library=lb_task.tags['library']),
                                                            parents=lb_task,
                                                            out_dir='SM_{sample_name}/work/LB_{library}',
                                                            stage_name='Filter_Both_Pairs_Unmapped')
                                         for tags, sm_tasks in group(dedupe, ['sample_name'])
                                         for lb_task in sm_tasks]


    # Skipping BQSR.  Will improve results only slightly, if at all.


    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics')
    one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics')

    return merged