Ejemplo n.º 1
0
def Pipeline_split():
    split_fastq = sequence_(
        map_(json_.Split), apply_(reduce_(['gz_output_dir'],
                                          json_.Total_json)),
        map_(json_.Format_json))

    return sequence_(split_fastq, )
Ejemplo n.º 2
0
def Pipeline_split():
    split_fastq = sequence_(
        map_(json_.Split),
        apply_(
            reduce_(['gz_output_dir'],json_.Total_json)
        ),
        map_(json_.Format_json)
    )
     
    return sequence_(
        split_fastq,
    )
Ejemplo n.º 3
0
def Pipeline():
    testing = wga_settings['test'] 
    target = wga_settings['target']
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])
    
    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library', 'platform', 'platform_unit', 'chunk'], pipes.AlignAndCleanALN)
        ),
    )
    
    if target:
        remove_dup = sequence_(
            reduce_(['sample_name'], picard.MERGE_SAMS)
        )
    else:
        remove_dup = sequence_(
            reduce_(['sample_name'], picard.MarkDuplicates)
        )
       
    preprocess_alignment = sequence_(                              
        map_(samtools.IndexBam),
        apply_(
            split_([intervals],gatk.RealignerTargetCreator) #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(gatk.ApplyBQSR) #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        )
    )
      
    call_variants = sequence_(
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
            combine=True
        ),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm],gatk.VQSR), 
    )
  
    return sequence_(
            align_to_reference,
            remove_dup,
            preprocess_alignment,
            call_variants
    )
Ejemplo n.º 4
0
def Pipeline():
    testing = wga_settings['test']
    target = wga_settings['target']
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', list(range(1, 23)) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_([
                'sample_name', 'library', 'platform', 'platform_unit', 'chunk'
            ], pipes.AlignAndCleanALN)), )

    if target:
        remove_dup = sequence_(reduce_(['sample_name'], picard.MERGE_SAMS))
    else:
        remove_dup = sequence_(reduce_(['sample_name'], picard.MarkDuplicates))

    preprocess_alignment = sequence_(
        map_(samtools.IndexBam),
        apply_(
            split_(
                [intervals], gatk.RealignerTargetCreator
            )  #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(
                gatk.ApplyBQSR
            )  #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        ))

    call_variants = sequence_(
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm],
                          gatk.UnifiedGenotyper,
                          tag={'vcf': 'UnifiedGenotyper'}),
            combine=True),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm], gatk.VQSR),
    )

    return sequence_(align_to_reference, remove_dup, preprocess_alignment,
                     call_variants)
Ejemplo n.º 5
0
def Pipeline_upload():
    download_fastq = sequence_(
        map_(s3_.S3Upload)
    )
     
    return sequence_(
        download_fastq,
    )
Ejemplo n.º 6
0
def Pipeline_Somatic():
    testing = wga_settings['test']
    target = wga_settings['target']

    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library', 'platform', 'platform_unit','sample_type','chunk','rgid'], pipes.AlignAndCleanMEM)
        ),
    )

    if target:
        remove_dup = sequence_(
            reduce_(['sample_name','sample_type','rgid'], picard.MERGE_SAMS)
        )
    else:
        remove_dup = sequence_(
            reduce_(['sample_name','sample_type','rgid'], picard.MarkDuplicates)
        )

    preprocess_alignment = sequence_(
        map_(samtools.IndexBam),
        apply_(
            split_([intervals],gatk.RealignerTargetCreator) #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name','sample_type','rgid'], gatk.BQSRGatherer),
            map_(gatk.ApplyBQSR) #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        )
    )

    somatic_call = sequence_(
           
           apply_(
                  sequence_(
                        map_(mutect.createInput),
                        reduce_(['rgid','interval'], mutect.Somatic, tag={'vcf': 'Mutect'}),
                        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
                  ),
                  sequence_(
                        map_(svdetect.PreProcessing),
			            map_(svdetect.link2SV)
                  )
         )
    )

    return sequence_(
            align_to_reference,
            remove_dup,
            preprocess_alignment,
            somatic_call
    )
Ejemplo n.º 7
0
def gunzip(workflow,input_dir,**kwargs):
    """
    Gunzips all gz files in directory

    $ genomekey gunzip -n 'Gunzip' /path/to/dir
    """
    DAG().sequence_(
         add_([ INPUT(f,tags={'i':i}) for i,f in enumerate(glob.glob(os.path.join(input_dir,'*.gz'))) ]),
         map_(unix.Gunzip),
         add_run(workflow)
    )
Ejemplo n.º 8
0
def gunzip(workflow, input_dir, **kwargs):
    """
    Gunzips all gz files in directory

    $ genomekey gunzip -n 'Gunzip' /path/to/dir
    """
    DAG().sequence_(
        add_([
            INPUT(f, tags={'i': i})
            for i, f in enumerate(glob.glob(os.path.join(input_dir, '*.gz')))
        ]), map_(unix.Gunzip), add_run(workflow))
Ejemplo n.º 9
0
def Pipeline_Somatic():
    testing = wga_settings['test']
    target = wga_settings['target']

    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', list(range(1, 23)) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_([
                'sample_name', 'library', 'platform', 'platform_unit',
                'sample_type', 'chunk', 'rgid'
            ], pipes.AlignAndCleanMEM)), )

    if target:
        remove_dup = sequence_(
            reduce_(['sample_name', 'sample_type', 'rgid'], picard.MERGE_SAMS))
    else:
        remove_dup = sequence_(
            reduce_(['sample_name', 'sample_type', 'rgid'],
                    picard.MarkDuplicates))

    preprocess_alignment = sequence_(
        map_(samtools.IndexBam),
        apply_(
            split_(
                [intervals], gatk.RealignerTargetCreator
            )  #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name', 'sample_type', 'rgid'], gatk.BQSRGatherer),
            map_(
                gatk.ApplyBQSR
            )  #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        ))

    somatic_call = sequence_(
        apply_(
            sequence_(
                map_(mutect.createInput),
                reduce_(['rgid', 'interval'],
                        mutect.Somatic,
                        tag={'vcf': 'Mutect'}),
                reduce_(['vcf'], gatk.CombineVariants,
                        'Combine Into Raw VCFs'),
            ), sequence_(map_(svdetect.PreProcessing),
                         map_(svdetect.link2SV))))

    return sequence_(align_to_reference, remove_dup, preprocess_alignment,
                     somatic_call)
Ejemplo n.º 10
0
def CteamPipeline(input_bams):

    bam_seq = None
    bam_dup = []  # to check duplicate input files

    for b in input_bams:
        # extract genome_id from file, add as a tag
        genome_id = os.path.basename(b).partition('.')[0]

        if genome_id in bam_dup:
            print '\n\nERROR: \"%s\" was already included in the input file list.\n' % b
            sys.exit()
        else:
            bam_dup.append(genome_id)

        # TEMPORARILLY, use genome_id as RG_ID, too
        s = sequence_(
            add_([INPUT(b, tags={'rg': genome_id})], stage_name="Load Input"))

        # append to sequence
        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    nInput = len(input_bams)
    nNodes = settings.settings['nNode']
    nSplit = min(
        256, 16 *
        max(nNodes / nInput, 1))  # will use floor, min 16, up to 256 splits

    settings.settings['nSplit'] = nSplit

    chrom = ('chrom', range(1, 23) + ['X', 'Y', 'MT'])
    split = ('split', range(1, nSplit + 1))

    return sequence_(
        bam_seq,
        map_(pipes.CteamSortSplitBam),  # sort bam by readname (== shuffling)
        split_([split], pipes.CteamTrimReadGroup),  # 
        map_(pipes.CteamBwaAln),  # bwa aln
        map_(pipes.CteamBwaSampe),  # bwa sampe
        reduce_(['rg'], pipes.CteamSplitByChromosome
                ),  # merge split files and (re)split by chromosome
        split_([chrom], pipes.CteamRmDup_BuildIndex),  # samtools rmdup + index
        map_(pipes.CteamRealignTarget),  # gatk indel realign target creator
        map_(pipes.CteamIndelRealigner),  # gatk indel realigner
        map_(pipes.CteamUnifiedGenotyper)  # gatk unifiedGenotyper

        # #map_(pipes.CteamVariantFiltration)          # gatk variantFilter
    )
Ejemplo n.º 11
0
def Pipeline_local():
    testing = wga_settings['test']
    target = wga_settings['target']

    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])
    print [glm][0]
    
    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library', 'platform', 'platform_unit'], pipes.AlignAndCleanALN)
        ),
    )
    
    if target:
        remove_dup = sequence_(
        )
    else:
        remove_dup = sequence_(
            reduce_(['sample_name'], picard.MarkDuplicates)
        )
        
    preprocess_alignment = sequence_( 
        map_(samtools.IndexBam),
        map_(gatk.RealignerTargetCreator), #if not is_capture or testing else map_(gatk.RealignerTargetCreator) 
        map_(gatk.IndelRealigner), 
        map_(gatk.BQSR),
        map_(gatk.ApplyBQSR_local)
    )  
     
    
    call_variants = sequence_(
        apply_(
            map_(gatk.UnifiedGenotyper_local, tag={'vcf': 'UnifiedGenotyper'})
        )   
    )
    
    return sequence_(
            align_to_reference,
            remove_dup,
            preprocess_alignment,
            call_variants
    )
Ejemplo n.º 12
0
def Pipeline_local():
    testing = wga_settings['test']
    target = wga_settings['target']

    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', list(range(1, 23)) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])
    print([glm][0])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library', 'platform', 'platform_unit'],
                    pipes.AlignAndCleanALN)), )

    if target:
        remove_dup = sequence_()
    else:
        remove_dup = sequence_(reduce_(['sample_name'], picard.MarkDuplicates))

    preprocess_alignment = sequence_(
        map_(samtools.IndexBam),
        map_(
            gatk.RealignerTargetCreator
        ),  #if not is_capture or testing else map_(gatk.RealignerTargetCreator) 
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        map_(gatk.ApplyBQSR_local))

    call_variants = sequence_(
        apply_(
            map_(gatk.UnifiedGenotyper_local, tag={'vcf':
                                                   'UnifiedGenotyper'})))

    return sequence_(align_to_reference, remove_dup, preprocess_alignment,
                     call_variants)
Ejemplo n.º 13
0
def Pipeline():
    is_capture = wga_settings['capture']
    testing = wga_settings['test']

    # split_ tuples
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])

    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library'], misc.FastqStats),
            reduce_(['sample_name', 'library', 'platform', 'platform_unit', 'chunk'], pipes.AlignAndClean)
        ),
    )

    preprocess_alignment = sequence_(
        reduce_(['sample_name'], picard.MarkDuplicates),
        apply_(
            map_(picard.CollectMultipleMetrics),
            split_([intervals],gatk.RealignerTargetCreator) #if not is_capture or testing else map_(gatk.RealignerTargetCreator)
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(gatk.ApplyBQSR) #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        )
    )

    call_variants = sequence_(
        # apply_(
        #     reduce_split_([],[intervals,glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
        #     combine=True
        # ) if is_capture
        # else
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
            combine=True
        ),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm],gatk.VQSR),
        map_(gatk.Apply_VQSR),
        reduce_(['vcf'], gatk.CombineVariants, "Combine into Master VCFs")
    )

    if is_capture:
        return sequence_(
            align_to_reference,
            preprocess_alignment,
            call_variants,
            massive_annotation
        )
    else:
        return sequence_(
            align_to_reference,
            preprocess_alignment,
            reduce_split_(['sample_name'],[intervals],gatk.ReduceReads),
            call_variants,
            massive_annotation
        )
Ejemplo n.º 14
0
from cosmos.lib.ezflow.dag import add_,map_,reduce_,split_,reduce_split_,sequence_,branch_
from subprocess import Popen,PIPE
from genomekey.tools import annovarext
from genomekey.wga_settings import wga_settings
import sys
import os

def get_db_names():
    cmd = '{0} listdbs'.format(wga_settings['annovarext_path'])
    if not os.path.exists(wga_settings['annovarext_path']):
        raise Exception, 'AnnovarExtensions is not installed at {0}'.format(wga_settings['annovarext_path'])
    dbs = Popen(cmd.split(' '),stdout=PIPE).communicate()[0]
    if len(dbs) < 10:
        raise Exception, "could not list databases, command was {0}".format(cmd)
    return [ db for db in dbs.split('\n') if db != '' ]

massive_annotation = sequence_(
    map_(annovarext.Vcf2Anno_in),
    split_( [('build',['hg19']),('dbname',get_db_names()) ], annovarext.Annotate ),
    reduce_(['vcf'],annovarext.MergeAnnotations)
)


Ejemplo n.º 15
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom  = ('chrom', range(1,23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', ['dbSNP135','CytoBand','Target_Scan','mirBase','Self_Chain','Repeat_Masker','TFBS','Segmental_Duplications','SIFT','COSMIC',
                          'PolyPhen2','Mutation_Taster','GERP','PhyloP','LRT','Mce46way','Complete_Genomics_69','The_1000g_Febuary_all','The_1000g_April_all',
                          'NHLBI_Exome_Project_euro','NHLBI_Exome_Project_aa','NHLBI_Exome_Project_all','ENCODE_DNaseI_Hypersensitivity','ENCODE_Transcription_Factor',
                          'UCSC_Gene','Refseq_Gene','Ensembl_Gene','CCDS_Gene','HGMD_INDEL','HGMD_SNP','GWAS_Catalog'])
    bam_seq = None
    
    for b in bams:
        header = _getHeaderInfo(b)
        sn     = _getSeqName(header)

        rgid = [ h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn    = ['chr1']
            chrom = ('chrom',[1])
            glm   = ('glm',['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [ ('prevSn', sn), ('chromosome_only_split', [True]) ]
            indelrealign_reduce =  ['bam']
        else:
            bam_bwa_split = [ ('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False]) ]
            indelrealign_reduce =  ['bam','rgId']

        s = sequence_( add_([INPUT(b, tags={'bam':sample_name})], stage_name="Load BAMs"), 
                       split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None:   bam_seq = s
        else:                 bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.ReduceReads),
        reduce_split_(['chrom'], [glm],        pipes.UnifiedGenotyper),
        reduce_(['glm'],                       pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'}),
        reduce_(['vcf'],                       pipes.CombineVariants, "Merge VCF"),
        map_(                                  pipes.Vcf2Anno_in),       
        split_([dbnames],                      pipes.Annotate, tag={'build':'hg19'}),       
        reduce_(['vcf'],                       pipes.MergeAnnotations)
    )

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.HaplotypeCaller),
        reduce_(['chrom'],                     pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],               pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'})
    )

    return hc_pipeline
Ejemplo n.º 16
0
Archivo: ex3.py Proyecto: p7k/COSMOS
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, split_,add_,map_,reduce_
from tools import ECHO, CAT, WC, PASTE, Sleep

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]),
    map_(Sleep),
    split_([('i',[1,2])], CAT),
    reduce_([], PASTE),
    map_(WC),
)

dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 3',restart=True,delete_intermediates=True)
dag.add_to_workflow(WF)
WF.run()
Ejemplo n.º 17
0
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, split_,add_,map_,reduce_
from tools import ECHO, CAT, WC, PASTE

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]),
    split_([('i',[1,2])], CAT),
    reduce_([], PASTE),
    map_(WC)
)

dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 2',restart=True,delete_intermediates=True)
dag.add_to_workflow(WF)
WF.run()
Ejemplo n.º 18
0
from cosmos.lib.ezflow.dag import add_,map_,reduce_,split_,reduce_split_,sequence_,branch_
from subprocess import Popen,PIPE
from genomekey.tools import annovarext
from genomekey.wga_settings import wga_settings
import sys
import os

def get_db_names():
    cmd = '{0} listdbs'.format(wga_settings['annovarext_path'])
    if not os.path.exists(wga_settings['annovarext_path']):
        raise Exception('AnnovarExtensions is not installed at {0}'.format(wga_settings['annovarext_path']))
    dbs = Popen(cmd.split(' '),stdout=PIPE).communicate()[0]
    if len(dbs) < 10:
        raise Exception("could not list databases, command was {0}".format(cmd))
    return [ db for db in dbs.split('\n') if db != '' ]

massive_annotation = sequence_(
    map_(annovarext.Vcf2Anno_in),
    split_( [('build',['hg19']),('dbname',get_db_names()) ], annovarext.Annotate ),
    reduce_(['vcf'],annovarext.MergeAnnotations)
)


Ejemplo n.º 19
0
def Pipeline():
    is_capture = wga_settings['capture']
    testing = wga_settings['test']

    # split_ tuples
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', range(1, 23) + ['X', 'Y'])

    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library'], misc.FastqStats),
            reduce_([
                'sample_name', 'library', 'platform', 'platform_unit', 'chunk'
            ], pipes.AlignAndClean)), )

    preprocess_alignment = sequence_(
        reduce_(['sample_name'], picard.MarkDuplicates),
        apply_(
            map_(picard.CollectMultipleMetrics),
            split_(
                [intervals], gatk.RealignerTargetCreator
            )  #if not is_capture or testing else map_(gatk.RealignerTargetCreator)
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(
                gatk.ApplyBQSR
            )  #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        ))

    call_variants = sequence_(
        # apply_(
        #     reduce_split_([],[intervals,glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
        #     combine=True
        # ) if is_capture
        # else
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm],
                          gatk.UnifiedGenotyper,
                          tag={'vcf': 'UnifiedGenotyper'}),
            combine=True),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm], gatk.VQSR),
        map_(gatk.Apply_VQSR),
        reduce_(['vcf'], gatk.CombineVariants, "Combine into Master VCFs"))

    if is_capture:
        return sequence_(align_to_reference, preprocess_alignment,
                         call_variants, massive_annotation)
    else:
        return sequence_(
            align_to_reference, preprocess_alignment,
            reduce_split_(['sample_name'], [intervals], gatk.ReduceReads),
            call_variants, massive_annotation)
Ejemplo n.º 20
0
def Pipeline_upload():
    download_fastq = sequence_(map_(s3_.S3Upload))

    return sequence_(download_fastq, )
Ejemplo n.º 21
0
Archivo: ex2.py Proyecto: p7k/COSMOS
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, split_, add_, map_, reduce_
from tools import ECHO, CAT, WC, PASTE

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ECHO(tags={'word': 'hello'}),
          ECHO(tags={'word': 'world'})]), split_([('i', [1, 2])], CAT),
    reduce_([], PASTE), map_(WC))

dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 2', restart=True, delete_intermediates=True)
dag.add_to_workflow(WF)
WF.run()
Ejemplo n.º 22
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom = ('chrom', range(1, 23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', [
        'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain',
        'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC',
        'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way',
        'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all',
        'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa',
        'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity',
        'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene',
        'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog'
    ])
    bam_seq = None

    for b in bams:
        header = _getHeaderInfo(b)
        sn = _getSeqName(header)

        rgid = [h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn = ['chr1']
            chrom = ('chrom', [1])
            glm = ('glm', ['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])]
            indelrealign_reduce = ['bam']
        else:
            bam_bwa_split = [('rgId', rgid), ('prevSn', sn),
                             ('chromosome_only_split', [False])]
            indelrealign_reduce = ['bam', 'rgId']

        s = sequence_(
            add_([INPUT(b, tags={'bam': sample_name})],
                 stage_name="Load BAMs"),
            split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.ReduceReads),
        reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper),
        reduce_(['glm'],
                pipes.VariantQualityScoreRecalibration,
                tag={'vcf': 'main'}),
        reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"),
        map_(pipes.Vcf2Anno_in),
        split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}),
        reduce_(['vcf'], pipes.MergeAnnotations))

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],
               pipes.VariantQualityScoreRecalibration,
               tag={'vcf': 'main'}))

    return hc_pipeline