Example #1
0
def json_somatic(workflow, input_dict, **kwargs):
    """
    Input file is a json of the following format:

    [
        {
	    "chunk": "001",
            "library": "LIB-1216301779A",
            "platform": "ILLUMINA",
            "platform_unit": "C0MR3ACXX.001",
	    "rgid": "BC18-06-2013",
	    "sample_name": "BC18-06-2013LyT_S5_L001",
	    "pair": "1",
	    "path": "/path/to/fastq.gz",
	    "sample_type": "normal or tumor"
        },
        {..}
    ]
    """

    input_json = json.load(open(input_dict, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_Somatic(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Example #2
0
def json_(workflow, input_dict, **kwargs):
    """
    Input file is a json of the following format:

    [
        {
            'chunk': 001,
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair': 0, #0 or 1
            'path': '/path/to/fastq'
        },
        {..}
    ]
    """

    input_json = json.load(open(input_dict, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Example #3
0
File: cli.py Project: LPM-HMS/PvKey
def json_somatic(workflow,input_dict,**kwargs):
    """
    Input file is a json of the following format:

    [
        {
	    "chunk": "001",
            "library": "LIB-1216301779A",
            "platform": "ILLUMINA",
            "platform_unit": "C0MR3ACXX.001",
	    "rgid": "BC18-06-2013",
	    "sample_name": "BC18-06-2013LyT_S5_L001",
	    "pair": "1",
	    "path": "/path/to/fastq.gz",
	    "sample_type": "normal or tumor"
        },
        {..}
    ]
    """
    
    input_json = json.load(open(input_dict,'r'))
    inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ]

    DAG(ignore_stage_name_collisions=True).sequence_(
         add_(inputs),
         Pipeline_Somatic(),
         configure(wga_settings),
         add_run(workflow)
    )
Example #4
0
File: cli.py Project: LPM-HMS/PvKey
def json_local(workflow,input_dict,**kwargs):
    """
    Input is a folder where each file is a json of the following format:

    [
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':1
            'path': '/path/to/fastq'
        },
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':2
            'path': '/path/to/fastq'..}
    ]
    """
    dirList=os.listdir(input_dict)
    for files in dirList:
        print input_dict+files
        input_json = json.load(open(input_dict+files,'r'))
        inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ]
        for i in inputs:
            print i
        DAG(ignore_stage_name_collisions=True).sequence_(
            add_(inputs),
            Pipeline_local(),
            configure(wga_settings),
            add_run(workflow)
        )
Example #5
0
File: cli.py Project: LPM-HMS/PvKey
def json_(workflow,input_dict,**kwargs):
    """
    Input file is a json of the following format:
    [
        {
            'chunk': 001,
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair': 0, #0 or 1
            'path': '/path/to/fastq'
        },
        {..}
    ]
    """
    input_json = json.load(open(input_dict,'r'))
    inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ]

    DAG(ignore_stage_name_collisions=True).sequence_(
         add_(inputs),
         Pipeline(),
         configure(wga_settings),
         add_run(workflow)
    )
Example #6
0
def downdbs(workflow,**kwargs):
    """
    Download all annotation databases
    """
    DAG().sequence_(
        add_([ annovarext.DownDB(tags={'build':'hg19','dbname':db}) for db in annovarext.get_db_names() ]),
        configure(wga_settings),
        add_run(workflow)
    )
Example #7
0
def gunzip(workflow, input_dir, **kwargs):
    """
    Gunzips all gz files in directory

    $ genomekey gunzip -n 'Gunzip' /path/to/dir
    """
    DAG().sequence_(
        add_([
            INPUT(f, tags={'i': i})
            for i, f in enumerate(glob.glob(os.path.join(input_dir, '*.gz')))
        ]), map_(unix.Gunzip), add_run(workflow))
Example #8
0
def gunzip(workflow,input_dir,**kwargs):
    """
    Gunzips all gz files in directory

    $ genomekey gunzip -n 'Gunzip' /path/to/dir
    """
    DAG().sequence_(
         add_([ INPUT(f,tags={'i':i}) for i,f in enumerate(glob.glob(os.path.join(input_dir,'*.gz'))) ]),
         map_(unix.Gunzip),
         add_run(workflow)
    )
Example #9
0
def downdbs(workflow, **kwargs):
    """
    Download all annotation databases
    """
    DAG().sequence_(
        add_([
            annovarext.DownDB(tags={
                'build': 'hg19',
                'dbname': db
            }) for db in annovarext.get_db_names()
        ]), configure(wga_settings), add_run(workflow))
Example #10
0
File: cli.py Project: LPM-HMS/PvKey
def fastq_(workflow,input_dict,output_dict,output_json,**kwargs):
    
    json_fastq_to_split=json_creator.json_out(input_dict,output_dict)
    input_json = json.load(open(json_fastq_to_split,'r'))
    inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ]
        
    DAG(ignore_stage_name_collisions=True).sequence_(
         add_(inputs),
         Pipeline_split(),
         configure(wga_settings),
         add_run(workflow)
    )
Example #11
0
File: cli.py Project: LPM-HMS/PvKey
def upload_(workflow,bucket,project,out_dict,**kwargs):        
    project_folder=join(out_dict,project.replace(" ", "_"))
    if not os.path.exists(project_folder):
            os.makedirs(project_folder)
    json_fastq_to_upload=s3_Bucket.getList(bucket,project,out_dict)
    input_json = json.load(open(json_fastq_to_upload,'r'))
    inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ]
        
    DAG(ignore_stage_name_collisions=True).sequence_(
         add_(inputs),
         Pipeline_upload(),
         configure(wga_settings),
         add_run(workflow)
    )
Example #12
0
def CteamPipeline(input_bams):

    bam_seq = None
    bam_dup = []  # to check duplicate input files

    for b in input_bams:
        # extract genome_id from file, add as a tag
        genome_id = os.path.basename(b).partition('.')[0]

        if genome_id in bam_dup:
            print '\n\nERROR: \"%s\" was already included in the input file list.\n' % b
            sys.exit()
        else:
            bam_dup.append(genome_id)

        # TEMPORARILLY, use genome_id as RG_ID, too
        s = sequence_(
            add_([INPUT(b, tags={'rg': genome_id})], stage_name="Load Input"))

        # append to sequence
        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    nInput = len(input_bams)
    nNodes = settings.settings['nNode']
    nSplit = min(
        256, 16 *
        max(nNodes / nInput, 1))  # will use floor, min 16, up to 256 splits

    settings.settings['nSplit'] = nSplit

    chrom = ('chrom', range(1, 23) + ['X', 'Y', 'MT'])
    split = ('split', range(1, nSplit + 1))

    return sequence_(
        bam_seq,
        map_(pipes.CteamSortSplitBam),  # sort bam by readname (== shuffling)
        split_([split], pipes.CteamTrimReadGroup),  # 
        map_(pipes.CteamBwaAln),  # bwa aln
        map_(pipes.CteamBwaSampe),  # bwa sampe
        reduce_(['rg'], pipes.CteamSplitByChromosome
                ),  # merge split files and (re)split by chromosome
        split_([chrom], pipes.CteamRmDup_BuildIndex),  # samtools rmdup + index
        map_(pipes.CteamRealignTarget),  # gatk indel realign target creator
        map_(pipes.CteamIndelRealigner),  # gatk indel realigner
        map_(pipes.CteamUnifiedGenotyper)  # gatk unifiedGenotyper

        # #map_(pipes.CteamVariantFiltration)          # gatk variantFilter
    )
Example #13
0
def fastq_(workflow, input_dict, output_dict, output_json, **kwargs):

    json_fastq_to_split = json_creator.json_out(input_dict, output_dict)
    input_json = json.load(open(json_fastq_to_split, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['gz_path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_split(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Example #14
0
def anno(workflow, input_file, input_file_list, file_format='vcf', **kwargs):
    """
    Annotates all files in input_Files

    $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf
    """
    input_files = input_file_list.read().strip().split(
        '\n') if input_file_list else []
    if input_file:
        input_files.append(input_file.name)
    print('annotating {0}'.format(', '.join(input_files)), file=sys.stderr)

    DAG().sequence_(
        add_([
            INPUT(input_file, tags={'vcf': i})
            for i, input_file in enumerate(input_files)
        ]), massive_annotation, configure(wga_settings), add_run(workflow))
Example #15
0
def anno(workflow,input_file,input_file_list,file_format='vcf',**kwargs):
    """
    Annotates all files in input_Files

    $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf
    """
    input_files = input_file_list.read().strip().split('\n') if input_file_list else []
    if input_file:
        input_files.append(input_file.name)
    print >> sys.stderr, 'annotating {0}'.format(', '.join(input_files))

    DAG().sequence_(
        add_([ INPUT(input_file,tags={'vcf':i}) for i,input_file in enumerate(input_files) ]),
        massive_annotation,
        configure(wga_settings),
        add_run(workflow)
    )
Example #16
0
def upload_(workflow, bucket, project, out_dict, **kwargs):
    project_folder = join(out_dict, project.replace(" ", "_"))
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)
    json_fastq_to_upload = s3_Bucket.getList(bucket, project, out_dict)
    input_json = json.load(open(json_fastq_to_upload, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['gz_path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_upload(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Example #17
0
def Bam2Fastq(workflow, dag, settings, input_bams):
    if len(input_bams) == 0:
        raise WorkflowException, 'At least 1 BAM input required'
    dag.sequence_(
        sequence_(
            *[
                sequence_(
                    add_([ INPUT(input_bam, tags={'input':os.path.basename(input_bam)})],stage_name="Load Input Bams"),
                    split_([('rgid',_inputbam2rgids(input_bam))],pipes.FilterBamByRG_To_FastQ)
                )
                for input_bam in input_bams
            ],
            combine=True
        ),
        split_([('pair',[1,2])],genomekey_scripts.SplitFastq),
        configure(settings),
        add_run(workflow,finish=False),
    ).add_(list(_splitfastq2inputs(dag)))
    return dag
Example #18
0
def Bam2Fastq(workflow, dag, settings, input_bams):
    if len(input_bams) == 0:
        raise WorkflowException, 'At least 1 BAM input required'
    dag.sequence_(
        sequence_(*[
            sequence_(
                add_([
                    INPUT(input_bam,
                          tags={'input': os.path.basename(input_bam)})
                ],
                     stage_name="Load Input Bams"),
                split_([('rgid', _inputbam2rgids(input_bam))],
                       pipes.FilterBamByRG_To_FastQ))
            for input_bam in input_bams
        ],
                  combine=True),
        split_([('pair', [1, 2])], genomekey_scripts.SplitFastq),
        configure(settings),
        add_run(workflow, finish=False),
    ).add_(list(_splitfastq2inputs(dag)))
    return dag
Example #19
0
def json_local(workflow, input_dict, **kwargs):
    """
    Input is a folder where each file is a json of the following format:

    [
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':1
            'path': '/path/to/fastq'
        },
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':2
            'path': '/path/to/fastq'..}
    ]
    """
    dirList = os.listdir(input_dict)
    for files in dirList:
        print(input_dict + files)
        input_json = json.load(open(input_dict + files, 'r'))
        inputs = [
            INPUT(name='fastq.gz',
                  path=i['path'],
                  fmt='fastq.gz',
                  tags=i,
                  stage_name='Load Input Fastqs') for i in input_json
        ]
        for i in inputs:
            print(i)
        DAG(ignore_stage_name_collisions=True).sequence_(
            add_(inputs), Pipeline_local(), configure(wga_settings),
            add_run(workflow))
Example #20
0
File: ex2.py Project: p7k/COSMOS
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, split_, add_, map_, reduce_
from tools import ECHO, CAT, WC, PASTE

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ECHO(tags={'word': 'hello'}),
          ECHO(tags={'word': 'world'})]), split_([('i', [1, 2])], CAT),
    reduce_([], PASTE), map_(WC))

dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 2', restart=True, delete_intermediates=True)
dag.add_to_workflow(WF)
WF.run()
Example #21
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom = ('chrom', range(1, 23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', [
        'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain',
        'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC',
        'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way',
        'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all',
        'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa',
        'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity',
        'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene',
        'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog'
    ])
    bam_seq = None

    for b in bams:
        header = _getHeaderInfo(b)
        sn = _getSeqName(header)

        rgid = [h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn = ['chr1']
            chrom = ('chrom', [1])
            glm = ('glm', ['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])]
            indelrealign_reduce = ['bam']
        else:
            bam_bwa_split = [('rgId', rgid), ('prevSn', sn),
                             ('chromosome_only_split', [False])]
            indelrealign_reduce = ['bam', 'rgId']

        s = sequence_(
            add_([INPUT(b, tags={'bam': sample_name})],
                 stage_name="Load BAMs"),
            split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.ReduceReads),
        reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper),
        reduce_(['glm'],
                pipes.VariantQualityScoreRecalibration,
                tag={'vcf': 'main'}),
        reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"),
        map_(pipes.Vcf2Anno_in),
        split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}),
        reduce_(['vcf'], pipes.MergeAnnotations))

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],
               pipes.VariantQualityScoreRecalibration,
               tag={'vcf': 'main'}))

    return hc_pipeline
Example #22
0
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, add_,split_
from tools import ECHO, CAT

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]),
    split_([('i',[1,2])],CAT)
)
dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 1',restart=True)
dag.add_to_workflow(WF)
WF.run()
Example #23
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom  = ('chrom', range(1,23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', ['dbSNP135','CytoBand','Target_Scan','mirBase','Self_Chain','Repeat_Masker','TFBS','Segmental_Duplications','SIFT','COSMIC',
                          'PolyPhen2','Mutation_Taster','GERP','PhyloP','LRT','Mce46way','Complete_Genomics_69','The_1000g_Febuary_all','The_1000g_April_all',
                          'NHLBI_Exome_Project_euro','NHLBI_Exome_Project_aa','NHLBI_Exome_Project_all','ENCODE_DNaseI_Hypersensitivity','ENCODE_Transcription_Factor',
                          'UCSC_Gene','Refseq_Gene','Ensembl_Gene','CCDS_Gene','HGMD_INDEL','HGMD_SNP','GWAS_Catalog'])
    bam_seq = None
    
    for b in bams:
        header = _getHeaderInfo(b)
        sn     = _getSeqName(header)

        rgid = [ h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn    = ['chr1']
            chrom = ('chrom',[1])
            glm   = ('glm',['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [ ('prevSn', sn), ('chromosome_only_split', [True]) ]
            indelrealign_reduce =  ['bam']
        else:
            bam_bwa_split = [ ('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False]) ]
            indelrealign_reduce =  ['bam','rgId']

        s = sequence_( add_([INPUT(b, tags={'bam':sample_name})], stage_name="Load BAMs"), 
                       split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None:   bam_seq = s
        else:                 bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.ReduceReads),
        reduce_split_(['chrom'], [glm],        pipes.UnifiedGenotyper),
        reduce_(['glm'],                       pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'}),
        reduce_(['vcf'],                       pipes.CombineVariants, "Merge VCF"),
        map_(                                  pipes.Vcf2Anno_in),       
        split_([dbnames],                      pipes.Annotate, tag={'build':'hg19'}),       
        reduce_(['vcf'],                       pipes.MergeAnnotations)
    )

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.HaplotypeCaller),
        reduce_(['chrom'],                     pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],               pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'})
    )

    return hc_pipeline
Example #24
0
File: ex3.py Project: p7k/COSMOS
from cosmos.Workflow.models import Workflow
from cosmos.lib.ezflow.dag import DAG, split_,add_,map_,reduce_
from tools import ECHO, CAT, WC, PASTE, Sleep

####################
# Workflow
####################

dag = DAG().sequence_(
    add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]),
    map_(Sleep),
    split_([('i',[1,2])], CAT),
    reduce_([], PASTE),
    map_(WC),
)

dag.create_dag_img('/tmp/ex.svg')

#################
# Run Workflow
#################

WF = Workflow.start('Example 3',restart=True,delete_intermediates=True)
dag.add_to_workflow(WF)
WF.run()