Ejemplo n.º 1
0
def json_somatic(workflow, input_dict, **kwargs):
    """
    Input file is a json of the following format:

    [
        {
	    "chunk": "001",
            "library": "LIB-1216301779A",
            "platform": "ILLUMINA",
            "platform_unit": "C0MR3ACXX.001",
	    "rgid": "BC18-06-2013",
	    "sample_name": "BC18-06-2013LyT_S5_L001",
	    "pair": "1",
	    "path": "/path/to/fastq.gz",
	    "sample_type": "normal or tumor"
        },
        {..}
    ]
    """

    input_json = json.load(open(input_dict, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_Somatic(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Ejemplo n.º 2
0
def json_(workflow, input_dict, **kwargs):
    """
    Input file is a json of the following format:

    [
        {
            'chunk': 001,
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair': 0, #0 or 1
            'path': '/path/to/fastq'
        },
        {..}
    ]
    """

    input_json = json.load(open(input_dict, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Ejemplo n.º 3
0
def gunzip(workflow, input_dir, **kwargs):
    """
    Gunzips all gz files in directory

    $ genomekey gunzip -n 'Gunzip' /path/to/dir
    """
    DAG().sequence_(
        add_([
            INPUT(f, tags={'i': i})
            for i, f in enumerate(glob.glob(os.path.join(input_dir, '*.gz')))
        ]), map_(unix.Gunzip), add_run(workflow))
Ejemplo n.º 4
0
def CteamPipeline(input_bams):

    bam_seq = None
    bam_dup = []  # to check duplicate input files

    for b in input_bams:
        # extract genome_id from file, add as a tag
        genome_id = os.path.basename(b).partition('.')[0]

        if genome_id in bam_dup:
            print '\n\nERROR: \"%s\" was already included in the input file list.\n' % b
            sys.exit()
        else:
            bam_dup.append(genome_id)

        # TEMPORARILLY, use genome_id as RG_ID, too
        s = sequence_(
            add_([INPUT(b, tags={'rg': genome_id})], stage_name="Load Input"))

        # append to sequence
        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    nInput = len(input_bams)
    nNodes = settings.settings['nNode']
    nSplit = min(
        256, 16 *
        max(nNodes / nInput, 1))  # will use floor, min 16, up to 256 splits

    settings.settings['nSplit'] = nSplit

    chrom = ('chrom', range(1, 23) + ['X', 'Y', 'MT'])
    split = ('split', range(1, nSplit + 1))

    return sequence_(
        bam_seq,
        map_(pipes.CteamSortSplitBam),  # sort bam by readname (== shuffling)
        split_([split], pipes.CteamTrimReadGroup),  # 
        map_(pipes.CteamBwaAln),  # bwa aln
        map_(pipes.CteamBwaSampe),  # bwa sampe
        reduce_(['rg'], pipes.CteamSplitByChromosome
                ),  # merge split files and (re)split by chromosome
        split_([chrom], pipes.CteamRmDup_BuildIndex),  # samtools rmdup + index
        map_(pipes.CteamRealignTarget),  # gatk indel realign target creator
        map_(pipes.CteamIndelRealigner),  # gatk indel realigner
        map_(pipes.CteamUnifiedGenotyper)  # gatk unifiedGenotyper

        # #map_(pipes.CteamVariantFiltration)          # gatk variantFilter
    )
Ejemplo n.º 5
0
def fastq_(workflow, input_dict, output_dict, output_json, **kwargs):

    json_fastq_to_split = json_creator.json_out(input_dict, output_dict)
    input_json = json.load(open(json_fastq_to_split, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['gz_path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_split(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Ejemplo n.º 6
0
def anno(workflow, input_file, input_file_list, file_format='vcf', **kwargs):
    """
    Annotates all files in input_Files

    $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf
    """
    input_files = input_file_list.read().strip().split(
        '\n') if input_file_list else []
    if input_file:
        input_files.append(input_file.name)
    print('annotating {0}'.format(', '.join(input_files)), file=sys.stderr)

    DAG().sequence_(
        add_([
            INPUT(input_file, tags={'vcf': i})
            for i, input_file in enumerate(input_files)
        ]), massive_annotation, configure(wga_settings), add_run(workflow))
Ejemplo n.º 7
0
def upload_(workflow, bucket, project, out_dict, **kwargs):
    project_folder = join(out_dict, project.replace(" ", "_"))
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)
    json_fastq_to_upload = s3_Bucket.getList(bucket, project, out_dict)
    input_json = json.load(open(json_fastq_to_upload, 'r'))
    inputs = [
        INPUT(name='fastq.gz',
              path=i['gz_path'],
              fmt='fastq.gz',
              tags=i,
              stage_name='Load Input Fastqs') for i in input_json
    ]

    DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs),
                                                     Pipeline_upload(),
                                                     configure(wga_settings),
                                                     add_run(workflow))
Ejemplo n.º 8
0
def Bam2Fastq(workflow, dag, settings, input_bams):
    if len(input_bams) == 0:
        raise WorkflowException, 'At least 1 BAM input required'
    dag.sequence_(
        sequence_(*[
            sequence_(
                add_([
                    INPUT(input_bam,
                          tags={'input': os.path.basename(input_bam)})
                ],
                     stage_name="Load Input Bams"),
                split_([('rgid', _inputbam2rgids(input_bam))],
                       pipes.FilterBamByRG_To_FastQ))
            for input_bam in input_bams
        ],
                  combine=True),
        split_([('pair', [1, 2])], genomekey_scripts.SplitFastq),
        configure(settings),
        add_run(workflow, finish=False),
    ).add_(list(_splitfastq2inputs(dag)))
    return dag
Ejemplo n.º 9
0
def json_local(workflow, input_dict, **kwargs):
    """
    Input is a folder where each file is a json of the following format:

    [
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':1
            'path': '/path/to/fastq'
        },
        {
            'library': 'LIB-1216301779A',
            'sample_name': '1216301779A',
            'platform': 'ILLUMINA',
            'platform_unit': 'C0MR3ACXX.001'
            'pair':2
            'path': '/path/to/fastq'..}
    ]
    """
    dirList = os.listdir(input_dict)
    for files in dirList:
        print(input_dict + files)
        input_json = json.load(open(input_dict + files, 'r'))
        inputs = [
            INPUT(name='fastq.gz',
                  path=i['path'],
                  fmt='fastq.gz',
                  tags=i,
                  stage_name='Load Input Fastqs') for i in input_json
        ]
        for i in inputs:
            print(i)
        DAG(ignore_stage_name_collisions=True).sequence_(
            add_(inputs), Pipeline_local(), configure(wga_settings),
            add_run(workflow))
Ejemplo n.º 10
0
def _splitfastq2inputs(dag):
    """
    Assumes dag's active tools are from SplitFastq.  Traverses their output for the fastq files, and
    yields new INPUTs properly annotated with dags, and children of their right SplitFastq parent.
    """
    for split_fastq_tool in dag.active_tools:
        tags = split_fastq_tool.tags.copy()

        # Get The RG info and place into a dictionary for tags
        # note: FilterBamByRG's output bam has the right RG information
        input_tool = split_fastq_tool.parent.parent
        bam_path = TaskFile.objects.get(
            id=input_tool.get_output('bam').id).path
        RGs = pysam.Samfile(bam_path, 'rb').header['RG']

        # FilterBamByRG does not remove the non-filtered RGs from the new header
        RG = [d for d in RGs if d['ID'] == split_fastq_tool.tags['rgid']][0]
        tags['sample_name'] = RG['SM']
        tags['library'] = RG['LB']
        tags['platform'] = RG['PL']
        tags['platform_unit'] = RG.get(
            'PU', RG['ID'])  # use 'ID' if 'PU' does not exist

        # Add fastq chucks as input files
        fastq_output_dir = TaskFile.objects.get(
            id=split_fastq_tool.get_output('dir').id).path
        for f in os.listdir(fastq_output_dir):
            fastq_path = os.path.join(fastq_output_dir, f)
            tags2 = tags.copy()
            tags2['chunk'] = re.search("(\d+)\.fastq", f).group(1)

            i = INPUT(name='fastq.gz',
                      path=fastq_path,
                      tags=tags2,
                      stage_name='Load Input Fastqs')
            dag.add_edge(split_fastq_tool, i)
            yield i
Ejemplo n.º 11
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom = ('chrom', range(1, 23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', [
        'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain',
        'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC',
        'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way',
        'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all',
        'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa',
        'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity',
        'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene',
        'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog'
    ])
    bam_seq = None

    for b in bams:
        header = _getHeaderInfo(b)
        sn = _getSeqName(header)

        rgid = [h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn = ['chr1']
            chrom = ('chrom', [1])
            glm = ('glm', ['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])]
            indelrealign_reduce = ['bam']
        else:
            bam_bwa_split = [('rgId', rgid), ('prevSn', sn),
                             ('chromosome_only_split', [False])]
            indelrealign_reduce = ['bam', 'rgId']

        s = sequence_(
            add_([INPUT(b, tags={'bam': sample_name})],
                 stage_name="Load BAMs"),
            split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.ReduceReads),
        reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper),
        reduce_(['glm'],
                pipes.VariantQualityScoreRecalibration,
                tag={'vcf': 'main'}),
        reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"),
        map_(pipes.Vcf2Anno_in),
        split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}),
        reduce_(['vcf'], pipes.MergeAnnotations))

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],
               pipes.VariantQualityScoreRecalibration,
               tag={'vcf': 'main'}))

    return hc_pipeline