def json_somatic(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { "chunk": "001", "library": "LIB-1216301779A", "platform": "ILLUMINA", "platform_unit": "C0MR3ACXX.001", "rgid": "BC18-06-2013", "sample_name": "BC18-06-2013LyT_S5_L001", "pair": "1", "path": "/path/to/fastq.gz", "sample_type": "normal or tumor" }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_Somatic(), configure(wga_settings), add_run(workflow))
def json_(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { 'chunk': 001, 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair': 0, #0 or 1 'path': '/path/to/fastq' }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline(), configure(wga_settings), add_run(workflow))
def gunzip(workflow, input_dir, **kwargs): """ Gunzips all gz files in directory $ genomekey gunzip -n 'Gunzip' /path/to/dir """ DAG().sequence_( add_([ INPUT(f, tags={'i': i}) for i, f in enumerate(glob.glob(os.path.join(input_dir, '*.gz'))) ]), map_(unix.Gunzip), add_run(workflow))
def CteamPipeline(input_bams): bam_seq = None bam_dup = [] # to check duplicate input files for b in input_bams: # extract genome_id from file, add as a tag genome_id = os.path.basename(b).partition('.')[0] if genome_id in bam_dup: print '\n\nERROR: \"%s\" was already included in the input file list.\n' % b sys.exit() else: bam_dup.append(genome_id) # TEMPORARILLY, use genome_id as RG_ID, too s = sequence_( add_([INPUT(b, tags={'rg': genome_id})], stage_name="Load Input")) # append to sequence if bam_seq is None: bam_seq = s else: bam_seq = sequence_(bam_seq, s, combine=True) nInput = len(input_bams) nNodes = settings.settings['nNode'] nSplit = min( 256, 16 * max(nNodes / nInput, 1)) # will use floor, min 16, up to 256 splits settings.settings['nSplit'] = nSplit chrom = ('chrom', range(1, 23) + ['X', 'Y', 'MT']) split = ('split', range(1, nSplit + 1)) return sequence_( bam_seq, map_(pipes.CteamSortSplitBam), # sort bam by readname (== shuffling) split_([split], pipes.CteamTrimReadGroup), # map_(pipes.CteamBwaAln), # bwa aln map_(pipes.CteamBwaSampe), # bwa sampe reduce_(['rg'], pipes.CteamSplitByChromosome ), # merge split files and (re)split by chromosome split_([chrom], pipes.CteamRmDup_BuildIndex), # samtools rmdup + index map_(pipes.CteamRealignTarget), # gatk indel realign target creator map_(pipes.CteamIndelRealigner), # gatk indel realigner map_(pipes.CteamUnifiedGenotyper) # gatk unifiedGenotyper # #map_(pipes.CteamVariantFiltration) # gatk variantFilter )
def fastq_(workflow, input_dict, output_dict, output_json, **kwargs): json_fastq_to_split = json_creator.json_out(input_dict, output_dict) input_json = json.load(open(json_fastq_to_split, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_split(), configure(wga_settings), add_run(workflow))
def anno(workflow, input_file, input_file_list, file_format='vcf', **kwargs): """ Annotates all files in input_Files $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf """ input_files = input_file_list.read().strip().split( '\n') if input_file_list else [] if input_file: input_files.append(input_file.name) print('annotating {0}'.format(', '.join(input_files)), file=sys.stderr) DAG().sequence_( add_([ INPUT(input_file, tags={'vcf': i}) for i, input_file in enumerate(input_files) ]), massive_annotation, configure(wga_settings), add_run(workflow))
def upload_(workflow, bucket, project, out_dict, **kwargs): project_folder = join(out_dict, project.replace(" ", "_")) if not os.path.exists(project_folder): os.makedirs(project_folder) json_fastq_to_upload = s3_Bucket.getList(bucket, project, out_dict) input_json = json.load(open(json_fastq_to_upload, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_upload(), configure(wga_settings), add_run(workflow))
def Bam2Fastq(workflow, dag, settings, input_bams): if len(input_bams) == 0: raise WorkflowException, 'At least 1 BAM input required' dag.sequence_( sequence_(*[ sequence_( add_([ INPUT(input_bam, tags={'input': os.path.basename(input_bam)}) ], stage_name="Load Input Bams"), split_([('rgid', _inputbam2rgids(input_bam))], pipes.FilterBamByRG_To_FastQ)) for input_bam in input_bams ], combine=True), split_([('pair', [1, 2])], genomekey_scripts.SplitFastq), configure(settings), add_run(workflow, finish=False), ).add_(list(_splitfastq2inputs(dag))) return dag
def json_local(workflow, input_dict, **kwargs): """ Input is a folder where each file is a json of the following format: [ { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':1 'path': '/path/to/fastq' }, { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':2 'path': '/path/to/fastq'..} ] """ dirList = os.listdir(input_dict) for files in dirList: print(input_dict + files) input_json = json.load(open(input_dict + files, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] for i in inputs: print(i) DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_local(), configure(wga_settings), add_run(workflow))
def _splitfastq2inputs(dag): """ Assumes dag's active tools are from SplitFastq. Traverses their output for the fastq files, and yields new INPUTs properly annotated with dags, and children of their right SplitFastq parent. """ for split_fastq_tool in dag.active_tools: tags = split_fastq_tool.tags.copy() # Get The RG info and place into a dictionary for tags # note: FilterBamByRG's output bam has the right RG information input_tool = split_fastq_tool.parent.parent bam_path = TaskFile.objects.get( id=input_tool.get_output('bam').id).path RGs = pysam.Samfile(bam_path, 'rb').header['RG'] # FilterBamByRG does not remove the non-filtered RGs from the new header RG = [d for d in RGs if d['ID'] == split_fastq_tool.tags['rgid']][0] tags['sample_name'] = RG['SM'] tags['library'] = RG['LB'] tags['platform'] = RG['PL'] tags['platform_unit'] = RG.get( 'PU', RG['ID']) # use 'ID' if 'PU' does not exist # Add fastq chucks as input files fastq_output_dir = TaskFile.objects.get( id=split_fastq_tool.get_output('dir').id).path for f in os.listdir(fastq_output_dir): fastq_path = os.path.join(fastq_output_dir, f) tags2 = tags.copy() tags2['chunk'] = re.search("(\d+)\.fastq", f).group(1) i = INPUT(name='fastq.gz', path=fastq_path, tags=tags2, stage_name='Load Input Fastqs') dag.add_edge(split_fastq_tool, i) yield i
def pipeline(bams, test_bam=False, chromosome_only_split=False): # split_ tuples #chrom = ('chrom', range(1,23) + ['X', 'Y', 'MT']) chrom = ('chrom', range(1, 23)) glm = ('glm', ['SNP', 'INDEL']) dbnames = ('dbname', [ 'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain', 'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC', 'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way', 'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all', 'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa', 'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity', 'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene', 'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog' ]) bam_seq = None for b in bams: header = _getHeaderInfo(b) sn = _getSeqName(header) rgid = [h[0] for h in header['rg']] # restrict output for testing if test_bam: sn = ['chr1'] chrom = ('chrom', [1]) glm = ('glm', ['SNP']) skip_VQSR = ('skip_VQSR', [True]) else: skip_VQSR = ('skip_VQSR', [False]) # if seqName is empty, then let's assume that the input is unaligned bam # use everything before extension as part of tag sample_name = os.path.splitext(os.path.basename(b))[0] if chromosome_only_split: # Stop splitting by rgId bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])] indelrealign_reduce = ['bam'] else: bam_bwa_split = [('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False])] indelrealign_reduce = ['bam', 'rgId'] s = sequence_( add_([INPUT(b, tags={'bam': sample_name})], stage_name="Load BAMs"), split_(bam_bwa_split, pipes.Bam_To_BWA)) if bam_seq is None: bam_seq = s else: bam_seq = sequence_(bam_seq, s, combine=True) # Previous pipeline pr_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_(pipes.MarkDuplicates), reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration), map_(pipes.ReduceReads), reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper), reduce_(['glm'], pipes.VariantQualityScoreRecalibration, tag={'vcf': 'main'}), reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"), map_(pipes.Vcf2Anno_in), split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}), reduce_(['vcf'], pipes.MergeAnnotations)) # HaplotypeCaller Pipeline: official for GATK 3.0 hc_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_(pipes.MarkDuplicates), reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration), map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs), split_([glm, skip_VQSR], pipes.VariantQualityScoreRecalibration, tag={'vcf': 'main'})) return hc_pipeline