def test_table_context_manager_error_handling(self): # In each case, the flush that happens at the close of the context handler should wait for # the asynchronous requests and then raise the resulting error. # Note that this test assumes that the error is a semantic error in the add_row data that # is NOT caught by any local error checking. # Use new_dxgtable with self.assertRaises(DXAPIError): with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table1: table1.add_row(["", 68719476736]) # Not in int32 range # Use open_dxgtable and close table table2_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w').get_id() with self.assertRaises(DXAPIError): with dxpy.open_dxgtable(table2_id) as table2: table2.add_row(["", 68719476736]) # Not in int32 range # TODO: why does the flush in this table's destructor fail? Nothing should be getting # flushed then... # Use open_dxgtable and leave table open table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) with self.assertRaises(DXAPIError): with dxpy.open_dxgtable(table3_id, mode='a') as table3: table3.add_row(["", 68719476736]) # Not in int32 range
def test_table_context_manager(self): # Writing a new_dxgtable with parts with dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as self.dxgtable: for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1) # Writing a new_dxgtable without parts with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table2: table2_id = table2.get_id() for i in range(64): table2.add_rows(data=[["row"+str(i), i]]) table2 = dxpy.open_dxgtable(table2_id) self.assertEqual(table2.describe()["length"], 64) table2.remove() # Writing an open_dxgtable table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]).get_id() with dxpy.open_dxgtable(table3_id, mode='a') as table3: for i in range(64): table3.add_rows(data=[["row"+str(i), i]]) with dxpy.open_dxgtable(table3_id, mode='w') as table3: for i in range(64): table3.add_rows(data=[["row"+str(i), i]]) table3 = dxpy.open_dxgtable(table3_id) state = table3._get_state() self.assertTrue(state in ['closing', 'closed']) table3._wait_on_close() self.assertEqual(table3.describe()["length"], 128) table3.remove()
def process(input_gtable_id, start_row, end_row, output_gtable_id): DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(input_gtable_id) # Using the context manager here is useful so that the flush() # method is called once the context manager exits, and any rows # added will be flushed to the platform. The mode is set to "a" # for "append". with dxpy.open_dxgtable(output_gtable_id, mode="a") as DX_APP_WIZARD_||_OUTPUT: # The following loop iterates over each row from start_row to # end_row (not including end_row). You can find documentation on # other useful GTable methods (such as iterating over a genomic # range query with iterate_query_rows) in the dxpy library here: # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row): # Fill in code here to perform whatever computation is # necessary to process the row and compute the new row. # # *row* is an array where the first element is the row ID, # and the rest of the elements appear in the same order as # the GTable's column specification. You can retrieve the # column specifications or names by using # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names(). new_row = [] # The following line queues up the array new_row as a row # of data that should be added to the output GTable. # Queued rows will be flushed to the platform periodically. DX_APP_WIZARD_||_OUTPUT.add_row(new_row)
def write_reads_to_fasta(reads_id, filename, seq_col='sequence', start_row=0, end_row=None): row_id = start_row with open(filename, "w") as fh: for row in dxpy.open_dxgtable(reads_id).iterate_rows(columns=[seq_col], start=start_row, end=end_row): for line in '>%d' % row_id, row[0]: print >>fh, line row_id += 1
def reduceGatk(**job_inputs): output = {} t = dxpy.open_dxgtable(job_inputs['tableId']) print "Closing Table" t.close() output['variants'] = dxpy.dxlink(t.get_id()) return output
def postprocess(process_outputs): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. for output in process_outputs: table = dxpy.open_dxgtable(output) table.close() break return { "answer": None }
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(parser.parse_args(sys.argv[1:])) # Attempt to resolve variants gtable name try: project, folderpath, entity_result = resolve_existing_path(kwargs['path'], expected='entity') except BaseException as details: parser.exit(1, fill(unicode(details)) + '\n') if entity_result is None: parser.exit(1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n') filename = kwargs['output'] if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf" if kwargs['output'] == '-': outputFile = sys.stdout else: outputFile = open(filename, 'w') exportRef = kwargs['export_ref_calls'] exportNoCall = kwargs['export_no_calls'] variantsTable = dxpy.open_dxgtable(entity_result['id']) try: originalContigSet = variantsTable.get_details()['original_contigset'] except: raise dxpy.AppError("The original reference genome must be attached as a detail") contigDetails = dxpy.DXRecord(originalContigSet).get_details() if kwargs['reference'] is not None: refFileName = kwargs['reference'] if not os.path.isfile(refFileName): raise dxpy.AppError("The reference expected by the variants to vcf script was not a valid file") else: refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name dxpy.download_dxfile(contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName) if kwargs['write_header']: infos = variantsTable.get_details().get('infos') formats = variantsTable.get_details().get('formats') alts = variantsTable.get_details().get('alts') filters = variantsTable.get_details().get('filters') samples = variantsTable.get_details().get('samples') outputFile.write("##fileformat=VCFv4.1\n") if infos is not None: for k, v in collections.OrderedDict(sorted(infos.iteritems())).iteritems(): outputFile.write("##INFO=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n") if len(samples) > 0: outputFile.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") outputFile.write("##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n") outputFile.write("##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n") if formats is not None: for k, v in collections.OrderedDict(sorted(formats.iteritems())).iteritems(): outputFile.write("##FORMAT=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n") if alts is not None: for k, v in collections.OrderedDict(sorted(alts.iteritems())).iteritems(): outputFile.write("##ALT=<ID="+k+",Description=\""+v['description']+"\">\n") if filters is not None: for k, v in collections.OrderedDict(sorted(filters.iteritems())).iteritems(): outputFile.write("##FILTER=<ID="+k+",Description=\""+v+"\">\n") for i in range(len(contigDetails['contigs']['names'])): outputFile.write("##contig=<ID="+contigDetails['contigs']['names'][i]+",length="+str(contigDetails['contigs']['sizes'][i])+">\n") outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") if len(samples) > 0: outputFile.write("\tFORMAT") for x in samples: outputFile.write("\t"+x) outputFile.write("\n") chromosomeOffsets = {} for i in range(len(contigDetails['contigs']['names'])): chromosomeOffsets[contigDetails['contigs']['names'][i]] = contigDetails['contigs']['offsets'][i] contigSequence = open(refFileName,'r').read() col = {} names = variantsTable.get_col_names() for i in range(len(names)): col[names[i]] = i+1 col = collections.OrderedDict(sorted(col.items())) chromosomeList = contigDetails['contigs']['names'] if kwargs['chr'] is not None: intersection = [] for x in chromosomeList: if x in kwargs['chr']: intersection.append(x) chromosomeList = intersection[:] for chromosome in chromosomeList: lastPosition = -1 buff = [] query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxint) for row in variantsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in variantsTable.iterate_rows(start=startRow): if row[1] != chromosome: break if lastPosition < row[col["lo"]]: writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = [] buff.append(row) lastPosition = row[col["lo"]] writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = []
def main(**job_inputs): job_outputs = {} mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"]) mappingsTableId = mappingsTable.get_id() # This controls the degree of parallelism chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1 try: contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"] originalContigSet = mappingsTable.get_details()["original_contigset"] except: raise Exception("The original reference genome must be attached as a detail") # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker # and variants are called on them, the workers will add rows to this table concurrently. variants_schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "ref", "type": "string"}, {"name": "alt", "type": "string"}, {"name": "qual", "type": "double"}, {"name": "ids", "type": "string"}, ] # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created elevatedTags = ["format_GT", "format_DP", "format_AD"] # The info and format tags are extracted from the header printed by samtools # If additional code will add a tag to the output of the program, modify this header to include the tag. # TODO: Allow the table to be created by the first job that finishes to avoid this step. headerInfo = extractHeader("/tmp/header.txt", elevatedTags) description = {} samples = [] indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")] ##The following section creates the sample-specific table columns for k, v in headerInfo["tags"]["info"].iteritems(): variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)}) description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]} # For each sample, add the sample-specific columns to the schema, at present only one sample is supported numSamples = 1 for i in range(numSamples): variants_schema.extend( [ {"name": "genotype_" + str(i), "type": "string"}, {"name": "phasing_" + str(i), "type": "string"}, {"name": "type_" + str(i), "type": "string"}, {"name": "variation_qual_" + str(i), "type": "double"}, {"name": "genotype_qual_" + str(i), "type": "double"}, {"name": "coverage_" + str(i), "type": "string"}, {"name": "total_coverage_" + str(i), "type": "int32"}, ] ) indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i))) samples.append("Sample_0") for k, v in headerInfo["tags"]["format"].iteritems(): if "format_" + k not in elevatedTags: variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)}) # TODO: Add lexicographic indices when secondary indices are supported variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]) tableId = variants.get_id() variants = dxpy.open_dxgtable(tableId) variants.add_types(["Variants", "gri"]) details = { "samples": samples, "original_contigset": job_inputs["reference"], "original_mappings": job_inputs["mappings"], "formats": headerInfo["tags"]["format"], "infos": headerInfo["tags"]["info"], } # if headerInfo.get('filters') != {}: # details['filters'] = headerInfo['filters'] variants.set_details(details) if "output_name" in job_inputs: variants.rename(job_inputs["output_name"]) else: variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup") # Split the genome into evenly sized regions genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks) # Generate the command line arguments needed to run samtools and bcftools samOptions = makeSamtoolsParameters(**job_inputs) bcfOptions = makeBcftoolsParameters(**job_inputs) # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job. # Which specifies reduce_job_inputs = {} for i in range(len(genomeRegions)): if len(genomeRegions[i]) > 0: map_job_inputs = { "mappings_table_id": mappingsTableId, "original_contig_set": contigSetId, "interval": genomeRegions[i], "tableId": tableId, "compress_reference": job_inputs["compress_reference"], "compress_no_call": job_inputs["compress_no_call"], "infer_no_call": job_inputs["infer_no_call"], "sam_options": samOptions, "bcf_options": bcfOptions, "part_number": i, } # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map')) map_job = dxpy.new_dxjob(map_job_inputs, "map") reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"} reduce_job_inputs["tableId"] = tableId # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs # By giving the reduce job the map jobs as input, the reduce job will wait to start. reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce") job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}} return job_outputs
def reducePileup(**job_inputs): t = dxpy.open_dxgtable(job_inputs["tableId"]) print "Closing Table" t.close() job_outputs = {"variants": dxpy.dxlink(t.get_id())} return job_outputs
def mapPileup(**job_inputs): print "Downloading Reference Genome" subprocess.check_call("dx-contigset-to-fasta %s ref.fa" % (job_inputs["original_contig_set"]), shell=True) # The mappings-to-sam script takes a file with a list of regions in the form -L chrX:lo-hi, as produced by the genome splitting region. # This generates the file regionFile = open("regions.txt", "w") regionFile.write(job_inputs["interval"]) regionFile.close() print "Indexing Dictionary" subprocess.check_call("samtools faidx ref.fa", shell=True) # The sam-to-mappings script in dx-toolkit, the region_index_offset option is specified to convert from 1-indexed to 0-indexed coordinates print "Converting Table to SAM" command = "dx-mappings-to-sam %s --output input.sam --region_index_offset -1 --region_file regions.txt" % ( job_inputs["mappings_table_id"] ) print "Running: " + command subprocess.check_call(command, shell=True) if checkSamContainsRead("input.sam"): print "Converting to BAM" subprocess.check_call("samtools view -bS input.sam > input.bam", shell=True) print "Indexing" subprocess.check_call("samtools index input.bam", shell=True) variants = dxpy.open_dxgtable(job_inputs["tableId"]) command = "samtools mpileup -uf ref.fa" # Since samtools takes a bed file to specify the regions, this takes the interval from the -L chrX:lo-hi format and puts it into BED bedFile = open("regions.bed", "w") intervalMatch = re.findall("-L ([^:]*):(\d+)-(\d+)", job_inputs["interval"]) if len(intervalMatch) > 0: for x in intervalMatch: bedFile.write(x[0] + "\t" + str(x[1]) + "\t" + str(x[2]) + "\n") bedFile.close() bedFile = open("regions.bed", "r") bedFile.close() command += " -l regions.bed " command += job_inputs["sam_options"] command += " input.bam | bcftools view " command += job_inputs["bcf_options"] command += " - > output.vcf" print "Pileup Command: " + command subprocess.check_call(command, shell=True) # Convert the vcf file into variants. The dx_vcfToVariants is a script provided as resources as it has not yet been incorporated into dx-toolkit command = "dx_vcfToVariants2 --table_id %s --vcf_file output.vcf --region_file regions.txt" % ( job_inputs["tableId"] ) if job_inputs["compress_reference"]: command += " --compress_reference" if job_inputs["infer_no_call"]: command += " --infer_no_call" if job_inputs["compress_no_call"]: command += " --compress_no_call" print "Import variants command: " + command subprocess.check_call(command, shell=True) # Return 'ok', a signal which the reduce job looks for in order to know when it can begin closing the table job_outputs = {"ok": True} return job_outputs
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(parser.parse_args(sys.argv[1:])) # Attempt to resolve variants gtable name try: project, folderpath, entity_result = resolve_existing_path( kwargs['path'], expected='entity') except ResolutionError as details: parser.exit(1, fill(str(details)) + '\n') if entity_result is None: parser.exit( 1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n') filename = kwargs['output'] if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf" if kwargs['output'] == '-': outputFile = sys.stdout else: outputFile = open(filename, 'w') exportRef = kwargs['export_ref_calls'] exportNoCall = kwargs['export_no_calls'] variantsTable = dxpy.open_dxgtable(entity_result['id']) try: originalContigSet = variantsTable.get_details()['original_contigset'] except: raise dxpy.AppError( "The original reference genome must be attached as a detail") contigDetails = dxpy.DXRecord(originalContigSet).get_details() if kwargs['reference'] is not None: refFileName = kwargs['reference'] if not os.path.isfile(refFileName): raise dxpy.AppError( "The reference expected by the variants to vcf script was not a valid file" ) else: refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name dxpy.download_dxfile( contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName) if kwargs['write_header']: infos = variantsTable.get_details().get('infos') formats = variantsTable.get_details().get('formats') alts = variantsTable.get_details().get('alts') filters = variantsTable.get_details().get('filters') samples = variantsTable.get_details().get('samples') outputFile.write("##fileformat=VCFv4.1\n") if infos is not None: for k, v in collections.OrderedDict(sorted( infos.iteritems())).iteritems(): outputFile.write("##INFO=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if len(samples) > 0: outputFile.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" ) outputFile.write( "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n" ) outputFile.write( "##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n" ) if formats is not None: for k, v in collections.OrderedDict(sorted( formats.iteritems())).iteritems(): outputFile.write("##FORMAT=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if alts is not None: for k, v in collections.OrderedDict(sorted( alts.iteritems())).iteritems(): outputFile.write("##ALT=<ID=" + k + ",Description=\"" + v['description'] + "\">\n") if filters is not None: for k, v in collections.OrderedDict(sorted( filters.iteritems())).iteritems(): outputFile.write("##FILTER=<ID=" + k + ",Description=\"" + v + "\">\n") for i in range(len(contigDetails['contigs']['names'])): outputFile.write("##contig=<ID=" + contigDetails['contigs']['names'][i] + ",length=" + str(contigDetails['contigs']['sizes'][i]) + ">\n") outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") if len(samples) > 0: outputFile.write("\tFORMAT") for x in samples: outputFile.write("\t" + x) outputFile.write("\n") chromosomeOffsets = {} for i in range(len(contigDetails['contigs']['names'])): chromosomeOffsets[contigDetails['contigs']['names'] [i]] = contigDetails['contigs']['offsets'][i] contigSequence = open(refFileName, 'r').read() col = {} names = variantsTable.get_col_names() for i in range(len(names)): col[names[i]] = i + 1 col = collections.OrderedDict(sorted(col.items())) chromosomeList = contigDetails['contigs']['names'] if kwargs['chr'] is not None: intersection = [] for x in chromosomeList: if x in kwargs['chr']: intersection.append(x) chromosomeList = intersection[:] for chromosome in chromosomeList: buff = [] lastPosition = -1 query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxsize) for row in variantsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in variantsTable.iterate_rows(start=startRow): if row[1] != chromosome: break if lastPosition < row[col["lo"]]: writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = [] buff.append(row) lastPosition = row[col["lo"]] writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = []
def main(**job_inputs): os.environ['CLASSPATH'] = '/opt/jar/AddOrReplaceReadGroups.jar:/opt/jar/GenomeAnalysisTK.jar' if job_inputs['output_mode'] == "EMIT_VARIANTS_ONLY": job_inputs['infer_no_call'] = False mappingsTable = dxpy.open_dxgtable(job_inputs['mappings'][0]['$dnanexus_link']) mappingsTableId = mappingsTable.get_id() #This controls the degree of parallelism in GATK reads = 0 for x in job_inputs['mappings']: reads += int(dxpy.DXGTable(x).describe()['length']) chunks = int(reads/job_inputs['reads_per_job'])+1 command = buildCommand(job_inputs) #callVariantsOnSample(job_inputs, mappingsTable, command) try: contigSetId = mappingsTable.get_details()['original_contigset']['$dnanexus_link'] originalContigSet = mappingsTable.get_details()['original_contigset'] except: raise Exception("The original reference genome must be attached as a detail") variants_schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "ref", "type": "string"}, {"name": "alt", "type": "string"}, {"name": "qual", "type": "double"}, {"name": "ids", "type": "string"} ] elevatedTags = ['format_GT', 'format_DP', 'format_AD'] headerInfo = extractHeader("/tmp/header.txt", elevatedTags) description = {} samples = [] indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')] formats = {} infos = {} filters = {} for k, v in headerInfo['tags']['info'].iteritems(): variants_schema.append({"name": "info_"+k, "type":translateTagTypeToColumnType(v)}) description[k] = {'name' : k, 'description' : v['description'], 'type' : v['type'], 'number' : v['number']} samples = [] for i in range(len(job_inputs['mappings'])): samples.append(dxpy.DXGTable(job_inputs['mappings'][i]).describe()['name'].replace(" ", "")) numSamples = len(job_inputs['mappings']) if job_inputs['call_multiple_samples'] == False: numSamples = 1 samples = ["Sample_0"] #For each sample, write the sample-specific columns for i in range(numSamples): variants_schema.extend([ {"name": "genotype_"+str(i), "type": "string"}, {"name": "phasing_"+str(i), "type": "string"}, {"name": "type_"+str(i), "type": "string"}, {"name": "variation_qual_"+str(i), "type": "double"}, {"name": "genotype_qual_"+str(i), "type": "double"}, {"name": "coverage_"+str(i), "type": "string"}, {"name": "total_coverage_"+str(i), "type": "int32"} ]) indices.append(dxpy.DXGTable.lexicographic_index([["type_"+str(i), "ASC"]], 'type_'+str(i))) for k, v in headerInfo['tags']['format'].iteritems(): if "format_"+k not in elevatedTags: variants_schema.append({"name": "format_"+k+"_"+str(i), "type":translateTagTypeToColumnType(v)}) variantsTable = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]) tableId = variantsTable.get_id() variantsTable = dxpy.open_dxgtable(tableId) variantsTable.add_types(["Variants", "gri"]) details = {'samples':samples, 'original_contigset':job_inputs['reference'], 'original_mappings':job_inputs['mappings'], 'formats':headerInfo['tags']['format'], 'infos':headerInfo['tags']['info']} #if headerInfo.get('filters') != {}: # details['filters'] = headerInfo['filters'] variantsTable.set_details(details) if 'output_name' in job_inputs: variantsTable.rename(job_inputs['output_name']) elif (job_inputs['genotype_likelihood_model'] == "SNP"): variantsTable.rename(mappingsTable.describe()['name'] + " SNP calls by GATK") elif (job_inputs['genotype_likelihood_model'] == "INDEL"): variantsTable.rename(mappingsTable.describe()['name'] + " indel calls by GATK") elif (job_inputs['genotype_likelihood_model'] == "BOTH"): variantsTable.rename(mappingsTable.describe()['name'] + " SNP and indel calls by GATK") else: variantsTable.rename(mappingsTable.describe()['name'] + " variant calls by GATK") reduceInput = {} #commandList = splitGenomeLengthLargePieces(originalContigSet, job_inputs['intervals_to_process'], job_inputs['intervals_to_exclude'], job_inputs['minimum_chunk_size'], job_inputs['maximum_chunks']) commandList = splitGenomeLengthLargePieces(originalContigSet, chunks) for i in range(len(commandList)): if len(commandList[i]) > 0: mapInput = { 'mappings_tables': job_inputs['mappings'], 'original_contig_set': contigSetId, 'interval': commandList[i], 'tableId': tableId, 'command': buildCommand(job_inputs), 'compress_reference': job_inputs['compress_reference'], 'infer_no_call': job_inputs['infer_no_call'], 'compress_no_call': job_inputs['compress_no_call'], 'intervals_to_include': job_inputs.get('intervals_to_process'), 'intervals_to_exclude': job_inputs.get('intervals_to_exclude'), 'intervals_merging': job_inputs['intervals_merging'], 'part_number': i, 'samples': samples, 'call_multiple_samples': job_inputs['call_multiple_samples'] } # Run a "map" job for each chunk mapJobId = dxpy.new_dxjob(fn_input=mapInput, fn_name="mapGatk").get_id() reduceInput["mapJob" + str(i) + "TableId"] = {'job': mapJobId, 'field': 'id'} reduceInput['tableId'] = tableId reduceJobId = dxpy.new_dxjob(fn_input=reduceInput, fn_name="reduceGatk").get_id() output = {'variants': {'job': reduceJobId, 'field': 'variants'}} return output