def test_upload_download_large_file_small_bufsize_dxfile(self): num_parts = 50000 common_args = dict(mode='w', project=self.proj_id) with dxpy.new_dxfile(write_buffer_size=280000, expected_file_size=300000*num_parts, **common_args) as myfile: myfile.write("0" * 700000) myfile.close(block=True) parts = myfile.describe(fields={"parts": True})['parts'] self.assertEquals(parts['1']['size'], 300000) with dxpy.new_dxfile(write_buffer_size=320000, expected_file_size=300000*num_parts, **common_args) as myfile: myfile.write("0" * 700000) myfile.close(block=True) parts = myfile.describe(fields={"parts": True})['parts'] self.assertEquals(parts['1']['size'], 320000)
def test_file_context_manager(self): with dxpy.new_dxfile(mode='w') as self.dxfile: file_id = self.dxfile.get_id() self.dxfile.write("Haha") file2 = dxpy.open_dxfile(file_id) state = file2._get_state() self.assertTrue(state in ['closing', 'closed']) file2._wait_on_close() self.assertEqual(file2.describe()["size"], 4)
def test_dxfile_errors(self): self.dxfile = dxpy.new_dxfile() self.dxfile.write("Line 1\nLine 2\nLine 3\n") with self.assertRaises(DXFileError): self.dxfile.read(3) with self.assertRaises(DXFileError): for line in self.dxfile: pass
def test_upload_download_large_file_small_bufsize_dxfile(self): num_parts = 50000 common_args = dict(mode='w', project=self.proj_id) with dxpy.new_dxfile(write_buffer_size=280000, expected_file_size=300000 * num_parts, **common_args) as myfile: myfile.write("0" * 700000) myfile.close(block=True) parts = myfile.describe(fields={"parts": True})['parts'] self.assertEquals(parts['1']['size'], 300000) with dxpy.new_dxfile(write_buffer_size=320000, expected_file_size=300000 * num_parts, **common_args) as myfile: myfile.write("0" * 700000) myfile.close(block=True) parts = myfile.describe(fields={"parts": True})['parts'] self.assertEquals(parts['1']['size'], 320000)
def test_iter_dxfile(self): dxid = "" with dxpy.new_dxfile() as self.dxfile: dxid = self.dxfile.get_id() self.dxfile.write("Line 1\nLine 2\nLine 3\n") with dxpy.open_dxfile(dxid) as same_dxfile: same_dxfile.wait_on_close() self.assertTrue(same_dxfile.closed()) lineno = 1 for line in same_dxfile: self.assertEqual(line, "Line " + str(lineno)) lineno += 1
def logThisRun(runId,resultsFolder,projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath,projectId) newFh = dxpy.new_dxfile('a',project=projectId,folder=resultsFolder,name=RUNS_LAUNCHED_FILE) newFh.write(runId+' started:'+str(datetime.now())+'\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId+'\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def setup_files(self, files): """Sets up files for testing. This does not assume the files will be closed by the end of this function. Args: files (List[str]): list of files relative to project root to be created on DX Only virtual paths are allowed. Path must start with '/' """ for i, curr_file in enumerate(files): dx_p = Path(curr_file) self.project_handler.new_folder(dx_p.parent, parents=True) with dxpy.new_dxfile(name=dx_p.name, folder='/' + dx_p.parent.lstrip('/'), project=self.proj_id) as f: f.write('data{}'.format(i).encode())
def setup_file(self, obj): """Set up a closed file for testing. Args: obj (str): file relative to project root to be created on DX Only virtual paths are allowed. Path must start with '/' """ dx_p = Path(obj) self.project_handler.new_folder(dx_p.parent, parents=True) with dxpy.new_dxfile(name=dx_p.name, folder='/' + dx_p.parent.lstrip('/'), project=self.proj_id) as f: f.write('data'.encode()) # to allow for max of 20s for file state to go to closed f.wait_on_close(20) return f
def logThisRun(runId, resultsFolder, projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath, projectId) newFh = dxpy.new_dxfile('a', project=projectId, folder=resultsFolder, name=RUNS_LAUNCHED_FILE) newFh.write(runId + ' started:' + str(datetime.now()) + '\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId + '\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def test_write_read_dxfile(self): dxid = "" with dxpy.new_dxfile() as self.dxfile: dxid = self.dxfile.get_id() self.dxfile.write(self.foo_str) with dxpy.open_dxfile(dxid) as same_dxfile: same_dxfile.wait_on_close() self.assertTrue(same_dxfile.closed()) buf = same_dxfile.read(len(self.foo_str)) self.assertEqual(self.foo_str, buf) buf = same_dxfile.read() self.assertEqual(len(buf), 0) same_dxfile.seek(1) buf = same_dxfile.read() self.assertEqual(self.foo_str[1:], buf)
def test_file_context_manager_destructor(self): dxfile = dxpy.new_dxfile(mode='w') dxfile.write("Haha")
def main(): ## RUN DEDUPLICATE os.environ['CLASSPATH'] = '/opt/jar/MarkDuplicates.jar:/opt/jar/MergeSamFiles.jar' #mappingsTable = dxpy.open_dxgtable(job['input']['mappings'][0]['$dnanexus_link']) if 'output_name' in job['input']: outputName = job['input']['output_name'] else: outputName = '' recalibratedTable = createNewTable(job['input']['mappings'], outputName) #print "Mappings Table: " + mappingsTable.get_id() print "Recalibrated Table: " + recalibratedTable.get_id() mappingsTable = dxpy.DXGTable(job['input']['mappings'][0]['$dnanexus_link']) for x in job['input']['mappings']: if 'quality' not in dxpy.DXGTable(x).get_col_names(): if len(job['input']['mappings']) > 1: raise dxpy.AppError("One of the provided mappings did not have quality scores, for example %s. GATK can't recalibrate mappings without quality scores" % dxpy.DXGTable(x).describe()['name']) else: raise dxpy.AppError("The provided mappings did not have quality scores. GATK can't recalibrate mappings without quality scores") try: contigSetId = mappingsTable.get_details()['original_contigset']['$dnanexus_link'] originalContigSet = mappingsTable.get_details()['original_contigset'] except: raise dxpy.AppError("The original reference genome must be attached as a detail") if contigSetId != job['input']['reference']['$dnanexus_link']: raise dxpy.AppError("The reference genome of the mappings does not match the provided reference genome") reads = 0 for x in job['input']['mappings']: table = dxpy.DXGTable(x) reads += int(table.describe()['length']) chunks = int(reads/job['input']['reads_per_job'])+1 #Split the genome into chunks to parallelize commandList = splitGenomeLengthChromosome(originalContigSet, chunks) chunks = len(commandList) if chunks == 1: job['input']['deduplicate_interchromosomal_pairs'] = False excludeInterchromosome = (chunks > 1) markDuplicatesJobs = [] #This is a Picard Mark Duplicates job run only on interchromosomal mappings in the case that the genome is split into regions #This is necessary because Mark Duplicates has to look at both mates in a read pair, so interchromosomal mappings must go together reduceInterchromosomeInput = {} bamFiles = [] if job['input']['deduplicate_interchromosomal_pairs']: for i in xrange(-1, chunks): bamFiles.append(dxpy.new_dxfile().get_id()) mapInterchromosomeInput = { 'mappings_tables': job['input']['mappings'], 'interval': commandList[i], 'job_number' : i, 'separate_read_groups' : job['input']['separate_read_groups'] } interchromosomeJobId = dxpy.new_dxjob(fn_input=mapInterchromosomeInput, fn_name="mapInterchromosome").get_id() reduceInterchromosomeInput["mapJob"+str(i)] = {'job': interchromosomeJobId, 'field': 'file_id'} #interchromosomeJobField = { 'job': interchromosomeJobId, 'field': 'bam'} #Make a reduce job for the interchromosome component reduceInterchromosomeInput["file_list"] = bamFiles reduceInterchromosomeInput["interval"] = commandList reduceInterchromosomeInput["discard_duplicates"] = job['input']['discard_duplicates'] reduceJobId = dxpy.new_dxjob(fn_input=reduceInterchromosomeInput, fn_name="reduceInterchromosome").get_id() deduplicateInterchromosome = True else: interchromosomeJobField = '' deduplicateInterchromosome = False #This runs the Picard Mark Duplicates program to deduplicate the reads reduceInput = {} for i in range(len(commandList)): print commandList[i] mapBestPracticesInput = { 'mappings_tables': job['input']['mappings'], 'recalibrated_table_id': recalibratedTable.get_id(), 'file_list': bamFiles, 'interval': commandList[i], 'job_number' : i, 'reference': job['input']['reference']['$dnanexus_link'], 'dbsnp': job['input']['dbsnp'], 'separate_read_groups' : job['input']['separate_read_groups'], 'discard_duplicates': job['input']['discard_duplicates'], 'parent_input': job['input'], 'intervals_to_include': job['input'].get('intervals_to_process'), 'intervals_to_exclude': job['input'].get('intervals_to_exclude'), 'intervals_merging': job['input']['intervals_merging'], 'deduplicate_interchromosome': deduplicateInterchromosome } if 'known_indels' in job['input']: mapBestPracticesInput['known_indels'] = job['input']['known_indels'] mapJobId = dxpy.new_dxjob(fn_input=mapBestPracticesInput, fn_name="mapBestPractices").get_id() reduceInput["mapJob" + str(i)] = {'job': mapJobId, 'field': 'ok'} reduceInput["recalibrated_table"] = recalibratedTable.get_id() reduceJobId = dxpy.new_dxjob(fn_input=reduceInput, fn_name="reduceBestPractices").get_id() job['output'] = {'recalibrated_mappings': {'job': reduceJobId, 'field': 'recalibrated_table'}}