Beispiel #1
0
    def test_upload_download_large_file_small_bufsize_dxfile(self):
        num_parts = 50000

        common_args = dict(mode='w', project=self.proj_id)

        with dxpy.new_dxfile(write_buffer_size=280000, expected_file_size=300000*num_parts, **common_args) as myfile:
            myfile.write("0" * 700000)
        myfile.close(block=True)
        parts = myfile.describe(fields={"parts": True})['parts']
        self.assertEquals(parts['1']['size'], 300000)

        with dxpy.new_dxfile(write_buffer_size=320000, expected_file_size=300000*num_parts, **common_args) as myfile:
            myfile.write("0" * 700000)
        myfile.close(block=True)
        parts = myfile.describe(fields={"parts": True})['parts']
        self.assertEquals(parts['1']['size'], 320000)
Beispiel #2
0
 def test_file_context_manager(self):
     with dxpy.new_dxfile(mode='w') as self.dxfile:
         file_id = self.dxfile.get_id()
         self.dxfile.write("Haha")
     file2 = dxpy.open_dxfile(file_id)
     state = file2._get_state()
     self.assertTrue(state in ['closing', 'closed'])
     file2._wait_on_close()
     self.assertEqual(file2.describe()["size"], 4)
Beispiel #3
0
 def test_dxfile_errors(self):
     self.dxfile = dxpy.new_dxfile()
     self.dxfile.write("Line 1\nLine 2\nLine 3\n")
     
     with self.assertRaises(DXFileError):
         self.dxfile.read(3)
     with self.assertRaises(DXFileError):
         for line in self.dxfile:
             pass
Beispiel #4
0
    def test_upload_download_large_file_small_bufsize_dxfile(self):
        num_parts = 50000

        common_args = dict(mode='w', project=self.proj_id)

        with dxpy.new_dxfile(write_buffer_size=280000,
                             expected_file_size=300000 * num_parts,
                             **common_args) as myfile:
            myfile.write("0" * 700000)
        myfile.close(block=True)
        parts = myfile.describe(fields={"parts": True})['parts']
        self.assertEquals(parts['1']['size'], 300000)

        with dxpy.new_dxfile(write_buffer_size=320000,
                             expected_file_size=300000 * num_parts,
                             **common_args) as myfile:
            myfile.write("0" * 700000)
        myfile.close(block=True)
        parts = myfile.describe(fields={"parts": True})['parts']
        self.assertEquals(parts['1']['size'], 320000)
Beispiel #5
0
    def test_iter_dxfile(self):
        dxid = ""
        with dxpy.new_dxfile() as self.dxfile:
            dxid = self.dxfile.get_id()
            self.dxfile.write("Line 1\nLine 2\nLine 3\n")

        with dxpy.open_dxfile(dxid) as same_dxfile:
            same_dxfile.wait_on_close()
            self.assertTrue(same_dxfile.closed())

            lineno = 1
            for line in same_dxfile:
                self.assertEqual(line, "Line " + str(lineno))
                lineno += 1
Beispiel #6
0
def logThisRun(runId,resultsFolder,projectId):
    '''Adds a runId to the runsLaunched file in resultsFolder.'''
    # NOTE: DX manual lies?!  Append not possible?!  Then write new/delete old
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    oldFid = dxencode.find_file(launchFilePath,projectId)
    newFh = dxpy.new_dxfile('a',project=projectId,folder=resultsFolder,name=RUNS_LAUNCHED_FILE)
    newFh.write(runId+' started:'+str(datetime.now())+'\n')
    if oldFid is not None:
        with dxpy.open_dxfile(oldFid) as oldFh:
            for oldRunId in oldFh:
                newFh.write(oldRunId+'\n')
        proj = dxpy.DXProject(projectId)
        proj.remove_objects([oldFid])
    newFh.close()
Beispiel #7
0
    def setup_files(self, files):
        """Sets up files for testing.
        This does not assume the files will be closed by the end of this function.

        Args:
            files (List[str]): list of files relative to project root to be created on DX
            Only virtual paths are allowed. Path must start with '/'
        """
        for i, curr_file in enumerate(files):
            dx_p = Path(curr_file)
            self.project_handler.new_folder(dx_p.parent, parents=True)
            with dxpy.new_dxfile(name=dx_p.name,
                                 folder='/' + dx_p.parent.lstrip('/'),
                                 project=self.proj_id) as f:
                f.write('data{}'.format(i).encode())
Beispiel #8
0
    def setup_file(self, obj):
        """Set up a closed file for testing.

        Args:
            obj (str): file relative to project root to be created on DX
            Only virtual paths are allowed. Path must start with '/'
        """
        dx_p = Path(obj)
        self.project_handler.new_folder(dx_p.parent, parents=True)
        with dxpy.new_dxfile(name=dx_p.name,
                             folder='/' + dx_p.parent.lstrip('/'),
                             project=self.proj_id) as f:
            f.write('data'.encode())
        # to allow for max of 20s for file state to go to closed
        f.wait_on_close(20)
        return f
def logThisRun(runId, resultsFolder, projectId):
    '''Adds a runId to the runsLaunched file in resultsFolder.'''
    # NOTE: DX manual lies?!  Append not possible?!  Then write new/delete old
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    oldFid = dxencode.find_file(launchFilePath, projectId)
    newFh = dxpy.new_dxfile('a',
                            project=projectId,
                            folder=resultsFolder,
                            name=RUNS_LAUNCHED_FILE)
    newFh.write(runId + ' started:' + str(datetime.now()) + '\n')
    if oldFid is not None:
        with dxpy.open_dxfile(oldFid) as oldFh:
            for oldRunId in oldFh:
                newFh.write(oldRunId + '\n')
        proj = dxpy.DXProject(projectId)
        proj.remove_objects([oldFid])
    newFh.close()
Beispiel #10
0
    def test_write_read_dxfile(self):
        dxid = ""
        with dxpy.new_dxfile() as self.dxfile:
            dxid = self.dxfile.get_id()
            self.dxfile.write(self.foo_str)

        with dxpy.open_dxfile(dxid) as same_dxfile:
            same_dxfile.wait_on_close()
            self.assertTrue(same_dxfile.closed())

            buf = same_dxfile.read(len(self.foo_str))
            self.assertEqual(self.foo_str, buf)

            buf = same_dxfile.read()
            self.assertEqual(len(buf), 0)

            same_dxfile.seek(1)
            buf = same_dxfile.read()
            self.assertEqual(self.foo_str[1:], buf)
Beispiel #11
0
 def test_file_context_manager_destructor(self):
     dxfile = dxpy.new_dxfile(mode='w')
     dxfile.write("Haha")
def main():
    ## RUN DEDUPLICATE
    os.environ['CLASSPATH'] = '/opt/jar/MarkDuplicates.jar:/opt/jar/MergeSamFiles.jar'

    #mappingsTable = dxpy.open_dxgtable(job['input']['mappings'][0]['$dnanexus_link'])
    if 'output_name' in job['input']:
        outputName = job['input']['output_name']
    else:
        outputName = ''
    recalibratedTable = createNewTable(job['input']['mappings'], outputName)

    #print "Mappings Table: " + mappingsTable.get_id()
    print "Recalibrated Table: " + recalibratedTable.get_id()

    mappingsTable = dxpy.DXGTable(job['input']['mappings'][0]['$dnanexus_link'])
    for x in job['input']['mappings']:
        if 'quality' not in dxpy.DXGTable(x).get_col_names():
            if len(job['input']['mappings']) > 1:
                raise dxpy.AppError("One of the provided mappings did not have quality scores, for example %s. GATK can't recalibrate mappings without quality scores" % dxpy.DXGTable(x).describe()['name'])
            else:
                raise dxpy.AppError("The provided mappings did not have quality scores. GATK can't recalibrate mappings without quality scores")

    try:
        contigSetId = mappingsTable.get_details()['original_contigset']['$dnanexus_link']
        originalContigSet = mappingsTable.get_details()['original_contigset']
    except:
        raise dxpy.AppError("The original reference genome must be attached as a detail")

    if contigSetId != job['input']['reference']['$dnanexus_link']:
        raise dxpy.AppError("The reference genome of the mappings does not match the provided reference genome")

    reads = 0
    for x in job['input']['mappings']:
        table = dxpy.DXGTable(x)
        reads += int(table.describe()['length'])
    chunks = int(reads/job['input']['reads_per_job'])+1

    #Split the genome into chunks to parallelize
    commandList = splitGenomeLengthChromosome(originalContigSet, chunks)
    chunks = len(commandList)
    if chunks == 1:
        job['input']['deduplicate_interchromosomal_pairs'] = False

    excludeInterchromosome = (chunks > 1)
    markDuplicatesJobs = []

    #This is a Picard Mark Duplicates job run only on interchromosomal mappings in the case that the genome is split into regions
    #This is necessary because Mark Duplicates has to look at both mates in a read pair, so interchromosomal mappings must go together
    reduceInterchromosomeInput = {}
    bamFiles = []
    if job['input']['deduplicate_interchromosomal_pairs']:
        for i in xrange(-1, chunks):
            bamFiles.append(dxpy.new_dxfile().get_id())
            mapInterchromosomeInput = {
            'mappings_tables': job['input']['mappings'],
            'interval': commandList[i],
            'job_number' : i,
            'separate_read_groups' : job['input']['separate_read_groups']
            }
            interchromosomeJobId = dxpy.new_dxjob(fn_input=mapInterchromosomeInput, fn_name="mapInterchromosome").get_id()
            reduceInterchromosomeInput["mapJob"+str(i)] = {'job': interchromosomeJobId, 'field': 'file_id'}
        #interchromosomeJobField = { 'job': interchromosomeJobId, 'field': 'bam'}

        #Make a reduce job for the interchromosome component
        reduceInterchromosomeInput["file_list"] =  bamFiles
        reduceInterchromosomeInput["interval"] = commandList
        reduceInterchromosomeInput["discard_duplicates"] = job['input']['discard_duplicates']
        reduceJobId = dxpy.new_dxjob(fn_input=reduceInterchromosomeInput, fn_name="reduceInterchromosome").get_id()
        deduplicateInterchromosome = True
    else:
        interchromosomeJobField = ''
        deduplicateInterchromosome = False

    #This runs the Picard Mark Duplicates program to deduplicate the reads
    reduceInput = {}
    for i in range(len(commandList)):
        print commandList[i]
        mapBestPracticesInput = {
            'mappings_tables': job['input']['mappings'],
            'recalibrated_table_id': recalibratedTable.get_id(),
            'file_list': bamFiles,
            'interval': commandList[i],
            'job_number' : i,
            'reference': job['input']['reference']['$dnanexus_link'],
            'dbsnp': job['input']['dbsnp'],
            'separate_read_groups' : job['input']['separate_read_groups'],
            'discard_duplicates': job['input']['discard_duplicates'],
            'parent_input': job['input'],
            'intervals_to_include': job['input'].get('intervals_to_process'),
            'intervals_to_exclude': job['input'].get('intervals_to_exclude'),
            'intervals_merging': job['input']['intervals_merging'],
            'deduplicate_interchromosome': deduplicateInterchromosome
        }
        if 'known_indels' in job['input']:
            mapBestPracticesInput['known_indels'] = job['input']['known_indels']


        mapJobId = dxpy.new_dxjob(fn_input=mapBestPracticesInput, fn_name="mapBestPractices").get_id()
        reduceInput["mapJob" + str(i)] = {'job': mapJobId, 'field': 'ok'}
    reduceInput["recalibrated_table"] = recalibratedTable.get_id()

    reduceJobId = dxpy.new_dxjob(fn_input=reduceInput, fn_name="reduceBestPractices").get_id()
    job['output'] = {'recalibrated_mappings': {'job': reduceJobId, 'field': 'recalibrated_table'}}