Ejemplo n.º 1
0
def checkRunsPreviouslyLaunched(resultsFolder,projectId):
    '''Checks for currently running jobs and will exit if found.'''
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    launchFids = dxencode.find_file(launchFilePath,projectId,multiple=True)
    if launchFids == None:
        print "  No prior jobs launched."
    else:
        # NOTE: Appending to the one file, but just in case handle multiple files.
        for fid in launchFids:
            with dxpy.open_dxfile(fid) as fd:
                for line in fd:
                    #print "Looking for job ["+line+"]"
                    runId = line.split(None,1)
                    if not runId[0].startswith('analysis-'):
                        continue
                    analysis = dxpy.DXAnalysis(dxid=runId[0])
                    if analysis == None:
                        continue
                    state = analysis.describe()['state']
                    # states I have seen: in_progress, terminated, done, failed
                    if state not in [ "done", "failed", "terminated" ]:
                        msg="Exiting: Can't launch because prior run ["+runId[0]+"] "
                        if len(runId) > 1:
                            msg+="("+runId[1]+") "
                        msg+= "has not finished (currently '"+state+"')."
                        print msg
                        sys.exit(1)
                    else:
                        msg="  Prior run ["+runId[0]+"] "
                        if len(runId) > 1:
                            msg+="("+runId[1]+") "
                        msg+= "is '"+state+"'."
                        print msg
Ejemplo n.º 2
0
def checkRunsPreviouslyLaunched(resultsFolder, projectId):
    '''Checks for currently running jobs and will exit if found.'''
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    launchFids = dxencode.find_file(launchFilePath, projectId, multiple=True)
    if launchFids == None:
        print "  No prior jobs launched."
    else:
        # NOTE: Appending to the one file, but just in case handle multiple files.
        for fid in launchFids:
            with dxpy.open_dxfile(fid) as fd:
                for line in fd:
                    #print "Looking for job ["+line+"]"
                    runId = line.split(None, 1)
                    if not runId[0].startswith('analysis-'):
                        continue
                    analysis = dxpy.DXAnalysis(dxid=runId[0])
                    if analysis == None:
                        continue
                    state = analysis.describe()['state']
                    # states I have seen: in_progress, terminated, done, failed
                    if state not in ["done", "failed", "terminated"]:
                        msg = "Exiting: Can't launch because prior run [" + runId[
                            0] + "] "
                        if len(runId) > 1:
                            msg += "(" + runId[1] + ") "
                        msg += "has not finished (currently '" + state + "')."
                        print msg
                        sys.exit(1)
                    else:
                        msg = "  Prior run [" + runId[0] + "] "
                        if len(runId) > 1:
                            msg += "(" + runId[1] + ") "
                        msg += "is '" + state + "'."
                        print msg
Ejemplo n.º 3
0
def parse_map_report(folder, project):

    mapreport = "/*_bismark_map_report.txt"
    report_link = dxencode.find_file(folder + mapreport,
                                     project.get_id(),
                                     recurse=False)

    metrics = {}
    res = {}
    for lab in labels:
        res[lab] = re.compile("(%s):\s+(.+)" % lab)

    try:
        with dxpy.open_dxfile(report_link) as rfd:
            for line in rfd:
                m = False
                for metric in res.values():
                    m = metric.match(line)
                    if m:
                        metrics.update({m.group(1): m.group(2).strip()})
                        continue

    except Exception, e:
        print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % (
            folder, report_link, e)
Ejemplo n.º 4
0
 def test_file_context_manager(self):
     with dxpy.new_dxfile(mode='w') as self.dxfile:
         file_id = self.dxfile.get_id()
         self.dxfile.write("Haha")
     file2 = dxpy.open_dxfile(file_id)
     state = file2._get_state()
     self.assertTrue(state in ['closing', 'closed'])
     file2._wait_on_close()
     self.assertEqual(file2.describe()["size"], 4)
Ejemplo n.º 5
0
    def test_iter_dxfile(self):
        dxid = ""
        with dxpy.new_dxfile() as self.dxfile:
            dxid = self.dxfile.get_id()
            self.dxfile.write("Line 1\nLine 2\nLine 3\n")

        with dxpy.open_dxfile(dxid) as same_dxfile:
            same_dxfile.wait_on_close()
            self.assertTrue(same_dxfile.closed())

            lineno = 1
            for line in same_dxfile:
                self.assertEqual(line, "Line " + str(lineno))
                lineno += 1
Ejemplo n.º 6
0
def logThisRun(runId,resultsFolder,projectId):
    '''Adds a runId to the runsLaunched file in resultsFolder.'''
    # NOTE: DX manual lies?!  Append not possible?!  Then write new/delete old
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    oldFid = dxencode.find_file(launchFilePath,projectId)
    newFh = dxpy.new_dxfile('a',project=projectId,folder=resultsFolder,name=RUNS_LAUNCHED_FILE)
    newFh.write(runId+' started:'+str(datetime.now())+'\n')
    if oldFid is not None:
        with dxpy.open_dxfile(oldFid) as oldFh:
            for oldRunId in oldFh:
                newFh.write(oldRunId+'\n')
        proj = dxpy.DXProject(projectId)
        proj.remove_objects([oldFid])
    newFh.close()
 def get_barcode_stats(self, barcode):
     """
     Loads the JSON in a ${barcode}_stats.json file in the DNAnexus project (usually in the qc
     folder). 
     """
     filename = barcode + "_stats.json"
     # In the call to dxpy.find_one_data_object() below, I'd normally set the
     # more_ok parameter to False, but this blows-up in Python 3.7 - giving me a RuntimeError.
     # So, I just won't set it for now. I think dxpy is still mainly a Python 2.7 library and
     # can break in later version of Python3.
     file_id = dxpy.find_one_data_object(zero_ok=False,
                                         project=self.dx_project_id,
                                         name=filename)["id"]
     json_data = json.loads(dxpy.open_dxfile(file_id).read())
     return json_data
Ejemplo n.º 8
0
def logThisRun(runId, resultsFolder, projectId):
    '''Adds a runId to the runsLaunched file in resultsFolder.'''
    # NOTE: DX manual lies?!  Append not possible?!  Then write new/delete old
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    oldFid = dxencode.find_file(launchFilePath, projectId)
    newFh = dxpy.new_dxfile('a',
                            project=projectId,
                            folder=resultsFolder,
                            name=RUNS_LAUNCHED_FILE)
    newFh.write(runId + ' started:' + str(datetime.now()) + '\n')
    if oldFid is not None:
        with dxpy.open_dxfile(oldFid) as oldFh:
            for oldRunId in oldFh:
                newFh.write(oldRunId + '\n')
        proj = dxpy.DXProject(projectId)
        proj.remove_objects([oldFid])
    newFh.close()
def get_fastqc(accession, project):
    summary_fn = accession+"_summary.txt"
    report_fn = accession+"_data.txt"

    summary_link = dxencode.find_file(summary_fn, project.get_id())
    report_link = dxencode.find_file(report_fn, project.get_id())

    metrics = {}
    try:
        with dxpy.open_dxfile(report_link) as rfd:
            total = re.compile('Total Sequences\s+(\d+)')
            for line in rfd:
                m = total.match(line)
                if m:
                    metrics.update({ 'Total Sequences': m.group(1) })
    except Exception, e:
        print "ERROR: Could not read FastQC summary: %s (%s) \n%s" % (summary_fn, summary_link, e)
        metrics.update({'Total Sequences': -999.999 })
 def get_run_details_json(self):
     """
     Retrieves the JSON object for the stats in the file named run_details.json in the project 
     specified by self.dx_project_id.
 
     Returns: 
         JSON object of the run details.
     """
     run_details_filename = "run_details.json"
     run_details_json_id = dxpy.find_one_data_object(
         more_ok=False,
         zero_ok=True,
         project=self.dx_project_id,
         name=run_details_filename)["id"]
     json_data = json.loads(
         dxpy.open_dxfile(dxid=run_details_json_id).read())
     #dxpy.download_dxfile(show_progress=True,dxid=run_details_json_id,project=self.dx_project_id,filename=output_name)
     return json_data
Ejemplo n.º 11
0
    def test_write_read_dxfile(self):
        dxid = ""
        with dxpy.new_dxfile() as self.dxfile:
            dxid = self.dxfile.get_id()
            self.dxfile.write(self.foo_str)

        with dxpy.open_dxfile(dxid) as same_dxfile:
            same_dxfile.wait_on_close()
            self.assertTrue(same_dxfile.closed())

            buf = same_dxfile.read(len(self.foo_str))
            self.assertEqual(self.foo_str, buf)

            buf = same_dxfile.read()
            self.assertEqual(len(buf), 0)

            same_dxfile.seek(1)
            buf = same_dxfile.read()
            self.assertEqual(self.foo_str[1:], buf)
    def get_sample_stats_json(self, barcode=None):
        """
        .. deprecated:: 0.1.0
           GSSC has removed the sample_stats.json file since the entire folder it was in has been 
           removed. Use :meth:`get_barcode_stats` instead. 
     
        Retrieves the JSON object for the stats in the file named sample_stats.json in the project 
        specified by self.dx_project_id.  This file is located in the DNAnexus folder stage\d_qc_report.
    
        Args:
            barcode: `str`. The barcode for the sample. Currently, the sample_stats.json file is of the 
                following form when there isn't a genome mapping: 
    
                [{"Sample name": "AGTTCC"}, {"Sample name": "CAGATC"}, {"Sample name": "GCCAAT"}, ...}]. 
    
                When there is a mapping, each dictionary has many more keys in addition to the "Sample name" one.
    
        Returns: 
            `list` of dicts if barcode=None, otherwise a dict for the given barcode.
        """
        sample_stats_json_filename = "sample_stats.json"
        sample_stats_json_id = dxpy.find_one_data_object(
            more_ok=False,
            zero_ok=False,
            project=self.dx_project_id,
            name=sample_stats_json_filename)["id"]
        #dxpy.download_dxfile(dxid=sample_stats_json_id,project=self.dx_project_id,filename=sample_stats_json_filename)
        json_data = json.loads(dxpy.open_dxfile(sample_stats_json_id).read())

        if not barcode:
            return json_data

        for d in json_data:  #d is a dictionary
            sample_barcode = d["Sample name"]
            if sample_barcode == barcode:
                return d
        if barcode:
            raise DnanexusBarcodeNotFound(
                "Barcode {barcode} for {library_name} not found in {sample_stats_json_filename} in project {project}."
                .format(barcode=barcode,
                        library_name=self.library_name,
                        sample_stats_json_filename=sample_stats_json_filename,
                        project=self.dx_project_id))
Ejemplo n.º 13
0
def parse_map_report(folder, project):

    mapreport = "/*_bismark_map_report.txt"
    report_link = dxencode.find_file(folder + mapreport, project.get_id(), recurse=False)

    metrics = {}
    res = {}
    for lab in labels:
        res[lab] = re.compile("(%s):\s+(.+)" % lab)

    try:
        with dxpy.open_dxfile(report_link) as rfd:
            for line in rfd:
                m = False
                for metric in res.values():
                    m = metric.match(line)
                    if m:
                        metrics.update({m.group(1): m.group(2).strip()})
                        continue

    except Exception, e:
        print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % (folder, report_link, e)
 def get_alignment_summary_metrics(self, barcode):
     """
     Parses the metrics in a ${barcode}alignment_summary_metrics file in the DNAnexus project
     (usually in the qc folder). This contains metrics produced by Picard Tools's 
     CollectAlignmentSummaryMetrics program. 
     """
     filename = barcode + ".alignment_summary_metrics"
     # In the call to dxpy.find_one_data_object() below, I'd normally set the
     # more_ok parameter to False, but this blows-up in Python 3.7 - giving me a RuntimeError.
     # So, I just won't set it for now. I think dxpy is still mainly a Python 2.7 library and
     # can break in later version of Python3.
     try:
         file_id = dxpy.find_one_data_object(zero_ok=False,
                                             project=self.dx_project_id,
                                             name=filename)["id"]
     except dxpy.exceptions.DXSearchError as err:
         msg = "Picard alignment summary metrics for barcode {} in DX project {} not found.".format(
             barcode, self.dx_project_id)
         debug_logger.error(msg)
         raise DxMissingAlignmentSummaryMetrics(msg)
     fh = StringIO(dxpy.open_dxfile(file_id).read())
     asm = picard.CollectAlignmentSummaryMetrics(fh)
     return asm.metrics
Ejemplo n.º 15
0
def main(**job_inputs):

    if 'prefix' not in job_inputs:
        bam_name = dxpy.describe(job_inputs['illumina_bam'])['name']
        if bam_name.endswith("cram"):
            prefix = bam_name[:-5]
        else:
            prefix = bam_name[:-4]
    else:
        prefix = job_inputs['prefix']

    # Running Docker image
    subprocess.check_call(
        ['mkdir', '-p', '/home/dnanexus/in', '/home/dnanexus/out'])

    print "Starting Docker"

    input_bam = dxpy.open_dxfile(job_inputs['illumina_bam'])
    bam_name = "/home/dnanexus/in/{0}".format(input_bam.name)
    dxpy.download_dxfile(input_bam, bam_name)

    ref_genome = dxpy.open_dxfile(job_inputs['ref_fasta'])
    ref_name = "/home/dnanexus/in/{0}".format(ref_genome.name)
    dxpy.download_dxfile(ref_genome, ref_name)

    docker_call = [
        'dx-docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/',
        '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'parliament2',
        '--bam', bam_name, '-r', ref_name, '--prefix',
        str(prefix)
    ]

    if 'illumina_bai' in job_inputs:
        input_bai = dxpy.open_dxfile(job_inputs['illumina_bai'])
        bai_name = "/home/dnanexus/in/{0}".format(input_bai.name)
        dxpy.download_dxfile(input_bai, bai_name)

        docker_call.extend(['--bai', bai_name])

    if job_inputs['filter_short_contigs']:
        docker_call.append('--filter_short_contigs')
    if job_inputs['run_breakdancer']:
        docker_call.append('--breakdancer')
    if job_inputs['run_breakseq']:
        docker_call.append('--breakseq')
    if job_inputs['run_manta']:
        docker_call.append('--manta')
    if job_inputs['run_cnvnator']:
        docker_call.append('--cnvnator')
    if job_inputs['run_lumpy']:
        docker_call.append('--lumpy')
    if job_inputs['run_delly_inversion']:
        docker_call.append('--delly_inversion')
    if job_inputs['run_delly_insertion']:
        docker_call.append('--delly_insertion')
    if job_inputs['run_delly_deletion']:
        docker_call.append('--delly_deletion')
    if job_inputs['run_delly_duplication']:
        docker_call.append('--delly_duplication')
    if job_inputs['run_genotype_candidates']:
        docker_call.append('--genotype')
    if job_inputs['run_svviz']:
        docker_call.append('--svviz')
    if job_inputs['svviz_only_validated_candidates']:
        docker_call.append('--svviz_only_validated_candidates')

    subprocess.check_call(docker_call)

    print "Docker image finished"

    sv_caller_results_names = glob.glob(
        '/home/dnanexus/out/sv_caller_results/*')
    sv_caller_results_upload = []
    for name in sv_caller_results_names:
        sv_caller_results_upload.append(
            dxpy.dxlink(dxpy.upload_local_file(name)))

    output = {'sv_caller_results': sv_caller_results_upload}

    subprocess.check_call(['ls', '-sh', '/home/dnanexus/out/svtyped_vcfs/'])

    if job_inputs['output_log_files'] and os.listdir(
            '/home/dnanexus/out/log_files/'):
        log_file_names = glob.glob('/home/dnanexus/out/log_files/*')
        log_file_upload = []
        for name in log_file_names:
            log_file_upload.append(dxpy.dxlink(dxpy.upload_local_file(name)))
        output['log_files'] = log_file_upload

    if job_inputs['run_genotype_candidates']:
        svtyped_vcf_names = glob.glob('/home/dnanexus/out/svtyped_vcfs/*')
        svtyped_vcfs_upload = []
        for name in svtyped_vcf_names:
            svtyped_vcfs_upload.append(
                dxpy.dxlink(dxpy.upload_local_file(name)))

        output['svtyped_vcfs'] = svtyped_vcfs_upload
        output['combined_genotypes'] = dxpy.dxlink(
            dxpy.upload_local_file(
                '/home/dnanexus/out/{0}.combined.genotyped.vcf'.format(
                    prefix)))

    if job_inputs['run_svviz'] and os.path.isfile(
            '/home/dnanexus/out/{0}.svviz_outputs.tar.gz'.format(prefix)):
        output['svviz_outputs'] = dxpy.dxlink(
            dxpy.upload_local_file(
                '/home/dnanexus/out/{0}.svviz_outputs.tar.gz'.format(prefix)))

    return output
    report_link = dxencode.find_file(report_fn, project.get_id())

    metrics = {}
    try:
        with dxpy.open_dxfile(report_link) as rfd:
            total = re.compile('Total Sequences\s+(\d+)')
            for line in rfd:
                m = total.match(line)
                if m:
                    metrics.update({ 'Total Sequences': m.group(1) })
    except Exception, e:
        print "ERROR: Could not read FastQC summary: %s (%s) \n%s" % (summary_fn, summary_link, e)
        metrics.update({'Total Sequences': -999.999 })

    try:
        with dxpy.open_dxfile(summary_link) as sfd:
            fastqc = re.compile('(PASS|FAIL|WARN)\s+(.+)\s+ENCFF')
            for line in sfd:
                m = fastqc.match(line)
                if m:
                    metrics.update({ m.group(2):  m.group(1) })

    except Exception, e:
        print "ERROR: Could not read FastQC report: %s (%s) \n%s" % (report_fn, report_link, e)

    #print json.dumps(metrics)
    return metrics

def get_analysis_time(accession, repstr, project):

    result = list(dxpy.find_analyses(project=project, name='*'+accession+repstr+'*', name_mode='glob', describe=True, state='done'))