Python DXAnalysis Exemples, dxpy.DXAnalysis Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : launchDnaMe.py Projet : TTloveTT/dna-me-pipeline

def checkRunsPreviouslyLaunched(resultsFolder, projectId):
    '''Checks for currently running jobs and will exit if found.'''
    launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE
    launchFids = dxencode.find_file(launchFilePath, projectId, multiple=True)
    if launchFids == None:
        print "  No prior jobs launched."
    else:
        # NOTE: Appending to the one file, but just in case handle multiple files.
        for fid in launchFids:
            with dxpy.open_dxfile(fid) as fd:
                for line in fd:
                    #print "Looking for job ["+line+"]"
                    runId = line.split(None, 1)
                    if not runId[0].startswith('analysis-'):
                        continue
                    analysis = dxpy.DXAnalysis(dxid=runId[0])
                    if analysis == None:
                        continue
                    state = analysis.describe()['state']
                    # states I have seen: in_progress, terminated, done, failed
                    if state not in ["done", "failed", "terminated"]:
                        msg = "Exiting: Can't launch because prior run [" + runId[
                            0] + "] "
                        if len(runId) > 1:
                            msg += "(" + runId[1] + ") "
                        msg += "has not finished (currently '" + state + "')."
                        print msg
                        sys.exit(1)
                    else:
                        msg = "  Prior run [" + runId[0] + "] "
                        if len(runId) > 1:
                            msg += "(" + runId[1] + ") "
                        msg += "is '" + state + "'."
                        print msg

Exemple #2

0

Afficher le fichier

Fichier : rerun_analysis_frip.py Projet : procha2/WranglerScripts

def rerun_with_frip(analysis_id, dryrun, assay_type):
    logger.debug(
        'rerun_with_frip: analysis_id %s'
        % (analysis_id))
    analysis = dxpy.DXAnalysis(analysis_id)
    project_id = analysis.describe()['project']
    temp = dxpy.api.workflow_new({
        'name': analysis.describe()['executableName'],
        'project': project_id,
        'initializeFrom': {'id': analysis.get_id()},
        'temporary': True})
    new_workflow = dxpy.DXWorkflow(temp['id'])
    logger.debug(
        'rerun_with_frip: new_workflow %s %s'
        % (new_workflow.get_id(), new_workflow.name))
    final_stage, new_input, new_applet = get_assay_specific_variables(
        analysis,
        assay_type
    )
    accessioning_stage = stage_named('Accession results', analysis)
    if accessioning_stage:
        new_workflow.remove_stage(accessioning_stage['id'])
    logger.debug(
        'rerun_with_frip: new_applet %s %s'
        % (new_applet.get_id(), new_applet.name))
    logger.debug(
        'rerun_with_frip: new_input \n%s'
        % (pformat(new_input)))
    new_workflow.update_stage(
        final_stage['id'],
        executable=new_applet.get_id(),
        stage_input=new_input,
        force=True)

    m = re.match('ENCSR.{6}', analysis.name)
    accession = m.group(0)

    analysis_properties = analysis.describe()['properties']
    analysis_properties.update({
        'experiment_accession': accession,
        'original_analysis': analysis_id
    })
    if dryrun:
        logger.debug(
            'rerun_with_frip: workflow created but dryrun so no analysis run')
        return new_workflow
    else:
        logger.debug(
            'rerun_with_frip: running workflow')
        return new_workflow.run(
            {},
            project=project_id,
            name="%s frip" % (analysis.name),
            properties=analysis_properties)

Exemple #3

0

Afficher le fichier

def rerun_with_applet(analysis_id, stage_name, applet_name, folder=None):
    logger.debug('rerun_with_applet: analysis_id %s new_applet_name %s' %
                 (analysis_id, applet_name))
    analysis = dxpy.DXAnalysis(analysis_id)
    old_workflow_description = analysis.describe().get('workflow')
    old_workflow = dxpy.DXWorkflow(old_workflow_description['id'])
    project_id = analysis.describe()['project']
    temp = dxpy.api.workflow_new({
        'name': analysis.describe()['executableName'],
        'project': project_id,
        'initializeFrom': {
            'id': analysis.get_id()
        },
        'properties': old_workflow.get_properties(),
        'temporary': True
    })
    new_workflow = dxpy.DXWorkflow(temp['id'])
    logger.debug('rerun_with_applet: new_workflow %s %s' %
                 (new_workflow.get_id(), new_workflow.name))
    old_stage = stage_named(stage_name, analysis)
    accessioning_stage = stage_named('Accession results', analysis)
    if accessioning_stage:
        new_workflow.remove_stage(accessioning_stage['id'])
    new_applet = find_applet_by_name(applet_name)
    logger.debug('rerun_with_applet: new_applet %s %s' %
                 (new_applet.get_id(), new_applet.name))
    same_input = old_stage['execution']['input']
    logger.debug('rerun_with_applet: same_input \n%s' % (pformat(same_input)))
    new_workflow.update_stage(old_stage['id'],
                              executable=new_applet.get_id(),
                              stage_input=same_input,
                              force=True)

    m = re.match('ENCSR.{6}', analysis.name)
    accession = m.group(0)

    analysis_properties = analysis.describe()['properties']
    analysis_properties.update({
        'experiment_accession': accession,
        'original_analysis': analysis_id
    })
    logger.debug('rerun_with_applet: running workflow')
    runargs = {
        # 'executable_input': {},
        'project': project_id,
        'name': "%s %s" % (analysis.name, new_applet.name),
        'properties': analysis_properties
    }
    if folder is not None:
        runargs.update({'folder': folder})
    logger.debug("running new_workflow with args: \n%s" % (pformat(runargs)))
    return new_workflow.run({}, **runargs)

Exemple #4

0

Afficher le fichier

Fichier : idr_report_experiments.py Projet : procha2/WranglerScripts

def main():
    args = get_args()
    if args.debug:
        logging.basicConfig(format='%(levelname)s:%(message)s',
                            level=logging.DEBUG)
        logger.setLevel(logging.DEBUG)
    else:
        # Use the default logging level.
        logging.basicConfig(format='%(levelname)s:%(message)s')
        logger.setLevel(logging.INFO)
    if args.released:
        keypair = None
        server = PUBLIC_SERVER
    else:
        authid, authpw, server = common.processkey(args.key, args.keyfile)
        keypair = (authid, authpw)
    if args.experiments:
        ids = args.experiments
    elif args.all:
        # Get metadata for all ChIP-seq Experiments.
        base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released'
        extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready'
        exp_query = base_exp_query if args.released else (base_exp_query +
                                                          extended_query)
        all_experiments = common.encoded_get(server + exp_query,
                                             keypair)['@graph']
        # Extract Experiment accessions.
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        # Never reached because infile defaults to stdin.
        raise InputError('Must supply experiment ids'
                         ' in arguments or --infile.')
    # Define column names for TSV.
    fieldnames = [
        'date', 'analysis', 'analysis_id', 'experiment', 'target',
        'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa',
        'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio',
        'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1',
        'F2', 'state', 'release', 'total_price', 'quality_metric_of'
    ]
    if args.create_google_sheet:
        # Force creation of temporary CSV that can be loaded into a DataFrame,
        # written to Google Sheets, then deleted.
        temp_file = 'temp_idr_%s.tsv' % (args.assembly)
        args.outfile = open(temp_file, 'w')
    writer = csv.DictWriter(args.outfile,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()
    # Get metadata for all IDR output Files.
    base_idr_query = ('/search/?type=File&assembly=%s&file_format=bed'
                      '&output_type=optimal+idr+thresholded+peaks'
                      '&output_type=conservative+idr+thresholded+peaks'
                      '&output_type=pseudoreplicated+idr+thresholded+peaks'
                      '&lab.title=ENCODE+Processing+Pipeline'
                      '&lab.title=J.+Michael+Cherry,+Stanford'
                      '&status=released' % (args.assembly))
    extended_idr_query = '&status=in+progress&status=uploading&status=uploaded'
    idr_query = base_idr_query if args.released else (base_idr_query +
                                                      extended_idr_query)
    all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph']
    na = 'not_available'
    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs):
            if not args.all:
                logger.warning("%s: Found %d IDR step runs. Skipping" %
                               (experiment_id, len(idr_step_runs)))
            continue
        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error(
                    '%s: Expected one IDR quality metric for file %s.'
                    ' Found %d.' %
                    (experiment_id, f.get('accession'), len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error('%s: File %s has no assembly' %
                             (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error('%s: Expected one unique IDR metric,'
                         ' found %d. Skipping.' %
                         (experiment_id, len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error('%s: Expected one unique assembly, found %d.'
                         ' Skipping.' % (experiment_id, len(assemblies)))
            continue
        # Grab unique value from set.
        idr_qc_uri = next(iter(idr_qc_uris))
        assembly = next(iter(assemblies))
        # Get analysis_id from DNAnexus, create analysis_link.
        idr_step_run_uri = next(iter(idr_step_runs))
        try:
            idr_step_run = common.encoded_get(server + idr_step_run_uri,
                                              keypair)
        except Exception as e:
            print(experiment_id, e, 'Skipping.')
            continue
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get(
                'dx_job_id')
        except:
            logger.warning(
                "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id"
            )
            logger.debug(idr_step_run)
            # Could try to pull it from alias.
            dx_job_id_str = None
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        if not args.released:
            dx_job = dxpy.DXJob(dx_job_id)
            job_desc = dx_job.describe()
            analysis_id = job_desc.get('analysis')
            logger.debug('%s' % (analysis_id))
            analysis = dxpy.DXAnalysis(analysis_id)
            desc = analysis.describe()
            project = desc.get('project')
            analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
                desc.get('project').split('-')[1],
                desc.get('id').split('-')[1])
        else:
            analysis_link = na
            desc = {}

        # Get IDR object.
        idr = common.encoded_get(server + idr_qc_uri, keypair)
        # Pull metrics of interest.
        idr_status = idr.get('status', na)
        if (args.released and (idr_status == na or idr_status != 'released')):
            logger.error('%s: Expected released IDR metric. Skipping.' %
                         idr_qc_uris)
            continue
        Np = idr.get('Np', na)
        N1 = idr.get('N1', na)
        N2 = idr.get('N2', na)
        Nt = idr.get('Nt', na)
        Fp = idr.get('Fp', na)
        F1 = idr.get('F1', na)
        F2 = idr.get('F2', na)
        Ft = idr.get('Ft', na)
        quality_metric_of = idr.get('quality_metric_of', [])
        date = idr.get('date_created', na)
        rescue_ratio = idr.get('rescue_ratio', na)
        self_consistency_ratio = idr.get('self_consistency_ratio', na)
        reproducibility_test = idr.get('reproducibility_test', na)
        # Get Experiment object.
        experiment = common.encoded_get(server + experiment_id, keypair)
        experiment_link = '%sexperiments/%s' % (server,
                                                experiment.get('accession'))
        # Get Award object.
        award = common.encoded_get(server + experiment.get('award'), keypair)
        # Grab project phase, e.g. ENCODE4.
        rfa = award.get('rfa', na)
        row = {
            'date': date,
            'analysis': analysis_link,
            'analysis_id': desc.get('id', na),
            'experiment': experiment_link,
            'target': experiment['target'].split('/')[2],
            'biosample_term_name': experiment.get('biosample_term_name'),
            'biosample_type': experiment.get('biosample_type'),
            'replication': experiment.get('replication_type'),
            'lab': experiment['lab'].split('/')[2],
            'rfa': rfa,
            'assembly': assembly,
            'Nt': Nt,
            'Np': Np,
            'N1': N1,
            'N2': N2,
            'rescue_ratio': rescue_ratio,
            'self_consistency_ratio': self_consistency_ratio,
            'reproducibility_test': reproducibility_test,
            'Ft': Ft,
            'Fp': Fp,
            'F1': F1,
            'F2': F2,
            'state': desc.get('state', na),
            'release': experiment['status'],
            'total_price': desc.get('totalPrice', na),
            'quality_metric_of': ', '.join(quality_metric_of)
        }
        writer.writerow(row)
    if args.create_google_sheet:
        args.outfile.close()
        # Load CSV data, sort.
        idr_data = pd.read_table(temp_file)
        idr_data = idr_data.replace('not_available', '')
        idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x))
        idr_data = idr_data.sort_values(
            by=['lab', 'biosample_term_name', 'target', 'experiment'],
            ascending=[True, True, True, True])
        idr_data.date = idr_data.date.astype('str')
        idr_data = idr_data.reset_index(drop=True)
        # Read sheet title and create unique page title.
        date = datetime.now().strftime('%m_%d_%Y')
        sheet_title = (args.sheet_title if not args.released else
                       '{} Released'.format(args.sheet_title))
        page_title = '%s_IDR_FRIP_%s' % (args.assembly, date)
        # Open/create Google Sheet.
        gc = pygsheets.authorize(args.apikey)
        try:
            sh = gc.open(sheet_title)
        except pygsheets.exceptions.SpreadsheetNotFound:
            sh = gc.create(sheet_title)
        try:
            wks = sh.add_worksheet(page_title)
        except HttpError:
            wks = sh.worksheet_by_title(page_title)
        # Clear worksheet.
        wks.clear()
        # Add data from DataFrame.
        wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1')
        # Apply formatting and conditions.
        header['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, header)
        # Format numbers.
        for col in number_format_columns:
            num = idr_data.columns.get_loc(col)
            number_format['repeatCell']['range']['startColumnIndex'] = num
            number_format['repeatCell']['range']['endColumnIndex'] = num + 1
            number_format['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, number_format)
        # Resize font.
        font_size_format['repeatCell']['range']['sheetId'] = wks.id
        wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format)
        # Add conditional formatting.
        for conditional in conditions:
            num = idr_data.columns.get_loc("reproducibility_test")
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'startColumnIndex'] = num
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'endColumnIndex'] = num + 1
            conditional['addConditionalFormatRule']['rule']['ranges'][0][
                'sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, conditional)
        for k, v in notes_dict.items():
            num = idr_data.columns.get_loc(k)
            note['repeatCell']['range']['startColumnIndex'] = num
            note['repeatCell']['range']['endColumnIndex'] = num + 1
            note['repeatCell']['cell']['note'] = v
            note['repeatCell']['range']['sheetId'] = wks.id
            wks.client.sh_batch_update(wks.spreadsheet.id, note)
        # Optional. Smaller column width to match original.
        for i in range(wks.cols):
            wks.adjust_column_width(i, pixel_size=38)
        # Resize tiny columns.
        tiny_columns = ['experiment', 'analysis']
        for i in [idr_data.columns.get_loc(x) for x in tiny_columns]:
            wks.adjust_column_width(i, pixel_size=25)
        # Resize medium columns.
        medium_columns = ['replication', 'assembly', 'rfa']
        for i in [idr_data.columns.get_loc(x) for x in medium_columns]:
            wks.adjust_column_width(i, pixel_size=65)
        # Resize wide columns.
        wide_columns = ['target', 'reproducibility_test', 'lab']
        for i in [idr_data.columns.get_loc(x) for x in wide_columns]:
            wks.adjust_column_width(i, pixel_size=85)
        # Remove temp file.
        os.remove(temp_file)

Exemple #5

0

Afficher le fichier

Fichier : accession_analyses.py Projet : wangdi2014/chip-seq-pipeline

def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
        logger.debug("Logging level set to DEBUG")
    else:
        logger.setLevel(logging.INFO)

    if args.analysis_ids:
        ids = [i for i in args.analysis_ids if not i.startswith('#')]
    elif args.infile:
        ids = [i for i in args.infile if not i.startswith('#')]
    else:
        # never reached because inile defaults to stdin
        raise InputError("Must supply analysis id's in arguments or --infile")

    if not args.name:
        if len(ids) > 1:
            job_name = "batch_%s" % (timestring)
        else:
            analysis = dxpy.DXAnalysis(ids[0])
            job_name = "Accession %s" % (analysis.name)
    else:
        job_name = args.name

    tokens = [
        'dx run %s' % (ACCESSION_ANALYSIS_APPLET),
        '-i "outfn=%s"' % (args.outfile),
        '--destination "%s"' % (args.destination),
        '--name "%s"' % (job_name),
        '--yes'
    ]
    if args.watch:
        tokens.append('--watch')
    if args.project is not None:
        tokens.append('-i "project=%s"' % (args.project))
    if args.pipeline is not None:
        tokens.append('-i "pipeline=%s"' % (args.pipeline))
    if args.key is not None:
        tokens.append('-i "key=%s"' % (args.key))
    # if args.keyfile is not None:
    #     tokens.append('-i "keyfile=%s"' % (args.keyfile))
    if args.debug is not None:
        tokens.append('-i "debug=%s"' % (args.debug))
    if args.dryrun is not None:
        tokens.append('-i "dryrun=%s"' % (args.dryrun))
    if args.force_patch is not None:
        tokens.append('-i "force_patch=%s"' % (args.force_patch))
    if args.force_upload is not None:
        tokens.append('-i "force_upload=%s"' % (args.force_upload))
    if args.use_content_md5sum is not None:
        tokens.append('-i "use_content_md5sum=%s"' % (args.use_content_md5sum))
    if args.fqcheck is not None:
        tokens.append('-i "fqcheck=%s"' % (args.fqcheck))
    if args.accession_raw is not None:
        tokens.append('-i "accession_raw=%s"' % (args.accession_raw))
    if args.signal_only is not None:
        tokens.append('-i "signal_only=%s"' % (args.signal_only))
    if args.skip_control is not None:
        tokens.append('-i "skip_control=%s"' % (args.skip_control))
    if args.encoded_check is not None:
        tokens.append('-i "encoded_check=%s"' % (args.encoded_check))

    for (i, analysis_id) in enumerate(ids):
        if analysis_id.startswith('#'):
            continue
        analysis_id = analysis_id.rstrip()
        logger.debug('%s' % (analysis_id))
        tokens.append('-i "analysis_ids=%s"' % (analysis_id))

    command_string = ' '.join(tokens)
    logger.debug(command_string)
    subprocess.check_call(shlex.split(command_string))

Exemple #6

0

Afficher le fichier

Fichier : orchestrate_analysis.py Projet : ENCODE-DCC/dna-nexus-collaboration

def main(token):
    # Configure dxpy authentication
    dxpy.set_security_context({
        'auth_token_type': 'Bearer',
        'auth_token': token
    })

    # Resolve FACTORY_PROJECT by ID
    proj = dxpy.DXProject(FACTORY_PROJECT)
    print 'Resolved project:', proj.describe()['name'], proj.get_id()

    # Set FACTORY_PROJECT as the workspace for subsequent operations
    # (sort of like the current working directory)
    dxpy.set_workspace_id(FACTORY_PROJECT)

    # Resolve the workflow by name. (Could also store ID like the project)
    wf = list(
        dxpy.search.find_data_objects(classname="workflow",
                                      name="RNA-seq pipeline",
                                      return_handler=True))[0]
    print 'Resolved workflow:', wf.describe()['name'], wf.get_id()

    # TODO: Stage the inputs. Here we find them in the IN folder
    left_reads = list(
        dxpy.search.find_data_objects(classname="file",
                                      name="ENCFF001JPX.1k.fastq.gz",
                                      folder="/IN",
                                      return_handler=True))[0]
    print 'Resolved left reads:', left_reads.describe(
    )['name'], left_reads.get_id()
    right_reads = list(
        dxpy.search.find_data_objects(classname="file",
                                      name="ENCFF001JQB.1k.fastq.gz",
                                      folder="/IN",
                                      return_handler=True))[0]
    print 'Resolved right reads:', right_reads.describe(
    )['name'], right_reads.get_id()

    # Launch the workflow
    analysis = wf.run({
        '0.fastqs': [dxpy.dxlink(left_reads.get_id())],
        '0.fastq_pairs': [dxpy.dxlink(right_reads.get_id())]
    })
    print 'Launched analysis:', analysis.get_id()
    print 'Analysis state:', analysis.describe()['state']

    # TODO: Poll for (or come back when) analysis state 'done' or 'failed'.
    # Handle any failures.

    # Cooking-show-style substitution with completed analysis
    analysis = dxpy.DXAnalysis(COMPLETED_ANALYSIS)
    print 'Analysis state:', analysis.describe()['state']

    # Enumerate outputs
    print 'Analysis outputs:'
    for one_output_name, one_output_link in analysis.describe(
    )['output'].iteritems():
        one_output = dxpy.get_handler(
            one_output_link)  # one_output : dxpy.DXFile
        one_file_name = one_output.describe()['name']
        one_file_url, _ = one_output.get_download_url(preauthenticated=True,
                                                      filename=one_file_name)
        print one_file_name, one_file_url

Exemple #7

0

Afficher le fichier

def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.analysis_ids:
        ids = args.analysis_ids
    elif args.created_after:
        analyses = []
        for state in args.state:
            analyses.extend(
                dxpy.find_analyses(name="ENCSR*",
                                   name_mode='glob',
                                   state=state,
                                   include_subjobs=True,
                                   return_handler=True,
                                   created_after="%s" % (args.created_after)))
        ids = [
            analysis.get_id() for analysis in analyses
            if analysis.describe()['executableName'] == 'tf_chip_seq'
            or analysis.describe()['executableName'].startswith(
                'ENCSR783QUL Peaks')
        ]
    elif args.infile:
        ids = args.infile
    else:
        #never reached because inile defaults to stdin
        raise InputError(
            "Must supply analysis id's in arguments, --infile or supply search string in --created_after"
        )

    fieldnames = [
        'date', 'analysis', 'experiment', 'target', 'biosample_term_name',
        'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2',
        'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test',
        'state', 'total price', 'notes'
    ]
    writer = csv.DictWriter(sys.stdout,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()

    for (i, analysis_id) in enumerate(ids):
        if analysis_id.startswith('#'):
            continue
        analysis_id = analysis_id.rstrip()
        logger.debug('%s' % (analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get('project')

        m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name'])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc['name']))
            continue

        experiment = common.encoded_get(
            urlparse.urljoin(server,
                             '/experiments/%s' % (experiment_accession)),
            keypair)
        logger.debug('ENCODEd experiment %s' % (experiment['accession']))
        if args.lab and experiment['lab'].split('/')[2] not in args.lab:
            continue
        try:
            idr_stage = next(
                s['execution'] for s in desc['stages']
                if s['execution']['name'] == "Final IDR peak calls")
        except:
            logging.error('Failed to find final IDR stage in %s' %
                          (analysis_id))
        else:
            if idr_stage[
                    'state'] != 'done':  #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = [
                    'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates',
                    'IDR Rep 2 Self-pseudoreplicates',
                    'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'
                ]
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(
                            s['execution'] for s in desc['stages']
                            if s['execution']['name'] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage['state'] == 'failed':
                        try:
                            job_log = subprocess.check_output(
                                'dx watch %s' % (idr_stage['id']),
                                shell=True,
                                stderr=subprocess.STDOUT)
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [
                                r'Peak files must contain at least 20 peaks post-merge'
                            ]
                            for p in patterns:
                                m = re.search(p, job_log)
                                if m:
                                    notes.append("%s: %s" %
                                                 (stage_name, m.group(0)))
                        if not notes:
                            notes.append(idr_stage['failureMessage'])
                try:
                    done_time = next(transition['setAt']
                                     for transition in desc['stateTransitions']
                                     if transition['newState'] == "failed")
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage['output'].get('Np')
                N1 = idr_stage['output'].get('N1')
                N2 = idr_stage['output'].get('N2')
                Nt = idr_stage['output'].get('Nt')
                rescue_ratio = idr_stage['output'].get('rescue_ratio')
                self_consistency_ratio = idr_stage['output'].get(
                    'self_consistency_ratio')
                reproducibility_test = idr_stage['output'].get(
                    'reproducibility_test')
                notes = "IDR Complete"
                done_time = next(transition['setAt']
                                 for transition in desc['stateTransitions']
                                 if transition['newState'] == "done")

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime(done_time / 1000))
        else:
            date = "Running"
        analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
            desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        experiment_link = 'https://www.encodeproject.org/experiments/%s' % (
            experiment.get('accession'))
        row = {
            'date':
            date,
            'analysis':
            analysis_link,
            'experiment':
            experiment_link,
            'target':
            experiment['target'].split('/')[2],
            'biosample_term_name':
            experiment.get('biosample_term_name'),
            'biosample_type':
            experiment.get('biosample_type'),
            'lab':
            experiment['lab'].split('/')[2],
            'rfa':
            common.encoded_get(server + experiment.get('award'),
                               keypair).get('rfa'),
            'assembly':
            args.assembly,  #TODO ... derive this from the analysis
            'Np':
            Np,
            'N1':
            N1,
            'N2':
            N2,
            'Nt':
            Nt,
            'rescue_ratio':
            rescue_ratio,
            'self_consistency_ratio':
            self_consistency_ratio,
            'reproducibility_test':
            reproducibility_test,
            'state':
            desc.get('state'),
            'total price':
            desc.get('totalPrice')
        }

        if notes:
            row.update({'notes': '%s' % (notes)})
        else:
            row.update({'notes': '%s' % ('OK')})
        #log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)

Exemple #8

0

Afficher le fichier

Fichier : idr_report_experiments.py Projet : anwesharry/chip-seq-pipeline

def main():

    args = get_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)

    if args.experiments:
        ids = args.experiments
    # elif args.created_after:
    #   analyses = []
    #   for state in args.state:
    #       analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)))
    #   ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')]
    elif args.all:
        exp_query = \
            "/search/?type=Experiment" + \
            "&assay_title=ChIP-seq" + \
            "&award.project=ENCODE" + \
            "&status=released&status=submitted&status=in+progress&status=started&status=release+ready"
        all_experiments = common.encoded_get(server + exp_query,
                                             keypair)['@graph']
        ids = [exp.get('accession') for exp in all_experiments]
    elif args.infile:
        ids = args.infile
    else:
        #never reached because inile defaults to stdin
        raise InputError(
            "Must supply experiment id's in arguments or --infile")

    fieldnames = [
        'date', 'analysis', 'analysis id', 'experiment', 'target',
        'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly',
        'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio',
        'reproducibility_test', 'state', 'release', 'total price', 'notes'
    ]
    writer = csv.DictWriter(sys.stdout,
                            fieldnames=fieldnames,
                            delimiter='\t',
                            quotechar='"')
    writer.writeheader()

    idr_query = \
        "/search/?type=File" + \
        "&file_format=bed" + \
        "&output_type=optimal+idr+thresholded+peaks" + \
        "&output_type=conservative+idr+thresholded+peaks" + \
        "&lab.title=ENCODE+Processing+Pipeline" + \
        "&lab.title=J.+Michael+Cherry,+Stanford" + \
        "&status=in+progress&status=released&status=uploading&status=uploaded"
    all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph']

    for (i, experiment_id) in enumerate(ids):
        if experiment_id.startswith('#'):
            continue
        experiment_id = experiment_id.rstrip()
        experiment_uri = '/experiments/%s/' % (experiment_id)
        idr_files = \
            [f for f in all_idr_files if f['dataset'] == experiment_uri]
        idr_step_runs = set([f.get('step_run') for f in idr_files])
        if not len(idr_step_runs) == 1:
            if not args.all:
                logger.warning(
                    "%s: Expected one IDR step run. Found %d.  Skipping" %
                    (experiment_id, len(idr_step_runs)))
            continue

        idr_qc_uris = []
        assemblies = []
        for f in idr_files:
            quality_metrics = f.get('quality_metrics')
            if not len(quality_metrics) == 1:
                logger.error(
                    '%s: Expected one IDR quality metric for file %s. Found %d.'
                    %
                    (experiment_id, f.get('accession'), len(quality_metrics)))
            idr_qc_uris.extend(quality_metrics)
            assembly = f.get('assembly')
            if not assembly:
                logger.error('%s: File %s has no assembly' %
                             (experiment_id, f.get('accession')))
            assemblies.append(assembly)
        idr_qc_uris = set(idr_qc_uris)
        if not len(idr_qc_uris) == 1:
            logger.error(
                '%s: Expected one unique IDR metric, found %d. Skipping.' %
                (experiment_id, len(idr_qc_uris)))
            continue
        assemblies = set(assemblies)
        if not len(assemblies) == 1:
            logger.error(
                '%s: Expected one unique assembly, found %d. Skipping.' %
                (experiment_id, len(assemblies)))
            continue
        assembly = next(iter(assemblies))

        idr_step_run_uri = next(iter(idr_step_runs))
        idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair)
        try:
            dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get(
                'dx_job_id')
        except:
            logger.warning(
                "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id"
            )
            logger.debug(idr_step_run)
            dx_job_id_str = None  #could try to pull it from alias
        dx_job_id = dx_job_id_str.rpartition(':')[2]
        dx_job = dxpy.DXJob(dx_job_id)
        job_desc = dx_job.describe()
        analysis_id = job_desc.get('analysis')

        logger.debug('%s' % (analysis_id))
        analysis = dxpy.DXAnalysis(analysis_id)
        desc = analysis.describe()
        project = desc.get('project')

        m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name'])
        if m:
            experiment_accession = m.group(1)
        else:
            logger.error("No accession in %s, skipping." % (desc['name']))
            continue

        if args.all:  # we've already gotten all the experiment objects
            experiment = \
                next(e for e in all_experiments
                     if e['accession'] == experiment_accession)
        else:
            experiment = \
                common.encoded_get(urlparse.urljoin(
                    server,
                    '/experiments/%s' % (experiment_accession)), keypair)
        logger.debug('ENCODEd experiment %s' % (experiment['accession']))
        if args.lab and experiment['lab'].split('/')[2] not in args.lab:
            continue

        try:
            idr_stage = next(
                s['execution'] for s in desc['stages']
                if s['execution']['name'] == "Final IDR peak calls")
        except:
            logging.error('Failed to find final IDR stage in %s' %
                          (analysis_id))
        else:
            if idr_stage[
                    'state'] != 'done':  #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors
                Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None
                notes = []
                #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs
                idr_stage_names = [
                    'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates',
                    'IDR Rep 2 Self-pseudoreplicates',
                    'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'
                ]
                for stage_name in idr_stage_names:
                    try:
                        idr_stage = next(
                            s['execution'] for s in desc['stages']
                            if s['execution']['name'] == stage_name)
                    except StopIteration:
                        continue
                    except:
                        raise
                    if idr_stage['state'] == 'failed':
                        try:
                            job_log = subprocess.check_output(
                                'dx watch %s' % (idr_stage['id']),
                                shell=True,
                                stderr=subprocess.STDOUT)
                        except subprocess.CalledProcessError as e:
                            job_log = e.output
                        else:
                            job_log = None
                        if job_log:
                            patterns = [
                                r'Peak files must contain at least 20 peaks post-merge'
                            ]
                            for p in patterns:
                                m = re.search(p, job_log)
                                if m:
                                    notes.append("%s: %s" %
                                                 (stage_name, m.group(0)))
                        if not notes:
                            notes.append(idr_stage['failureMessage'])
                try:
                    done_time = next(transition['setAt']
                                     for transition in desc['stateTransitions']
                                     if transition['newState'] == "failed")
                except StopIteration:
                    done_time = "Not done or failed"
                except:
                    raise
            else:
                Np = idr_stage['output'].get('Np')
                N1 = idr_stage['output'].get('N1')
                N2 = idr_stage['output'].get('N2')
                Nt = idr_stage['output'].get('Nt')
                rescue_ratio = idr_stage['output'].get('rescue_ratio')
                self_consistency_ratio = idr_stage['output'].get(
                    'self_consistency_ratio')
                reproducibility_test = idr_stage['output'].get(
                    'reproducibility_test')
                notes = "IDR Complete"
                done_time = next(transition['setAt']
                                 for transition in desc['stateTransitions']
                                 if transition['newState'] == "done")

        if done_time:
            date = time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime(done_time / 1000))
        else:
            date = "Running"
        analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % (
            desc.get('project').split('-')[1], desc.get('id').split('-')[1])
        experiment_link = 'https://www.encodeproject.org/experiments/%s' % (
            experiment.get('accession'))
        row = {
            'date':
            date,
            'analysis':
            analysis_link,
            'analysis id':
            desc.get('id'),
            'experiment':
            experiment_link,
            'target':
            experiment['target'].split('/')[2],
            'biosample_term_name':
            experiment.get('biosample_term_name'),
            'biosample_type':
            experiment.get('biosample_type'),
            'lab':
            experiment['lab'].split('/')[2],
            'rfa':
            common.encoded_get(server + experiment.get('award'),
                               keypair).get('rfa'),
            'assembly':
            assembly,
            'Np':
            Np,
            'N1':
            N1,
            'N2':
            N2,
            'Nt':
            Nt,
            'rescue_ratio':
            rescue_ratio,
            'self_consistency_ratio':
            self_consistency_ratio,
            'reproducibility_test':
            reproducibility_test,
            'state':
            desc.get('state'),
            'release':
            experiment['status'],
            'total price':
            desc.get('totalPrice')
        }

        if notes:
            row.update({'notes': '%s' % (notes)})
        else:
            row.update({'notes': '%s' % ('OK')})
        #log = subprocess.check_output('dx watch %s' %(analysis.))
        writer.writerow(row)