Ejemplo n.º 1
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid =  project.get_id()

    applet = dxencode.find_applet_by_name('fastqc-exp', pid )
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
            run = applet.run({ "accession": acc}, project=pid)
            print "Running: %s for %s" % (run, acc)
            n = n + 1
            if n > cmnd.number:
                break
        else:
            print "Skipping %s (0 replicates)" % acc
Ejemplo n.º 2
0
def createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId, appProjectId=None):
    '''This function will populate a workflow for the stepsToDo.'''

    if len(stepsToDo) < 1:
        return None
    if appProjectId == None:
        appProjectId = projectId

    # create a workflow object
    wf = dxpy.new_dxworkflow(title=extras['name'],name=extras['name'],folder=resultsFolder,
                                            project=projectId,description=extras['description'])

    # NOTE: prevStepResults dict contains links to result files to be generated by previous steps
    prevStepResults = {}
    for step in stepsToDo:
        appName = STEPS[step]['app']
        app = dxencode.find_applet_by_name(appName, appProjectId)
        appInputs = {}
        # file inputs
        for fileToken in STEPS[step]['inputs'].keys():
            appInp = STEPS[step]['inputs'][fileToken]
            if fileToken in prevStepResults:
                appInputs[ appInp ] = prevStepResults[fileToken]
            elif fileToken in priors:
                if isinstance(priors[fileToken], list):
                    appInputs[ appInp ] = []
                    for fid in priors[fileToken]:
                        appInputs[ appInp ] += [ dxencode.get_file_link(fid) ]
                else:
                    appInputs[ appInp ] = dxencode.get_file_link(priors[fileToken])
            else:
                print "ERROR: step '"+step+"' can't find input '"+fileToken+"'!"
                sys.exit(1)
        # Non-file app inputs
        if 'params' in STEPS[step]:
            for param in STEPS[step]['params'].keys():
                appParam = STEPS[step]['params'][param]
                if param in extras:
                    appInputs[ appParam ] = extras[param]
                else:
                    print "ERROR: unable to locate '"+param+"' in extras."
                    sys.exit(1)
        # Add wf stage
        stageId = wf.add_stage(app, stage_input=appInputs, folder=resultsFolder)
        # outputs, which we will need to link to
        for fileToken in STEPS[step]['results'].keys():
            #appOut = STEPS[step]['results'][fileToken]
            appOut = fileToken ## not the value
            prevStepResults[ fileToken ] = dxpy.dxlink({ 'stage': stageId,'outputField': appOut })
    wfRun = wf.run({})
    return wfRun.describe()
Ejemplo n.º 3
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid = project.get_id()

    applet = dxencode.find_applet_by_name('fastqc-exp', pid)
    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www')
    query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID
    res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW)
    exps = res.json()['@graph']

    n = 0
    for exp in exps:
        acc = exp['accession']
        if len(exp['replicates']) > 0:
            if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                print "Skipping %s with wrong library size (%s)" % (
                    acc, exp['replicates'][0]['library'].get('size_range', ""))
                #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                continue
            if exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity_units', "") == "cells":
                ncells = float(exp['replicates'][0]['library'].get(
                    'nucleic_acid_starting_quantity', 0.0))
                if ncells < 20:
                    print "Skipping %s as single-cell (%s %s)" % (
                        acc, exp['replicates'][0]['library'].get(
                            'nucleic_acid_starting_quantity_units',
                            ""), ncells)
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
            run = applet.run({"accession": acc}, project=pid)
            print "Running: %s for %s" % (run, acc)
            n = n + 1
            if n > cmnd.number:
                break
        else:
            print "Skipping %s (0 replicates)" % acc
Ejemplo n.º 4
0
def main():
    args = get_args()

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % ( args.experiment, exp )
        sys.exit(1)

    #replicate = "rep%s_%s" % (args.br, args.tr)
    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br,args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    mapping['replicate'] = replicate

    try:
        mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")]

    except KeyError:
        print "Organism %s not currently supported" % mapping['organism']
        sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    psv = pipeline_specific_vars(args, mapping, pairedEnd)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()


    ## TODO this is a bunch of ugly
    if pairedEnd:
        paired_fqs = {
            '1': [],
            '2': []
        }
        read1s = []
        read2s = []
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
            read1s.append(p1['accession'])
            read2s.append(p2['accession'])
        pipePath = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]
        pipePath = STEP_ORDER['se']

    for step in pipePath:
        STEPS[step] = calculate_steps(step)

    pipeSteps = STEPS
    ## warning ugly kludge here
    file_globs = {}
    for app in STEPS.keys():
        for token in STEPS[app]['results'].keys():
            file_globs[token] = STEPS[app]['results'][token]

    print "Checking for prior results..."

    priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId)

    if pairedEnd:
        priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId)
        priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId)
        priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads']
        submitted = {
            'all_reads': read1s + read2s
        }
    else:
        priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId)
        priors['all_reads'] = priors['reads']
        submitted = {
            'all_reads': [ f['accession'] for f in mapping['unpaired']],
        }


    print "Determining steps to run..."
    #print priors
    #sys.exit(1)
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [] # old results will need to be moved/removed if step is rerun
    stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True)

    print "Checking for currently running analyses..."
    dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True)

    if len(stepsToDo):
        print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo
        sys.exit(0)

    print priors
    to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ]
    n = 0 # skip reads
    print "Attempting to submit %s files to args.experiment" % len(to_submit)
    while(to_submit):
        if n > len(priors) * len(priors):
            print "Too many itereations: %s" % priors
            break
        token = to_submit.pop(0)
        print "%s %s - %s" % (token, priors[token], n)
        f_ob = POST_TEMPLATES.get(token, None)
        n += 1
        if f_ob:
            derive_check = f_ob.get('derived_from', [])
            if derive_check:
                derived = [ submitted[f] for f in derive_check if submitted.get(f) ]
                if not derived:
                    to_submit.append(token)
                    continue
                else:
                    f_ob['derived_from'] = list(itertools.chain(*derived))
            dxFile = dxpy.DXFile(dxid=priors[token])
            print "Post File: %s %s" % (token, dxFile.name)
            f_ob['dataset'] = args.experiment
            f_ob['lab'] = '/labs/j-michael-cherry/'
            f_ob['award'] = '/awards/U41HG006992/'
            f_ob['assembly'] = mapping['genome']
            ## temporary haxors until file display works
            f_ob['replicate'] = mapping['replicate_id']
            f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software()))
            print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': '))
            if args.testserver:
                server = 'test'
            else:
                server = 'www'

            if args.test:
                fake_acc = 'ENCFF%03dAAA' % n
                print "Fake submission: %s" % fake_acc
                submitted[token] = [ fake_acc ]
            else:
                applet = dxencode.find_applet_by_name('validate-post', projectId )
                job = applet.run({
                    "pipe_file": dxpy.dxlink(dxFile),
                    "file_meta": f_ob,
                    "key": server,
                    "debug": True,
                    "skipvalidate": args.skipvalidate or False
                    })
                print "Submitting %s" % job.id
                job.wait_on_done(interval=1)
                accession = job.describe()['output'].get('accession', "Unknown Acc")
                error = job.describe()['output'].get('error', "Unknown Error")
                submitted[token] = [ accession ]
                print "Posted (%s): %s" % (error, accession)

    # Exit if test only
    if args.test:
        print "Fake submitted %s files." % n
    if args.test:
        sys.exit(0)
Ejemplo n.º 5
0
def createWorkflow(stepsToDo,
                   priors,
                   extras,
                   resultsFolder,
                   projectId,
                   appProjectId=None):
    '''This function will populate a workflow for the stepsToDo.'''

    if len(stepsToDo) < 1:
        return None
    if appProjectId == None:
        appProjectId = projectId

    # create a workflow object
    wf = dxpy.new_dxworkflow(title=extras['name'],
                             name=extras['name'],
                             folder=resultsFolder,
                             project=projectId,
                             description=extras['description'])

    # NOTE: prevStepResults dict contains links to result files to be generated by previous steps
    prevStepResults = {}
    for step in stepsToDo:
        appName = STEPS[step]['app']
        app = dxencode.find_applet_by_name(appName, appProjectId)
        appInputs = {}
        # file inputs
        for fileToken in STEPS[step]['inputs'].keys():
            appInp = STEPS[step]['inputs'][fileToken]
            if fileToken in prevStepResults:
                appInputs[appInp] = prevStepResults[fileToken]
            elif fileToken in priors:
                if isinstance(priors[fileToken], list):
                    appInputs[appInp] = []
                    for fid in priors[fileToken]:
                        appInputs[appInp] += [dxencode.get_file_link(fid)]
                else:
                    appInputs[appInp] = dxencode.get_file_link(
                        priors[fileToken])
            else:
                print "ERROR: step '" + step + "' can't find input '" + fileToken + "'!"
                sys.exit(1)
        # Non-file app inputs
        if 'params' in STEPS[step]:
            for param in STEPS[step]['params'].keys():
                appParam = STEPS[step]['params'][param]
                if param in extras:
                    appInputs[appParam] = extras[param]
                else:
                    print "ERROR: unable to locate '" + param + "' in extras."
                    sys.exit(1)
        # Add wf stage
        stageId = wf.add_stage(app,
                               stage_input=appInputs,
                               folder=resultsFolder)
        # outputs, which we will need to link to
        for fileToken in STEPS[step]['results'].keys():
            #appOut = STEPS[step]['results'][fileToken]
            appOut = fileToken  ## not the value
            prevStepResults[fileToken] = dxpy.dxlink({
                'stage': stageId,
                'outputField': appOut
            })
    wfRun = wf.run({})
    return wfRun.describe()