def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() applet = dxencode.find_applet_by_name('fastqc-exp', pid ) (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue run = applet.run({ "accession": acc}, project=pid) print "Running: %s for %s" % (run, acc) n = n + 1 if n > cmnd.number: break else: print "Skipping %s (0 replicates)" % acc
def main(): cmnd = get_args() ## resolve projects (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&files.file_format=fastq&frame=embedded&replicates.library.biosample.donor.organism.name=mouse' % ASSAY_TERM_ID res = requests.get(SERVER+query, headers=HEADERS, auth=(AUTHID, AUTHPW),allow_redirects=True, stream=True) exps = res.json()['@graph'] n=0 pid = os.getpid() if cmnd.maplambda: lambdaqc = '--maplambda' else: lambdaqc = '' for exp in exps: acc = exp['accession'] if n >= cmnd.numberjobs: print "Stopping at %s replicates" % n break for rep in exp.get('replicates', []): try: runcmd = "./launchDnaMe.py %s --gzip -e %s --br %s --tr %s > runs/launch%s-%s-%s.%s%s.out" % (lambdaqc, acc, rep['biological_replicate_number'], rep['technical_replicate_number'],acc, rep['biological_replicate_number'], rep['technical_replicate_number'],pid,lambdaqc) print runcmd if not cmnd.test: os.system(runcmd) n+=1 except KeyError, e: print "%s failed: %s" % (acc, e)
def main(): cmnd = get_args() (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if cmnd.only and acc != cmnd.only: print "skipping %s" % acc continue if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if n >= cmnd.numberjobs: print "Stopping at %s replicates" % n break exp_mapping = dxencode.choose_mapping_for_experiment(exp) for rep in exp.get('replicates', []): try: br = rep['biological_replicate_number'] tr = rep['technical_replicate_number'] mapping = exp_mapping[(br,tr)] o = GENOME_MAPPING[mapping['organism']] args = "-o %s" % o args += " -l %s" % mapping['library'] args += " -g %s" % mapping['sex'] if mapping['paired']: paired_fqs = { '1': [], '2': [] } for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") args += " -1 " + " ".join(paired_fqs['1']) args += " -2 " + " ".join(paired_fqs['2']) else: args += " -1 " + " ".join([ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]) runcmd = "./lrnaLaunch.py -e %s -r %s -tr %s %s -a M4 --project %s --resultsLoc /runs --run > runs/launch%s-%s-%s-M4.%s.out" % (acc, br, tr, args, PROJECT_NAME, acc, br, tr, os.getpid()) print runcmd if not cmnd.test: # probably should be subprocess.Popen() os.system(runcmd) n+=1 except KeyError, e: print "%s failed: %s" % (acc, e)
def main(accession, key=None, debug=False): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) (AUTHID, AUTHPW, SERVER) = dxencode.processkey(key) url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (accession) #get the experiment object logger.debug("%s - %s" % (url, AUTHID)) response = dxencode.encoded_get(url, AUTHID, AUTHPW) logger.debug(response) exp = response.json() for ff in exp.get('original_files', []): try: fr = dxencode.encoded_get(SERVER + ff, AUTHID, AUTHPW) ff = fr.json() if ff['status'] != 'uploading': continue notes = json.loads(ff['notes']) dxid = notes['dx-id'] except Exception, e: logger.error("Error getting dx id: %s for %s" % (e, ff['accession'])) continue dx_file = dxpy.DXFile(dxid) local_file = dx_file.describe()['name'] dxpy.download_dxfile(dxid, local_file) item = dxencode.encoded_upload_existing(local_file, ff['accession'], SERVER, AUTHID, AUTHPW) print item
def main(accession, key=None, debug=False): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #files = [dxpy.DXFile(item) for item in files] # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #for i, f in enumerate(files): # dxpy.download_dxfile(f.get_id(), "files-" + str(i)) # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) (AUTHID,AUTHPW,SERVER) = dxencode.processkey(key) url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(accession) #get the experiment object logger.debug("%s - %s" % (url, AUTHID)) response = dxencode.encoded_get(url, AUTHID, AUTHPW) logger.debug(response) exp = response.json() for ff in exp.get('original_files', []): try: fr = dxencode.encoded_get(SERVER+ff, AUTHID, AUTHPW) ff = fr.json() if ff['status'] != 'uploading': continue notes = json.loads(ff['notes']) dxid = notes['dx-id'] except Exception, e: logger.error("Error getting dx id: %s for %s" % (e, ff['accession'])) continue dx_file = dxpy.DXFile(dxid) local_file = dx_file.describe()['name'] dxpy.download_dxfile(dxid, local_file) item = dxencode.encoded_upload_existing(local_file, ff['accession'], SERVER, AUTHID, AUTHPW) print item
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() applet = dxencode.find_applet_by_name('fastqc-exp', pid) (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % ( acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % ( acc, exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue run = applet.run({"accession": acc}, project=pid) print "Running: %s for %s" % (run, acc) n = n + 1 if n > cmnd.number: break else: print "Skipping %s (0 replicates)" % acc
def main(): cmnd = get_args() ## resolve projects (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&files.file_format=fastq&frame=embedded&replicates.library.biosample.donor.organism.name=mouse' % ASSAY_TERM_ID res = requests.get(SERVER + query, headers=HEADERS, auth=(AUTHID, AUTHPW), allow_redirects=True, stream=True) exps = res.json()['@graph'] n = 0 pid = os.getpid() if cmnd.maplambda: lambdaqc = '--maplambda' else: lambdaqc = '' for exp in exps: acc = exp['accession'] if n >= cmnd.numberjobs: print "Stopping at %s replicates" % n break for rep in exp.get('replicates', []): try: runcmd = "./launchDnaMe.py %s --gzip -e %s --br %s --tr %s > runs/launch%s-%s-%s.%s%s.out" % ( lambdaqc, acc, rep['biological_replicate_number'], rep['technical_replicate_number'], acc, rep['biological_replicate_number'], rep['technical_replicate_number'], pid, lambdaqc) print runcmd if not cmnd.test: os.system(runcmd) n += 1 except KeyError, e: print "%s failed: %s" % (acc, e)
def main(): args = get_args() (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % ( args.experiment, exp ) sys.exit(1) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br,args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) if args.maplambda: genome = 'lambda' else: if mapping['organism'] == 'mouse': genome = 'mm10' elif mapping['organism'] == 'human': genome = 'hg19' else: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip) project = dxencode.get_project(args.project) projectId = project.get_id() # args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome args.resultsLoc = RESULT_FOLDER_DEFAULT # not sure we need genome resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate if args.maplambda: resultsFolder = resultsFolder + '/lambda' if not args.test: if not dxencode.project_has_folder(project, resultsFolder): project.new_folder(resultsFolder,parents=True) if pairedEnd: paired_fqs = { '1': [], '2': [] } for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") steps = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ] steps = STEP_ORDER['se'] print "Generating workflow steps (single-end)..." for step in steps: STEPS[step] = calculate_steps(step) print "Checking for prior results..." # Check if there are previous results # Perhaps reads files are already there? # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo # and fill in inputs to workflow steps priors = findPriorResults(pairedEnd,resultsFolder,projectId, maplambda=True) print "Checking for read files..." # Find all reads files and move into place # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/... # or possibly local, Currently only DX locations are supported. if pairedEnd: reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId) reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId) else: # trim-se and trim-pe use different input tokens. reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId) print "Looking for reference files..." findReferenceFiles(GENOME_REFERENCES.keys(), priors,args.refLoc,extras) print "Determining steps to run..." # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [] # old results will need to be moved/removed if step is rerun stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force) # Report the plans print "Running '"+extras['title']+"'" print " on "+extras['subTitle'] if pairedEnd: print "- Reads1: " else: print "- Reads: " for fid in reads1: print " " + dxencode.file_path_from_fid(fid) if pairedEnd: print "- Reads2: " for fid in reads2: print " " + dxencode.file_path_from_fid(fid) print "- Reference files:" for token in GENOME_REFERENCES.keys(): print " " + dxencode.file_path_from_fid(priors[token],True) print "- Results written to: " + args.project + ":" +resultsFolder if len(stepsToDo) == 0: print "* All expected results are in the results folder, so there is nothing to do." print " If this experiment/replicate needs to be rerun, then use the --force flag to " print " rerun all steps; or remove suspect results from the folder before launching." sys.exit(0) else: print "- Steps to run:" steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: STEPS[step] = calculate_steps(step) if step in stepsToDo: print " * "+STEPS[step]['app']+" will be run" else: if not step.find('concat') == 0: print " "+STEPS[step]['app']+" has already been run" print "Checking for currently running analyses..." checkRunsPreviouslyLaunched(resultsFolder,projectId) if len(deprecateFiles) > 0: if args.test: print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'." for fid in deprecateFiles: print " " + dxencode.file_path_from_fid(fid) else: print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'..." dxencode.move_files(deprecateFiles,resultsFolder+"/deprecated",projectId) # Exit if test only if args.test: print "TEST ONLY - exiting." sys.exit(0) print "Launch sequence initiating..." wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder,projectId) print " We have liftoff!" logThisRun(wfRun['id'],resultsFolder,projectId) print " Launched " + wfRun['id'] print "(success)"
def main(): args = get_args() (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % ( args.experiment, exp ) sys.exit(1) #replicate = "rep%s_%s" % (args.br, args.tr) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br,args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) mapping['replicate'] = replicate try: mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")] except KeyError: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) psv = pipeline_specific_vars(args, mapping, pairedEnd) project = dxencode.get_project(args.project) projectId = project.get_id() ## TODO this is a bunch of ugly if pairedEnd: paired_fqs = { '1': [], '2': [] } read1s = [] read2s = [] for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz") read1s.append(p1['accession']) read2s.append(p2['accession']) pipePath = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ] pipePath = STEP_ORDER['se'] for step in pipePath: STEPS[step] = calculate_steps(step) pipeSteps = STEPS ## warning ugly kludge here file_globs = {} for app in STEPS.keys(): for token in STEPS[app]['results'].keys(): file_globs[token] = STEPS[app]['results'][token] print "Checking for prior results..." priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId) if pairedEnd: priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId) priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId) priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads'] submitted = { 'all_reads': read1s + read2s } else: priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId) priors['all_reads'] = priors['reads'] submitted = { 'all_reads': [ f['accession'] for f in mapping['unpaired']], } print "Determining steps to run..." #print priors #sys.exit(1) # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [] # old results will need to be moved/removed if step is rerun stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True) print "Checking for currently running analyses..." dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True) if len(stepsToDo): print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo sys.exit(0) print priors to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ] n = 0 # skip reads print "Attempting to submit %s files to args.experiment" % len(to_submit) while(to_submit): if n > len(priors) * len(priors): print "Too many itereations: %s" % priors break token = to_submit.pop(0) print "%s %s - %s" % (token, priors[token], n) f_ob = POST_TEMPLATES.get(token, None) n += 1 if f_ob: derive_check = f_ob.get('derived_from', []) if derive_check: derived = [ submitted[f] for f in derive_check if submitted.get(f) ] if not derived: to_submit.append(token) continue else: f_ob['derived_from'] = list(itertools.chain(*derived)) dxFile = dxpy.DXFile(dxid=priors[token]) print "Post File: %s %s" % (token, dxFile.name) f_ob['dataset'] = args.experiment f_ob['lab'] = '/labs/j-michael-cherry/' f_ob['award'] = '/awards/U41HG006992/' f_ob['assembly'] = mapping['genome'] ## temporary haxors until file display works f_ob['replicate'] = mapping['replicate_id'] f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software())) print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': ')) if args.testserver: server = 'test' else: server = 'www' if args.test: fake_acc = 'ENCFF%03dAAA' % n print "Fake submission: %s" % fake_acc submitted[token] = [ fake_acc ] else: applet = dxencode.find_applet_by_name('validate-post', projectId ) job = applet.run({ "pipe_file": dxpy.dxlink(dxFile), "file_meta": f_ob, "key": server, "debug": True, "skipvalidate": args.skipvalidate or False }) print "Submitting %s" % job.id job.wait_on_done(interval=1) accession = job.describe()['output'].get('accession', "Unknown Acc") error = job.describe()['output'].get('error', "Unknown Error") submitted[token] = [ accession ] print "Posted (%s): %s" % (error, accession) # Exit if test only if args.test: print "Fake submitted %s files." % n if args.test: sys.exit(0)
def main(): cmnd = get_args() (AUTHID, AUTHPW, SERVER) = dxencode.processkey('www') query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % ASSAY_TERM_ID res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW) exps = res.json()['@graph'] n = 0 for exp in exps: acc = exp['accession'] if cmnd.only and acc != cmnd.only: print "skipping %s" % acc continue if len(exp['replicates']) > 0: if exp['replicates'][0]['library'].get('size_range', "") != '>200': print "Skipping %s with wrong library size (%s)" % ( acc, exp['replicates'][0]['library'].get('size_range', "")) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', "") == "cells": ncells = float(exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity', 0.0)) if ncells < 20: print "Skipping %s as single-cell (%s %s)" % ( acc, exp['replicates'][0]['library'].get( 'nucleic_acid_starting_quantity_units', ""), ncells) #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': ')) continue if n >= cmnd.numberjobs: print "Stopping at %s replicates" % n break exp_mapping = dxencode.choose_mapping_for_experiment(exp) for rep in exp.get('replicates', []): try: br = rep['biological_replicate_number'] tr = rep['technical_replicate_number'] mapping = exp_mapping[(br, tr)] o = GENOME_MAPPING[mapping['organism']] args = "-o %s" % o args += " -l %s" % mapping['library'] args += " -g %s" % mapping['sex'] if mapping['paired']: paired_fqs = {'1': [], '2': []} for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession'] + ".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession'] + ".fastq.gz") args += " -1 " + " ".join(paired_fqs['1']) args += " -2 " + " ".join(paired_fqs['2']) else: args += " -1 " + " ".join([ f['accession'] + ".fastq.gz" for f in mapping['unpaired'] ]) runcmd = "./lrnaLaunch.py -e %s -r %s -tr %s %s -a M4 --project %s --resultsLoc /runs --run > runs/launch%s-%s-%s-M4.%s.out" % ( acc, br, tr, args, PROJECT_NAME, acc, br, tr, os.getpid()) print runcmd if not cmnd.test: # probably should be subprocess.Popen() os.system(runcmd) n += 1 except KeyError, e: print "%s failed: %s" % (acc, e)
mapped = parse_map_report(folder, project) lambda_mapped = parse_map_report(folder + "/lambda", project) nreads = float(mapped.get("Sequences analysed in total", -999)) mapeff = float(mapped.get("Mapping efficiency", "-99999%").strip("%")) / 100.0 coverage = nreads * mapeff * float(rep["read_length"]) / APPROX_HAPLOID_GENOME_SIZE print "\t".join( [acc, rep_str] + [mapped.get(v, "-999.9") for v in labels] + [lambda_mapped.get(v, "-999.9") for v in labels] + ["%2.2f" % coverage] ) (AUTHID, AUTHPW, SERVER) = dxencode.processkey("default") def process_exp(acc, project, skipfq): fqc_metrics = {} expr = get_fastqc.get_exp_time(acc, project, skip=skipfq) if not skipfq: fqc_metrics[expr["accession"]] = {} for fq in [f for f in expr["files"] if f["file_format"] == "fastq"]: fqc_metrics[expr["accession"]][fq["accession"]] = get_fastqc.get_fastqc(fq["accession"], project) # print json.dumps(fqc_metrics, indent=4) get_bismark_stats(expr, project)
for fq in fqs: rep = fq['replicate'] repstr = '_rep'+str(rep['biological_replicate_number'])+'_'+str(rep['technical_replicate_number']) total_reads[fq['accession']] = get_fastqc(fq['accession'], project)['Total Sequences'] elapsed[repstr]['fastqs'].append(fq['accession']) if fq.get('paired_end', None): paired = 'Paired' for repstr in elapsed.keys(): sizes = "\t".join([ "\t".join((fq, str(total_reads[fq]))) for fq in elapsed[repstr]['fastqs']]) print "\t".join((exp['accession'],repstr, str(elapsed[repstr]['time']), sizes, paired)) return exp # woo hoo global (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default') def main(): argparser = get_args() args = argparser.parse_args() project = dxencode.get_project(args.project) if args.file: getr = dxencode.encoded_get(SERVER+args.file, AUTHID=AUTHID, AUTHPW=AUTHPW) try: getr.raise_for_status() except: print "Could not find %s in db" % args.file raise encff = getr.json()
def main(): args = get_args() (AUTHID, AUTHPW, SERVER) = dxencode.processkey('default') url = SERVER + 'experiments/%s/?format=json&frame=embedded' % ( args.experiment) response = dxencode.encoded_get(url, AUTHID, AUTHPW) exp = response.json() if not exp.get('replicates') or len(exp['replicates']) < 1: print "No replicates found in %s\n%s" % (args.experiment, exp) sys.exit(1) replicate = "%s_%s" % (args.br, args.tr) reps_mapping = dxencode.choose_mapping_for_experiment(exp) # could try to do all replicates here try: mapping = reps_mapping[(args.br, args.tr)] except KeyError: print "Specified replicate: %s could not be found in mapping." % replicate print reps_mapping sys.exit(1) if args.maplambda: genome = 'lambda' else: if mapping['organism'] == 'mouse': genome = 'mm10' elif mapping['organism'] == 'human': genome = 'hg19' else: print "Organism %s not currently supported" % mapping['organism'] sys.exit(1) if mapping['unpaired'] and not mapping['paired']: pairedEnd = False elif mapping['paired'] and not mapping['unpaired']: pairedEnd = True elif not mapping['unpaired'] and not mapping['paired']: print "Replicate has no reads either paired or unpaired" print mapping sys.exit(1) else: print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % ( len(mapping['paired'], len(mapping['unpaired']))) print mapping sys.exit(1) extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip) project = dxencode.get_project(args.project) projectId = project.get_id() # args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome args.resultsLoc = RESULT_FOLDER_DEFAULT # not sure we need genome resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate if args.maplambda: resultsFolder = resultsFolder + '/lambda' if not args.test: if not dxencode.project_has_folder(project, resultsFolder): project.new_folder(resultsFolder, parents=True) if pairedEnd: paired_fqs = {'1': [], '2': []} for (p1, p2) in mapping['paired']: paired_fqs[p1['paired_end']].append(p1['accession'] + ".fastq.gz") paired_fqs[p2['paired_end']].append(p2['accession'] + ".fastq.gz") steps = STEP_ORDER['pe'] print "Generating workflow steps (paired-end)..." else: unpaired_fqs = [ f['accession'] + ".fastq.gz" for f in mapping['unpaired'] ] steps = STEP_ORDER['se'] print "Generating workflow steps (single-end)..." for step in steps: STEPS[step] = calculate_steps(step) print "Checking for prior results..." # Check if there are previous results # Perhaps reads files are already there? # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo # and fill in inputs to workflow steps priors = findPriorResults(pairedEnd, resultsFolder, projectId, maplambda=True) print "Checking for read files..." # Find all reads files and move into place # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/... # or possibly local, Currently only DX locations are supported. if pairedEnd: reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId) reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId) else: # trim-se and trim-pe use different input tokens. reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId) print "Looking for reference files..." findReferenceFiles(GENOME_REFERENCES.keys(), priors, args.refLoc, extras) print "Determining steps to run..." # NOTE: stepsToDo is an ordered list of steps that need to be run deprecateFiles = [ ] # old results will need to be moved/removed if step is rerun stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force) # Report the plans print "Running '" + extras['title'] + "'" print " on " + extras['subTitle'] if pairedEnd: print "- Reads1: " else: print "- Reads: " for fid in reads1: print " " + dxencode.file_path_from_fid(fid) if pairedEnd: print "- Reads2: " for fid in reads2: print " " + dxencode.file_path_from_fid(fid) print "- Reference files:" for token in GENOME_REFERENCES.keys(): print " " + dxencode.file_path_from_fid(priors[token], True) print "- Results written to: " + args.project + ":" + resultsFolder if len(stepsToDo) == 0: print "* All expected results are in the results folder, so there is nothing to do." print " If this experiment/replicate needs to be rerun, then use the --force flag to " print " rerun all steps; or remove suspect results from the folder before launching." sys.exit(0) else: print "- Steps to run:" steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: STEPS[step] = calculate_steps(step) if step in stepsToDo: print " * " + STEPS[step]['app'] + " will be run" else: if not step.find('concat') == 0: print " " + STEPS[step]['app'] + " has already been run" print "Checking for currently running analyses..." checkRunsPreviouslyLaunched(resultsFolder, projectId) if len(deprecateFiles) > 0: if args.test: print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'." for fid in deprecateFiles: print " " + dxencode.file_path_from_fid(fid) else: print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \ resultsFolder+"/deprecated'..." dxencode.move_files(deprecateFiles, resultsFolder + "/deprecated", projectId) # Exit if test only if args.test: print "TEST ONLY - exiting." sys.exit(0) print "Launch sequence initiating..." wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId) print " We have liftoff!" logThisRun(wfRun['id'], resultsFolder, projectId) print " Launched " + wfRun['id'] print "(success)"