def run_audit(docstore, sample_list): doc = FileDocStore(file_path=docstore) master_list = [] with open(sample_list) as handle: for line in handle: master_list.append(line.rstrip()) results = {} pending = {} for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'): if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] if doc.size(entry) > 0: results[sample] = results.get(sample, []) + [entry['name']] for sample, files in results.items(): print "%s (%s) %s" % (sample, len(files), "\t".join(files)) for sample in master_list: if sample not in results or len(results[sample]) < 3: print "missing (%s)" % (len(results.get(sample, []))), sample
def run_errors(docstore): doc = FileDocStore(file_path=docstore) results = {} pending = {} for id, entry in doc.filter(visible=True): if entry.get('state', 'ok') in ['error']: print entry
def run_ls(docstore, size=False): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): #if doc.size(entry) > 0: if size: print id, entry.get('name', id), doc.size(entry) else: print id, entry.get('name', id)
def run_timing(docstore): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): if 'job' in entry and 'job_metrics' in entry['job']: timing = None for met in entry['job']['job_metrics']: if met['name'] == 'runtime_seconds': timing = met['raw_value'] if timing is not None: print id, entry["name"], timing
def testNebulaLaunch(self): input = { "input_file_1" : Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2" : Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = { "tail_select" : { "lineNum" : 3 } } doc = FileDocStore( file_path=get_abspath("../test_tmp/docstore") ) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"] ) logging.info("Creating Task") workflow = GalaxyWorkflow(ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask( "test_workflow", workflow, inputs=input, parameters=parameters ) service = GalaxyService( docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022 ) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config(cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call([get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def run_ls(docstore, size=False, extra=[]): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): #if doc.size(entry) > 0: extra = [] for e in args.extra: extra.append( str(entry.get(e,"")) ) if size: print id, entry.get('name', id), doc.size(entry), " ".join(extra) else: print id, entry.get('name', id), " ".join(extra)
def run_errors(docstore): doc = FileDocStore(file_path=docstore) for id, entry in doc.filter(): if entry.get('state', '') == 'error': print "Dataset", id, entry.get("tags", "") if 'provenance' in entry: print "tool:", entry['provenance']['tool_id'] print "-=-=-=-=-=-=-" print entry['job']['stdout'] print "-------------" print entry['job']['stderr'] print "-=-=-=-=-=-=-"
def testNebulaLaunch(self): input = { "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = {"tail_select": {"lineNum": 3}} doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) logging.info("Creating Task") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask("test_workflow", workflow, inputs=input, parameters=parameters) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config( cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call( [get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def run_copy(docstore, out_docstore): doc = FileDocStore(file_path=docstore) out_doc = FileDocStore(file_path=out_docstore) for id, entry in doc.filter(): if out_doc.get(id) is None: print "copy", id out_doc.put(id, entry) if doc.exists(entry): src_path = doc.get_filename(entry) out_doc.create(entry) dst_path = out_doc.get_filename(entry) shutil.copy(src_path, dst_path) out_doc.update_from_file(entry) else: #print "skip", id, doc.size(entry), out_doc.size(entry) if doc.size(entry) != out_doc.size(entry): print "mismatch", id
def run_query(docstore, fields, size, filters): doc = FileDocStore(file_path=docstore) filter = {} for k in filters: tmp=k.split("=") filter[tmp[0]] = tmp[1] for id, entry in doc.filter(**filter): if fields is None or len(fields) == 0: line = entry else: line = dict( (i, entry.get(i, "")) for i in fields ) if size: size_value = doc.size(Target(uuid=entry['uuid'])) else: size_value = "" print size_value, json.dumps(line)
def run_synapse(docstore, parent, workdir): doc = FileDocStore(file_path=docstore) syn = synapseclient.Synapse() syn.login() for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'): if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] name = entry['name'] name = re.sub(r'.vcf$', '', name) file_name = sample + "." + name + ".snv_mnv.vcf" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz") r = syn.query(query)['results'] if len(r) == 0: #print r print dst_file shutil.copy(src_file, dst_file) subprocess.check_call("bgzip %s" % (dst_file), shell=True) f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" ) f.fileType = 'vcf' f.pipeline = 'UCSC' f.variant_type = "snv" f = syn.store(f, executed="https://github.com/ucsccancer/pcawg_tools" ) else: print "Skipping", file_name
def run_scan(docstore, workdir, keyfile, upload_url, manifest): doc = FileDocStore(file_path=docstore) file_map = { 'broad' : {}, 'muse' : {} } wl_map = {} with open(manifest) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: wl_map[row['Donor_ID']] = row for id, entry in doc.filter(visible=True): if entry.get('extension', None) in ["vcf", "vcf_bgzip"]: if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] pipeline = None method = None call_type = None variant_type = None if entry['name'] in ['MUSE_1.0rc', 'MUSE_0.9.9.5']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exeception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (sample, method, datestr, variant_type, call_type ) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" print file_name target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) shutil.copy(src_file, dst_file) if entry['extension'] == 'vcf': subprocess.check_call( "bgzip %s" % dst_file, shell=True ) dst_file = dst_file + ".gz" subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True) shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file)) subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True) subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True) if sample not in file_map[pipeline]: file_map[pipeline][sample] = [] input_file = os.path.basename(dst_file) file_map[pipeline][sample].append(input_file) for pipeline, samples in file_map.items(): for sample, files in samples.items(): with open( os.path.join(workdir, "%s.%s.sh" %(pipeline, sample)), "w" ) as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Normal_GNOS_endpoint'], wl_map[sample]['Normal_Analysis_ID']), "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Tumour_GNOS_endpoint'], wl_map[sample]['Tumour_Analysis_ID']) ] cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl" cmd_str += " --metadata-urls %s" % (",".join(urls)) cmd_str += " --vcfs %s " % (",".join(files)) cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) ))) cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) ))) cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) ))) cmd_str += " --outdir %s.%s.dir" % (pipeline, sample) cmd_str += " --key %s " % (keyfile) cmd_str += " --upload-url %s" % (upload_url) cmd_str += " --study-refname-override tcga_pancancer_vcf_test" handle.write("""#!/bin/bash %s """ % (cmd_str) )