def upload_gzipped_files(files, dataset, dataset_key): bucket = storage.bucket() file_urls = {} print("uploading gzipped files") print(files) for nm, fpath in files.items(): with open(fpath, "rb") as file_stream: basename,extension = (fpath.split("/")[-1].split(".")[0]\ ,".".join( fpath.split("/")[-1].split(".")[1:])) bucket_fn = os.path.join( f"all_v2/website_datasets/{dataset['userId']}/{basename}_{dataset['dataset']}.{extension}" ) textblob = bucket.blob(bucket_fn) textblob.cache_control = 'no-cache' textblob.content_encoding = 'gzip' textblob.upload_from_string( file_stream.read(), content_type="application/octet-stream") textblob.reload() req = requests.get(textblob.public_url) url = textblob.public_url if isinstance(url, six.binary_type): url = url.decode('utf-8') file_urls[nm + "_url"] = url print(url) val = root.get()[dataset_key] print(val) val.update(file_urls) root.update({dataset_key: val})
def upload_frontend_files(files,dataset, dataset_key): bucket = storage.bucket() urls = [] for nm,fpath in files.items(): with open(fpath,"rb") as file_stream: basename,extension = (fpath.split("/")[-1].split(".")[0]\ ,".".join( fpath.split("/")[-1].split(".")[1:])) bucket_fn = os.path.join(f"all_v2/website_datasets/{dataset['userId']}/{basename}_{dataset['dataset']}.{extension}") textblob = bucket.blob(bucket_fn) textblob.cache_control = 'no-cache' textblob.content_encoding = 'gzip' textblob.upload_from_string(file_stream.read(),content_type="application/octet-stream") textblob.reload() req = requests.get(textblob.public_url) url = textblob.public_url if isinstance(url, six.binary_type): url = url.decode('utf-8') urls.append(url) print(url) if "annotation" in basename: annotations_url = url afname = bucket_fn elif "coords" in basename: download_url = url cfname = bucket_fn val = root.get()[dataset_key] val.update(dict( annotations_url = annotations_url, downloadUrl= download_url, filename= cfname, )) val["allfiles"].update({ "coords":cfname, "annotations":afname, }) root.update({dataset_key:val})
def loop_queue(**kwargs): #list all datasets in firebase datasets = root.get() #create a list of users from all datasets users = set([v["userId"] for k, v in list(datasets.items())]) force_reset_all = kwargs.get("force_reset_all", False) forced_resets = kwargs.get("force_resets_dataset", []) print(forced_resets) print(users) for u in users: #for each user, look at the list of all uploaded files, searching for unique dataset ids fpath = os.path.join(DATAROOT, "", u) userfiles = os.listdir(fpath) dsre = re.compile("[^_](\d+)$") #identify all files which can be matched to a dataset filtered_sorted = sorted([u for u in userfiles if dsre.search(u)], key=lambda x: dsre.search(x).group()) #cycle through dataset file groups for dataset, g in it.groupby(filtered_sorted, lambda x: dsre.search(x).group()): print(dataset) #check firebase for a database record associated with the uploaded files matched = [(k, d) for k, d in datasets.items() if d["dataset"] == dataset] if len(matched) == 1: k, d = matched[0] #check job status on the server if (not force_reset_all) and (d["dataset"] not in forced_resets ) and (d["server_process_status"] == "COMPLETE"): continue #process the upload if necessary print("PROCESSING") process_dataset(fpath, d, k, **kwargs) else: print( f"no matching firebase entry found for dataset files {dataset}" ) time.sleep(1)
def process_dataset(gcs_folder, dataset, dataset_key, **kwargs): force_resets_step = kwargs.get("force_resets_step", []) force_resets_dataset = kwargs.get("force_resets_dataset", []) force_reset_all = kwargs.get("force_reset_all", False) reset_tmpfiles = kwargs.get("reset_tmpfiles") max_step = kwargs.get("max_step", None) status = dataset["server_process_status"] # if processing has not yet started, begin print(f"processing job, {dataset_key}") #enumerate job names jobs = { "INIT_FRONTEND": "INIT_FRONTEND", "INIT_BLAT": "INIT_BLAT", "INIT_TOPHAT_TRANSCRIPTS": "INIT_TOPHAT_TRANSCRIPTS", #csv having umis--> tx ids "INIT_GO_TERMS": "INIT_GO_TERMS", "INIT_TRANSCRIPTS_DATABASE": "INIT_TRANSCRIPTS_DATABASE", "INIT_XY_BUFFERS": "INIT_XY_BUFFERS", "INIT_COLOR_BUFFERS": "INIT_COLOR_BUFFERS", "INIT_DATABASE_FILES": "INIT_DATABASE_FILES", "INIT_DATASET_DATABASE": "INIT_DATASET_DATABASE", "INIT_SEGMENTATIONS": "INIT_SEGMENTATIONS", "INIT_POSTPROCESSING_FRONTEND": "INIT_POSTPROCESSING_FRONTEND", "INIT_POSTGIS": "INIT_POSTGIS", } #enumerate job statuses status = { "WAITING": "WAITING", "RUNNING": "RUNNING", "COMPLETE": "COMPLETE", "FAILED": "FAILED", } jobfuns = { "INIT_FRONTEND": init_frontend, "INIT_XY_BUFFERS": init_xy_buffers, "INIT_COLOR_BUFFERS": init_color_buffers, "INIT_DATABASE_FILES": init_database_files, "INIT_DATASET_DATABASE": init_dataset_database, "INIT_SEGMENTATIONS": init_segmentations, "INIT_POSTPROCESSING_FRONTEND": init_postprocessing_frontend, "INIT_POSTGIS": init_postgis, "INIT_TOPHAT_TRANSCRIPTS": init_tophat_transcripts, "INIT_GO_TERMS": init_go_terms, "INIT_BLAT": init_blat, "INIT_TRANSCRIPTS_DATABASE": init_transcripts_database, } #initialize job handling for this dataset val = root.get()[dataset_key] if not "server_job_statuses" in val: val.update(dict(server_job_statuses={})) if not "server_job_progresses" in val: val.update(dict(server_job_progresses={})) val["server_process_status"] = status["RUNNING"] root.update({dataset_key: val}) for k, v in jobs.items(): if force_reset_all: val["server_job_statuses"][v] = "WAITING" continue print(force_resets_dataset) if dataset["dataset"] in force_resets_dataset: if len(force_resets_step) > 0: print("checking") #reset this step if forced if k in force_resets_step: print("resetting ", v) print("resetting forced step for ", v) val["server_job_statuses"][v] = "WAITING" continue else: print("resetting all steps ", v, f"for {dataset['dataset']}") val["server_job_statuses"][v] = "WAITING" continue #reset this step if forced if v in force_resets_step: val["server_job_statuses"][v] = "WAITING" continue #check if the job is incomplete if not val["server_job_statuses"].get(v, None) == status["COMPLETE"]: #if so, then if it has not failed, restart. Otherwise, test if we #should restart failed jobs if RESTART_FAILED_JOBS or (val["server_job_statuses"].get(v, None) != status["FAILED"]): val["server_job_statuses"][v] = status["WAITING"] val["server_job_progresses"][v] = 0 root.update({dataset_key: val}) val = None tmpdir = _create_tmpfiles(gcs_folder, dataset, reset_tmpfiles=reset_tmpfiles) dskey = dataset_key process_allafter = False #TODO: REPLACE ALL JOB QUEUEING WITH THIS LOOP for jobkey in [ "INIT_FRONTEND", "INIT_XY_BUFFERS", "INIT_COLOR_BUFFERS", "INIT_DATABASE_FILES", "INIT_DATASET_DATABASE", #must follow database files creation "INIT_SEGMENTATIONS", #must follow database creation "INIT_POSTPROCESSING_FRONTEND", "INIT_POSTGIS", "INIT_TRANSCRIPTS_DATABASE", # ONLY FOR WHOLE TXOME "INIT_BLAT", # ONLY FOR WHOLE TXOME "INIT_TOPHAT_TRANSCRIPTS", # ONLY FOR WHOLE TXOME "INIT_GO_TERMS", # ONLY FOR WHOLE TXOME ]: #pass additional args (such as the storage key) #if a job requires it kw = { "key": dskey } if (jobkey in ["INIT_FRONTEND", "INIT_POSTPROCESSING_FRONTEND" ]) else {} jobname = jobs[jobkey] if (get_jobstatus(dataset_key, jobname) == "WAITING") or (process_allafter): process_allafter = True # if we reset one function, overrwrite all after set_jobstatus(dataset_key, jobname, status["RUNNING"]) try: output_status = jobfuns[jobname](tmpdir, dataset, **kw) except Exception as e: set_jobstatus(dataset_key, jobname, "FAILED") raise (e) if output_status == 0: set_jobstatus(dataset_key, jobname, status["COMPLETE"]) else: raise Exception(f"nonzero output status for {jobname}") if jobkey == max_step: print(f"Surpassing last step {max_step}, breaking") break val = root.get()[dataset_key] for k, v in val["server_job_statuses"].items(): if v == "RUNNING": val["server_job_statuses"][k] = "WAITING" val["server_process_status"] = status["COMPLETE"] root.update({dataset_key: val}) root.update({dskey: val}) return
def set_jobstatus(dskey, name, value): val = root.get()[dskey] val["server_job_statuses"][name] = value root.update({dskey: val})
def get_jobstatus(dskey, name): val = root.get()[dskey] status = val["server_job_statuses"][name] return status