def upload_gzipped_files(files, dataset, dataset_key):
    bucket = storage.bucket()
    file_urls = {}
    print("uploading gzipped files")
    print(files)

    for nm, fpath in files.items():
        with open(fpath, "rb") as file_stream:
            basename,extension = (fpath.split("/")[-1].split(".")[0]\
                                 ,".".join( fpath.split("/")[-1].split(".")[1:]))
            bucket_fn = os.path.join(
                f"all_v2/website_datasets/{dataset['userId']}/{basename}_{dataset['dataset']}.{extension}"
            )
            textblob = bucket.blob(bucket_fn)
            textblob.cache_control = 'no-cache'
            textblob.content_encoding = 'gzip'
            textblob.upload_from_string(
                file_stream.read(), content_type="application/octet-stream")
            textblob.reload()
            req = requests.get(textblob.public_url)
            url = textblob.public_url

            if isinstance(url, six.binary_type):
                url = url.decode('utf-8')
            file_urls[nm + "_url"] = url
            print(url)

    val = root.get()[dataset_key]
    print(val)
    val.update(file_urls)
    root.update({dataset_key: val})
Example #2
0
def upload_frontend_files(files,dataset, dataset_key):
    bucket = storage.bucket()

    urls = []
    for nm,fpath in files.items():
        with open(fpath,"rb") as file_stream:
            basename,extension = (fpath.split("/")[-1].split(".")[0]\
                                 ,".".join( fpath.split("/")[-1].split(".")[1:]))
            bucket_fn = os.path.join(f"all_v2/website_datasets/{dataset['userId']}/{basename}_{dataset['dataset']}.{extension}")
            textblob = bucket.blob(bucket_fn)
            textblob.cache_control = 'no-cache'
            textblob.content_encoding = 'gzip'
            textblob.upload_from_string(file_stream.read(),content_type="application/octet-stream")
            textblob.reload()
            req = requests.get(textblob.public_url)
            url = textblob.public_url
        
            if isinstance(url, six.binary_type):
                url = url.decode('utf-8')
            urls.append(url)

            print(url)

            if "annotation" in basename:
                annotations_url = url
                afname = bucket_fn
            elif "coords" in basename:
                download_url = url
                cfname = bucket_fn
           


    val = root.get()[dataset_key]

    val.update(dict(
        annotations_url = annotations_url,
        downloadUrl= download_url,
        filename= cfname,
    ))
    val["allfiles"].update({
            "coords":cfname,
            "annotations":afname,
    })

    root.update({dataset_key:val})
Example #3
0
def loop_queue(**kwargs):
    #list all datasets in firebase
    datasets = root.get()
    #create a list of users from all datasets
    users = set([v["userId"] for k, v in list(datasets.items())])

    force_reset_all = kwargs.get("force_reset_all", False)
    forced_resets = kwargs.get("force_resets_dataset", [])
    print(forced_resets)

    print(users)
    for u in users:
        #for each user, look at the list of all uploaded files, searching for unique dataset ids
        fpath = os.path.join(DATAROOT, "", u)
        userfiles = os.listdir(fpath)
        dsre = re.compile("[^_](\d+)$")

        #identify all files which can be matched to a dataset
        filtered_sorted = sorted([u for u in userfiles if dsre.search(u)],
                                 key=lambda x: dsre.search(x).group())
        #cycle through dataset file groups

        for dataset, g in it.groupby(filtered_sorted,
                                     lambda x: dsre.search(x).group()):

            print(dataset)
            #check firebase for a database record associated with the uploaded files
            matched = [(k, d) for k, d in datasets.items()
                       if d["dataset"] == dataset]
            if len(matched) == 1:
                k, d = matched[0]
                #check job status on the server

                if (not force_reset_all) and (d["dataset"] not in forced_resets
                                              ) and (d["server_process_status"]
                                                     == "COMPLETE"):
                    continue
                #process the upload if necessary
                print("PROCESSING")
                process_dataset(fpath, d, k, **kwargs)
            else:
                print(
                    f"no matching firebase entry found for dataset files {dataset}"
                )
    time.sleep(1)
Example #4
0
def process_dataset(gcs_folder, dataset, dataset_key, **kwargs):

    force_resets_step = kwargs.get("force_resets_step", [])
    force_resets_dataset = kwargs.get("force_resets_dataset", [])
    force_reset_all = kwargs.get("force_reset_all", False)
    reset_tmpfiles = kwargs.get("reset_tmpfiles")
    max_step = kwargs.get("max_step", None)

    status = dataset["server_process_status"]
    # if processing has not yet started, begin

    print(f"processing job, {dataset_key}")

    #enumerate job names
    jobs = {
        "INIT_FRONTEND": "INIT_FRONTEND",
        "INIT_BLAT": "INIT_BLAT",
        "INIT_TOPHAT_TRANSCRIPTS":
        "INIT_TOPHAT_TRANSCRIPTS",  #csv having umis--> tx ids
        "INIT_GO_TERMS": "INIT_GO_TERMS",
        "INIT_TRANSCRIPTS_DATABASE": "INIT_TRANSCRIPTS_DATABASE",
        "INIT_XY_BUFFERS": "INIT_XY_BUFFERS",
        "INIT_COLOR_BUFFERS": "INIT_COLOR_BUFFERS",
        "INIT_DATABASE_FILES": "INIT_DATABASE_FILES",
        "INIT_DATASET_DATABASE": "INIT_DATASET_DATABASE",
        "INIT_SEGMENTATIONS": "INIT_SEGMENTATIONS",
        "INIT_POSTPROCESSING_FRONTEND": "INIT_POSTPROCESSING_FRONTEND",
        "INIT_POSTGIS": "INIT_POSTGIS",
    }
    #enumerate job statuses
    status = {
        "WAITING": "WAITING",
        "RUNNING": "RUNNING",
        "COMPLETE": "COMPLETE",
        "FAILED": "FAILED",
    }

    jobfuns = {
        "INIT_FRONTEND": init_frontend,
        "INIT_XY_BUFFERS": init_xy_buffers,
        "INIT_COLOR_BUFFERS": init_color_buffers,
        "INIT_DATABASE_FILES": init_database_files,
        "INIT_DATASET_DATABASE": init_dataset_database,
        "INIT_SEGMENTATIONS": init_segmentations,
        "INIT_POSTPROCESSING_FRONTEND": init_postprocessing_frontend,
        "INIT_POSTGIS": init_postgis,
        "INIT_TOPHAT_TRANSCRIPTS": init_tophat_transcripts,
        "INIT_GO_TERMS": init_go_terms,
        "INIT_BLAT": init_blat,
        "INIT_TRANSCRIPTS_DATABASE": init_transcripts_database,
    }

    #initialize job handling for this dataset
    val = root.get()[dataset_key]
    if not "server_job_statuses" in val:
        val.update(dict(server_job_statuses={}))

    if not "server_job_progresses" in val:
        val.update(dict(server_job_progresses={}))

    val["server_process_status"] = status["RUNNING"]
    root.update({dataset_key: val})

    for k, v in jobs.items():
        if force_reset_all:
            val["server_job_statuses"][v] = "WAITING"
            continue

        print(force_resets_dataset)
        if dataset["dataset"] in force_resets_dataset:

            if len(force_resets_step) > 0:
                print("checking")
                #reset this step if forced
                if k in force_resets_step:
                    print("resetting ", v)
                    print("resetting forced step for ", v)
                    val["server_job_statuses"][v] = "WAITING"
                    continue
            else:
                print("resetting all steps ", v, f"for {dataset['dataset']}")
                val["server_job_statuses"][v] = "WAITING"
                continue

        #reset this step if forced
        if v in force_resets_step:
            val["server_job_statuses"][v] = "WAITING"
            continue

        #check if the job is incomplete
        if not val["server_job_statuses"].get(v, None) == status["COMPLETE"]:
            #if so, then if it has not failed, restart. Otherwise, test if we
            #should restart failed jobs
            if RESTART_FAILED_JOBS or (val["server_job_statuses"].get(v, None)
                                       != status["FAILED"]):
                val["server_job_statuses"][v] = status["WAITING"]
                val["server_job_progresses"][v] = 0

    root.update({dataset_key: val})
    val = None

    tmpdir = _create_tmpfiles(gcs_folder,
                              dataset,
                              reset_tmpfiles=reset_tmpfiles)
    dskey = dataset_key

    process_allafter = False

    #TODO: REPLACE ALL JOB QUEUEING WITH THIS LOOP
    for jobkey in [
            "INIT_FRONTEND",
            "INIT_XY_BUFFERS",
            "INIT_COLOR_BUFFERS",
            "INIT_DATABASE_FILES",
            "INIT_DATASET_DATABASE",  #must follow database files creation
            "INIT_SEGMENTATIONS",  #must follow database creation
            "INIT_POSTPROCESSING_FRONTEND",
            "INIT_POSTGIS",
            "INIT_TRANSCRIPTS_DATABASE",  # ONLY FOR WHOLE TXOME
            "INIT_BLAT",  # ONLY FOR WHOLE TXOME
            "INIT_TOPHAT_TRANSCRIPTS",  # ONLY FOR WHOLE TXOME
            "INIT_GO_TERMS",  # ONLY FOR WHOLE TXOME
    ]:

        #pass additional args (such as the storage key)
        #if a job requires it
        kw = {
            "key": dskey
        } if (jobkey in ["INIT_FRONTEND", "INIT_POSTPROCESSING_FRONTEND"
                         ]) else {}

        jobname = jobs[jobkey]
        if (get_jobstatus(dataset_key, jobname)
                == "WAITING") or (process_allafter):

            process_allafter = True  # if we reset one function, overrwrite all after
            set_jobstatus(dataset_key, jobname, status["RUNNING"])
            try:
                output_status = jobfuns[jobname](tmpdir, dataset, **kw)
            except Exception as e:
                set_jobstatus(dataset_key, jobname, "FAILED")
                raise (e)
            if output_status == 0:
                set_jobstatus(dataset_key, jobname, status["COMPLETE"])

            else:
                raise Exception(f"nonzero output status for {jobname}")

        if jobkey == max_step:
            print(f"Surpassing last step {max_step}, breaking")
            break

    val = root.get()[dataset_key]
    for k, v in val["server_job_statuses"].items():
        if v == "RUNNING": val["server_job_statuses"][k] = "WAITING"

    val["server_process_status"] = status["COMPLETE"]
    root.update({dataset_key: val})
    root.update({dskey: val})

    return
Example #5
0
def set_jobstatus(dskey, name, value):
    val = root.get()[dskey]
    val["server_job_statuses"][name] = value
    root.update({dskey: val})
Example #6
0
def get_jobstatus(dskey, name):
    val = root.get()[dskey]
    status = val["server_job_statuses"][name]
    return status