Esempio n. 1
0
    def deserialize_results(self, dir, loc):
        """Deserialize to locaiton on disk.

        Note: only support gs:// endpoints.
        """

        # strip gs prefix if it exists
        if dir.startswith("gs://"):
            dir = dir[5:]
        dir_path = dir.split("/")

        # grab bucket name
        bucket_name = dir_path[0]

        # get path
        path = "/".join(dir_path[1:])
        if path[-1] != "/":
            path += "/"
        path += loc

        ghook = GoogleCloudStorageHook()  # uses default gcp connection
        client = ghook.get_conn()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(blob_name=path)
        # raises error if not found
        return blob.download_as_string().decode()
Esempio n. 2
0
    def write_status(**context):
        # test mode disable
        if not TEST_MODE:
            # write config and time stamp
            ghook = GoogleCloudStorageHook()  # uses default gcp connection
            client = ghook.get_conn()
            source = context["dag_run"].conf.get("source")
            bucket = client.bucket(source + "_process")
            blob = bucket.blob(
                blob_name=f"{context['dag_run'].run_id}/complete.json")
            project_id = context["dag_run"].conf.get("project_id")

            data = context["dag_run"].conf
            data["execution_date"] = str(context.get("execution_date"))
            data = json.dumps(data)
            blob.upload_from_string(data)
Esempio n. 3
0
    def execute(self, context):
        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)
        service = hook.get_conn()
        answer = False
        counter = 0
        while (True):
            pageToken = None
            while (True):
                response = service.objects().list(
                    bucket=self.bucket,
                    pageToken=pageToken,
                    prefix=self.prefix).execute()

                if 'items' not in response:
                    print("No items found for prefix: " + self.prefix)
                    break

                if len(response['items']) > int(self.number):
                    answer = True

                for item in response['items']:
                    if item and 'name' in item:
                        print item['name']

                if 'nextPageToken' not in response:
                    # no further pages of results, so stop the loop
                    break

                pageToken = response['nextPageToken']
                if not pageToken:
                    # empty next page token
                    break

            if answer:
                print("files exist, move to the next step")
                return
            else:
                print("files do not exists.  Waiting...")
                time.sleep(120)

            counter = counter + 1
            if counter > 30:
                print("Files were not created after 1 hour.  Tomeouting...")
                return
Esempio n. 4
0
    def create_env(run_id, **context):
        """Run id should be some random UUID.
        """

        ghook = GoogleCloudStorageHook()  # uses default gcp connection
        bucket_name = context["dag_run"].conf.get('source')
        project_id = context["dag_run"].conf.get("project_id")
        if not TEST_MODE:
            """
            # _process bucket could already exist
            try:
                subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_process'}"], shell=True).decode()
            except Exception:
                pass

            # other buckets should not have been created before

            # this data can be used for chunk-based image processing)
            try:
                subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_chunk_' + run_id}"], shell=True).decode()
            except Exception:
                pass

            # will be auto deleted
            try:
                subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_tmp_' + run_id}"], shell=True).decode()
            except Exception:
                pass

            # will be made public readable
            try:
                subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_ng_' + run_id}"], shell=True).decode()
            except Exception:
                pass
            """

            # interface does not support enabling uniform IAM.
            # create bucket for configs (ignore if it already existss
            try:
                ghook.create_bucket(bucket_name=bucket_name + "_process",
                                    project_id=project_id,
                                    storage_class="REGIONAL",
                                    location="US-EAST4")
            except AirflowException as e:
                # ignore if the erorr is the bucket exists
                if not str(e).startswith("409"):
                    raise

            # other buckets should not have been created before

            # this data can be used for chunk-based image processing)
            ghook.create_bucket(bucket_name=bucket_name + "_chunk_" + run_id,
                                project_id=project_id,
                                storage_class="REGIONAL",
                                location="US-EAST4")

            # will be auto deleted
            ghook.create_bucket(
                bucket_name=bucket_name + "_tmp_" + run_id,
                project_id=project_id
            )  #, storage_class="REGIONAL", location="US-EAST4")

            # will be made public readable
            ghook.create_bucket(bucket_name=bucket_name + "_ng_" + run_id,
                                project_id=project_id,
                                storage_class="REGIONAL",
                                location="US-EAST4")

            # dump configuration
            client = ghook.get_conn()
            source = context["dag_run"].conf.get("source")
            bucket = client.bucket(source + "_process")
            blob = bucket.blob(
                blob_name=f"{context['dag_run'].run_id}/init.json")

            data = context["dag_run"].conf
            data["execution_date"] = str(context.get("execution_date"))
            data = json.dumps(data)
            blob.upload_from_string(data)
    def execute(self, context):

        with NamedTemporaryFile("w") as tmp:

            # Load the SalesforceHook
            hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name)

            # Attempt to login to Salesforce
            # If this process fails, it will raise an error and die.
            try:
                sf_conn = hook.sign_in()
            except:
                logging.debug('Unable to login.')

            logging.info(self.soql)
            logging.info(self.object)

            logging.debug('Connecting to Salesforce...')
            query_results = sf_conn.bulk.__getattr__(self.object).query(
                self.soql)
            logging.info('Retrieved results...')

            logging.info(type(query_results))
            logging.info('First line is:')
            logging.info(query_results[0])

            gcs = GoogleCloudStorageHook(self.gcs_conn_id)
            service = gcs.get_conn()

            logging.info('Preparing File...')

            intermediate_arr = []

            for i, q in enumerate(query_results):

                del q['attributes']
                q["partition_date"] = date.today().strftime('%Y-%m-%d')

                for k, v in q.items():

                    if (type(v) == float):
                        q[k] = round(v, 2)
                    if (type(v) == int) and (len(str(v)) == 13):
                        q[k] = datetime.fromtimestamp(
                            v / 1000).strftime('%Y-%m-%d %H:%M:%S')
                    if (type(v) == str) and (re.search(r"^(\d+\.\d+)$", v) !=
                                             None):
                        q[k] = round(float(v), 2)

                for key in q.keys():

                    q[key.lower()] = q.pop(key)

                query = json.dumps(q, ensure_ascii=False)
                intermediate_arr.append(query + '\n')
                del query

                if i % 100 == 0:
                    tmp.file.writelines(intermediate_arr)
                    intermediate_arr = []

                    #tmp.file.write(str(query+'\n'))
            tmp.file.writelines(intermediate_arr)

            #            tmp.file.flush()

            logging.info('Loading results to GCS...')

            self.upload(service=service,
                        bucket=self.gcs_bucket,
                        filename=tmp.name,
                        object=self.gcs_object,
                        multipart=True,
                        num_retries=2)

            tmp.close()

        logging.info("Query finished!")
Esempio n. 6
0
    def writeslice_worker(worker_id, num_workers, data, **context):
        minz = int(data["minz"])
        maxz = int(data["maxz"])
        dest = data["dest"]
        image = data["image"]
        collect_id = data["collect_id"]
        dest_tmp = data["dest-tmp"]
        shard_size = data["shard-size"]
        bucket_name = data["bucket_name"]

        bbox_val = json.dumps(context["task_instance"].xcom_pull(
            task_ids=collect_id, key="bbox"))
        bbox = json.loads(bbox_val)

        transform_vals = {}
        # fetch data from google storage
        if not TEST_MODE:
            ghook = GoogleCloudStorageHook()  # uses default gcp connection
            client = ghook.get_conn()
            bucket = client.bucket(bucket_name + "_process")
            blob = bucket.blob(
                blob_name=f"{context['dag_run'].run_id}/align/transforms.json")
            trans_str = blob.download_as_string().decode()
            transform_vals = json.loads(trans_str)

        task_list = []
        for slice in range(minz, maxz + 1):
            if (slice % num_workers) == worker_id:
                if TEST_MODE:
                    # slow with many slices
                    transform_val = json.dumps(
                        context["task_instance"].xcom_pull(task_ids=collect_id,
                                                           key=str(slice)))
                else:
                    transform_val = json.dumps(transform_vals[str(slice)])

                params = {
                    "img": image % slice,
                    "transform": transform_val,
                    "bbox": bbox_val,
                    "dest-tmp": dest_tmp,
                    "slice": slice,
                    "shard-size": shard_size,
                    "dest": dest,
                    "run_id": context["dag_run"].run_id
                }
                task_list.append([f"{slice}", params])
                """
                # split into super tiles of 8192 
                for stx in range(0, bbox[0], 8192):
                    for sty in range(0, bbox[1], 8192): 
                        params = {
                                "img": image % slice,
                                "transform": transform_val, 
                                "bbox": bbox_val, 
                                "dest-tmp": dest_tmp,
                                "slice": slice,
                                "shard-size": shard_size,
                                "super-tile-chunk": [stx//8192, sty//8192],
                                "dest": dest
                        }        
                        task_list.append([f"{slice}_{stx}_{sty}", params])
                """
        return task_list
Esempio n. 7
0
    def collect_affine(temp_location, bucket_name, **context):
        """Create transform arrays for each image and global bbox.

        Note: the computation is very straighforward matrix multiplication.  No
        need to use a docker image.
        """

        source = context["dag_run"].conf.get("source") + "_process"
        image = context["dag_run"].conf.get("image")
        minz = context["dag_run"].conf.get("minz")
        maxz = context["dag_run"].conf.get("maxz")
        downsample_factor = context["dag_run"].conf.get("downsample_factor", 1)
        project_id = context["dag_run"].conf.get("project_id")

        def calculate_transform(x, y, affine):
            """Apply transform to a point.
            """
            x1 = affine[0] * x + affine[2] * y + affine[4]
            y1 = affine[1] * x + affine[3] * y + affine[5]
            return (round(x1), round(y1))

        def process_results(res):
            """Determines whether affine or translation is used.
            The transform is also adjusted for a top-left origin
            and reorders the paramters to list col1, col2, and col3.
            """

            # default no-op
            affine = [1, 0, 0, 1, 0, 0]
            translation = [1, 0, 0, 1, 0, 0]

            width = res["width"] * downsample_factor
            height = res["height"] * downsample_factor

            width0 = res["width0"] * downsample_factor
            height0 = res["height0"] * downsample_factor

            def adjust_trans(trans):
                """Flips Y and moves origin to top left.
                """
                #trans = np.array(trans)
                #trans *= downsample_factor

                trans[4] *= downsample_factor
                trans[5] *= downsample_factor
                dx = trans[4] - (1 -
                                 trans[0]) * width / 2 + trans[2] * height / 2
                dy = trans[5] - (1 -
                                 trans[3]) * height / 2 + trans[1] * width / 2
                return [trans[0], -trans[2], -trans[1], trans[3], dx, dy]

            affine = adjust_trans(res["affine"])
            translation = adjust_trans(res["translation"])

            # use translatee coefficients if image rotated less than 0.5 percent
            if affine[2] <= 0.0008:
                affine = translation
            return affine, [width, height], [width0, height0]

        # read each transform and create global coordinate system
        # (note: each transform is applied to n+1 slice, image sizes are assumed to have identical dims)
        last_affine = [1, 0, 0, 1, 0, 0]
        transforms = [[1, 0, 0, 1, 0, 0]]

        # store current bbox x range and y range and find max
        bbox = None
        global_bbox = None

        all_results = {}
        ghook = GoogleCloudStorageHook()  # uses default gcp connection
        client = ghook.get_conn()
        bucket = client.bucket(bucket_name)

        for worker_id in range(0, NUM_WORKERS):
            blob = bucket.blob(
                blob_name=
                f"{context['dag_run'].run_id}/align/affine_cache/worker-{worker_id}"
            )
            # raises error if not found
            res = json.loads(blob.download_as_string().decode())

            #res = context['task_instance'].xcom_pull(task_ids=f"{name}.affine_{worker_id}")
            all_results.update(res)

        for slice in range(minz, maxz):
            res = json.loads(all_results[str(slice)])
            # affine has already been modified to treat top-left of image as origin

            # process results
            curr_affine, bbox, bbox0 = process_results(res)

            # get bbox
            if slice == minz:
                global_bbox = [0, bbox0[0], 0, bbox0[1]]
            """
            if bbox[0] > global_bbox[1]:
                global_bbox[1] = bbox[0]

            if bbox[1] > global_bbox[3]:
                global_bbox[3] = bbox[1]
            """

            # multiply matrices
            mod_affine = []
            mod_affine.append(last_affine[0] * curr_affine[0] +
                              last_affine[2] * curr_affine[1])
            mod_affine.append(last_affine[1] * curr_affine[0] +
                              last_affine[3] * curr_affine[1])

            mod_affine.append(last_affine[0] * curr_affine[2] +
                              last_affine[2] * curr_affine[3])
            mod_affine.append(last_affine[1] * curr_affine[2] +
                              last_affine[3] * curr_affine[3])

            mod_affine.append(last_affine[0] * curr_affine[4] +
                              last_affine[2] * curr_affine[5] + last_affine[4])
            mod_affine.append(last_affine[1] * curr_affine[4] +
                              last_affine[3] * curr_affine[5] + last_affine[5])

            last_affine = mod_affine
            # add affine to list
            transforms.append(mod_affine)

            # check corners to find bbox
            shift1 = calculate_transform(0, 0, mod_affine)
            shift2 = calculate_transform(0, bbox[1], mod_affine)
            shift3 = calculate_transform(bbox[0], 0, mod_affine)
            shift4 = calculate_transform(bbox[0], bbox[1], mod_affine)
            xmin = min(shift1[0], shift2[0], shift3[0], shift4[0])
            xmax = max(shift1[0], shift2[0], shift3[0], shift4[0])
            ymin = min(shift1[1], shift2[1], shift3[1], shift4[1])
            ymax = max(shift1[1], shift2[1], shift3[1], shift4[1])
            if xmin < global_bbox[0]:
                global_bbox[0] = xmin
            if ymin < global_bbox[2]:
                global_bbox[2] = ymin
            if xmax > global_bbox[1]:
                global_bbox[1] = xmax
            if ymax > global_bbox[3]:
                global_bbox[3] = ymax

        # push results for each image and create csv of transforms
        affines_csv = ""
        transforms_out = {}
        for slice in range(minz, maxz + 1):
            curr_affine = transforms[slice - minz]
            curr_affine[4] = curr_affine[4] - global_bbox[0]  # shift by min x
            curr_affine[5] = curr_affine[5] - global_bbox[2]  # shift by min y
            transforms_out[slice] = curr_affine

            if TEST_MODE:  # sending the data via xcom is slow when there are a lot of slices
                context['task_instance'].xcom_push(key=f"{slice}",
                                                   value=curr_affine)
            affines_csv += f"{slice} , '{curr_affine}'\n"

        logging.info(
            [global_bbox[1] - global_bbox[0], global_bbox[3] - global_bbox[2]])
        # push bbox for new image size
        context['task_instance'].xcom_push(key="bbox",
                                           value=[
                                               global_bbox[1] - global_bbox[0],
                                               global_bbox[3] - global_bbox[2]
                                           ])

        # test mode disable
        if not TEST_MODE:
            # write transforms to align/tranforms.csv
            ghook = GoogleCloudStorageHook()  # uses default gcp connection
            client = ghook.get_conn()
            bucket = client.bucket(source)
            blob = bucket.blob(
                blob_name=f"{context['dag_run'].run_id}/align/transforms.csv")
            blob.upload_from_string(affines_csv)

            # write the json parseable transforms to align/transforms.json
            blob = bucket.blob(
                blob_name=f"{context['dag_run'].run_id}/align/transforms.json")
            blob.upload_from_string(json.dumps(transforms_out))