def deserialize_results(self, dir, loc): """Deserialize to locaiton on disk. Note: only support gs:// endpoints. """ # strip gs prefix if it exists if dir.startswith("gs://"): dir = dir[5:] dir_path = dir.split("/") # grab bucket name bucket_name = dir_path[0] # get path path = "/".join(dir_path[1:]) if path[-1] != "/": path += "/" path += loc ghook = GoogleCloudStorageHook() # uses default gcp connection client = ghook.get_conn() bucket = client.bucket(bucket_name) blob = bucket.blob(blob_name=path) # raises error if not found return blob.download_as_string().decode()
def write_status(**context): # test mode disable if not TEST_MODE: # write config and time stamp ghook = GoogleCloudStorageHook() # uses default gcp connection client = ghook.get_conn() source = context["dag_run"].conf.get("source") bucket = client.bucket(source + "_process") blob = bucket.blob( blob_name=f"{context['dag_run'].run_id}/complete.json") project_id = context["dag_run"].conf.get("project_id") data = context["dag_run"].conf data["execution_date"] = str(context.get("execution_date")) data = json.dumps(data) blob.upload_from_string(data)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) service = hook.get_conn() answer = False counter = 0 while (True): pageToken = None while (True): response = service.objects().list( bucket=self.bucket, pageToken=pageToken, prefix=self.prefix).execute() if 'items' not in response: print("No items found for prefix: " + self.prefix) break if len(response['items']) > int(self.number): answer = True for item in response['items']: if item and 'name' in item: print item['name'] if 'nextPageToken' not in response: # no further pages of results, so stop the loop break pageToken = response['nextPageToken'] if not pageToken: # empty next page token break if answer: print("files exist, move to the next step") return else: print("files do not exists. Waiting...") time.sleep(120) counter = counter + 1 if counter > 30: print("Files were not created after 1 hour. Tomeouting...") return
def create_env(run_id, **context): """Run id should be some random UUID. """ ghook = GoogleCloudStorageHook() # uses default gcp connection bucket_name = context["dag_run"].conf.get('source') project_id = context["dag_run"].conf.get("project_id") if not TEST_MODE: """ # _process bucket could already exist try: subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_process'}"], shell=True).decode() except Exception: pass # other buckets should not have been created before # this data can be used for chunk-based image processing) try: subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_chunk_' + run_id}"], shell=True).decode() except Exception: pass # will be auto deleted try: subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_tmp_' + run_id}"], shell=True).decode() except Exception: pass # will be made public readable try: subprocess.check_output([f"gsutil mb -p {project_id} -l US-EAST4 -b on gs://{bucket_name + '_ng_' + run_id}"], shell=True).decode() except Exception: pass """ # interface does not support enabling uniform IAM. # create bucket for configs (ignore if it already existss try: ghook.create_bucket(bucket_name=bucket_name + "_process", project_id=project_id, storage_class="REGIONAL", location="US-EAST4") except AirflowException as e: # ignore if the erorr is the bucket exists if not str(e).startswith("409"): raise # other buckets should not have been created before # this data can be used for chunk-based image processing) ghook.create_bucket(bucket_name=bucket_name + "_chunk_" + run_id, project_id=project_id, storage_class="REGIONAL", location="US-EAST4") # will be auto deleted ghook.create_bucket( bucket_name=bucket_name + "_tmp_" + run_id, project_id=project_id ) #, storage_class="REGIONAL", location="US-EAST4") # will be made public readable ghook.create_bucket(bucket_name=bucket_name + "_ng_" + run_id, project_id=project_id, storage_class="REGIONAL", location="US-EAST4") # dump configuration client = ghook.get_conn() source = context["dag_run"].conf.get("source") bucket = client.bucket(source + "_process") blob = bucket.blob( blob_name=f"{context['dag_run'].run_id}/init.json") data = context["dag_run"].conf data["execution_date"] = str(context.get("execution_date")) data = json.dumps(data) blob.upload_from_string(data)
def execute(self, context): with NamedTemporaryFile("w") as tmp: # Load the SalesforceHook hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name) # Attempt to login to Salesforce # If this process fails, it will raise an error and die. try: sf_conn = hook.sign_in() except: logging.debug('Unable to login.') logging.info(self.soql) logging.info(self.object) logging.debug('Connecting to Salesforce...') query_results = sf_conn.bulk.__getattr__(self.object).query( self.soql) logging.info('Retrieved results...') logging.info(type(query_results)) logging.info('First line is:') logging.info(query_results[0]) gcs = GoogleCloudStorageHook(self.gcs_conn_id) service = gcs.get_conn() logging.info('Preparing File...') intermediate_arr = [] for i, q in enumerate(query_results): del q['attributes'] q["partition_date"] = date.today().strftime('%Y-%m-%d') for k, v in q.items(): if (type(v) == float): q[k] = round(v, 2) if (type(v) == int) and (len(str(v)) == 13): q[k] = datetime.fromtimestamp( v / 1000).strftime('%Y-%m-%d %H:%M:%S') if (type(v) == str) and (re.search(r"^(\d+\.\d+)$", v) != None): q[k] = round(float(v), 2) for key in q.keys(): q[key.lower()] = q.pop(key) query = json.dumps(q, ensure_ascii=False) intermediate_arr.append(query + '\n') del query if i % 100 == 0: tmp.file.writelines(intermediate_arr) intermediate_arr = [] #tmp.file.write(str(query+'\n')) tmp.file.writelines(intermediate_arr) # tmp.file.flush() logging.info('Loading results to GCS...') self.upload(service=service, bucket=self.gcs_bucket, filename=tmp.name, object=self.gcs_object, multipart=True, num_retries=2) tmp.close() logging.info("Query finished!")
def writeslice_worker(worker_id, num_workers, data, **context): minz = int(data["minz"]) maxz = int(data["maxz"]) dest = data["dest"] image = data["image"] collect_id = data["collect_id"] dest_tmp = data["dest-tmp"] shard_size = data["shard-size"] bucket_name = data["bucket_name"] bbox_val = json.dumps(context["task_instance"].xcom_pull( task_ids=collect_id, key="bbox")) bbox = json.loads(bbox_val) transform_vals = {} # fetch data from google storage if not TEST_MODE: ghook = GoogleCloudStorageHook() # uses default gcp connection client = ghook.get_conn() bucket = client.bucket(bucket_name + "_process") blob = bucket.blob( blob_name=f"{context['dag_run'].run_id}/align/transforms.json") trans_str = blob.download_as_string().decode() transform_vals = json.loads(trans_str) task_list = [] for slice in range(minz, maxz + 1): if (slice % num_workers) == worker_id: if TEST_MODE: # slow with many slices transform_val = json.dumps( context["task_instance"].xcom_pull(task_ids=collect_id, key=str(slice))) else: transform_val = json.dumps(transform_vals[str(slice)]) params = { "img": image % slice, "transform": transform_val, "bbox": bbox_val, "dest-tmp": dest_tmp, "slice": slice, "shard-size": shard_size, "dest": dest, "run_id": context["dag_run"].run_id } task_list.append([f"{slice}", params]) """ # split into super tiles of 8192 for stx in range(0, bbox[0], 8192): for sty in range(0, bbox[1], 8192): params = { "img": image % slice, "transform": transform_val, "bbox": bbox_val, "dest-tmp": dest_tmp, "slice": slice, "shard-size": shard_size, "super-tile-chunk": [stx//8192, sty//8192], "dest": dest } task_list.append([f"{slice}_{stx}_{sty}", params]) """ return task_list
def collect_affine(temp_location, bucket_name, **context): """Create transform arrays for each image and global bbox. Note: the computation is very straighforward matrix multiplication. No need to use a docker image. """ source = context["dag_run"].conf.get("source") + "_process" image = context["dag_run"].conf.get("image") minz = context["dag_run"].conf.get("minz") maxz = context["dag_run"].conf.get("maxz") downsample_factor = context["dag_run"].conf.get("downsample_factor", 1) project_id = context["dag_run"].conf.get("project_id") def calculate_transform(x, y, affine): """Apply transform to a point. """ x1 = affine[0] * x + affine[2] * y + affine[4] y1 = affine[1] * x + affine[3] * y + affine[5] return (round(x1), round(y1)) def process_results(res): """Determines whether affine or translation is used. The transform is also adjusted for a top-left origin and reorders the paramters to list col1, col2, and col3. """ # default no-op affine = [1, 0, 0, 1, 0, 0] translation = [1, 0, 0, 1, 0, 0] width = res["width"] * downsample_factor height = res["height"] * downsample_factor width0 = res["width0"] * downsample_factor height0 = res["height0"] * downsample_factor def adjust_trans(trans): """Flips Y and moves origin to top left. """ #trans = np.array(trans) #trans *= downsample_factor trans[4] *= downsample_factor trans[5] *= downsample_factor dx = trans[4] - (1 - trans[0]) * width / 2 + trans[2] * height / 2 dy = trans[5] - (1 - trans[3]) * height / 2 + trans[1] * width / 2 return [trans[0], -trans[2], -trans[1], trans[3], dx, dy] affine = adjust_trans(res["affine"]) translation = adjust_trans(res["translation"]) # use translatee coefficients if image rotated less than 0.5 percent if affine[2] <= 0.0008: affine = translation return affine, [width, height], [width0, height0] # read each transform and create global coordinate system # (note: each transform is applied to n+1 slice, image sizes are assumed to have identical dims) last_affine = [1, 0, 0, 1, 0, 0] transforms = [[1, 0, 0, 1, 0, 0]] # store current bbox x range and y range and find max bbox = None global_bbox = None all_results = {} ghook = GoogleCloudStorageHook() # uses default gcp connection client = ghook.get_conn() bucket = client.bucket(bucket_name) for worker_id in range(0, NUM_WORKERS): blob = bucket.blob( blob_name= f"{context['dag_run'].run_id}/align/affine_cache/worker-{worker_id}" ) # raises error if not found res = json.loads(blob.download_as_string().decode()) #res = context['task_instance'].xcom_pull(task_ids=f"{name}.affine_{worker_id}") all_results.update(res) for slice in range(minz, maxz): res = json.loads(all_results[str(slice)]) # affine has already been modified to treat top-left of image as origin # process results curr_affine, bbox, bbox0 = process_results(res) # get bbox if slice == minz: global_bbox = [0, bbox0[0], 0, bbox0[1]] """ if bbox[0] > global_bbox[1]: global_bbox[1] = bbox[0] if bbox[1] > global_bbox[3]: global_bbox[3] = bbox[1] """ # multiply matrices mod_affine = [] mod_affine.append(last_affine[0] * curr_affine[0] + last_affine[2] * curr_affine[1]) mod_affine.append(last_affine[1] * curr_affine[0] + last_affine[3] * curr_affine[1]) mod_affine.append(last_affine[0] * curr_affine[2] + last_affine[2] * curr_affine[3]) mod_affine.append(last_affine[1] * curr_affine[2] + last_affine[3] * curr_affine[3]) mod_affine.append(last_affine[0] * curr_affine[4] + last_affine[2] * curr_affine[5] + last_affine[4]) mod_affine.append(last_affine[1] * curr_affine[4] + last_affine[3] * curr_affine[5] + last_affine[5]) last_affine = mod_affine # add affine to list transforms.append(mod_affine) # check corners to find bbox shift1 = calculate_transform(0, 0, mod_affine) shift2 = calculate_transform(0, bbox[1], mod_affine) shift3 = calculate_transform(bbox[0], 0, mod_affine) shift4 = calculate_transform(bbox[0], bbox[1], mod_affine) xmin = min(shift1[0], shift2[0], shift3[0], shift4[0]) xmax = max(shift1[0], shift2[0], shift3[0], shift4[0]) ymin = min(shift1[1], shift2[1], shift3[1], shift4[1]) ymax = max(shift1[1], shift2[1], shift3[1], shift4[1]) if xmin < global_bbox[0]: global_bbox[0] = xmin if ymin < global_bbox[2]: global_bbox[2] = ymin if xmax > global_bbox[1]: global_bbox[1] = xmax if ymax > global_bbox[3]: global_bbox[3] = ymax # push results for each image and create csv of transforms affines_csv = "" transforms_out = {} for slice in range(minz, maxz + 1): curr_affine = transforms[slice - minz] curr_affine[4] = curr_affine[4] - global_bbox[0] # shift by min x curr_affine[5] = curr_affine[5] - global_bbox[2] # shift by min y transforms_out[slice] = curr_affine if TEST_MODE: # sending the data via xcom is slow when there are a lot of slices context['task_instance'].xcom_push(key=f"{slice}", value=curr_affine) affines_csv += f"{slice} , '{curr_affine}'\n" logging.info( [global_bbox[1] - global_bbox[0], global_bbox[3] - global_bbox[2]]) # push bbox for new image size context['task_instance'].xcom_push(key="bbox", value=[ global_bbox[1] - global_bbox[0], global_bbox[3] - global_bbox[2] ]) # test mode disable if not TEST_MODE: # write transforms to align/tranforms.csv ghook = GoogleCloudStorageHook() # uses default gcp connection client = ghook.get_conn() bucket = client.bucket(source) blob = bucket.blob( blob_name=f"{context['dag_run'].run_id}/align/transforms.csv") blob.upload_from_string(affines_csv) # write the json parseable transforms to align/transforms.json blob = bucket.blob( blob_name=f"{context['dag_run'].run_id}/align/transforms.json") blob.upload_from_string(json.dumps(transforms_out))