def _setup_measurement_set(self, day_to_process, barrier_drop, add_output_s3, node_id): s3_drop = self.create_s3_drop( node_id, self._bucket_name, os.path.join('observation_data', day_to_process.full_tar_name), 'aws-chiles02', 's3_in') if len(add_output_s3) == 0: self._start_oids.append(s3_drop['uid']) else: for drop in add_output_s3: drop.addOutput(s3_drop) copy_from_s3 = self.create_app(node_id, get_module_name(CopyMsTransformFromS3), 'app_copy_mstransform_from_s3') measurement_set = self.create_directory_container(node_id, 'dir_in_ms', expire_after_use=False) if barrier_drop is not None: barrier_drop.addOutput(measurement_set) copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) drop_listobs = self.create_docker_app(node_id, get_module_name(DockerListobs), 'app_listobs', CONTAINER_CHILES02, 'listobs') properties = self.create_json_drop(node_id) drop_listobs.addInput(measurement_set) drop_listobs.addOutput(properties) return measurement_set, properties, drop_listobs
def _setup_measurement_set(self, day_to_process, barrier_drop, add_output_s3, node_id): s3_drop = self.create_s3_drop( node_id, self._bucket_name, os.path.join('observation_data', day_to_process.full_tar_name), 'aws-chiles02', 's3_in') if len(add_output_s3) == 0: self._start_oids.append(s3_drop['uid']) else: for drop in add_output_s3: drop.addOutput(s3_drop) copy_from_s3 = self.create_app(node_id, get_module_name(CopyMsTransformFromS3), 'app_copy_mstransform_from_s3') measurement_set = self.create_directory_container( node_id, 'dir_in_ms', expire_after_use=False) if barrier_drop is not None: barrier_drop.addOutput(measurement_set) copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) drop_listobs = self.create_docker_app(node_id, get_module_name(DockerListobs), 'app_listobs', CONTAINER_CHILES02, 'listobs') properties = self.create_json_drop(node_id) drop_listobs.addInput(measurement_set) drop_listobs.addOutput(properties) return measurement_set, properties, drop_listobs
def _build_clean_chain(self, s3_object, count_on_node, node_id): # Get the carry over carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.s3_out is None: carry_over_data.s3_out = [None] * self._parallel_streams elements = s3_object.split('/') s3_drop = self.create_s3_drop( node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) self._start_oids.append(s3_drop['uid']) copy_from_s3 = self.create_app( node_id, get_module_name(CopyCleanFromS3), 'app_copy_from_s3', min_frequency=self._min_frequency, max_frequency=self._max_frequency, ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms_{0}'.format(elements[2]) ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) if carry_over_data.s3_out[count_on_node] is not None: copy_from_s3.addInput(carry_over_data.s3_out[count_on_node]) casa_py_clean_drop = self.create_docker_app( node_id, get_module_name(DockerClean), 'app_clean', CONTAINER_CHILES02, 'clean', min_frequency=self._min_frequency, max_frequency=self._max_frequency, iterations=1, measurement_sets=[measurement_set['dirname']], ) elements = s3_object.split('/') result = self.create_directory_container( node_id, 'dir_clean_output_{0}'.format(elements[2]) ) casa_py_clean_drop.addInput(measurement_set) casa_py_clean_drop.addOutput(result) carry_over_data.s3_out[count_on_node] = result
def _build_clean_chain(self, s3_object, count_on_node, node_id): # Get the carry over carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.s3_out is None: carry_over_data.s3_out = [None] * self._parallel_streams elements = s3_object.split('/') s3_drop = self.create_s3_drop( node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) self._start_oids.append(s3_drop['uid']) copy_from_s3 = self.create_app( node_id, get_module_name(CopyCleanFromS3), 'app_copy_from_s3', min_frequency=self._min_frequency, max_frequency=self._max_frequency, ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms_{0}'.format(elements[2])) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) if carry_over_data.s3_out[count_on_node] is not None: copy_from_s3.addInput(carry_over_data.s3_out[count_on_node]) casa_py_clean_drop = self.create_docker_app( node_id, get_module_name(DockerClean), 'app_clean', CONTAINER_CHILES02, 'clean', min_frequency=self._min_frequency, max_frequency=self._max_frequency, iterations=1, measurement_sets=[measurement_set['dirname']], ) elements = s3_object.split('/') result = self.create_directory_container( node_id, 'dir_clean_output_{0}'.format(elements[2])) casa_py_clean_drop.addInput(measurement_set) casa_py_clean_drop.addOutput(result) carry_over_data.s3_out[count_on_node] = result
def copy_logfiles_and_shutdown(self): """ Copy the logfile to S3 and shutdown """ for list_ips in self._node_details.values(): for instance_details in list_ips: node_id = instance_details['ip_address'] copy_log_drop = self.create_app(node_id, get_module_name(CopyLogFilesApp), 'copy_log_files_app') # After everything is complete for drop in self._drop_list: if drop['type'] in ['plain', 'container'] and drop['node'] == node_id: copy_log_drop.addInput(drop) s3_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}.tar'.format( self._session_id, node_id, ), 'aws-chiles02', oid='s3_out' ) copy_log_drop.addOutput(s3_drop_out) if self._shutdown: shutdown_drop = self.create_bash_shell_app(node_id, 'sudo shutdown -h +5 "DFMS node shutting down" &') shutdown_drop.addInput(s3_drop_out)
def create_barrier_app(self, node_id, oid='barrier_app', input_error_threshold=100): drop = dropdict({ "type": 'app', "app": get_module_name(BarrierAppDROP), "oid": self.get_oid(oid), "uid": self.get_uuid(), "input_error_threshold": input_error_threshold, "node": node_id, }) self.add_drop(drop) return drop
def _split(self, last_element, frequency_pairs, measurement_set, properties, observation_name, node_id): casa_py_drop = self.create_docker_app( node_id, get_module_name(DockerMsTransform), 'app_ms_transform', CONTAINER_CHILES02, 'ms_transform', min_frequency=frequency_pairs.bottom_frequency, max_frequency=frequency_pairs.top_frequency, ) result = self.create_directory_container(node_id, 'dir_split') casa_py_drop.addInput(measurement_set) casa_py_drop.addInput(properties) if last_element is not None: casa_py_drop.addInput(last_element) casa_py_drop.addOutput(result) copy_to_s3 = self.create_app( node_id, get_module_name(CopyMsTransformToS3), 'app_copy_mstransform_to_s3', min_frequency=frequency_pairs.bottom_frequency, max_frequency=frequency_pairs.top_frequency, ) s3_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{3}/{0}_{1}/{2}.tar'.format( frequency_pairs.bottom_frequency, frequency_pairs.top_frequency, observation_name, self._s3_split_name ), 'aws-chiles02', oid='s3_out' ) copy_to_s3.addInput(result) copy_to_s3.addOutput(s3_drop_out) return s3_drop_out
def create_bash_shell_app(self, node_id, command, oid='bash_shell_app', input_error_threshold=100): drop = dropdict({ "type": 'app', "app": get_module_name(BashShellApp), "oid": self.get_oid(oid), "uid": self.get_uuid(), "command": command, "input_error_threshold": input_error_threshold, "node": node_id, }) self.add_drop(drop) return drop
def _split(self, last_element, frequency_pairs, measurement_set, properties, observation_name, node_id): casa_py_drop = self.create_docker_app( node_id, get_module_name(DockerMsTransform), 'app_ms_transform', CONTAINER_CHILES02, 'ms_transform', min_frequency=frequency_pairs.bottom_frequency, max_frequency=frequency_pairs.top_frequency, ) result = self.create_directory_container(node_id, 'dir_split') casa_py_drop.addInput(measurement_set) casa_py_drop.addInput(properties) if last_element is not None: casa_py_drop.addInput(last_element) casa_py_drop.addOutput(result) copy_to_s3 = self.create_app( node_id, get_module_name(CopyMsTransformToS3), 'app_copy_mstransform_to_s3', min_frequency=frequency_pairs.bottom_frequency, max_frequency=frequency_pairs.top_frequency, ) s3_drop_out = self.create_s3_drop(node_id, self._bucket_name, '{3}/{0}_{1}/{2}.tar'.format( frequency_pairs.bottom_frequency, frequency_pairs.top_frequency, observation_name, self._s3_split_name), 'aws-chiles02', oid='s3_out') copy_to_s3.addInput(result) copy_to_s3.addOutput(s3_drop_out) return s3_drop_out
def create_barrier_app(self, node_id, oid='barrier_app', input_error_threshold=100): oid_text = self.get_oid(oid) uid_text = self.get_uuid() drop = dropdict({ "type": 'app', "app": get_module_name(BarrierAppDROP), "oid": oid_text, "uid": uid_text, "input_error_threshold": input_error_threshold, "node": node_id, }) self.add_drop(drop) return drop
def create_bash_shell_app(self, node_id, command, oid='bash_shell_app', input_error_threshold=100): oid_text = self.get_oid(oid) uid_text = self.get_uuid() drop = dropdict({ "type": 'app', "app": get_module_name(BashShellApp), "oid": oid_text, "uid": uid_text, "command": command, "input_error_threshold": input_error_threshold, "node": node_id, }) self.add_drop(drop) return drop
def _build_s3_download(self, node_id, frequency_pair): s3_objects = [] prefix = '{0}/{1}_{2}'.format(self._s3_uvsub_name, frequency_pair.bottom_frequency, frequency_pair.top_frequency) for key in self._bucket.objects.filter(Prefix=prefix): s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams s3_out_drops = [] counter = 0 for s3_object in s3_objects: s3_drop = self.create_s3_drop( node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) copy_from_s3 = self.create_app( node_id, get_module_name(CopyCleanFromS3), 'app_copy_from_s3', min_frequency=frequency_pair.bottom_frequency, max_frequency=frequency_pair.top_frequency, ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms' ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) self._start_oids.append(s3_drop['uid']) carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.s3_out is not None: copy_from_s3.addInput(carry_over_data.s3_out) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) parallel_streams[counter] = measurement_set s3_out_drops.append(measurement_set) counter += 1 if counter >= self._parallel_streams: counter = 0 return s3_out_drops
def _build_s3_download(self, node_id, frequency_pair): s3_objects = [] prefix = '{0}/{1}_{2}'.format(self._s3_uvsub_name, frequency_pair.bottom_frequency, frequency_pair.top_frequency) for key in self._bucket.objects.filter(Prefix=prefix): s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams s3_out_drops = [] counter = 0 for s3_object in s3_objects: s3_drop = self.create_s3_drop( node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) copy_from_s3 = self.create_app( node_id, get_module_name(CopyCleanFromS3), 'app_copy_from_s3', min_frequency=frequency_pair.bottom_frequency, max_frequency=frequency_pair.top_frequency, ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms') # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) self._start_oids.append(s3_drop['uid']) carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.s3_out is not None: copy_from_s3.addInput(carry_over_data.s3_out) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) parallel_streams[counter] = measurement_set s3_out_drops.append(measurement_set) counter += 1 if counter >= self._parallel_streams: counter = 0 return s3_out_drops
def create_directory_container(self, node_id, oid='directory_container', expire_after_use=True): oid_text = self.get_oid(oid) drop = dropdict({ "type": 'container', "container": get_module_name(DirectoryContainer), "oid": oid_text, "uid": self.get_uuid(), "precious": False, "dirname": os.path.join(self._volume, oid_text), "check_exists": False, "expireAfterUse": expire_after_use, "node": node_id, }) self.add_drop(drop) return drop
def create_directory_container(self, node_id, oid='directory_container', expire_after_use=True): oid_text = self.get_oid(oid) uid_text = self.get_uuid() drop = dropdict({ "type": 'container', "container": get_module_name(DirectoryContainer), "oid": oid_text, "uid": uid_text, "precious": False, "dirname": os.path.join(self._volume, oid_text), "check_exists": False, "expireAfterUse": expire_after_use, "node": node_id, }) self.add_drop(drop) return drop
def build_graph(self): start_drop = dropdict({ "type": 'plain', "storage": 'memory', "oid": get_oid('memory_in'), "uid": get_uuid(), }) self._start_oids.append(start_drop['uid']) self.append(start_drop) shutdown_drop = dropdict({ "type": 'app', "app": get_module_name(BashShellApp), "oid": get_oid('app_bash_shell_app'), "uid": get_uuid(), "command": 'sudo shutdown -h +5 "DFMS node shutting down" &', "user": '******', "input_error_threshold": 100, }) shutdown_drop.addInput(start_drop) self.append(shutdown_drop)
def copy_logfiles_and_shutdown(self, shutdown_dim=False): """ Copy the logfile to S3 and shutdown """ if shutdown_dim: dim_shutdown_drop = self.create_bash_shell_app(self._dim_ip, 'sudo shutdown -h +5 "DFMS node shutting down" &') for list_ips in self._node_details.values(): for instance_details in list_ips: node_id = instance_details['ip_address'] copy_log_drop = self.create_app(node_id, get_module_name(CopyLogFilesApp), 'copy_log_files_app') # After everything is complete for drop in self._drop_list: if drop['type'] in ['plain', 'container'] and drop['node'] == node_id: copy_log_drop.addInput(drop) s3_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}.tar'.format( self._session_id, node_id, ), 'aws-chiles02', oid='s3_out' ) copy_log_drop.addOutput(s3_drop_out) if self._shutdown: shutdown_drop = self.create_bash_shell_app(node_id, 'sudo shutdown -h +5 "DFMS node shutting down" &') shutdown_drop.addInput(s3_drop_out) if shutdown_dim: memory_drop = self.create_memory_drop(self._dim_ip) shutdown_drop.addOutput(memory_drop) dim_shutdown_drop.addInput(memory_drop)
def build_graph(self): session = boto3.Session(profile_name='aws-chiles02') s3 = session.resource('s3', use_ssl=False) self._s3_client = s3.meta.client self._bucket = s3.Bucket(self._bucket_name) # Add the cleaned images s3_objects = [] prefix = '{0}/'.format(self._s3_clean_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.tar'): s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams s3_out_drops = [] counter = 0 for s3_object in s3_objects: s3_drop = self.create_s3_drop( self._node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) copy_from_s3 = self.create_app( self._node_id, get_module_name(CopyConcatenateFromS3), 'app_copy_from_s3', ) measurement_set = self.create_directory_container( self._node_id, 'dir_in_ms', ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) self._start_oids.append(s3_drop['uid']) carry_over_data = self._map_carry_over_data[self._node_id] if carry_over_data.s3_out is not None: copy_from_s3.addInput(carry_over_data.s3_out) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) parallel_streams[counter] = measurement_set s3_out_drops.append(measurement_set) counter += 1 if counter >= self._parallel_streams: counter = 0 casa_py_concatenation_drop = self.create_docker_app( self._node_id, get_module_name(DockerImageconcat), 'app_concatenate', CONTAINER_CHILES02, 'concatenate', measurement_sets=[drop['dirname'] for drop in s3_out_drops], width=self._width, iterations=self._iterations, ) result = self.create_directory_container(self._node_id, 'dir_concatenate_output') for drop in s3_out_drops: casa_py_concatenation_drop.addInput(drop) casa_py_concatenation_drop.addOutput(result) copy_to_s3 = self.create_app( self._node_id, get_module_name(CopyConcatenateToS3), 'app_copy_concatenate_to_s3', width=self._width, iterations=self._iterations, ) s3_drop_out = self.create_s3_drop( self._node_id, self._bucket_name, '{0}/image_{1}_{2}.tar'.format( self._s3_image_name, self._width, self._iterations, ), 'aws-chiles02', oid='s3_out' ) copy_to_s3.addInput(result) copy_to_s3.addOutput(s3_drop_out) carry_over_data = self._map_carry_over_data[self._node_id] carry_over_data.copy_to_s3 = copy_to_s3 self.copy_logfiles_and_shutdown()
def build_graph(self): self._build_node_map() session = boto3.Session(profile_name='aws-chiles02') s3 = session.resource('s3', use_ssl=False) self._s3_client = s3.meta.client self._bucket = s3.Bucket(self._bucket_name) # Add the start drops for frequency_pair in self._work_to_do: node_id = self._get_next_node(frequency_pair) s3_drop_outs = self._build_s3_download(node_id, frequency_pair) casa_py_clean_drop = self.create_docker_app( node_id, get_module_name(DockerClean), 'app_clean', CONTAINER_CHILES02, 'clean', min_frequency=frequency_pair.bottom_frequency, max_frequency=frequency_pair.top_frequency, iterations=self._iterations, arcsec=self._arcsec, measurement_sets=[drop['dirname'] for drop in s3_drop_outs], ) result = self.create_directory_container(node_id, 'dir_clean_output') for drop in s3_drop_outs: casa_py_clean_drop.addInput(drop) casa_py_clean_drop.addOutput(result) copy_clean_to_s3 = self.create_app( node_id, get_module_name(CopyCleanToS3), 'app_copy_clean_to_s3', min_frequency=frequency_pair.bottom_frequency, max_frequency=frequency_pair.top_frequency, only_image=self._only_image, ) s3_clean_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/cleaned_{1}_{2}.tar'.format( self._s3_clean_name, frequency_pair.bottom_frequency, frequency_pair.top_frequency, ), 'aws-chiles02', oid='s3_out', ) copy_clean_to_s3.addInput(result) copy_clean_to_s3.addOutput(s3_clean_drop_out) copy_fits_to_s3 = self.create_app( node_id, get_module_name(CopyFitsToS3), 'app_copy_fits_to_s3', min_frequency=frequency_pair.bottom_frequency, max_frequency=frequency_pair.top_frequency, ) s3_fits_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/cleaned_{1}_{2}.fits'.format( self._s3_fits_name, frequency_pair.bottom_frequency, frequency_pair.top_frequency, ), 'aws-chiles02', oid='s3_out', ) copy_fits_to_s3.addInput(result) copy_fits_to_s3.addOutput(s3_fits_drop_out) barrier_drop = self.create_barrier_app(node_id) # Give the memory drop somewhere to go memory_drop = self.create_memory_drop(node_id) barrier_drop.addInput(s3_clean_drop_out) barrier_drop.addInput(s3_fits_drop_out) barrier_drop.addOutput(memory_drop) carry_over_data = self._map_carry_over_data[node_id] carry_over_data.s3_out = memory_drop clean_up = self.create_app( node_id, get_module_name(CleanupDirectories), 'app_cleanup_directories', ) for drop in s3_drop_outs: clean_up.addInput(drop) clean_up.addInput(result) clean_up.addInput(memory_drop) carry_over_data.clean_up = clean_up self.copy_logfiles_and_shutdown(True)
def build_graph(self): session = boto3.Session(profile_name='aws-chiles02') s3 = session.resource('s3', use_ssl=False) self._s3_client = s3.meta.client self._bucket = s3.Bucket(self._bucket_name) # Add the cleaned images s3_objects = [] prefix = '{0}/'.format(self._s3_clean_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.tar'): s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams s3_out_drops = [] counter = 0 for s3_object in s3_objects: s3_drop = self.create_s3_drop( self._node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) copy_from_s3 = self.create_app( self._node_id, get_module_name(CopyConcatenateFromS3), 'app_copy_from_s3', ) measurement_set = self.create_directory_container( self._node_id, 'dir_in_ms', ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) self._start_oids.append(s3_drop['uid']) carry_over_data = self._map_carry_over_data[self._node_id] if carry_over_data.s3_out is not None: copy_from_s3.addInput(carry_over_data.s3_out) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) parallel_streams[counter] = measurement_set s3_out_drops.append(measurement_set) counter += 1 if counter >= self._parallel_streams: counter = 0 casa_py_concatenation_drop = self.create_docker_app( self._node_id, get_module_name(DockerImageconcat), 'app_concatenate', CONTAINER_CHILES02, 'concatenate', measurement_sets=[drop['dirname'] for drop in s3_out_drops], width=self._width, iterations=self._iterations, ) result = self.create_directory_container(self._node_id, 'dir_concatenate_output') for drop in s3_out_drops: casa_py_concatenation_drop.addInput(drop) casa_py_concatenation_drop.addOutput(result) copy_to_s3 = self.create_app( self._node_id, get_module_name(CopyConcatenateToS3), 'app_copy_concatenate_to_s3', width=self._width, iterations=self._iterations, ) s3_drop_out = self.create_s3_drop(self._node_id, self._bucket_name, '{0}/image_{1}_{2}.tar'.format( self._s3_image_name, self._width, self._iterations, ), 'aws-chiles02', oid='s3_out') copy_to_s3.addInput(result) copy_to_s3.addOutput(s3_drop_out) carry_over_data = self._map_carry_over_data[self._node_id] carry_over_data.copy_to_s3 = copy_to_s3 self.copy_logfiles_and_shutdown()
def _build_uvsub_chain(self, split_to_process, count_on_node, node_id): # Get the carry over carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.memory_drop_list is None: carry_over_data.memory_drop_list = [None] * self._parallel_streams s3_drop = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}/{2}'.format( self._s3_split_name, split_to_process[0], split_to_process[1], ), 'aws-chiles02', oid='s3_in', ) self._start_oids.append(s3_drop['uid']) frequencies = split_to_process[0].split('_') copy_from_s3 = self.create_app( node_id, get_module_name(CopyUvsubFromS3), 'app_copy_from_s3', min_frequency=frequencies[0], max_frequency=frequencies[1], ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms' ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) if carry_over_data.memory_drop_list[count_on_node] is not None: copy_from_s3.addInput(carry_over_data.memory_drop_list[count_on_node]) # Do the UV subtraction casa_py_uvsub_drop = self.create_docker_app( node_id, get_module_name(DockerUvsub), 'app_uvsub', CONTAINER_CHILES02, 'uvsub', min_frequency=frequencies[0], max_frequency=frequencies[1], ) result = self.create_directory_container(node_id, 'dir_uvsub_output') casa_py_uvsub_drop.addInput(measurement_set) casa_py_uvsub_drop.addOutput(result) copy_uvsub_to_s3 = self.create_app( node_id, get_module_name(CopyUvsubToS3), 'app_copy_uvsub_to_s3', min_frequency=frequencies[0], max_frequency=frequencies[1], ) s3_uvsub_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}/{2}'.format( self._s3_uvsub_name, split_to_process[0], split_to_process[1], ), 'aws-chiles02', oid='s3_out', ) copy_uvsub_to_s3.addInput(result) copy_uvsub_to_s3.addOutput(s3_uvsub_drop_out) clean_up = self.create_app( node_id, get_module_name(CleanupDirectories), 'app_cleanup_directories', ) memory_drop = self.create_memory_drop(node_id) clean_up.addInput(s3_uvsub_drop_out) clean_up.addInput(result) clean_up.addInput(measurement_set) clean_up.addOutput(memory_drop) # Remember the end of the tail carry_over_data.memory_drop_list[count_on_node] = memory_drop
def build_graph(self): session = boto3.Session(profile_name='aws-chiles02') s3 = session.resource('s3', use_ssl=False) self._s3_client = s3.meta.client self._bucket = s3.Bucket(self._bucket_name) # Get the ones we've already done already_done = [] prefix = '{0}/'.format(self._s3_jpeg2000_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.jpx'): (head, tail) = os.path.split(key.key) (name, ext) = os.path.splitext(tail) already_done.append(name[6:]) # Add the fits s3_objects = [] prefix = '{0}/'.format(self._s3_fits_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.fits'): (head, tail) = os.path.split(key.key) (name, ext) = os.path.splitext(tail) if name[8:] not in already_done: s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams counter = 0 for s3_object in s3_objects: s3_drop = self.create_s3_drop( self._node_id, self._bucket_name, s3_object, 'aws-chiles02', oid='s3_in', ) copy_from_s3 = self.create_app( self._node_id, get_module_name(CopyFitsFromS3), 'app_copy_from_s3', ) (minimum_frequency, maximum_frequency) = self._get_frequencies(s3_object) fits_file_name = self._get_fits_file_name(s3_object) fits_file = self.create_file_drop( self._node_id, os.path.join(self._volume, fits_file_name), oid='fits_file', ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(fits_file) self._start_oids.append(s3_drop['uid']) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) # Do the conversions convert_jpeg2000 = self.create_docker_app( self._node_id, get_module_name(DockerApp), 'app_convert_jpeg2000', CONTAINER_SV, 'sv-encode -i %i0 -o %o0 Clayers=15 Clevels=6 Cycc=no Corder=CPRL ORGgen_plt=yes Cprecincts="{256,256},{128,128}" Cblk="{32,32}" Qstep=0.0001', user='******', ) jpeg2000_name = self._get_jpeg2000_name(s3_object) jpeg2000_file = self.create_file_drop( self._node_id, os.path.join(self._volume, jpeg2000_name), oid='jpeg200_file', ) convert_jpeg2000.addInput(fits_file) convert_jpeg2000.addOutput(jpeg2000_file) copy_jpg2000_to_s3 = self.create_app( self._node_id, get_module_name(CopyJpeg2000ToS3), 'app_copy_jpeg_to_s3', ) s3_jpeg2000_drop_out = self.create_s3_drop( self._node_id, self._bucket_name, '{0}/image_{1}_{2}.jpx'.format( self._s3_jpeg2000_name, minimum_frequency, maximum_frequency, ), 'aws-chiles02', 's3_out', ) copy_jpg2000_to_s3.addInput(jpeg2000_file) copy_jpg2000_to_s3.addOutput(s3_jpeg2000_drop_out) parallel_streams[counter] = s3_jpeg2000_drop_out counter += 1 if counter >= self._parallel_streams: counter = 0 barrier_drop = self.create_app( self._node_id, get_module_name(BarrierAppDROP), 'app_barrier', ) for jpeg2000_file in parallel_streams: if jpeg2000_file is not None: barrier_drop.addInput(jpeg2000_file) carry_over_data = self._map_carry_over_data[self._node_id] carry_over_data.barrier_drop = barrier_drop self.copy_logfiles_and_shutdown()
def _build_uvsub_chain(self, split_to_process, count_on_node, node_id): # Get the carry over carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.memory_drop_list is None: carry_over_data.memory_drop_list = [None] * self._parallel_streams s3_drop = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}/{2}'.format( self._s3_split_name, split_to_process[0], split_to_process[1], ), 'aws-chiles02', oid='s3_in', ) self._start_oids.append(s3_drop['uid']) frequencies = split_to_process[0].split('_') copy_from_s3 = self.create_app( node_id, get_module_name(CopyUvsubFromS3), 'app_copy_from_s3', min_frequency=frequencies[0], max_frequency=frequencies[1], ) measurement_set = self.create_directory_container(node_id, 'dir_in_ms') # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) if carry_over_data.memory_drop_list[count_on_node] is not None: copy_from_s3.addInput( carry_over_data.memory_drop_list[count_on_node]) # Do the UV subtraction casa_py_uvsub_drop = self.create_docker_app( node_id, get_module_name(DockerUvsub), 'app_uvsub', CONTAINER_CHILES02, 'uvsub', min_frequency=frequencies[0], max_frequency=frequencies[1], ) result = self.create_directory_container(node_id, 'dir_uvsub_output') casa_py_uvsub_drop.addInput(measurement_set) casa_py_uvsub_drop.addOutput(result) copy_uvsub_to_s3 = self.create_app( node_id, get_module_name(CopyUvsubToS3), 'app_copy_uvsub_to_s3', min_frequency=frequencies[0], max_frequency=frequencies[1], ) s3_uvsub_drop_out = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}/{2}'.format( self._s3_uvsub_name, split_to_process[0], split_to_process[1], ), 'aws-chiles02', oid='s3_out', ) copy_uvsub_to_s3.addInput(result) copy_uvsub_to_s3.addOutput(s3_uvsub_drop_out) clean_up = self.create_app( node_id, get_module_name(CleanupDirectories), 'app_cleanup_directories', ) memory_drop = self.create_memory_drop(node_id) clean_up.addInput(s3_uvsub_drop_out) clean_up.addInput(result) clean_up.addInput(measurement_set) clean_up.addOutput(memory_drop) # Remember the end of the tail carry_over_data.memory_drop_list[count_on_node] = memory_drop
def _build_stats_chain(self, uvsub_to_process, count_on_node, node_id): # Get the carry over carry_over_data = self._map_carry_over_data[node_id] if carry_over_data.memory_drop_list is None: carry_over_data.memory_drop_list = [None] * self._parallel_streams s3_drop = self.create_s3_drop( node_id, self._bucket_name, '{0}/{1}_{2}/{3}'.format( self._s3_uvsub_name, uvsub_to_process[1], uvsub_to_process[2], uvsub_to_process[0] ), 'aws-chiles02', oid='s3_in', ) self._start_oids.append(s3_drop['uid']) copy_from_s3 = self.create_app( node_id, get_module_name(CopyStatsFromS3), 'app_copy_from_s3', min_frequency=uvsub_to_process[1], max_frequency=uvsub_to_process[2], ) measurement_set = self.create_directory_container( node_id, 'dir_in_ms' ) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(measurement_set) if carry_over_data.memory_drop_list[count_on_node] is not None: copy_from_s3.addInput(carry_over_data.memory_drop_list[count_on_node]) # Do the stats casa_py_stats_drop = self.create_docker_app( node_id, get_module_name(DockerStats), 'app_stats', CONTAINER_CHILES02, 'stats', password=self._password, database_hostname=self._database_hostname, day_name_id=self._map_day_name[uvsub_to_process[0]], width=self._width, min_frequency=uvsub_to_process[1], max_frequency=uvsub_to_process[2], ) result = self.create_memory_drop(node_id) casa_py_stats_drop.addInput(measurement_set) casa_py_stats_drop.addOutput(result) clean_up = self.create_app( node_id, get_module_name(CleanupDirectories), 'app_cleanup_directories', # dry_run=True, ) memory_drop = self.create_memory_drop(node_id) clean_up.addInput(result) clean_up.addOutput(memory_drop) # Remember the end of the tail carry_over_data.memory_drop_list[count_on_node] = memory_drop
def build_graph(self): session = boto3.Session(profile_name='aws-chiles02') s3 = session.resource('s3', use_ssl=False) self._s3_client = s3.meta.client self._bucket = s3.Bucket(self._bucket_name) # Get the ones we've already done already_done = [] prefix = '{0}/'.format(self._s3_jpeg2000_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.jpx'): (head, tail) = os.path.split(key.key) (name, ext) = os.path.splitext(tail) already_done.append(name[6:]) # Add the cleaned images s3_objects = [] prefix = '{0}/'.format(self._s3_fits_name) for key in self._bucket.objects.filter(Prefix=prefix): if key.key.endswith('.fits'): (head, tail) = os.path.split(key.key) (name, ext) = os.path.splitext(tail) if name[8:] not in already_done: s3_objects.append(key.key) parallel_streams = [None] * self._parallel_streams counter = 0 for s3_object in s3_objects: s3_drop = dropdict({ "type": 'plain', "storage": 's3', "oid": self.get_oid('s3_in'), "uid": self.get_uuid(), "precious": False, "bucket": self._bucket_name, "key": s3_object, "profile_name": 'aws-chiles02', "node": self._node_id, }) copy_from_s3 = dropdict({ "type": 'app', "app": get_module_name(CopyFitsFromS3), "oid": self.get_oid('app_copy_from_s3'), "uid": self.get_uuid(), "input_error_threshold": 100, "node": self._node_id, }) (minimum_frequency, maximum_frequency) = self._get_frequencies(s3_object) fits_file_name = self._get_fits_file_name(s3_object) fits_file = dropdict({ "type": 'plain', "storage": 'file', "oid": self.get_oid('fits_file'), "uid": self.get_uuid(), "precious": False, "filepath": os.path.join(self._volume, fits_file_name), "node": self._node_id, }) # The order of arguments is important so don't put anything in front of these copy_from_s3.addInput(s3_drop) copy_from_s3.addOutput(fits_file) self._start_oids.append(s3_drop['uid']) if parallel_streams[counter] is not None: copy_from_s3.addInput(parallel_streams[counter]) self.append(s3_drop) self.append(copy_from_s3) self.append(fits_file) # Do the conversions convert_jpeg2000 = dropdict({ "type": 'app', "app": get_module_name(DockerApp), "oid": self.get_oid('app_convert_jpeg2000'), "uid": self.get_uuid(), "image": CONTAINER_SV, "command": 'sv-encode -i %i0 -o %o0 Clayers=15 Clevels=6 Cycc=no Corder=CPRL ORGgen_plt=yes Cprecincts="{256,256},{128,128}" Cblk="{32,32}" Qstep=0.0001', "user": '******', "node": self._node_id, }) jpeg2000_name = self._get_jpeg2000_name(s3_object) jpeg2000_file = dropdict({ "type": 'plain', "storage": 'file', "container": get_module_name(FileDROP), "oid": self.get_oid('fits_file'), "uid": self.get_uuid(), "precious": False, "filepath": os.path.join(self._volume, jpeg2000_name), "node": self._node_id, }) convert_jpeg2000.addInput(fits_file) convert_jpeg2000.addOutput(jpeg2000_file) self.append(convert_jpeg2000) self.append(jpeg2000_file) copy_jpg2000_to_s3 = dropdict({ "type": 'app', "app": get_module_name(CopyJpeg2000ToS3), "oid": self.get_oid('app_copy_jpeg_to_s3'), "uid": self.get_uuid(), "input_error_threshold": 100, "node": self._node_id, }) s3_jpeg2000_drop_out = dropdict({ "type": 'plain', "storage": 's3', "oid": self.get_oid('s3_out'), "uid": self.get_uuid(), "expireAfterUse": True, "precious": False, "bucket": self._bucket_name, "key": '{0}/image_{1}_{2}.jpx'.format( self._s3_jpeg2000_name, minimum_frequency, maximum_frequency, ), "profile_name": 'aws-chiles02', "node": self._node_id, }) copy_jpg2000_to_s3.addInput(jpeg2000_file) copy_jpg2000_to_s3.addOutput(s3_jpeg2000_drop_out) self.append(copy_jpg2000_to_s3) self.append(s3_jpeg2000_drop_out) parallel_streams[counter] = s3_jpeg2000_drop_out counter += 1 if counter >= self._parallel_streams: counter = 0 barrier_drop = dropdict({ "type": 'app', "app": get_module_name(BarrierAppDROP), "oid": self.get_oid('app_barrier'), "uid": self.get_uuid(), "input_error_threshold": 100, "node": self._node_id, }) self.append(barrier_drop) for jpeg2000_file in parallel_streams: barrier_drop.addInput(jpeg2000_file) carry_over_data = self._map_carry_over_data[self._node_id] carry_over_data.barrier_drop = barrier_drop if self._shutdown: self.add_shutdown()