def _fill_in_old_browse_collection( db: BundleDB, changes_dict: ChangesDict, bundle_lidvid: LIDVID, data_collection_lidvid: LIDVID, ) -> None: bundle_segment = bundle_lidvid.lid().parts()[0] collection_segment = data_collection_lidvid.lid().parts()[1] browse_collection_lid = data_collection_lidvid.lid().to_browse_lid() browse_collection_segment = browse_collection_lid.collection_id browse_collection_vid = data_collection_lidvid.vid() browse_collection_lidvid = LIDVID.create_from_lid_and_vid( browse_collection_lid, browse_collection_vid) changes_dict.set(browse_collection_lid, browse_collection_vid, False) db.create_bundle_collection_link(str(bundle_lidvid), str(browse_collection_lidvid)) try: PDS_LOGGER.open("Fill in old browse collection") PDS_LOGGER.log( "info", f"Created link and change for {browse_collection_lidvid}") for product in db.get_collection_products( str(browse_collection_lidvid)): product_lidvid = LIDVID(product.lidvid) changes_dict.set(product_lidvid.lid(), product_lidvid.vid(), False) PDS_LOGGER.log("info", f"Created link and change for {product_lidvid}") except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def next_stage(self, phase: str) -> Optional[Stage]: def phase_index() -> int: for i, (name, stage) in enumerate(self.stages): if name == phase: return i raise ValueError(f"unknown phase {phase}.") i = phase_index() try: PDS_LOGGER.log("info", f"{self.stages[i+1][0]}") return self.stages[i + 1][1] except IndexError: return None
def download_product_documents(proposal_id: int, download_dir: str) -> Set[str]: """ Using the templates, try to download the documentation files for this proposal ID into a directory and return a set of the basenames of the files successfully downloaded. """ table: List[Tuple[str, str]] = [ (f"https://www.stsci.edu/hst/phase2-public/{proposal_id}.apt", "phase2.apt"), (f"https://www.stsci.edu/hst/phase2-public/{proposal_id}.pdf", "phase2.pdf"), (f"https://www.stsci.edu/hst/phase2-public/{proposal_id}.pro", "phase2.pro"), (f"https://www.stsci.edu/hst/phase2-public/{proposal_id}.prop", "phase2.prop"), ] res: Set[str] = set() try: PDS_LOGGER.open("Download product documents") for (url, basename) in table: filepath = fs.path.join(download_dir, basename) if _retrieve_doc(url, filepath): PDS_LOGGER.log("info", f"Retrieve {basename} from {url}") res.add(basename) except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close() return res
def _copy_docs_files(self, bundle_segment: str, documents_dir: str, primary_files_dir: str) -> None: if not os.path.isdir(documents_dir): raise ValueError(f"{documents_dir} doesn't exist.") try: PDS_LOGGER.open("Copy docs files to document directory") with make_osfs(documents_dir) as documents_fs, make_sv_osfs( primary_files_dir) as primary_files_fs: new_dir_path = os.path.join( to_segment_dir(bundle_segment), to_segment_dir("document"), to_segment_dir("phase2"), ) primary_files_fs.makedirs(new_dir_path) for file in documents_fs.walk.files(): file_basename = os.path.basename(file) new_file_path = os.path.join(new_dir_path, file_basename) PDS_LOGGER.log("info", f"Copy {file_basename} to {new_file_path}") fs.copy.copy_file(documents_fs, file, primary_files_fs, new_file_path) except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def run() -> None: assert len(sys.argv) == 2, sys.argv proposal_id = int(sys.argv[1]) init_logging() dirs = make_directories() PDS_LOGGER.open( f"Pipeline for proposal id: {proposal_id}", limits={ "info": INFO_MESSAGE_LIMIT, }, ) state_machine = StateMachine(dirs, proposal_id) state_machine.run() PDS_LOGGER.close()
def _copy_fits_files(self, bundle_segment: str, mast_downloads_dir: str, primary_files_dir: str) -> None: if not os.path.isdir(mast_downloads_dir): raise ValueError(f"{mast_downloads_dir} doesn't exist.") try: PDS_LOGGER.open("Copy fits files to corresponding directories") with make_osfs( mast_downloads_dir) as mast_downloads_fs, make_sv_osfs( primary_files_dir) as primary_files_fs: # Walk the mast_downloads_dir for FITS file and file # them into the COW filesystem. for filepath in mast_downloads_fs.walk.files( filter=["*.fits"]): parts = fs.path.iteratepath(filepath) depth = len(parts) if depth != 3: raise ValueError(f"{parts} length is not 3.") # New way: product name comes from the filename _, _, filename = parts filename = filename.lower() hst_filename = HstFilename(filename) product = hst_filename.rootname() instrument_name = hst_filename.instrument_name() suffix = hst_filename.suffix() collection_type = get_collection_type( suffix=suffix, instrument_id=instrument_name) coll = f"{collection_type}_{instrument_name.lower()}_{suffix}" new_path = fs.path.join( to_segment_dir(bundle_segment), to_segment_dir(coll), to_segment_dir(product), filename, ) dirs, filename = fs.path.split(new_path) primary_files_fs.makedirs(dirs) PDS_LOGGER.log("info", f"Copy {filename} to {new_path}") fs.copy.copy_file(mast_downloads_fs, filepath, primary_files_fs, new_path) if not os.path.isdir(primary_files_dir + "-sv"): raise ValueError(f"{primary_files_dir + '-sv'} doesn't exist.") # # If I made it to here, it should be safe to delete the downloads # shutil.rmtree(mast_downloads_dir) # assert not os.path.isdir(mast_downloads_dir) except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def change_fits_file(rel_path: str) -> None: abs_path = fs.path.join(self.mast_downloads_dir(), fs.path.relpath(rel_path)) from touch_fits import touch_fits try: PDS_LOGGER.open("Change fits file") PDS_LOGGER.log("info", f"Touching {abs_path}") touch_fits(abs_path) except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def _do_downloads( self, working_dir: str, mast_downloads_dir: str, proposal_id: int, ) -> None: try: PDS_LOGGER.open("Download datafiles") # first pass, <working_dir> shouldn't exist; second pass # <working_dir>/mastDownload should not exist. if os.path.isdir(mast_downloads_dir): raise ValueError( "<working_dir>/mastDownload should not exist.") # TODO These dates are wrong; they potentially collect too # much. Do I need to reduce the range of dates here? slice = MastSlice((1900, 1, 1), (2025, 1, 1), proposal_id) proposal_ids = slice.get_proposal_ids() if proposal_id not in proposal_ids: raise KeyError(f"{proposal_id} not in {proposal_ids}") # get files from full list of ACCEPTED_SUFFIXES product_set = slice.to_product_set(proposal_id) if not os.path.isdir(working_dir): os.makedirs(working_dir) # TODO I should also download the documents here. product_set.download(working_dir) # TODO This might fail if there are no files. Which might not be # a bad thing. PDS_LOGGER.log( "info", f"::::::::::mast_downloads_dir: {mast_downloads_dir}") PDS_LOGGER.log("info", f"Download datafiles to {mast_downloads_dir}") if not os.path.isdir(mast_downloads_dir): raise ValueError(f"{mast_downloads_dir} doesn't exist.") except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def _run(self) -> None: working_dir: str = self.working_dir() archive_dir: str = self.archive_dir() deliverable_dir: str = self.deliverable_dir() manifest_dir: str = self.manifest_dir() try: PDS_LOGGER.open("Create deliverable directory") if os.path.isdir(deliverable_dir): raise ValueError( f"{deliverable_dir} cannot exist for MakeDeliverable.") changes_path = os.path.join(working_dir, CHANGES_DICT_NAME) changes_dict = read_changes_dict(changes_path) with make_osfs(archive_dir) as archive_osfs, make_multiversioned( archive_osfs) as mv: bundle_segment = self._bundle_segment bundle_lid = LID.create_from_parts([bundle_segment]) bundle_vid = changes_dict.vid(bundle_lid) bundle_lidvid = LIDVID.create_from_lid_and_vid( bundle_lid, bundle_vid) version_view = VersionView(mv, bundle_lidvid) synth_files: Dict[str, bytes] = dict() # open the database db_filepath = fs.path.join(working_dir, _BUNDLE_DB_NAME) bundle_db = create_bundle_db_from_os_filepath(db_filepath) bundle_lidvid_str = str(bundle_lidvid) synth_files = dict() cm = make_checksum_manifest(bundle_db, bundle_lidvid_str, short_lidvid_to_dirpath) synth_files["/checksum.manifest.txt"] = cm.encode("utf-8") tm = make_transfer_manifest(bundle_db, bundle_lidvid_str, short_lidvid_to_dirpath) synth_files["/transfer.manifest.txt"] = tm.encode("utf-8") deliverable_view = DeliverableView(version_view, synth_files) os.mkdir(deliverable_dir) deliverable_osfs = OSFS(deliverable_dir) copy_fs(deliverable_view, deliverable_osfs) PDS_LOGGER.log("info", f"Deliverable: {deliverable_dir}") except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def rollback_transaction(self, e: Exception) -> None: error_text = (f"EXCEPTION raised by {self._bundle_segment}, " f"stage {self.class_name()}: {e}\n" f"{traceback.format_exc()}") PDS_LOGGER.open(f"Stage '{self.class_name}' error raised") PDS_LOGGER.error(error_text) PDS_LOGGER.close() self._marker_file.set_marker_info(self.class_name(), "failure", error_text)
def _delete_directory() -> None: PDS_LOGGER.open("Delete directory") for path in mast_fs.walk.dirs(): if len(fs.path.parts(path)) == 3: PDS_LOGGER.log("info", f"REMOVED {path}") mast_fs.removetree(path) PDS_LOGGER.close() return raise RuntimeError( "Fell off the end of delete_directory in ChangeFiles.")
def _change_fits_file() -> None: which_file = 0 PDS_LOGGER.open("Change fits file") for path in mast_fs.walk.files(filter=["*.fits"]): # change only the n-th FITS file then return if which_file == 0: change_fits_file(path) PDS_LOGGER.log("info", f"CHANGED {path}") PDS_LOGGER.close() return which_file = which_file - 1 raise RuntimeError( "Fell off the end of change_fits_file in ChangeFiles.")
def _run(self) -> None: working_dir: str = self.working_dir() primary_files_dir: str = self.primary_files_dir() archive_dir: str = self.archive_dir() archive_primary_deltas_dir: str = self.archive_primary_deltas_dir() try: PDS_LOGGER.open( "Create a directory for a new version of the bundle") if os.path.isdir(self.deliverable_dir()): raise ValueError( f"{self.deliverable_dir()} cannot exist for InsertChanges." ) changes_path = os.path.join(working_dir, CHANGES_DICT_NAME) with make_osfs(archive_dir) as archive_osfs, make_version_view( archive_osfs, self._bundle_segment ) as version_view, make_sv_osfs( primary_files_dir) as primary_files_osfs, make_sv_deltas( version_view, archive_primary_deltas_dir) as sv_deltas: archive_dirs = list(archive_osfs.walk.dirs()) changes_dict = read_changes_dict(changes_path) _merge_primaries(changes_dict, primary_files_osfs, sv_deltas) shutil.rmtree(primary_files_dir + "-sv") if not os.path.isdir(archive_dir): raise ValueError(f"{archive_dir} doesn't exist.") dirpath = archive_primary_deltas_dir + "-deltas-sv" PDS_LOGGER.log("info", f"Directory for the new version: {dirpath}") if not os.path.isdir(dirpath): raise ValueError(f"{dirpath} doesn't exist.") if not os.path.isfile(changes_path): raise ValueError(f"{changes_path} is not a file.") except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def _run(self) -> None: working_dir: str = self.working_dir() documents_dir: str = self.documents_dir() mast_downloads_dir: str = self.mast_downloads_dir() archive_dir: str = self.archive_dir() if not os.path.isdir(working_dir): return for entry in os.listdir(working_dir): fullpath = os.path.join(working_dir, entry) if not (fullpath in [ documents_dir, mast_downloads_dir, archive_dir ] or fullpath.endswith(".tar.gz") or fullpath.endswith(".db")): if os.path.isdir(fullpath): shutil.rmtree(fullpath) else: os.unlink(fullpath) PDS_LOGGER.open("Re-reset pipeline") PDS_LOGGER.log( "info", f"contents of working_dir after re-reset: {os.listdir(working_dir)}" ) PDS_LOGGER.close()
def builder(document: Document) -> Node: doc = document stack: List[Union[Node, List[Node]]] = [] class Builder(xml.sax.handler.ContentHandler): def startElement(self, name: str, attrs: Any) -> None: if name == "NODE": param_name = attrs["name"] param = dictionary[param_name] if type(param) in [str]: elmt = doc.createTextNode(param) elif type(param) in [int, float]: elmt = doc.createTextNode(str(param)) else: if not _is_function(param): raise ValueError( f"{param_name} is type " + f"{type(param)}; should be function.") elmt = param(doc) if not isinstance(elmt, xml.dom.Node): raise TypeError("Failed to create NODE based " + f"on info keyed by {param_name} " + "in dictionary.") stack.append(elmt) elif name == "FRAGMENT": param_name = attrs["name"] param = dictionary[param_name] if not _is_function(param): raise ValueError( f"{param_name} is type " + f"{type(param)}; should be function.") elmts = param(doc) if not isinstance(elmts, list): raise TypeError( f"elmts created by {param_name} is not a list." ) elmt_list: List[Node] = elmts for elmt in elmt_list: if not isinstance(elmt, xml.dom.Node): raise TypeError( "Failed to create NODE based " + f"on info keyed by {param_name} " + "in dictionary.") stack.append(elmt_list) else: elmt = doc.createElement(name) for name in attrs.getNames(): elmt.setAttribute(name, attrs[name]) if not isinstance(elmt, xml.dom.Node): raise TypeError(f"{elmt} is not a dom NODE.") stack.append(elmt) def endElement(self, name: str) -> None: if name == "FRAGMENT": elmts = stack.pop() if not isinstance(elmts, list): raise TypeError(f"{elmts} is not a list.") elmt_list: List[xml.dom.Node] = elmts for elmt in elmt_list: elmt.normalize() if stack: cast(xml.dom.Node, stack[-1]).appendChild(elmt) else: stack.append(elmt) else: elmt = cast(xml.dom.Node, stack.pop()) elmt.normalize() if stack: cast(xml.dom.Node, stack[-1]).appendChild(elmt) else: stack.append(elmt) def characters(self, content: str) -> None: node = doc.createTextNode(content) cast(xml.dom.Node, stack[-1]).appendChild(node) try: xml.sax.parseString(template, Builder()) except Exception: PDS_LOGGER.open("Template error") PDS_LOGGER.error(f"malformed template: {template}") PDS_LOGGER.close() raise return stack[-1]
def _run(self) -> None: try: PDS_LOGGER.open("BuildBrowse") PDS_LOGGER.log("info", "Entering BuildBrowse.") working_dir: str = self.working_dir() archive_dir: str = self.archive_dir() archive_primary_deltas_dir: str = self.archive_primary_deltas_dir() archive_browse_deltas_dir: str = self.archive_browse_deltas_dir() if os.path.isdir(self.deliverable_dir()): raise ValueError(f"{self.deliverable_dir()} cannot exist " + "for BuildBrowse.") changes_path = os.path.join(working_dir, CHANGES_DICT_NAME) changes_dict = read_changes_dict(changes_path) db_filepath = os.path.join(working_dir, _BUNDLE_DB_NAME) db = create_bundle_db_from_os_filepath(db_filepath) bundle_lid = LID.create_from_parts([self._bundle_segment]) bundle_vid = changes_dict.vid(bundle_lid) bundle_lidvid = LIDVID.create_from_lid_and_vid( bundle_lid, bundle_vid) with make_osfs(archive_dir) as archive_osfs, make_version_view( archive_osfs, self._bundle_segment) as version_view, make_sv_deltas( version_view, archive_primary_deltas_dir ) as sv_deltas, make_sv_deltas( sv_deltas, archive_browse_deltas_dir) as browse_deltas: bundle_path = f"/{self._bundle_segment}$/" collection_segments = [ str(coll[:-1]) for coll in browse_deltas.listdir(bundle_path) if "$" in coll ] for collection_segment in collection_segments: collection_lid = LID.create_from_parts( [self._bundle_segment, collection_segment]) if _requires_browse_collection(collection_segment): collection_vid = changes_dict.vid(collection_lid) collection_lidvid = LIDVID.create_from_lid_and_vid( collection_lid, collection_vid) if changes_dict.changed(collection_lid): PDS_LOGGER.log( "info", f"Making browse for {collection_lidvid}") _build_browse_collection( db, changes_dict, browse_deltas, bundle_lidvid, collection_lidvid, bundle_path, ) else: _fill_in_old_browse_collection( db, changes_dict, bundle_lidvid, collection_lidvid) write_changes_dict(changes_dict, changes_path) PDS_LOGGER.log("info", "Leaving BuildBrowse.") except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close()
def dirs_match(dirpath: str) -> bool: primary_dirs = filter_to_primary_dirs( dirpath, ( relpath(dir) for dir in SubFS(primary_fs, dirpath).walk.dirs() if "$" in dir ), ) latest_dirs = filter_to_primary_dirs( dirpath, ( relpath(dir) for dir in SubFS(latest_version_fs, dirpath).walk.dirs() if "$" in dir ), ) PDS_LOGGER.open("Directory changes detected") if primary_dirs == latest_dirs: for dir in primary_dirs: full_dirpath = join(dirpath, relpath(dir)) lid = dirpath_to_lid(full_dirpath) if lid not in result.changes_dict: raise KeyError(f"{lid} not in changes_dict.") if result.changed(lid): PDS_LOGGER.log( "info", f"CHANGE DETECTED in {dirpath}: {lid} changed" ) PDS_LOGGER.close() return False PDS_LOGGER.close() return True else: # list of dirs does not match added = primary_dirs - latest_dirs removed = latest_dirs - primary_dirs if added and removed: PDS_LOGGER.log( "info", f"CHANGE DETECTED IN {dirpath}: added {added}; removed {removed}", ) elif added: PDS_LOGGER.log("info", f"CHANGE DETECTED IN {dirpath}: added {added}") else: # removed PDS_LOGGER.log( "info", f"CHANGE DETECTED IN {dirpath}: removed {removed}" ) PDS_LOGGER.close() return False
def files_match(dirpath: str) -> bool: # All files in subcomponents will have a "$" in their path (it # comes after the name of the subcomponent), so by filtering # them out, we get only the files for this component. PDS4 # *does* allow directories in a component (that aren't part of # a subcomponent), so we use walk instead of listdir() to get # *all* the files, not just the top-level ones. primary_files = filter_to_primary_files( dirpath, ( relpath(filepath) for filepath in SubFS(primary_fs, dirpath).walk.files() if "$" not in filepath ), ) latest_files = filter_to_primary_files( dirpath, ( relpath(filepath) for filepath in SubFS(latest_version_fs, dirpath).walk.files() if "$" not in filepath ), ) try: PDS_LOGGER.open("File changes detected") if primary_files != latest_files: PDS_LOGGER.log( "info", f"CHANGE DETECTED IN {dirpath}: {primary_files} != {latest_files}", ) PDS_LOGGER.close() return False for filename in primary_files: filepath = join(dirpath, relpath(filename)) if primary_fs.getbytes(filepath) != latest_version_fs.getbytes( filepath ): PDS_LOGGER.log( "info", f"CHANGE DETECTED IN {filepath}; DIRPATH = {dirpath}" ) PDS_LOGGER.close() return False except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close() return True
def log_label(tag: str, lidvid: str) -> None: PDS_LOGGER.log("info", f"{tag} label for {lidvid}")
def _run(self) -> None: working_dir: str = self.working_dir() archive_dir: str = self.archive_dir() archive_primary_deltas_dir: str = self.archive_primary_deltas_dir() archive_browse_deltas_dir: str = self.archive_browse_deltas_dir() archive_label_deltas_dir: str = self.archive_label_deltas_dir() if os.path.isdir(self.deliverable_dir()): raise ValueError( f"{self.deliverable_dir()} cannot exist for BuildLabels.") changes_path = fs.path.join(working_dir, CHANGES_DICT_NAME) changes_dict = read_changes_dict(changes_path) with make_osfs(archive_dir) as archive_osfs, make_version_view( archive_osfs, self._bundle_segment) as version_view, make_sv_deltas( version_view, archive_primary_deltas_dir) as sv_deltas, make_sv_deltas( sv_deltas, archive_browse_deltas_dir ) as browse_deltas, make_sv_deltas( browse_deltas, archive_label_deltas_dir) as label_deltas: changes_path = fs.path.join(working_dir, CHANGES_DICT_NAME) changes_dict = read_changes_dict(changes_path) # open the database db_filepath = fs.path.join(working_dir, _BUNDLE_DB_NAME) db = create_bundle_db_from_os_filepath(db_filepath) # create labels bundle_lid = LID.create_from_parts([self._bundle_segment]) bundle_vid = changes_dict.vid(bundle_lid) bundle_lidvid = LIDVID.create_from_lid_and_vid( bundle_lid, bundle_vid) documents_dir = f"/{self._bundle_segment}$/document$/phase2$" docs = set(sv_deltas.listdir(documents_dir)) # fetch citation info from database citation_info_from_db = db.get_citation(str(bundle_lidvid)) info = Citation_Information( citation_info_from_db.filename, citation_info_from_db.propno, citation_info_from_db.category, citation_info_from_db.cycle, citation_info_from_db.authors.split(","), citation_info_from_db.title, citation_info_from_db.submission_year, citation_info_from_db.timing_year, citation_info_from_db.abstract.split("\n"), ) info.set_publication_year(PUBLICATION_YEAR) try: PDS_LOGGER.open("BuildLabels") # create_pds4_labels() may change changes_dict, because we # create the context collection if it doesn't exist. create_pds4_labels(working_dir, db, bundle_lidvid, changes_dict, label_deltas, info) except Exception as e: PDS_LOGGER.exception(e) finally: PDS_LOGGER.close() write_changes_dict(changes_dict, changes_path)