def restore_all_tables_for_all_dbs(self, ts: t.Union[dt.datetime, str], job_dir: t.Optional[t.Union[Path, str]] = None, truncate_first: bool = False) -> None: """Restore all tables for all databases""" ts = parse_timestamp(ts, raise_parse_error=True) for dbt in DBType: td = None try: if job_dir: job_dir = Path(job_dir).resolve() restore_job_dir = Path(job_dir, dbt.value) restore_job_dir.mkdir(exist_ok=False) else: td = TemporaryDirectory() restore_job_dir = Path(td.name) self.restore_all_tables( db_type=dbt, ts=ts, job_dir=restore_job_dir, truncate_first=truncate_first ) finally: if td: td.cleanup()
def backup_all_tables(self, db_type: t.Union[DBType, str], ts: t.Union[dt.datetime, str], job_dir: t.Optional[t.Union[Path, str]] = None) -> None: """Backup all tables for given db_type to S3 :param db_type: type of db - web/orch :param ts: backup timestamp that determines s3 backup path :param job_dir: directory used to store downloaded files - temp dir by default """ db_type = DBType(db_type) ts = parse_timestamp(ts, raise_parse_error=True) backup_prefix = self.get_backup_prefix(db_type=db_type, ts=ts) if self.s3u.prefix_exists(backup_prefix): raise ValueError(f"Cannot backup to given timestamped prefix because it already exists: {backup_prefix}") try: td = None if job_dir: job_dir = Path(job_dir).resolve() job_dir.mkdir(exist_ok=True) else: td = TemporaryDirectory() job_dir = Path(td.name) print(f"Backing up tables to S3 {backup_prefix}.", file=sys.stderr) self.export_all_tables(db_type=db_type, export_base_dir=job_dir) self.s3u.upload_dir(local_dir=job_dir, prefix_path=backup_prefix) finally: if td: td.cleanup()
def restore_current_snapshot( self, snapshot_type: t.Union[SnapshotType, str], snapshot_ts: t.Union[dt.datetime, str]) -> str: """Restore current raw/parsed snapshot from one corresponding to a timestamp :param snapshot_type: type of snapshot - raw/parsed :param snapshot_tis: timestamp to use when figuring out what prefix to restore from :return: s3 prefix to the current snapshot """ snapshot_type = SnapshotType(snapshot_type) snapshot_ts = parse_timestamp(snapshot_ts) current_prefix = self.get_current_prefix(snapshot_type) backup_prefix = self.get_backup_prefix_for_ts( snapshot_type=snapshot_type, ts=snapshot_ts) if not self.s3u.prefix_exists(backup_prefix): raise ValueError( f"Cannot restore backup prefix, it doesn't exist: {backup_prefix}" ) if self.s3u.prefix_exists(current_prefix): print( f"Deleting current prefix prior to restore: {current_prefix} ...", file=sys.stderr) for obj_path in self.s3u.iter_object_paths_at_prefix( current_prefix): self.s3u.delete_object(obj_path) print( f"Restoring current prefix - {current_prefix} - from backup at {backup_prefix} ...", file=sys.stderr) self.s3u.copy_prefix(src_prefix=backup_prefix, dst_prefix=current_prefix) return current_prefix
def backup_current_snapshot( self, snapshot_type: t.Union[SnapshotType, str], snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now() ) -> t.Optional[str]: """Backup current raw/parsed snapshot to timestamped location in S3 :param snapshot_type: type of snapshot - raw/parsed :param snapshot_ts: timestamp to use when constructing the backup prefix :return: s3 prefix to the backup """ snapshot_type = SnapshotType(snapshot_type) snapshot_ts = parse_timestamp(snapshot_ts, raise_parse_error=True) current_prefix = self.get_current_prefix(snapshot_type) backup_prefix = self.get_backup_prefix_for_ts( snapshot_type=snapshot_type, ts=snapshot_ts) if self.s3u.prefix_exists(backup_prefix): raise ValueError( f"Cannot backup current snapshot because corresponding prefix already exists: {backup_prefix}" ) if not self.s3u.prefix_exists(current_prefix): print( f"Cannot backup current snapshot because there's nothing there: {current_prefix}", file=sys.stderr) return None print( f"Backing up current prefix {current_prefix} to archive {backup_prefix} ...", file=sys.stderr) self.s3u.copy_prefix(src_prefix=current_prefix, dst_prefix=backup_prefix) return backup_prefix
def process_db_doc_updates(self, idgs: t.Iterable[IngestableDocGroup], ts: t.Union[dt.datetime, str]) -> None: """Process versioned_doc table db updates using given docs""" ts = parse_timestamp(ts=ts, raise_parse_error=True) with Config.connection_helper.orch_db_session_scope('rw') as session: for idg in idgs: if not idg.metadata_idoc: continue metadata = idg.metadata_idoc.metadata existing_doc = VersionedDoc.get_existing_from_doc( doc=metadata, session=session) if existing_doc: session.add(existing_doc) else: pub = Publication.get_or_create_from_document( doc=metadata, session=session) if pub: session.add(pub) vdoc = VersionedDoc.create_from_document( doc=metadata, pub=pub, filename=idg.raw_idoc.local_path.name, doc_location=idg.raw_idoc.s3_path or "", batch_timestamp=ts) session.add(vdoc) session.commit()
def get_publication_date(doc_dict): try: parsed_date = parse_timestamp(doc_dict.get("publication_date", None)) if parsed_date: return datetime.strftime(parsed_date, '%Y-%m-%dT%H:%M:%S') except: return ""
def get_backup_prefix(self, db_type: t.Union[DBType, str], ts: t.Union[dt.datetime, str]) -> str: """Get S3 backup prefix for given db_type and timestamp""" db_type = DBType(db_type) ts = parse_timestamp(ts, raise_parse_error=True) ts_str = ts.strftime(Config.TIMESTAMP_FORMAT) return { DBType.WEB: self.db_backup_base_prefix + DBType.WEB.value + '/' + ts_str, DBType.ORCH: self.db_backup_base_prefix + DBType.ORCH.value + '/' + ts_str }[db_type]
def restore_all_current_snapshots( self, snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now() ) -> t.List[str]: """Restore current snapshots for all databases""" snapshot_ts = parse_timestamp(ts=snapshot_ts, raise_parse_error=True) restored_current_prefixes: t.List[str] = [] for st in SnapshotType: s3_path = self.restore_current_snapshot(snapshot_type=st, snapshot_ts=snapshot_ts) restored_current_prefixes.append(s3_path) return restored_current_prefixes
def backup_all_current_snapshots( self, snapshot_ts: t.Union[dt.datetime, str] = dt.datetime.now() ) -> t.List[str]: """Backup snapshots for all databases""" snapshot_ts = parse_timestamp(ts=snapshot_ts, raise_parse_error=True) backed_up_snapshot_paths: t.List[str] = [] for st in SnapshotType: s3_path = self.backup_current_snapshot(snapshot_type=st, snapshot_ts=snapshot_ts) if s3_path: backed_up_snapshot_paths.append(s3_path) return backed_up_snapshot_paths
def upload_docs_to_s3(self, idgs: t.Iterable[IngestableDocGroup], ts: t.Union[dt.datetime, str], max_threads: int) -> List[str]: """Upload all raw/parsed/metadata docs in a group to s3""" ts = parse_timestamp(ts, raise_parse_error=True) uploaded_files: List[str] = [] def _upload_to_s3(idoc: GenericIngestableDoc, ts=dt.datetime) -> str: print(f"Uploading doc {idoc.local_path!s} to S3 ... ", file=sys.stderr) s3_location = Config.s3_utils.upload_file( file=idoc.local_path, object_prefix=self.get_timestamped_archive_prefix_for_idoc( idoc=idoc, ts=ts)) return s3_location # if we use all available resources # NOT recommended. This uses all computing power at once, will probably crash if big directory if max_threads < 0: max_workers = multiprocessing.cpu_count() # if we don't use multithreading or if we do partitioned multithreading elif max_threads >= 1: max_workers = max_threads # else, bad value inserted for max_threads else: raise ValueError( f"Invalid max_threads value given: ${max_threads}") def dl_inner_func(file, ts_set): file.raw_idoc.s3_path = _upload_to_s3(file.raw_idoc, ts=ts_set) if file.parsed_idoc: file.parsed_idoc.s3_path = _upload_to_s3(file.parsed_idoc, ts=ts_set) if file.metadata_idoc: file.metadata_idoc.s3_path = _upload_to_s3(file.metadata_idoc, ts=ts_set) if file.thumbnail_idoc: file.thumbnail_idoc.s3_path = _upload_to_s3( file.thumbnail_idoc, ts=ts_set) yield file with ThreadPoolExecutor(max_workers=max_workers) as executor: r = executor.map(dl_inner_func, (idg for idg in idgs), (ts for _ in idgs)) for result in r: if result: uploaded_files.append(next(result)) return uploaded_files
def load(self, raw_dir: t.Union[Path, str], metadata_dir: t.Optional[t.Union[Path, str]], parsed_dir: t.Optional[t.Union[Path, str]], thumbnail_dir: t.Optional[t.Union[Path, str]], ingest_ts: t.Union[dt.datetime, str], max_threads: int, update_s3: bool, update_db: bool) -> None: """Process all doc/pub updates for eligible files""" ingest_ts = parse_timestamp(ts=ingest_ts, raise_parse_error=True) print( f"Running load:\n\traw_dir={raw_dir}\n\tmetadata_dir={metadata_dir}\n\tparsed_dir={parsed_dir}\n\t" f"thumbnail_dir={thumbnail_dir}") idgs = list( self.get_ingestable_docs(raw_dir=raw_dir, metadata_dir=metadata_dir, parsed_dir=parsed_dir, thumbnail_dir=thumbnail_dir)) if update_db: print("Updating pub entries in 'publications' table ...", file=sys.stderr) self.process_db_pub_updates(idgs=idgs) else: print("Skipping updates to 'publications' table ...", file=sys.stderr) uploaded_idgs = None if update_s3: print("Uploading docs to S3 ...", file=sys.stderr) uploaded_idgs = list( self.upload_docs_to_s3(idgs=idgs, ts=ingest_ts, max_threads=max_threads)) else: print("Skipping s3 uploads of docs ...", file=sys.stderr) if update_db: print("Updating pub entries in 'versioned_docs' table ...", file=sys.stderr) self.process_db_doc_updates(idgs=uploaded_idgs or idgs, ts=ingest_ts) else: print("Skipping updates to 'versioned_docs' table ...", file=sys.stderr)
def create_from_document(doc: Dict[str, Any], doc_location: str, filename: str, batch_timestamp: dt.datetime, pub: Publication) -> 'VersionedDoc': """Generate VersionedDoc from Document obj. and associated Publication""" return VersionedDoc( publication=pub, name=doc['doc_name'], type=doc['doc_type'], number=doc['doc_num'], # TODO: Pass actual filename using ProcessedDoc instead of Doc # TODO: Tweak for clones? filename=filename, doc_location=doc_location, batch_timestamp=batch_timestamp, publication_date=parse_timestamp(doc['publication_date']), json_metadata=doc, version_hash=doc['version_hash'], md5_hash="", is_ignored=False)
def get_checkpoint_ts(self, checkpoint_path: str, bucket: Optional[str] = None) -> Optional[datetime.datetime]: """Get timestamp from the checkpoint file :param checkpoint_path: Path to timestamp checkpoint file :param bucket: Bucket name :return: Timestamp from the checkpoint file, if one exists """ bucket_name = bucket or self.bucket s3_resource = self.ch.s3_resource if not self.object_exists(object_path=checkpoint_path, bucket=bucket_name): return None response: Dict[str, Any] = s3_resource.Object( bucket_name, checkpoint_path ).get() ts_str = response['Body'].read().decode(encoding="utf-8") return parse_timestamp(ts_str)
def restore_all_tables(self, db_type: t.Union[DBType, str], ts: t.Union[dt.datetime, str], job_dir: t.Optional[t.Union[Path, str]] = None, truncate_first: bool = False) -> None: """Restore all tables for db_type from given backup timestamp :param ts: backup timestamp that determines s3 backup path :param db_type: DB type - web/orch :param job_dir: directory used to store downloaded files - temp dir by default :param truncate_first: whether to truncate target db tables before importing data """ db_type = DBType(db_type) ts = parse_timestamp(ts, raise_parse_error=True) backup_prefix = self.get_backup_prefix(db_type=db_type, ts=ts) if not self.s3u.prefix_exists(backup_prefix): raise ValueError(f"There is no backup at given prefix to import: {backup_prefix}") try: td = None if job_dir: job_dir = Path(job_dir).resolve() job_dir.mkdir(exist_ok=True) else: td = TemporaryDirectory() job_dir = Path(td.name) print(f"Restoring from backups at {backup_prefix}.", file=sys.stderr) self.s3u.download_dir(local_dir=job_dir, prefix_path=backup_prefix) if truncate_first: print("Truncating tables before import ...", file=sys.stderr) self.truncate_backup_tables(db_type=db_type) self.import_all_tables(db_type=db_type, import_base_dir=job_dir) finally: if td: td.cleanup()
def get_prefix_at_ts(base_prefix: str, ts: t.Union[dt.datetime, str], ts_fmt: str = TIMESTAMP_FORMAT) -> str: """Get prefix for a given timestamp""" ts = parse_timestamp(ts=ts, raise_parse_error=True) base_prefix = S3Utils.format_as_prefix(base_prefix) return S3Utils.path_join(base_prefix, ts.strftime(ts_fmt))