def preprocess_bucket(self, s3_uri_input, s3_uri_output: str, n_cores=1): bucket_name_input, path_input = s3_uri_bucket(s3_uri_input) bucket_name_output, path_output = s3_uri_bucket(s3_uri_output) files = get_s3_path_to_files(bucket_name_input, path_input) output_join_path = lambda f: os.path.join("s3://", bucket_name_output, path_output, "/".join( f.split("/")[-2:])) data = pd.DataFrame({ "input_file": files, "output_file": [output_join_path(f) for f in files] }) existing_files = get_s3_path_to_files(bucket_name_output, path_output) data = data[~data["output_file"].isin(existing_files)] if n_cores == 1: data.apply(lambda x: self.apply(x["input_file"], x["output_file"]), axis=1) elif n_cores > 1: pool = Pool(n_cores) for _ in tqdm(pool.imap_unordered(self._apply, data.values), total=len(data)): pass return data
def compute_features(s3_uri_input: str, s3_uri_output: str, partition: int, total_cores: int, preprocess: bool, use_monitor: bool, path_monitor: str): from ztf_dr.extractors import DataReleaseExtractor from ztf_dr.utils.preprocess import Preprocessor if use_monitor: monitor(path_monitor, f"compute_features_{partition}", log=True, plot=False) logging.info("Initializing features computer") bucket_name_input, path_input = s3_uri_bucket(s3_uri_input) bucket_name_output, path_output = s3_uri_bucket(s3_uri_output) data_release = get_s3_path_to_files(bucket_name_input, path_input) existing_features = get_s3_path_to_files(bucket_name_output, path_output) to_process = s3_filename_difference(data_release, existing_features) partitions = split_list(to_process, total_cores) my_partition = partitions[partition] logging.info(f"Partition {partition} has {len(my_partition)} files") del partitions del data_release del to_process del existing_features dr_ext = DataReleaseExtractor() dr_pre = Preprocessor(limit_epochs=20, mag_error_tolerance=1.0, catflags_filter=0) for index, file in enumerate(my_partition): out_file = "/".join(file.split("/")[-2:]) output_file = os.path.join("s3://", bucket_name_output, path_output, out_file) logging.info(f"{index+1}/{len(my_partition)} processing {file}") data = pd.read_parquet(file) if preprocess: data = dr_pre.run(data) if data is None: continue features = dr_ext.compute_features(data) del data if len(features) == 0: logging.info(f"No features for {file}") continue if features is not None: tries = 0 while tries < 5: try: features.to_parquet(output_file) tries = 5 except ServerTimeoutError: tries += 1 del features logging.info(f"Features computed")
def test_s3_uri_bucket(self): bucket, path = s3_uri_bucket( "s3://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet") self.assertEqual(bucket, "test_bucket") self.assertEqual(path, "drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet") with self.assertRaises(Exception) as context: s3_uri_bucket( "http://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet" ) self.assertIsInstance(context.exception, Exception)
def insert_lightcurves(mongo_uri: str, mongo_database: str, mongo_collection: str, s3_uri: str, n_processes: int, batch_size: int, drop: bool): logging.info("Init now") mongo_config = { "mongo_uri": mongo_uri, "mongo_database": mongo_database, "mongo_collection": mongo_collection } if drop: # pragma: no cover drop_mongo(mongo_config) bucket_name, path = s3_uri_bucket(s3_uri) to_process = get_s3_path_to_files(bucket_name, path) if n_processes == 1: for file in to_process: insert_lightcurves_to_mongo(file, mongo_config, batch_size=batch_size) else: # pragma: no cover args = [(os.path.join("s3://", bucket_name, f), mongo_config, batch_size) for f in to_process] run_jobs(args, insert_lightcurves_to_mongo, num_processes=n_processes) mongo_indexes = [("loc", "2dsphere"), ("fieldid", 1), ("filterid", 1)] create_indexes(mongo_config, mongo_indexes)
def test_preprocess_bucket(self): preprocessor = Preprocessor() preprocessor.preprocess_bucket( "s3://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet", "s3://test_bucket/drx/preprocessed.parquet", n_cores=1) bucket, path = s3_uri_bucket( "s3://test_bucket/drx/preprocessed.parquet") files = get_s3_path_to_files(bucket, path) self.assertEqual(len(files), 1)
def __init__(self, data_release_url: str, checksum_path: str, s3_uri: str, output_folder="/tmp"): self.logger = self.init_logging() self.data_release_url = data_release_url self.checksum_path = checksum_path self.output_folder = output_folder self.bucket_name, self.path = s3_uri_bucket(s3_uri) self.checksums = self.get_checksums() self.uploaded_files = self.in_s3_files()
def parse_parquets(s3_uri_input: str, s3_uri_output: str, n_processes: int = 2) -> None: bucket_name_input, path_input = s3_uri_bucket(s3_uri_input) bucket_name_output, path_output = s3_uri_bucket(s3_uri_output) fields = get_s3_path_to_files(bucket_name_input, path_input) parsed_fields = get_s3_path_to_files(bucket_name_output, path_output) logging.info(f"{len(parsed_fields)}/{len(fields)} fields processed") to_process = s3_filename_difference(fields, parsed_fields) n_to_process = len(to_process) if n_to_process: logging.info( f"Process {n_to_process} files in {n_processes} processes") output_join_path = lambda f: os.path.join("s3://", bucket_name_output, path_output, "/".join( f.split("/")[-2:])) arguments = [(x, output_join_path(x)) for x in to_process] run_jobs(arguments, parse_field, num_processes=n_processes) return
def insert_features(mongo_uri: str, mongo_database: str, mongo_collection: str, s3_uri: str, n_process: int, batch_size: int): bucket_name, path = s3_uri_bucket(s3_uri) to_process = get_s3_path_to_files(bucket_name, path) mongo_config = { "mongo_uri": mongo_uri, "mongo_database": mongo_database, "mongo_collection": mongo_collection } if n_process == 1: for file in to_process: insert_features_to_mongo(file, mongo_config, batch_size=batch_size) else: # pragma: no cover args = [(f, mongo_config, batch_size) for f in to_process] run_jobs(args, insert_features_to_mongo, num_processes=n_process)
def test_get_s3_path_to_files(self): bucket, path = s3_uri_bucket( "s3://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet") files = get_s3_path_to_files(bucket, path) self.assertEqual(len(files), 1)