Beispiel #1
0
    def preprocess_bucket(self, s3_uri_input, s3_uri_output: str, n_cores=1):
        bucket_name_input, path_input = s3_uri_bucket(s3_uri_input)
        bucket_name_output, path_output = s3_uri_bucket(s3_uri_output)

        files = get_s3_path_to_files(bucket_name_input, path_input)
        output_join_path = lambda f: os.path.join("s3://", bucket_name_output,
                                                  path_output, "/".join(
                                                      f.split("/")[-2:]))

        data = pd.DataFrame({
            "input_file": files,
            "output_file": [output_join_path(f) for f in files]
        })

        existing_files = get_s3_path_to_files(bucket_name_output, path_output)
        data = data[~data["output_file"].isin(existing_files)]
        if n_cores == 1:
            data.apply(lambda x: self.apply(x["input_file"], x["output_file"]),
                       axis=1)

        elif n_cores > 1:
            pool = Pool(n_cores)
            for _ in tqdm(pool.imap_unordered(self._apply, data.values),
                          total=len(data)):
                pass
        return data
Beispiel #2
0
def compute_features(s3_uri_input: str, s3_uri_output: str, partition: int,
                     total_cores: int, preprocess: bool, use_monitor: bool,
                     path_monitor: str):
    from ztf_dr.extractors import DataReleaseExtractor
    from ztf_dr.utils.preprocess import Preprocessor
    if use_monitor:
        monitor(path_monitor,
                f"compute_features_{partition}",
                log=True,
                plot=False)
    logging.info("Initializing features computer")
    bucket_name_input, path_input = s3_uri_bucket(s3_uri_input)
    bucket_name_output, path_output = s3_uri_bucket(s3_uri_output)

    data_release = get_s3_path_to_files(bucket_name_input, path_input)
    existing_features = get_s3_path_to_files(bucket_name_output, path_output)
    to_process = s3_filename_difference(data_release, existing_features)
    partitions = split_list(to_process, total_cores)
    my_partition = partitions[partition]
    logging.info(f"Partition {partition} has {len(my_partition)} files")
    del partitions
    del data_release
    del to_process
    del existing_features
    dr_ext = DataReleaseExtractor()
    dr_pre = Preprocessor(limit_epochs=20,
                          mag_error_tolerance=1.0,
                          catflags_filter=0)
    for index, file in enumerate(my_partition):
        out_file = "/".join(file.split("/")[-2:])
        output_file = os.path.join("s3://", bucket_name_output, path_output,
                                   out_file)
        logging.info(f"{index+1}/{len(my_partition)} processing {file}")
        data = pd.read_parquet(file)
        if preprocess:
            data = dr_pre.run(data)

        if data is None:
            continue
        features = dr_ext.compute_features(data)
        del data
        if len(features) == 0:
            logging.info(f"No features for {file}")
            continue
        if features is not None:
            tries = 0
            while tries < 5:
                try:
                    features.to_parquet(output_file)
                    tries = 5
                except ServerTimeoutError:
                    tries += 1

        del features
    logging.info(f"Features computed")
Beispiel #3
0
def insert_lightcurves(mongo_uri: str, mongo_database: str,
                       mongo_collection: str, s3_uri: str, n_processes: int,
                       batch_size: int, drop: bool):
    logging.info("Init now")
    mongo_config = {
        "mongo_uri": mongo_uri,
        "mongo_database": mongo_database,
        "mongo_collection": mongo_collection
    }
    if drop:  # pragma: no cover
        drop_mongo(mongo_config)
    bucket_name, path = s3_uri_bucket(s3_uri)
    to_process = get_s3_path_to_files(bucket_name, path)
    if n_processes == 1:
        for file in to_process:
            insert_lightcurves_to_mongo(file,
                                        mongo_config,
                                        batch_size=batch_size)

    else:  # pragma: no cover
        args = [(os.path.join("s3://", bucket_name,
                              f), mongo_config, batch_size)
                for f in to_process]
        run_jobs(args, insert_lightcurves_to_mongo, num_processes=n_processes)

    mongo_indexes = [("loc", "2dsphere"), ("fieldid", 1), ("filterid", 1)]
    create_indexes(mongo_config, mongo_indexes)
Beispiel #4
0
 def test_preprocess_bucket(self):
     preprocessor = Preprocessor()
     preprocessor.preprocess_bucket(
         "s3://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet",
         "s3://test_bucket/drx/preprocessed.parquet",
         n_cores=1)
     bucket, path = s3_uri_bucket(
         "s3://test_bucket/drx/preprocessed.parquet")
     files = get_s3_path_to_files(bucket, path)
     self.assertEqual(len(files), 1)
Beispiel #5
0
def parse_parquets(s3_uri_input: str,
                   s3_uri_output: str,
                   n_processes: int = 2) -> None:
    bucket_name_input, path_input = s3_uri_bucket(s3_uri_input)
    bucket_name_output, path_output = s3_uri_bucket(s3_uri_output)

    fields = get_s3_path_to_files(bucket_name_input, path_input)
    parsed_fields = get_s3_path_to_files(bucket_name_output, path_output)

    logging.info(f"{len(parsed_fields)}/{len(fields)} fields processed")

    to_process = s3_filename_difference(fields, parsed_fields)
    n_to_process = len(to_process)
    if n_to_process:
        logging.info(
            f"Process {n_to_process} files in {n_processes} processes")
        output_join_path = lambda f: os.path.join("s3://", bucket_name_output,
                                                  path_output, "/".join(
                                                      f.split("/")[-2:]))
        arguments = [(x, output_join_path(x)) for x in to_process]
        run_jobs(arguments, parse_field, num_processes=n_processes)
    return
Beispiel #6
0
def insert_features(mongo_uri: str, mongo_database: str, mongo_collection: str,
                    s3_uri: str, n_process: int, batch_size: int):
    bucket_name, path = s3_uri_bucket(s3_uri)
    to_process = get_s3_path_to_files(bucket_name, path)
    mongo_config = {
        "mongo_uri": mongo_uri,
        "mongo_database": mongo_database,
        "mongo_collection": mongo_collection
    }

    if n_process == 1:
        for file in to_process:
            insert_features_to_mongo(file, mongo_config, batch_size=batch_size)

    else:  # pragma: no cover
        args = [(f, mongo_config, batch_size) for f in to_process]
        run_jobs(args, insert_features_to_mongo, num_processes=n_process)
Beispiel #7
0
 def test_parse_parquet(self):
     parse_parquets("s3://test_bucket/drx/field0202/",
                    "s3://test_bucket/drx/parsed/")
     files = get_s3_path_to_files("test_bucket", "drx/parsed/")
     self.assertEqual(len(files), 1)
Beispiel #8
0
 def test_get_s3_path_to_files(self):
     bucket, path = s3_uri_bucket(
         "s3://test_bucket/drx/field0202/ztf_000202_zg_c10_q1_dr5.parquet")
     files = get_s3_path_to_files(bucket, path)
     self.assertEqual(len(files), 1)