Python prepare_data 예제들, astra.operators.utils.prepare_data Python 예제들

예제 #1

0

파일 보기

def classify(pks, **kwargs):
    """
    Classify sources given the primary keys of task instances.

    :param pks:
        the primary keys of the task instances in the database that need classification
    """

    models = {}
    results = {}
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]

        try:
            model, factory = models[model_path]
        except KeyError:
            network_factory = model_path.split("_")[-2]
            factory = getattr(networks, network_factory)

            log.info(f"Loading model from {model_path} using {factory}")
            model = utils.read_network(factory, model_path)
            model.eval()

            models[model_path] = (model, factory)

        flux = torch.from_numpy(spectrum.flux.value.astype(np.float32))

        with torch.no_grad():
            prediction = model.forward(
                flux)  #Variable(torch.Tensor(spectrum.flux.value)))
            log_probs = prediction.cpu().numpy().flatten()

        results[instance.pk] = log_probs

    for pk, log_probs in tqdm(results.items(), desc="Writing results"):

        result = _prepare_log_prob_result(factory.class_names, log_probs)

        # Write the output to the database.
        create_task_output(pk, astradb.Classification, **result)

예제 #2

0

파일 보기

def estimate_stellar_labels(pks,
                            default_num_uncertainty_draws=100,
                            default_large_error=1e10):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
         
    :param default_num_uncertainty_draws: [optional]
        The number of random draws to make of the flux uncertainties, which will be
        propagated into the estimate of the stellar parameter uncertainties (default: 100).
    
    :param default_large_error: [optional]
        An arbitrarily large error value to assign to bad pixels (default: 1e10).
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tpks: {pks}")

    log.debug(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")

    models = {}

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]

        # Load the model.
        try:
            model = models[model_path]
        except KeyError:
            log.info(f"Loaded model from {model_path}")

            models[model_path] = model = Model(model_path, device)

        N, P = spectrum.flux.shape

        # Build metadata array.
        metadata_keys, metadata, metadata_norm = get_metadata(spectrum)

        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape(
            (N, 1, P))
        meta = np.tile(metadata_norm, N).reshape((N, -1))

        flux = torch.from_numpy(flux).to(device)
        meta = torch.from_numpy(meta).to(device)

        with torch.set_grad_enabled(False):
            predictions = model.predict_spectra(flux, meta)
            if device != "cpu":
                predictions = predictions.cpu().data.numpy()

        # Replace infinites with non-finite.
        predictions[~np.isfinite(predictions)] = np.nan

        # Create results array.
        log_g, log_teff, fe_h = predictions.T
        teff = 10**log_teff
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            logg=log_g.tolist(),
            fe_h=fe_h.tolist(),
        )

        num_uncertainty_draws = int(
            instance.parameters.get("num_uncertainty_draws",
                                    default_num_uncertainty_draws))

        if num_uncertainty_draws > 0:
            large_error = float(
                instance.parameters.get("large_error", default_large_error))

            flux_error = np.nan_to_num(
                spectrum.uncertainty.array**-0.5).astype(np.float32).reshape(
                    (N, 1, P))
            median_error = 5 * np.median(flux_error, axis=(1, 2))

            for j, value in enumerate(median_error):
                bad_pixel = (flux_error[j]
                             == large_error) | (flux_error[j] >= value)
                flux_error[j][bad_pixel] = value

            flux_error = torch.from_numpy(flux_error).to(device)

            inputs = torch.randn((num_uncertainty_draws, N, 1, P),
                                 device=device) * flux_error + flux
            inputs = inputs.reshape((num_uncertainty_draws * N, 1, P))

            meta_error = meta.repeat(num_uncertainty_draws, 1)
            with torch.set_grad_enabled(False):
                draws = model.predict_spectra(inputs, meta_error)
                if device != "cpu":
                    draws = draws.cpu().data.numpy()

            draws = draws.reshape((num_uncertainty_draws, N, -1))

            # Need to put the log(teffs) to teffs before calculating std_dev
            draws[:, :, 1] = 10**draws[:, :, 1]

            median_draw_predictions = np.nanmedian(draws, axis=0)
            std_draw_predictions = np.nanstd(draws, axis=0)

            log_g_median, teff_median, fe_h_median = median_draw_predictions.T
            log_g_std, teff_std, fe_h_std = std_draw_predictions.T

            result.update(_teff_median=teff_median.tolist(),
                          _logg_median=log_g_median.tolist(),
                          _fe_h_median=fe_h_median.tolist(),
                          u_teff=teff_std.tolist(),
                          u_logg=log_g_std.tolist(),
                          u_fe_h=fe_h_std.tolist())

        else:
            median_draw_predictions, std_draw_predictions = (None, None)

        # Add the bitmask flag.
        bitmask_flag = create_bitmask(
            predictions,
            median_draw_predictions=median_draw_predictions,
            std_draw_predictions=std_draw_predictions)

        result.update(bitmask_flag=bitmask_flag.tolist())

        # Write the result to database.
        create_task_output(instance, astradb.ApogeeNet, **result)

    log.info(f"Completed processing of {total} primary keys")

예제 #3

0

파일 보기

def estimate_radial_velocity(pks,
                             verbose=True,
                             mcmc=False,
                             figfile=None,
                             cornername=None,
                             retpmodels=False,
                             plot=False,
                             tweak=True,
                             usepeak=False,
                             maxvel=[-1000, 1000]):
    """
    Estimate radial velocities for the sources that are identified by the task instances
    of the given primary keys.

    :param pks:
        The primary keys of task instances to estimate radial velocities for, which includes
        parameters to identify the source SDSS data model product.

    See `doppler.rv.fit` for more information on other keyword arguments.
    """

    # TODO: Move this to astra/contrib
    import doppler

    log.info(f"Estimating radial velocities for {len(pks)} task instances")

    failures = []
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        log.debug(f"Running Doppler on {instance} from {path}")

        try:
            spectrum = doppler.read(path)
            summary, model_spectrum, modified_input_spectrum = doppler.rv.fit(
                spectrum,
                verbose=verbose,
                mcmc=mcmc,
                figfile=figfile,
                cornername=cornername,
                retpmodels=retpmodels,
                plot=plot,
                tweak=tweak,
                usepeak=usepeak,
                maxvel=maxvel)

        except:
            log.exception(
                f"Exception occurred on Doppler on {path} with task instance {instance}"
            )
            failures.append(instance.pk)
            continue

        else:
            # Write the output to the database.
            results = prepare_results(summary)

            create_task_output(instance, astradb.Doppler, **results)

    if len(failures) > 0:
        log.warning(
            f"There were {len(failures)} Doppler failures out of a total {len(pks)} executions."
        )
        log.warning(f"Failed primary keys include: {failures}")

        log.warning(f"Raising last exception to indicate failure in pipeline.")
        raise

예제 #4

0

파일 보기

파일: sdss_data_product.py 프로젝트: sdss/astra

 def prepare_data(self):
     """
     A generator that yields the task instance, the path of the input data product, the 
     spectrum, and the modified spectrum (after applying any spectrum callbacks).
     """
     yield from prepare_data(self.pks)

예제 #5

0

파일 보기

def estimate_stellar_labels(pks, **kwargs):
    """
    Estimate stellar labels given a single-layer neural network.

    :param pks:
        The primary keys of task instances to estimate stellar labels for. The
        task instances include information to identify the source SDSS data product.
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running ThePayne on device {device} with:")
    log.info(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")
    log.info(f"Using torch version {torch.__version__} in {torch.__path__}")

    states = {}

    log.info(f"Estimating stellar labels for task instances")

    results = {}
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]
        try:
            state = states[model_path]
        except KeyError:
            log.info(f"Loading model from {model_path}")
            state = states[model_path] = test.load_state(model_path)

            label_names = state["label_names"]
            L = len(label_names)
            log.info(f"Estimating these {L} label names: {label_names}")

        # Run optimization.
        t_init = time()
        p_opt, p_cov, model_flux, meta = test.test(spectrum.wavelength.value,
                                                   spectrum.flux.value,
                                                   spectrum.uncertainty.array,
                                                   **state)
        t_opt = time() - t_init

        #log.debug(f"spectrum shape: {spectrum.flux.shape}")
        #log.debug(f"p_opt shape: {p_opt.shape}")
        #log.debug(f"spectrum meta: {spectrum.meta['snr']}")

        # Prepare outputs.
        result = dict(zip(label_names, p_opt.T))
        result.update(snr=spectrum.meta["snr"])
        # Include uncertainties.
        result.update(
            dict(
                zip((f"u_{ln}" for ln in label_names),
                    np.sqrt(p_cov[:,
                                  np.arange(p_opt.shape[1]),
                                  np.arange(p_opt.shape[1])].T))))

        results[instance.pk] = result
        log.info(f"Result for {instance} took {t_opt} seconds")

    # Write database outputs.
    for pk, result in tqdm(results.items(), desc="Writing database outputs"):
        # Write database outputs.
        create_task_output(pk, astradb.ThePayne, **result)

    return None

예제 #6

0

파일 보기

def _estimate_stellar_labels(pk):

    # TODO: It would be great if these were stored with the network,
    #       instead of being hard-coded.
    label_names = ["teff", "logg", "vsini", "v_micro", "m_h"]
    # Translate:
    _t = {
        "teff": "T_eff",
        "logg": "log(g)",
        "m_h": "[M/H]",
        "vsini": "v*sin(i)",
    }

    # TODO: This implicitly assumes that the same constraints and network path are used by all the
    #       primary keys given. This is the usual case, but we should check this, and code around it.

    # TODO: This implementation requires knowing the observed spectrum before loading data.
    #       This is fine for ApStar objects since they all have the same dispersion sampling,
    #       but will not be fine for dispersion sampling that differs in each observation.

    # Let's peak ahead at the first valid spectrum we can find.
    instance, _, spectrum = next(prepare_data([pk]))
    if spectrum is None:
        # No valid spectrum.
        log.warning(
            f"Cannot build LSF for fitter because no spectrum found for primary key {pk}"
        )
        return None

    network = Network()
    network.read_in(instance.parameters["network_path"])

    constraints = json.loads(instance.parameters.get("constraints", "{}"))
    fitted_label_names = [
        ln for ln in label_names \
            if network.grid[_t.get(ln, ln)][0] != network.grid[_t.get(ln, ln)][1]
    ]
    L = len(fitted_label_names)

    bounds_unscaled = np.zeros((2, L))
    for i, ln in enumerate(fitted_label_names):
        bounds_unscaled[:,
                        i] = constraints.get(ln, network.grid[_t.get(ln,
                                                                     ln)][:2])

    fit = Fit(network, int(instance.parameters["N_chebyshev"]))
    fit.bounds_unscaled = bounds_unscaled

    spectral_resolution = int(instance.parameters["spectral_resolution"])
    fit.lsf = LSF_Fixed_R(spectral_resolution, spectrum.wavelength.value,
                          network.wave)

    # Note the Stramut code uses inconsistent naming for "presearch", but in the operator interface we use
    # 'pre_search' in all situations. That's why there is some funny naming translation here.
    fit.N_presearch_iter = int(instance.parameters["N_pre_search_iter"])
    fit.N_pre_search = int(instance.parameters["N_pre_search"])

    fitter = UncertFit(fit, spectral_resolution)
    N, P = spectrum.flux.shape

    keys = []
    keys.extend(fitted_label_names)
    keys.extend([f"u_{ln}" for ln in fitted_label_names])
    keys.extend(["v_rad", "u_v_rad", "chi2", "theta"])

    result = {key: [] for key in keys}
    result["snr"] = spectrum.meta["snr"]

    model_fluxes = []
    log.info(f"Running ThePayne-Che on {N} spectra for {instance}")

    for i in range(N):

        flux = spectrum.flux.value[i]
        error = spectrum.uncertainty.array[0]**-0.5

        # TODO: No NaNs/infs are allowed, but it doesn't seem like that was an issue for Stramut's code.
        #       Possibly due to different versions of scipy. In any case, raise this as a potential bug,
        #       since the errors do not always seem to be believed by ThePayne-Che.
        bad = (~np.isfinite(flux)) | (error <= 0)
        flux[bad] = 0
        error[bad] = 1e10

        fit_result = fitter.run(
            spectrum.wavelength.value,
            flux,
            error,
        )

        # The `popt` attribute is length: len(label_names) + 1 (for radial velocity) + N_chebyshev

        # Relevent attributes are:
        # - fit_result.popt
        # - fit_result.uncert
        # - fit_result.RV_uncert
        # - fit_result.model

        for j, label_name in enumerate(fitted_label_names):
            result[label_name].append(fit_result.popt[j])
            result[f"u_{label_name}"].append(fit_result.uncert[j])

        result["theta"].append(fit_result.popt[L + 1:].tolist())
        result["chi2"].append(fit_result.chi2_func(fit_result.popt))
        result["v_rad"].append(fit_result.popt[L])
        result["u_v_rad"].append(fit_result.RV_uncert)

        model_fluxes.append(fit_result.model)

    # Write database result.
    create_task_output(instance, astradb.ThePayneChe, **result)

    # TODO: Write AstraSource object here.
    return None

예제 #7

0

파일 보기

파일: operators.py 프로젝트: sdss/astra

def estimate_stellar_labels(pks,
                            model_path,
                            dwave_slam=10.,
                            p_slam=(1E-8, 1E-7),
                            q_slam=0.7,
                            ivar_block_slam=None,
                            eps_slam=1E-19,
                            rsv_frac_slam=2.,
                            n_jobs_slam=1,
                            verbose_slam=5):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
    
    :param model_path:
        The disk path of the pre-trained model.
        
    :param dwave_slam: float
        binning width
        
    :param p_slam: tuple of 2 ps [optional]
        smoothing parameter between 0 and 1: (default: 1E-8, 1E-7)
        0 -> LS-straight line
        1 -> cubic spline interpolant
        
    :param q_slam: float in range of [0, 100] [optional]
        percentile, between 0 and 1 (default: 0.7)
        
    :param ivar_block_slam: ndarray (n_pix, ) | None [optional]
        ivar array (default: None)
        
    :param eps_slam: float [optional]
        the ivar threshold (default: 1E-19)
    
    :param rsv_frac_slam: float [optional]
        the fraction of pixels reserved in terms of std. default is 3.
    
    :param n_jobs_slam: int [optional]
        number of processes launched by joblib (default: 1)
        
    :param verbose_slam: int / bool [optional]
        verbose level (default: 5)
    """
    '''
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tmodel_path: {model_path}")
    log.info(f"\tpks: {pks}")

    log.debug(f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")
    
    # Load the model.
    ### model = Model(model_path, device)
    '''

    # Load the model.
    model = Slam.load_dump(model_path)  ### ("./models/btsettl.dump")
    ### wave_interp = np.load("./models/wave_interp_R1800.npz")['wave'] ### ??? how to load properly
    wave_interp = model.wave

    log.info(f"Loaded model from {model_path}")

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        N, P = spectrum.flux.shape
        '''
        ### original code in apogeenet
        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P))
        
        ### original code in MDwarfMachine
        fluxes, invars = [], []
        for i in tqdm(range(len(obs_spec))):
            fluxes += [obs_spec[i]['flux_resamp']]
            invars += [obs_spec[i]['invar_resamp']]
        fluxes, invars = np.array(fluxes), np.array(invars)
        '''
        ### wave   = np.nan_to_num(spectrum.spectral_axis.value).astype(np.float32).reshape((N, 1, P))
        ### fluxes = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P)) ### ??? reshape to what format
        ### invars = np.nan_to_num(spectrum.uncertainty.array).astype(np.float32).reshape((N, 1, P)) ### ???  spectrum.uncertainity format
        wave = spectrum.spectral_axis
        fluxes = spectrum.flux
        invars = specrrum.uncertainty

        fluxes_resamp, invars_resamp = [], []
        for i in tqdm(range(N)):
            fluxes_temp, invars_temp = resample(wave[i], fluxes[i], invars[i],
                                                wave_interp)
            fluxes_resamp += [fluxes_temp]
            invars_resamp += [invars_temp]
        fluxes_resamp, invars_resamp = np.array(fluxes_resamp), np.array(
            invars_resamp)

        ### normalization of each spetra
        ### fluxes_norm, fluxes_cont = normalize_spectra_block(wave_interp, fluxes_resamp,
        ###                                           (6147., 8910.), 10., p=(1E-8, 1E-7), q=0.7,
        ###                                           eps=1E-19, rsv_frac=2., n_jobs=1, verbose=5) ### ??? inputs
        fluxes_norm, fluxes_cont = normalize_spectra_block(
            wave_interp,
            fluxes_resamp, (6147., 8910.),
            dwave_slam,
            p=p_slam,
            q=q_slam,
            ivar_block=ivar_block_slam,
            eps=eps_slam,
            rsv_frac=rsv_frac_slam,
            n_jobs=n_jobs_slam,
            verbose=verbose_slam)

        invars_norm = fluxes_cont**2 * invars_resamp

        ### Initial estimation: get initial estimate of parameters by chi2 best match
        label_init = model.predict_labels_quick(fluxes_norm,
                                                invars_norm,
                                                n_jobs=1)

        ### SLAM prediction: optimize parameters
        results_pred = model.predict_labels_multi(label_init, fluxes_norm,
                                                  invars_norm)
        label_pred = np.array([label['x'] for label in results_pred])
        std_pred = np.array([label['pstd'] for label in results_pred])

        ### modify the following block for SLAM style
        # Create results array.
        ### log_g, log_teff, fe_h = predictions.T
        ### teff = 10**log_teff
        teff = label_pred[:, 0]
        m_h = label_pred[:, 1]
        log_g = label_pred[:, 2]
        alpha_m = label_pred[:, 3]
        u_teff = std_pred[:, 0]
        u_m_h = std_pred[:, 1]
        u_log_g = std_pred[:, 2]
        u_alpha_m = std_pred[:, 3]
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            m_h=m_h.tolist(),
            logg=log_g.tolist(),
            alpha_m=alpha_m.tolist(),
            u_teff=u_teff.tolist(),
            u_m_h=u_m_h.tolist(),
            u_logg=u_log_g.tolist(),
            u_alpha_m=u_alpha_m.tolist(),
        )

        # Write the result to database.
        ### create_task_output(instance, astradb.ApogeeNet, **result)
        create_task_output(instance, astradb.SLAM, **result)

    log.info(f"Completed processing of {total} primary keys")