Ejemplo n.º 1
0
def create_source_table(database_model_name,
                        output_path,
                        format=None,
                        overwrite=False,
                        filter_by_kwargs=None,
                        limit=None):
    """
    Create a table of sources and their results from the database. 
    
    If there are mutiple results per source (e.g., from individual visits), then only the first result is included.

    Parameters
    ----------
    database_model_name : str
        Name of the database model to query.
    output_path : str
        Path to the output file.
    format : str
        Format of the output file.
    filter_by_kwargs : dict
        Keyword arguments to pass to the database query.
    limit : int
        Limit the number of results.
    """

    rows = _get_results(database_model_name,
                        spectrum_index=0,
                        filter_by_kwargs=filter_by_kwargs,
                        limit=limit)

    table = Table(rows=rows)
    table.write(output_path, format=format, overwrite=overwrite)
    log.info(f"Wrote source table with {len(table)} rows to {output_path}")
    return None
Ejemplo n.º 2
0
def create_visit_table(database_model_name,
                       output_path,
                       format=None,
                       overwrite=False,
                       filter_by_kwargs=None,
                       limit=None):
    """
    Create a table of visits and their results from the database. 
    
    Parameters
    ----------
    database_model_name : str
        Name of the database model to query.
    output_path : str
        Path to the output file.
    format : str
        Format of the output file.
    filter_by_kwargs : dict
        Keyword arguments to pass to the database query.
    limit : int
        Limit the number of results.
    """

    rows = _get_results(database_model_name,
                        spectrum_index=None,
                        filter_by_kwargs=filter_by_kwargs,
                        limit=limit)

    table = Table(rows=rows)
    table.write(output_path, format=format, overwrite=overwrite)
    log.info(f"Wrote visit table with {len(table)} rows to {output_path}")
    return None
Ejemplo n.º 3
0
Archivo: base.py Proyecto: sdss/astra
 def _unlink_primary_key_path(self):
     try:
         primary_key_path = self._primary_key_path
     except AttributeError:
         None
     else:
         log.info(f"Removing temporary file at {primary_key_path}")
         os.unlink(primary_key_path)
     return None
Ejemplo n.º 4
0
Archivo: core.py Proyecto: sdss/astra
def _execute_ferre_by_slurm(directory, total, offile, interval=60, **kwargs):

    from slurm import queue as SlurmQueue

    label = "ferre"

    queue = SlurmQueue(verbose=True)
    queue.create(
        label=label,
        **kwargs
    )
    queue.append(_ferre_executable, dir=directory)
    queue.commit(hard=True, submit=True)

    log.info(f"Slurm job submitted with {queue.key} and keywords {kwargs} to run {_ferre_executable} in {directory}")
    log.info(f"\tJob directory: {queue.job_dir}")

    # 
    stdout_path = os.path.join(directory, f"{label}_01.o")
    stderr_path = os.path.join(directory, f"{label}_01.e")    
    output_flux_path = os.path.join(directory, offile)

    # Now we wait until the Slurm job is complete.
    t_init, t_to_start = (time(), None)
    while 100 > queue.get_percent_complete():

        sleep(interval)

        t = time() - t_init

        if not os.path.exists(stderr_path) and not os.path.exists(stdout_path):
            log.info(f"Waiting on job {queue.key} to start (elapsed: {t / 60:.0f} min)")
        else:
            log.info(f"Job in {queue.key} has started")
            
            total_done = 0
            with tqdm(total=total, desc="FERRE", unit="spectra") as pb:
                while total_done < total:
                    n_done = _check_ferre_progress(output_flux_path)            
                    pb.update(n_done - total_done)
                    total_done = n_done

                    pb.refresh()

                    sleep(interval)

            log.info("Finishing up.")

    with open(stdout_path, "r") as fp:
        stdout = fp.read()
    with open(stderr_path, "r") as fp:
        stderr = fp.read()

    return (stdout, stderr)
Ejemplo n.º 5
0
    def infer_releases(self, context):
        """
        Infer the SDSS release(s) to use based on the execution context.

        :param context:
            The Airflow context dictionary.
        """
        releases = infer_releases(context["ds"], context["next_ds"])
        log.info(
            f"Between {context['ds']} and {context['next_ds']} the relevant SDSS releases are {releases}"
        )
        return releases
Ejemplo n.º 6
0
Archivo: github.py Proyecto: sdss/astra
def validate_slug(slug):
    r"""
    Validate a given GitHub repository slug, given some input.

    :param slug:
        The given slug string, which should be in the form '{OWNER}/{REPO}'.
        If no '{OWNER}' is given, it will be assumed to be owned by the SDSS organization.
    """

    slug = f"{slug}".strip().lower()
    if "/" not in slug:
        log.info(f"Assuming GitHub repository '{slug}' is owned by SDSS (sdss/{slug})")
        slug = f"sdss/{slug}"
    return slug
Ejemplo n.º 7
0
def add_meta_to_task_instances_without_meta():
    """
    Add meta to task instances without meta.
    """
    failed, total = (0, count_task_instances_without_meta())
    for pk in tqdm(yield_task_instance_pks_without_meta(),
                   total=total,
                   desc="Adding metadata to task instances"):
        try:
            add_meta_to_task_instance(pk)
        except:
            log.exception(f"Unable to add meta to task instance with pk {pk}")
            continue
    log.info(f"Added meta to {total - failed} task instances")
    return None
Ejemplo n.º 8
0
    def run(self):

        # Load training set labels and spectra.
        labels, dispersion, training_set_flux, training_set_ivar = read_training_set(
            self.input().path, self.default_inverse_variance)

        # Set the vectorizer.
        # We sort the label names so that luigi doesn't re-train models if we alter the order.
        vectorizer = tc.vectorizer.PolynomialVectorizer(
            sorted(self.label_names), self.order)

        # Initiate model.
        model = tc.model.CannonModel(labels,
                                     training_set_flux,
                                     training_set_ivar,
                                     vectorizer=vectorizer,
                                     dispersion=dispersion,
                                     regularization=self.regularization)

        log.info(f"Training The Cannon model {model}")
        model.train(threads=self.threads)

        output_path = self.output().path
        log.info(f"Writing The Cannon model {model} to disk {output_path}")
        model.write(output_path)

        if self.plot:
            # Plot zeroth and first order coefficients.
            from astra_thecannon import plot
            fig = plot.theta(
                model,
                indices=np.arange(1 + len(model.vectorizer.label_names)),
                normalize=False)
            fig.savefig(f"{self.output_prefix}-theta.png")

            # Plot scatter.
            fig = plot.scatter(model)
            fig.savefig(f"{self.output_prefix}-scatter.png")

            # Plot one-to-one.
            test_labels, test_cov, test_meta = model.test(
                training_set_flux,
                training_set_ivar,
                initial_labels=model.training_set_labels)
            fig = plot.one_to_one(model, test_labels, cov=test_cov)
            fig.savefig(f"{self.output_prefix}-one-to-one.png")
Ejemplo n.º 9
0
def classify(pks, **kwargs):
    """
    Classify sources given the primary keys of task instances.

    :param pks:
        the primary keys of the task instances in the database that need classification
    """

    models = {}
    results = {}
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]

        try:
            model, factory = models[model_path]
        except KeyError:
            network_factory = model_path.split("_")[-2]
            factory = getattr(networks, network_factory)

            log.info(f"Loading model from {model_path} using {factory}")
            model = utils.read_network(factory, model_path)
            model.eval()

            models[model_path] = (model, factory)

        flux = torch.from_numpy(spectrum.flux.value.astype(np.float32))

        with torch.no_grad():
            prediction = model.forward(
                flux)  #Variable(torch.Tensor(spectrum.flux.value)))
            log_probs = prediction.cpu().numpy().flatten()

        results[instance.pk] = log_probs

    for pk, log_probs in tqdm(results.items(), desc="Writing results"):

        result = _prepare_log_prob_result(factory.class_names, log_probs)

        # Write the output to the database.
        create_task_output(pk, astradb.Classification, **result)
Ejemplo n.º 10
0
    def _init_progressbar(self, N, message=None):
        """
        Initialise a progressbar.

        :param N:
            The number of items that will be iterated over.
        
        :param message: [optional]
            An information message to log before showing the progressbar.
        """

        self.N = int(N)

        if self.N < 0:
            return
        
        try:
            rows, columns = os.popen('stty size', 'r').read().split()

        except:
            log.debug("Couldn't get screen size. Progressbar may look odd.")
            self.W = 100

        else:
            self.W = min(100, int(columns) - (12 + 21 + 2 * len(str(self.N))))

        self.t_init = time()
        self.message = message
        if 0 >= self.N:
            return None

        if message is not None:
            log.info(message.rstrip())
        
        sys.stdout.flush()
        with _counter_lock:
            _counter.value = 0
Ejemplo n.º 11
0
Archivo: base.py Proyecto: sdss/astra
    def execute(self, context):
        """
        Execute the operator.

        :param context:
            The Airflow DAG execution context.
        """

        if self.slurm_kwargs:

            # Serialize the primary keys.
            if len(self.pks) > 1:
                primary_key_path = serialize_pks_to_path(self.pks,
                                                         dir=get_scratch_dir())
                log.info(
                    f"Serialized {len(self.pks)} primary keys to {primary_key_path}. First 10 primary keys are {self.pks[:10]}"
                )

                # Store the primary key path, because we will clean up later.
                self._primary_key_path = primary_key_path

                bash_command = f"astra execute {primary_key_path}"
            else:
                bash_command = f"astra execute {self.pks[0]}"

            self.execute_by_slurm(context, bash_command)

        else:
            # This is essentially what "astra execute [PK]" does.
            function = string_to_callable(self.python_callable)

            result = function(self.pks, **(self.op_kwargs or dict()))
            log.info(
                f"Result from {function} with op kwargs {self.op_kwargs} was: {result}"
            )

        return self.pks
Ejemplo n.º 12
0
def _estimate_stellar_labels(pk):

    # TODO: It would be great if these were stored with the network,
    #       instead of being hard-coded.
    label_names = ["teff", "logg", "vsini", "v_micro", "m_h"]
    # Translate:
    _t = {
        "teff": "T_eff",
        "logg": "log(g)",
        "m_h": "[M/H]",
        "vsini": "v*sin(i)",
    }

    # TODO: This implicitly assumes that the same constraints and network path are used by all the
    #       primary keys given. This is the usual case, but we should check this, and code around it.

    # TODO: This implementation requires knowing the observed spectrum before loading data.
    #       This is fine for ApStar objects since they all have the same dispersion sampling,
    #       but will not be fine for dispersion sampling that differs in each observation.

    # Let's peak ahead at the first valid spectrum we can find.
    instance, _, spectrum = next(prepare_data([pk]))
    if spectrum is None:
        # No valid spectrum.
        log.warning(
            f"Cannot build LSF for fitter because no spectrum found for primary key {pk}"
        )
        return None

    network = Network()
    network.read_in(instance.parameters["network_path"])

    constraints = json.loads(instance.parameters.get("constraints", "{}"))
    fitted_label_names = [
        ln for ln in label_names \
            if network.grid[_t.get(ln, ln)][0] != network.grid[_t.get(ln, ln)][1]
    ]
    L = len(fitted_label_names)

    bounds_unscaled = np.zeros((2, L))
    for i, ln in enumerate(fitted_label_names):
        bounds_unscaled[:,
                        i] = constraints.get(ln, network.grid[_t.get(ln,
                                                                     ln)][:2])

    fit = Fit(network, int(instance.parameters["N_chebyshev"]))
    fit.bounds_unscaled = bounds_unscaled

    spectral_resolution = int(instance.parameters["spectral_resolution"])
    fit.lsf = LSF_Fixed_R(spectral_resolution, spectrum.wavelength.value,
                          network.wave)

    # Note the Stramut code uses inconsistent naming for "presearch", but in the operator interface we use
    # 'pre_search' in all situations. That's why there is some funny naming translation here.
    fit.N_presearch_iter = int(instance.parameters["N_pre_search_iter"])
    fit.N_pre_search = int(instance.parameters["N_pre_search"])

    fitter = UncertFit(fit, spectral_resolution)
    N, P = spectrum.flux.shape

    keys = []
    keys.extend(fitted_label_names)
    keys.extend([f"u_{ln}" for ln in fitted_label_names])
    keys.extend(["v_rad", "u_v_rad", "chi2", "theta"])

    result = {key: [] for key in keys}
    result["snr"] = spectrum.meta["snr"]

    model_fluxes = []
    log.info(f"Running ThePayne-Che on {N} spectra for {instance}")

    for i in range(N):

        flux = spectrum.flux.value[i]
        error = spectrum.uncertainty.array[0]**-0.5

        # TODO: No NaNs/infs are allowed, but it doesn't seem like that was an issue for Stramut's code.
        #       Possibly due to different versions of scipy. In any case, raise this as a potential bug,
        #       since the errors do not always seem to be believed by ThePayne-Che.
        bad = (~np.isfinite(flux)) | (error <= 0)
        flux[bad] = 0
        error[bad] = 1e10

        fit_result = fitter.run(
            spectrum.wavelength.value,
            flux,
            error,
        )

        # The `popt` attribute is length: len(label_names) + 1 (for radial velocity) + N_chebyshev

        # Relevent attributes are:
        # - fit_result.popt
        # - fit_result.uncert
        # - fit_result.RV_uncert
        # - fit_result.model

        for j, label_name in enumerate(fitted_label_names):
            result[label_name].append(fit_result.popt[j])
            result[f"u_{label_name}"].append(fit_result.uncert[j])

        result["theta"].append(fit_result.popt[L + 1:].tolist())
        result["chi2"].append(fit_result.chi2_func(fit_result.popt))
        result["v_rad"].append(fit_result.popt[L])
        result["u_v_rad"].append(fit_result.RV_uncert)

        model_fluxes.append(fit_result.model)

    # Write database result.
    create_task_output(instance, astradb.ThePayneChe, **result)

    # TODO: Write AstraSource object here.
    return None
Ejemplo n.º 13
0
def estimate_stellar_labels(pks,
                            default_num_uncertainty_draws=100,
                            default_large_error=1e10):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
         
    :param default_num_uncertainty_draws: [optional]
        The number of random draws to make of the flux uncertainties, which will be
        propagated into the estimate of the stellar parameter uncertainties (default: 100).
    
    :param default_large_error: [optional]
        An arbitrarily large error value to assign to bad pixels (default: 1e10).
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tpks: {pks}")

    log.debug(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")

    models = {}

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]

        # Load the model.
        try:
            model = models[model_path]
        except KeyError:
            log.info(f"Loaded model from {model_path}")

            models[model_path] = model = Model(model_path, device)

        N, P = spectrum.flux.shape

        # Build metadata array.
        metadata_keys, metadata, metadata_norm = get_metadata(spectrum)

        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape(
            (N, 1, P))
        meta = np.tile(metadata_norm, N).reshape((N, -1))

        flux = torch.from_numpy(flux).to(device)
        meta = torch.from_numpy(meta).to(device)

        with torch.set_grad_enabled(False):
            predictions = model.predict_spectra(flux, meta)
            if device != "cpu":
                predictions = predictions.cpu().data.numpy()

        # Replace infinites with non-finite.
        predictions[~np.isfinite(predictions)] = np.nan

        # Create results array.
        log_g, log_teff, fe_h = predictions.T
        teff = 10**log_teff
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            logg=log_g.tolist(),
            fe_h=fe_h.tolist(),
        )

        num_uncertainty_draws = int(
            instance.parameters.get("num_uncertainty_draws",
                                    default_num_uncertainty_draws))

        if num_uncertainty_draws > 0:
            large_error = float(
                instance.parameters.get("large_error", default_large_error))

            flux_error = np.nan_to_num(
                spectrum.uncertainty.array**-0.5).astype(np.float32).reshape(
                    (N, 1, P))
            median_error = 5 * np.median(flux_error, axis=(1, 2))

            for j, value in enumerate(median_error):
                bad_pixel = (flux_error[j]
                             == large_error) | (flux_error[j] >= value)
                flux_error[j][bad_pixel] = value

            flux_error = torch.from_numpy(flux_error).to(device)

            inputs = torch.randn((num_uncertainty_draws, N, 1, P),
                                 device=device) * flux_error + flux
            inputs = inputs.reshape((num_uncertainty_draws * N, 1, P))

            meta_error = meta.repeat(num_uncertainty_draws, 1)
            with torch.set_grad_enabled(False):
                draws = model.predict_spectra(inputs, meta_error)
                if device != "cpu":
                    draws = draws.cpu().data.numpy()

            draws = draws.reshape((num_uncertainty_draws, N, -1))

            # Need to put the log(teffs) to teffs before calculating std_dev
            draws[:, :, 1] = 10**draws[:, :, 1]

            median_draw_predictions = np.nanmedian(draws, axis=0)
            std_draw_predictions = np.nanstd(draws, axis=0)

            log_g_median, teff_median, fe_h_median = median_draw_predictions.T
            log_g_std, teff_std, fe_h_std = std_draw_predictions.T

            result.update(_teff_median=teff_median.tolist(),
                          _logg_median=log_g_median.tolist(),
                          _fe_h_median=fe_h_median.tolist(),
                          u_teff=teff_std.tolist(),
                          u_logg=log_g_std.tolist(),
                          u_fe_h=fe_h_std.tolist())

        else:
            median_draw_predictions, std_draw_predictions = (None, None)

        # Add the bitmask flag.
        bitmask_flag = create_bitmask(
            predictions,
            median_draw_predictions=median_draw_predictions,
            std_draw_predictions=std_draw_predictions)

        result.update(bitmask_flag=bitmask_flag.tolist())

        # Write the result to database.
        create_task_output(instance, astradb.ApogeeNet, **result)

    log.info(f"Completed processing of {total} primary keys")
Ejemplo n.º 14
0
Archivo: base.py Proyecto: sdss/astra
    def execute_by_slurm(self,
                         context,
                         bash_command,
                         directory=None,
                         poke_interval=60):

        uid = str(uuid.uuid4())[:8]
        label = ".".join([
            context["dag"].dag_id,
            context["task"].task_id,
            context["execution_date"].strftime('%Y-%m-%d'),
            # run_id is None if triggered by command line
            uid
        ])
        if len(label) > 64:
            log.warning(
                f"Truncating Slurm label ({label}) to 64 characters: {label[:64]}"
            )
            label = label[:64]

        self._slurm_label = label

        # It's bad practice to import here, but the slurm package is
        # not easily installable outside of Utah, and is not a "must-have"
        # requirement.
        from slurm import queue

        # TODO: HACK to be able to use local astra installation while in development
        if bash_command.startswith("astra "):
            bash_command = f"/uufs/chpc.utah.edu/common/home/u6020307/.local/bin/astra {bash_command[6:]}"

        slurm_kwargs = (self.slurm_kwargs or dict())

        log.info(
            f"Submitting Slurm job {label} with command:\n\t{bash_command}\nAnd Slurm keyword arguments: {slurm_kwargs}"
        )
        q = queue(verbose=True)
        q.create(label=label, dir=directory, **slurm_kwargs)
        q.append(bash_command)
        try:
            q.commit(hard=True, submit=True)
        except CalledProcessError as e:
            log.exception(
                f"Exception occurred when committing Slurm job with output:\n{e.output}"
            )
            raise

        log.info(
            f"Slurm job submitted with {q.key} and keywords {slurm_kwargs}")
        log.info(f"\tJob directory: {directory or q.job_dir}")

        stdout_path = os.path.join(directory or q.job_dir, f"{label}_01.o")
        stderr_path = os.path.join(directory or q.job_dir, f"{label}_01.e")

        # Now we wait until the Slurm job is complete.
        t_submitted, t_started = (time(), None)
        while 100 > q.get_percent_complete():

            sleep(poke_interval)

            t = time() - t_submitted

            if not os.path.exists(stderr_path) and not os.path.exists(
                    stdout_path):
                log.info(
                    f"Waiting on job {q.key} to start (elapsed: {t / 60:.0f} min)"
                )

            else:
                # Check if this is the first time it has started.
                if t_started is None:
                    t_started = time()
                    log.debug(
                        f"Recording job {q.key} as starting at {t_started} (took {t / 60:.0f} min to start)"
                    )

                log.info(
                    f"Waiting on job {q.key} to finish (elapsed: {t / 60:.0f} min)"
                )
                # Open last line of stdout path?

                # If this has been going much longer than the walltime, then something went wrong.
                # TODO: Check on the status of the job from Slurm.

        log.info(
            f"Job {q.key} in {q.job_dir} is complete after {(time() - t_submitted)/60:.0f} minutes."
        )

        with open(stderr_path, "r", newline="\n") as fp:
            stderr = fp.read()
        log.info(f"Contents of {stderr_path}:\n{stderr}")

        with open(stdout_path, "r", newline="\n") as fp:
            stdout = fp.read()
        log.info(f"Contents of {stdout_path}:\n{stdout}")

        # TODO: Better parsing for critical errors.
        if "Error" in stdout.rstrip().split("\n")[-1] \
        or "Error" in stderr.rstrip().split("\n")[-1]:
            raise RuntimeError(f"detected exception at task end-point")

        # TODO: Get exit codes from squeue

        return None
Ejemplo n.º 15
0
def get_best_result(task, ti, **kwargs):
    """
    When there are numerous FERRE tasks that are upstream, this
    function will return the primary keys of the task instances that gave
    the best result on a per-observation basis.
    """

    # Get the PKs from upstream.
    pks = []
    log.debug(f"Upstream tasks: {task.upstream_list}")
    for upstream_task in task.upstream_list:
        pks.append(ti.xcom_pull(task_ids=upstream_task.task_id))

    pks = flatten(pks)
    log.debug(f"Getting best initial guess among primary keys {pks}")

    # Need to uniquely identify observations.
    param_bit_mask = bitmask.ParamBitMask()
    bad_grid_edge = (param_bit_mask.get_value("GRIDEDGE_WARN") | param_bit_mask.get_value("GRIDEDGE_BAD"))

    trees = {}
    best_tasks = {}
    for i, pk in enumerate(pks):
        q = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk==pk)
        instance = q.one_or_none()

        if instance.output is None:
            log.warning(f"No output found for task instance {instance}")
            continue

        p = instance.parameters

        # Check that the telescope is the same as what we expect from this task ID.
        # This is a bit of a hack. Let us explain.

        # The "BA" grid does not have a telescope/fiber model, so you can run LCO and APO
        # data through the initial-BA grid. And those outputs go to the "get_best_results"
        # for each of the APO and LCO tasks (e.g., this function).
        # If there is only APO data, then the LCO "get_best_result" will only have one
        # input: the BA results. Then it will erroneously think that's the best result
        # for that source.

        # It's hacky to put this logic in here. It should be in the DAG instead. Same
        # thing for parsing 'telescope' name in the DAG (eg 'APO') from 'apo25m'.
        this_telescope_short_name = p["telescope"][:3].upper()
        expected_telescope_short_name = task.task_id.split(".")[1]
        log.info(f"For instance {instance} we have {this_telescope_short_name} and {expected_telescope_short_name}")
        if this_telescope_short_name != expected_telescope_short_name:
            continue

        try:
            tree = trees[p["release"]]                
        except KeyError:
            tree = trees[p["release"]] = SDSSPath(release=p["release"])
        
        key = "_".join([
            p['release'],
            p['filetype'],
            *[p[k] for k in tree.lookup_keys(p['filetype'])]
        ])
        
        best_tasks.setdefault(key, (np.inf, None))
        
        # TODO: Confirm that this is base10 log. This should also be 'log_reduced_chisq_fit',
        #       according to the documentation.
        log_chisq_fit, *_ = instance.output.log_chisq_fit
        previous_teff, *_ = instance.output.teff
        bitmask_flag, *_ = instance.output.bitmask_flag
        
        log.debug(f"Result {instance} {instance.output} with log_chisq_fit = {log_chisq_fit} and {previous_teff} and {bitmask_flag}")
        
        # Note: If FERRE totally fails then it will assign -999 values to the log_chisq_fit. So we have to
        #       check that the log_chisq_fit is actually sensible!
        #       (Or we should only query task instances where the output is sensible!)
        if log_chisq_fit < 0: # TODO: This is a f*****g hack.
            log.debug(f"Skipping result for {instance} {instance.output} as log_chisq_fit = {log_chisq_fit}")
            continue
            
        parsed_header = utils.parse_header_path(p["header_path"])
        
        # Penalise chi-sq in the same way they did for DR17.
        # See github.com/sdss/apogee/python/apogee/aspcap/aspcap.py#L658
        if parsed_header["spectral_type"] == "GK" and previous_teff < 3900:
            log.debug(f"Increasing \chisq because spectral type GK")
            log_chisq_fit += np.log10(10)

        bitmask_flag_logg, bitmask_flag_teff = bitmask_flag[-2:]
        if bitmask_flag_logg & bad_grid_edge:
            log.debug(f"Increasing \chisq because logg flag is bad edge")
            log_chisq_fit += np.log10(5)
            
        if bitmask_flag_teff & bad_grid_edge:
            log.debug(f"Increasing \chisq because teff flag is bad edge")
            log_chisq_fit += np.log10(5)
        
        # Is this the best so far?
        if log_chisq_fit < best_tasks[key][0]:
            log.debug(f"Assigning this output to best task as {log_chisq_fit} < {best_tasks[key][0]}: {pk}")
            best_tasks[key] = (log_chisq_fit, pk)
    
    for key, (log_chisq_fit, pk) in best_tasks.items():
        if pk is None:
            log.warning(f"No good task found for key {key}: ({log_chisq_fit}, {pk})")
        else:
            log.info(f"Best task for key {key} with \chi^2 of {log_chisq_fit:.2f} is primary key {pk}")

    if best_tasks:
        return [pk for (log_chisq_fit, pk) in best_tasks.values() if pk is not None]
    else:
        raise AirflowSkipException(f"no task outputs found from {len(pks)} primary keys")
Ejemplo n.º 16
0
def estimate_radial_velocity(pks,
                             verbose=True,
                             mcmc=False,
                             figfile=None,
                             cornername=None,
                             retpmodels=False,
                             plot=False,
                             tweak=True,
                             usepeak=False,
                             maxvel=[-1000, 1000]):
    """
    Estimate radial velocities for the sources that are identified by the task instances
    of the given primary keys.

    :param pks:
        The primary keys of task instances to estimate radial velocities for, which includes
        parameters to identify the source SDSS data model product.

    See `doppler.rv.fit` for more information on other keyword arguments.
    """

    # TODO: Move this to astra/contrib
    import doppler

    log.info(f"Estimating radial velocities for {len(pks)} task instances")

    failures = []
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        log.debug(f"Running Doppler on {instance} from {path}")

        try:
            spectrum = doppler.read(path)
            summary, model_spectrum, modified_input_spectrum = doppler.rv.fit(
                spectrum,
                verbose=verbose,
                mcmc=mcmc,
                figfile=figfile,
                cornername=cornername,
                retpmodels=retpmodels,
                plot=plot,
                tweak=tweak,
                usepeak=usepeak,
                maxvel=maxvel)

        except:
            log.exception(
                f"Exception occurred on Doppler on {path} with task instance {instance}"
            )
            failures.append(instance.pk)
            continue

        else:
            # Write the output to the database.
            results = prepare_results(summary)

            create_task_output(instance, astradb.Doppler, **results)

    if len(failures) > 0:
        log.warning(
            f"There were {len(failures)} Doppler failures out of a total {len(pks)} executions."
        )
        log.warning(f"Failed primary keys include: {failures}")

        log.warning(f"Raising last exception to indicate failure in pipeline.")
        raise
Ejemplo n.º 17
0
def train_polynomial_model(labels, data, order=2, regularization=0, threads=1):

    log.debug(f'Inputs are: ({type(labels)}) {labels}')
    log.debug(f'Data are: {data}')
    # labels could be in JSON format.
    if isinstance(labels, str):
        labels = json.loads(labels.replace("'", '"'))
        # TODO: use a general deserializer that fixes the single quote issues with json loading

    if isinstance(data, str) and os.path.exists(data):
        with open(data, "rb") as fp:
            data = pickle.load(fp)

    for key in ("dispersion", "wavelength"):
        try:
            dispersion = data[key]
        except KeyError:
            continue
        else:
            break
    else:
        raise ValueError(f"unable to find {key} in data")

    training_set_flux = data["normalized_flux"]
    training_set_ivar = data["normalized_ivar"]

    try:
        num_spectra = data["num_spectra"]
    except:
        log.debug(
            f"Keeping all items in training set; not checking for missing spectra."
        )
    else:
        keep = (num_spectra == 1)
        if not all(keep):
            log.warning(
                f"Excluding {sum(~keep)} objects from the training set that had missing spectra"
            )

            labels = {k: np.array(v)[keep] for k, v in labels.items()}
            training_set_flux = training_set_flux[keep]
            training_set_ivar = training_set_ivar[keep]

    # Set the vectorizer.
    vectorizer = tc.vectorizer.PolynomialVectorizer(
        labels.keys(),
        order=order,
    )

    # Initiate model.
    model = tc.model.CannonModel(labels,
                                 training_set_flux,
                                 training_set_ivar,
                                 vectorizer=vectorizer,
                                 dispersion=dispersion,
                                 regularization=regularization)

    model.train(threads=threads)

    output_path = os.path.join(get_base_output_path(), "thecannon",
                               "model.pkl")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    log.info(f"Writing The Cannon model {model} to disk {output_path}")
    model.write(output_path, include_training_set_spectra=True, overwrite=True)
    return output_path
Ejemplo n.º 18
0
Archivo: core.py Proyecto: sdss/astra
def parse_ferre_outputs(directory, header_path, wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta,
    clean_up_on_exit=False,
    raise_exception_on_bad_outputs=False):

    # Get processing times.
    #processing_times = utils.get_processing_times(stdout, kwds["nthreads"])

    # Parse parameter outputs and uncertainties.
    try:
        output_names, param, param_err, output_meta = utils.read_output_parameter_file(
            os.path.join(directory, kwds["opfile"]),
            n_dimensions=kwds["ndim"],
            full_covariance=kwds["covprint"]
        )
    except:
        log.exception(f"Failed to load FERRE output parameter file at {os.path.join(directory, kwds['opfile'])}")
        raise
    
    # Parse flux outputs.
    try:
        model_flux = np.nan * np.ones_like(flux)
        model_flux[:, mask] = np.loadtxt(os.path.join(directory, kwds["offile"]))
    except:
        log.exception(f"Failed to load model flux from {os.path.join(directory, kwds['offile'])}:")
        raise

    if kwds.get("cont", None) is None:
        continuum = np.ones_like(model_flux)
    else:
        # Infer continuum.
        normalized_flux = np.nan * np.ones_like(flux)
        normalized_flux[:, mask] = np.loadtxt(os.path.join(directory, kwds["sffile"]))
        continuum = flux / normalized_flux

    meta.update(
        mask=mask,
        wavelength=wavelength,
        flux=flux,
        sigma=sigma,
        normalized_model_flux=model_flux,
        continuum=continuum
    )
    
    # Flag things.
    P, L = param.shape

    param_bitmask = bitmask.ParamBitMask()
    bitmask_flag = np.zeros((P, L), dtype=np.int64)
    
    grid_headers, *segment_headers = utils.read_ferre_headers(utils.expand_path(header_path))
    bad_lower = (grid_headers["LLIMITS"] + grid_headers["STEPS"]/8)
    bad_upper = (grid_headers["ULIMITS"] - grid_headers["STEPS"]/8)
    bitmask_flag[(param < bad_lower) | (param > bad_upper)] |= param_bitmask.get_value("GRIDEDGE_BAD")

    warn_lower = (grid_headers["LLIMITS"] + grid_headers["STEPS"])
    warn_upper = (grid_headers["ULIMITS"] - grid_headers["STEPS"])
    bitmask_flag[(param < warn_lower) | (param > warn_upper)] |= param_bitmask.get_value("GRIDEDGE_WARN")

    bitmask_flag[(param == -999) | (param_err < -0.01)] |= param_bitmask.get_value("FERRE_FAIL")

    # Check for any erroneous outputs
    if raise_exception_on_bad_outputs and np.any(bitmask_flag & param_bitmask.get_value("FERRE_FAIL")):
        v = bitmask_flag & param_bitmask.get_value("FERRE_FAIL")
        idx = np.where(np.any(bitmask_flag & param_bitmask.get_value("FERRE_FAIL"), axis=1))
        
        raise ValueError(f"FERRE returned all erroneous values for an entry: {idx} {v}")

    # Include processing times and bitmask etc.
    meta.update(
        bitmask_flag=bitmask_flag.tolist(), # .tolist() for postgresql encoding.
        #processing_times=processing_times,
        **output_meta
    )

    # need parameter names
    print(f"input names {names}")
    print(f"output_names: {output_names}")
    print(f"param: {param}")
    print(f"param_err: {param_err}")
    print(f"meta: {meta}")
    print(f"bitmask_flag: {bitmask_flag}")

    # Parse elapsed time.
    
    #print(f"times {processing_times}")

    if clean_up_on_exit:
        log.info(f"Removing directory {directory} and its contents")
        rmtree(directory)
    else:
        log.info(f"Leaving directory {directory} and its contents as clean_up_on_exit = {clean_up_on_exit}")

    return (param, param_err, meta)
Ejemplo n.º 19
0
Archivo: core.py Proyecto: sdss/astra
def ferre(
        wavelength,
        flux,
        sigma,
        header_path,
        names=None,
        initial_parameters=None,
        frozen_parameters=None,
        interpolation_order=3,
        input_weights_path=None,
        input_lsf_shape_path=None,
        lsf_shape_flag=0,
        error_algorithm_flag=1,
        wavelength_interpolation_flag=0,
        optimization_algorithm_flag=3,
        continuum_flag=1,
        continuum_order=4,
        continuum_segment=None,
        continuum_reject=0.3,
        continuum_observations_flag=1,
        full_covariance=False,
        pca_project=False,
        pca_chi=False,
        n_threads=32,
        f_access=None,
        f_format=1,
        ferre_kwargs=None,
        directory=None,
        clean_up_on_exit=False,
        raise_exception_on_bad_outputs=False,
        **kwargs
    ):
    """
    Run FERRE on the given observations and return the parsed outputs.
    
    :param wavelength:
        An array of wavelength values for the observations. This should be one of:

        - a 1D array of shape `P` where P is the number of pixels, if all spectra are
          on the same wavelength grid
        - an array of shape `(N, P)` where `N` is the number of observations and `P` 
          is the number of pixels, if all spectra have the same number of pixels
        - a list of `N` arrays, where each array contains the number of pixels in 
          that observation
        
    :param flux:
        The observed flux values. This should be one of:

        - an array of shape `(N, P)` where `N` is the number of observations and `P`
          is the number of pixels, if all spectra have the same number of pixels
        - a list of `N` arrays, where each array has a size of the number of pixels in
          that observation.
        
    :param sigma:
        The uncertainty in the observed flux values. This should be one of:

        - an array of shape `(N, P)` where `N` is the number of observations and `P`
          is the number of pixels, if all spectra have the same number of pixels
        - a list of `N` arrays, where each array has a size of the number of pixels in
          that observation
        
    :param header_path:
        The path of the FERRE header file.
        
    :param initial_parameters: [optional]
        The initial parameters to start from. If `None` is given then this will revert
        to the mid-point of the grid for all observations. This should be an array of
        shape `(N, L)` where `N` is the number of observations and `L` is the number
        of dimensions in the FERRE grid supplied.

    :param frozen_parameters: [optional]
        A dictionary with parameter names (as per the header file) as keys, and either
        a boolean flag or a float as value. If boolean `True` is given for a parameter,
        then the value will be fixed at the initial value per spectrum. If a float is
        given then this value will supercede all initial values given, fixing the
        dimension for all input spectra regardless of the initial value.

    :param interpolation_order: [optional]
        Order of interpolation to use (default: 1, as per FERRE).
        This corresponds to the FERRE keyword `inter`.

        0. nearest neighbour
        1. linear
        2. quadratic Bezier
        3. cubic Bezier
        4. cubic splines

    :param input_weights_path: [optional]
        The location of a weight (or mask) file to apply to the pixels. This corresponds
        to the FERRE keyword `filterfile`.
    
    :para input_lsf_shape_path: [optional]
        The location of a file containing describing the line spread function to apply to
        the observations. This keyword is ignored if `lsf_shape_flag` is anything but 0.
        This corresponds to the FERRE keyword `lsffile`.
    
    :param lsf_shape_flag: [optional]
        A flag indicating what line spread convolution to perform. This should be one of:

        0. no LSF convolution (default)
        1. 1D (independent of wavelength), one and the same for all spectra
        2. 2D (a function of wavelength), one and the same for all
        3. 1D and Gaussian  (i.e. described by a single parameter, its width), one for all objects
        4. 2D and Gaussian, one for all objects
        11. 1D and particular for each spectrum
        12. 2D and particular for each spectrum
        13. 1D Gaussian, but particular for each spectrum
        14. 2D Gaussian and particular for each object.

        If `lsf_shape_flag` is anything but 0, then an `input_lsf_path` keyword argument
        will also be required, pointing to the location of the LSF file.

    :param error_algorithm_flag: [optional]
        Choice of algorithm to compute error bars (default: 1, as per FERRE).
        This corresponds to the FERRE keyword `errbar`.

        0. To adopt the distance from the solution at which $\chi^2$ = min($\chi^2$) + 1
        1. To invert the curvature matrix
        2. Perform numerical experiments injecting noise into the data

    :param wavelength_interpolation_flag: [optional]
        Flag to indicate what to do about wavelength interpolation (default: 0).
        This is not usually needed as the FERRE grids are computed on the resampled
        APOGEE grid. This corresponds to the FERRE keyword `winter`.

        0. No interpolation.
        1. Interpolate observations.
        2. The FERRE documentation says 'Interpolate fluxes', but it is not clear to the
           writer how that is any different from Option 1.

    :param optimization_algorithm_flag: [optional]
        Integer flag to indicate which optimization algorithm to use:

        1. Nelder-Mead
        2. Boender-Timmer-Rinnoy Kan
        3. Powell's truncated Newton method
        4. Nash's truncated Newton method

    :param continuum_flag: [optional]
        Choice of algorithm to use for continuum fitting (default: 1).
        This corresponds to the FERRE keyword `cont`, and is related to the
        FERRE keywords `ncont` and `rejectcont`.

        If `None` is supplied then no continuum keywords will be given to FERRE.

        1. Polynomial fitting using an iterative sigma clipping algrithm (set by
           `continuum_order` and `continuum_reject` keywords).
        2. Segmented normalization, where the data are split into `continuum_segment`
           segments, and the values in each are divided by their mean values.
        3. The input data are divided by a running mean computed with a window of
           `continuum_segment` pixels.
    
    :param continuum_order: [optional]
        The order of polynomial fitting to use, if `continuum_flag` is 1.
        This corresponds to the FERRE keyword `ncont`, if `continuum_flag` is 1.
        If `continuum_flag` is not 1, this keyword argument is ignored.
    
    :param continuum_segment: [optional]
        Either the number of segments to split the data into for performing normalization,
        (e.g., when `continuum_flag` = 2), or the window size to use when `continuum_flag`
        = 3. This corresponds to the FERRE keyword `ncont` if `continuum_flag` is 2 or 3.
        If `continuum_flag` is not 2 or 3, this keyword argument is ignored.

    :param continuum_reject: [optional]
        When using polynomial fitting with an iterative sigma clipping algorithm
        (`continuum_flag` = 1), this sets the relative error where data points will be
        excluded. Any data points with relative errors larger than `continuum_reject`
        will be excluded. This corresponds to the FERRE keyword `rejectcont`.
        If `continuum_flag` is not 1, this keyword argument is ignored.
    
    :param continuum_observations_flag: [optional]
        This corresponds to the FERRE keyword `obscont`. Nothing is written down in the
        FERRE documentation about this keyword.
    
    :param full_covariance: [optional]
        Return the full covariance matrix from FERRE (default: True). 
        This corresponds to the FERRE keyword `covprint`.
    
    :param pca_project: [optional]
        Use Principal Component Analysis to compress the spectra (default: False).
        This corresponds to the FERRE keyword `pcaproject`.
    
    :param pca_chi: [optional]
        Use Principal Component Analysis to compress the spectra when calculating the
        $\chi^2$ statistic. This corresponds to the FERRE keyword `pcachi`.
    
    :param n_threads: [optional]
        The number of threads to use for FERRE. This corresponds to the FERRE keyword
        `nthreads`.

    :param f_access: [optional]
        If `False`, load the entire grid into memory. If `True`, run the interpolation 
        without loading the entire grid into memory -- this is useful for small numbers 
        of interpolation. If `None` (default), automatically determine which is faster.
        This corresponds to the FERRE keyword `f_access`.
    
    :param f_format: [optional]
        File format of the FERRE grid: 0 (ASCII) or 1 (UNF format, default).
        This corresponds to the FERRE keyword `f_format`.

    :param ferre_kwargs: [optional]
        A dictionary of options to apply directly to FERRE, which will over-ride other
        settings supplied here, so use with caution.
    """

    # Create the temporary directory, if necessary.
    if directory is None:
        directory = mkdtemp(**kwargs.get("directory_kwds", {}))
        log.info(f"Created temporary directory {directory}")

    os.makedirs(directory, exist_ok=True)

    # Create a dictionary of all input keywords.
    input_kwds = {}
    for arg in getfullargspec(ferre).args:
        input_kwds[arg] = locals()[arg]

    wavelength, flux, sigma, mask, names, initial_parameters, kwds, meta = prepare_ferre(directory, input_kwds)

    execute_args = (directory, len(flux), kwds["offile"])
    
    if slurm_kwds:
        stdout, stderr = _execute_ferre_by_slurm(*execute_args, **slurm_kwds)
    else:
        stdout, stderr = _execute_ferre_by_subprocess(*execute_args)

    return parse_ferre_outputs(
        directory, 
        header_path,
        wavelength,
        flux,
        sigma,
        mask,
        names,
        initial_parameters,
        kwds,
        meta,
        clean_up_on_exit=clean_up_on_exit,
        raise_exception_on_bad_outputs=raise_exception_on_bad_outputs
    )
Ejemplo n.º 20
0
def _select_training_set_data_from_database(label_columns,
                                            filter_args=None,
                                            filter_func=None,
                                            limit=None,
                                            **kwargs):
    label_columns = list(label_columns)
    label_names = [column.key for column in label_columns]
    L = len(label_names)

    if filter_func is None:
        filter_func = lambda *_, **__: True

    # Get the label names.
    log.info(f"Querying for label names {label_names} from {label_columns}")

    # Figure out what other columns we will need to identify the input file.
    for column in label_columns:
        try:
            primary_parent = column.class_
        except AttributeError:
            continue
        else:
            break
    else:
        raise ValueError(
            "Can't get primary parent. are you labelling every column?")

    log.debug(f"Identified primary parent table as {primary_parent}")

    if primary_parent == catalogdb.SDSSApogeeAllStarMergeR13:

        log.debug(
            f"Adding columns and setting data_model_func for {primary_parent}")
        additional_columns = [
            catalogdb.SDSSDR16ApogeeStar.apstar_version.label("apstar"),
            catalogdb.SDSSDR16ApogeeStar.field,
            catalogdb.SDSSDR16ApogeeStar.apogee_id.label("obj"),
            catalogdb.SDSSDR16ApogeeStar.file,
            catalogdb.SDSSDR16ApogeeStar.telescope,

            # Things that we might want for filtering on.
            catalogdb.SDSSDR16ApogeeStar.snr
        ]

        columns = label_columns + additional_columns

        q = session.query(*columns).join(
            catalogdb.SDSSApogeeAllStarMergeR13,
            func.trim(catalogdb.SDSSApogeeAllStarMergeR13.apstar_ids) ==
            catalogdb.SDSSDR16ApogeeStar.apstar_id)

        data_model_func = lambda apstar, field, obj, filename, telescope, *_, : {
            "release": "DR16",
            "filetype": "apStar",
            "apstar": apstar,
            "field": field,
            "obj": obj,
            "prefix": filename[:2],
            "telescope": telescope,
            "apred": filename.split("-")[1]
        }

    else:
        raise NotImplementedError(
            f"Cannot intelligently figure out what data model keywords will be necessary."
        )

    if filter_args is not None:
        q = q.filter(*filter_args)

    if limit is not None:
        q = q.limit(limit)

    log.debug(f"Querying {q}")

    data_model_identifiers = []
    labels = {label_name: [] for label_name in label_names}
    for i, row in enumerate(tqdm(q.yield_per(1), total=q.count())):
        if not filter_func(*row): continue

        for label_name, value in zip(label_names, row[:L]):
            if not np.isfinite(value) or value is None:
                log.warning(
                    f"Label {label_name} in {i} row is not finite: {value}!")
            labels[label_name].append(value)
        data_model_identifiers.append(data_model_func(*row[L:]))

    return (labels, data_model_identifiers)
Ejemplo n.º 21
0
def estimate_stellar_labels(pks, **kwargs):
    """
    Estimate stellar labels given a single-layer neural network.

    :param pks:
        The primary keys of task instances to estimate stellar labels for. The
        task instances include information to identify the source SDSS data product.
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running ThePayne on device {device} with:")
    log.info(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")
    log.info(f"Using torch version {torch.__version__} in {torch.__path__}")

    states = {}

    log.info(f"Estimating stellar labels for task instances")

    results = {}
    for instance, path, spectrum in prepare_data(pks):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]
        try:
            state = states[model_path]
        except KeyError:
            log.info(f"Loading model from {model_path}")
            state = states[model_path] = test.load_state(model_path)

            label_names = state["label_names"]
            L = len(label_names)
            log.info(f"Estimating these {L} label names: {label_names}")

        # Run optimization.
        t_init = time()
        p_opt, p_cov, model_flux, meta = test.test(spectrum.wavelength.value,
                                                   spectrum.flux.value,
                                                   spectrum.uncertainty.array,
                                                   **state)
        t_opt = time() - t_init

        #log.debug(f"spectrum shape: {spectrum.flux.shape}")
        #log.debug(f"p_opt shape: {p_opt.shape}")
        #log.debug(f"spectrum meta: {spectrum.meta['snr']}")

        # Prepare outputs.
        result = dict(zip(label_names, p_opt.T))
        result.update(snr=spectrum.meta["snr"])
        # Include uncertainties.
        result.update(
            dict(
                zip((f"u_{ln}" for ln in label_names),
                    np.sqrt(p_cov[:,
                                  np.arange(p_opt.shape[1]),
                                  np.arange(p_opt.shape[1])].T))))

        results[instance.pk] = result
        log.info(f"Result for {instance} took {t_opt} seconds")

    # Write database outputs.
    for pk, result in tqdm(results.items(), desc="Writing database outputs"):
        # Write database outputs.
        create_task_output(pk, astradb.ThePayne, **result)

    return None
Ejemplo n.º 22
0
def estimate_stellar_labels(pks,
                            model_path,
                            dwave_slam=10.,
                            p_slam=(1E-8, 1E-7),
                            q_slam=0.7,
                            ivar_block_slam=None,
                            eps_slam=1E-19,
                            rsv_frac_slam=2.,
                            n_jobs_slam=1,
                            verbose_slam=5):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
    
    :param model_path:
        The disk path of the pre-trained model.
        
    :param dwave_slam: float
        binning width
        
    :param p_slam: tuple of 2 ps [optional]
        smoothing parameter between 0 and 1: (default: 1E-8, 1E-7)
        0 -> LS-straight line
        1 -> cubic spline interpolant
        
    :param q_slam: float in range of [0, 100] [optional]
        percentile, between 0 and 1 (default: 0.7)
        
    :param ivar_block_slam: ndarray (n_pix, ) | None [optional]
        ivar array (default: None)
        
    :param eps_slam: float [optional]
        the ivar threshold (default: 1E-19)
    
    :param rsv_frac_slam: float [optional]
        the fraction of pixels reserved in terms of std. default is 3.
    
    :param n_jobs_slam: int [optional]
        number of processes launched by joblib (default: 1)
        
    :param verbose_slam: int / bool [optional]
        verbose level (default: 5)
    """
    '''
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tmodel_path: {model_path}")
    log.info(f"\tpks: {pks}")

    log.debug(f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")
    
    # Load the model.
    ### model = Model(model_path, device)
    '''

    # Load the model.
    model = Slam.load_dump(model_path)  ### ("./models/btsettl.dump")
    ### wave_interp = np.load("./models/wave_interp_R1800.npz")['wave'] ### ??? how to load properly
    wave_interp = model.wave

    log.info(f"Loaded model from {model_path}")

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        N, P = spectrum.flux.shape
        '''
        ### original code in apogeenet
        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P))
        
        ### original code in MDwarfMachine
        fluxes, invars = [], []
        for i in tqdm(range(len(obs_spec))):
            fluxes += [obs_spec[i]['flux_resamp']]
            invars += [obs_spec[i]['invar_resamp']]
        fluxes, invars = np.array(fluxes), np.array(invars)
        '''
        ### wave   = np.nan_to_num(spectrum.spectral_axis.value).astype(np.float32).reshape((N, 1, P))
        ### fluxes = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P)) ### ??? reshape to what format
        ### invars = np.nan_to_num(spectrum.uncertainty.array).astype(np.float32).reshape((N, 1, P)) ### ???  spectrum.uncertainity format
        wave = spectrum.spectral_axis
        fluxes = spectrum.flux
        invars = specrrum.uncertainty

        fluxes_resamp, invars_resamp = [], []
        for i in tqdm(range(N)):
            fluxes_temp, invars_temp = resample(wave[i], fluxes[i], invars[i],
                                                wave_interp)
            fluxes_resamp += [fluxes_temp]
            invars_resamp += [invars_temp]
        fluxes_resamp, invars_resamp = np.array(fluxes_resamp), np.array(
            invars_resamp)

        ### normalization of each spetra
        ### fluxes_norm, fluxes_cont = normalize_spectra_block(wave_interp, fluxes_resamp,
        ###                                           (6147., 8910.), 10., p=(1E-8, 1E-7), q=0.7,
        ###                                           eps=1E-19, rsv_frac=2., n_jobs=1, verbose=5) ### ??? inputs
        fluxes_norm, fluxes_cont = normalize_spectra_block(
            wave_interp,
            fluxes_resamp, (6147., 8910.),
            dwave_slam,
            p=p_slam,
            q=q_slam,
            ivar_block=ivar_block_slam,
            eps=eps_slam,
            rsv_frac=rsv_frac_slam,
            n_jobs=n_jobs_slam,
            verbose=verbose_slam)

        invars_norm = fluxes_cont**2 * invars_resamp

        ### Initial estimation: get initial estimate of parameters by chi2 best match
        label_init = model.predict_labels_quick(fluxes_norm,
                                                invars_norm,
                                                n_jobs=1)

        ### SLAM prediction: optimize parameters
        results_pred = model.predict_labels_multi(label_init, fluxes_norm,
                                                  invars_norm)
        label_pred = np.array([label['x'] for label in results_pred])
        std_pred = np.array([label['pstd'] for label in results_pred])

        ### modify the following block for SLAM style
        # Create results array.
        ### log_g, log_teff, fe_h = predictions.T
        ### teff = 10**log_teff
        teff = label_pred[:, 0]
        m_h = label_pred[:, 1]
        log_g = label_pred[:, 2]
        alpha_m = label_pred[:, 3]
        u_teff = std_pred[:, 0]
        u_m_h = std_pred[:, 1]
        u_log_g = std_pred[:, 2]
        u_alpha_m = std_pred[:, 3]
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            m_h=m_h.tolist(),
            logg=log_g.tolist(),
            alpha_m=alpha_m.tolist(),
            u_teff=u_teff.tolist(),
            u_m_h=u_m_h.tolist(),
            u_logg=u_log_g.tolist(),
            u_alpha_m=u_alpha_m.tolist(),
        )

        # Write the result to database.
        ### create_task_output(instance, astradb.ApogeeNet, **result)
        create_task_output(instance, astradb.SLAM, **result)

    log.info(f"Completed processing of {total} primary keys")
Ejemplo n.º 23
0
        training_labels, training_spectra, \
        validation_labels, validation_spectra = training.load_training_data(training_set_path)

    state, model, optimizer = training.train(training_spectra,
                                             training_labels,
                                             validation_spectra,
                                             validation_labels,
                                             label_names,
                                             num_neurons=int(num_neurons),
                                             num_epochs=num_epochs,
                                             learning_rate=learning_rate,
                                             weight_decay=weight_decay)

    # Ensure that the output folder exists.
    os.makedirs(os.path.dirname(output_model_path), exist_ok=True)
    log.info(f"Writing model to {output_model_path}")
    with open(output_model_path, "wb") as fp:
        pickle.dump(
            dict(
                state=state,
                wavelength=wavelength,
                label_names=label_names,
            ), fp)

    # Try to send xcom result of the output path.
    try:
        ti = kwargs["ti"]
        ti.xcom_push("model_path", output_model_path)
    except:
        log.exception("Unable to send `model_path` as xcom variable")
    else:
Ejemplo n.º 24
0
    def execute(self, context):
        """
        Execute the operator.

        :param context:
            The Airflow DAG context.
        """

        # Load spectra.
        instances, Ns = ([], [])
        wavelength, flux, sigma, spectrum_meta = ([], [], [], [])
        for instance, path, spectrum in self.prepare_data():
            if spectrum is None: continue

            N, P = spectrum.flux.shape
            wavelength.append(
                np.tile(spectrum.wavelength.value, N).reshape((N, -1)))
            flux.append(spectrum.flux.value)
            sigma.append(spectrum.uncertainty.array**-0.5)
            spectrum_meta.append(dict(snr=spectrum.meta["snr"]))

            Ns.append(N)
            instances.append(instance)

        Ns = np.array(Ns, dtype=int)
        wavelength, flux, sigma = tuple(
            map(np.vstack, (wavelength, flux, sigma)))

        # Create names for easy debugging in FERRE outputs.
        names = create_names(
            instances, Ns, "{star_index}_{telescope}_{obj}_{spectrum_index}")

        # Load initial parameters, taking account
        initial_parameters = create_initial_parameters(instances, Ns)

        # Directory.
        directory = os.path.join(
            get_base_output_path(), "ferre", "tasks",
            f"{context['ds']}-{context['dag'].dag_id}-{context['task'].task_id}-{context['run_id']}"
        )
        os.makedirs(directory, exist_ok=True)
        log.info(f"Working directory for task is {directory}")

        # Prepare FERRE.
        args = prepare_ferre(
            directory,
            dict(wavelength=wavelength,
                 flux=flux,
                 sigma=sigma,
                 header_path=self.header_path,
                 names=names,
                 initial_parameters=initial_parameters,
                 frozen_parameters=self.frozen_parameters,
                 interpolation_order=self.interpolation_order,
                 input_weights_path=self.input_weights_path,
                 input_lsf_shape_path=self.input_lsf_shape_path,
                 lsf_shape_flag=self.lsf_shape_flag,
                 error_algorithm_flag=self.error_algorithm_flag,
                 wavelength_interpolation_flag=self.
                 wavelength_interpolation_flag,
                 optimization_algorithm_flag=self.optimization_algorithm_flag,
                 continuum_flag=self.continuum_flag,
                 continuum_order=self.continuum_order,
                 continuum_segment=self.continuum_segment,
                 continuum_reject=self.continuum_reject,
                 continuum_observations_flag=self.continuum_observations_flag,
                 full_covariance=self.full_covariance,
                 pca_project=self.pca_project,
                 pca_chi=self.pca_chi,
                 n_threads=self.n_threads,
                 f_access=self.f_access,
                 f_format=self.f_format,
                 ferre_kwargs=self.ferre_kwargs))

        # Execute, either by slurm or whatever.
        log.debug(f"FERRE ready to roll in {directory}")
        assert self.slurm_kwargs
        self.execute_by_slurm(
            context,
            bash_command=
            "/uufs/chpc.utah.edu/common/home/sdss09/software/apogee/Linux/apogee/trunk/bin/ferre.x",
            directory=directory,
        )
        # Unbelievably, FERRE sends a '1' exit code every time it is executed. Even if it succeeds.
        # TODO: Ask Carlos or Jon to remove this insanity.

        # Parse outputs.
        # TODO: clean up this function
        param, param_err, output_meta = parse_ferre_outputs(
            directory, self.header_path, *args)

        results = group_results_by_instance(param, param_err, output_meta,
                                            spectrum_meta, Ns)

        for instance, (result, data) in zip(instances, results):
            if result is None: continue

            create_task_output(instance, astradb.Ferre, **result)

            log.debug(f"{instance}")
            log.debug(f"{result}")
            log.debug(f"{data}")

            # TODO: Write a data model product for this intermediate output!
            output_path = utils.output_data_product_path(instance.pk)
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with open(output_path, "wb") as fp:
                pickle.dump((result, data), fp)

            log.info(
                f"Wrote outputs of task instance {instance} to {output_path}")

        # Always return the primary keys that were worked on!
        return self.pks
Ejemplo n.º 25
0
def _parse_names_and_initial_and_frozen_parameters(
        names,
        initial_parameters,
        frozen_parameters,
        headers,
        flux,
        clip_initial_parameters_to_boundary_edges=True,
        clip_epsilon_percent=1,
        **kwargs):

    # Read the labels from the first header path
    parameter_names = headers["LABEL"]

    # Need the number of spectra, which we will take from the flux array.
    N = len(flux)
    mid_point = _grid_mid_point(headers)
    parsed_initial_parameters = np.tile(mid_point, N).reshape((N, -1))

    log.debug(f"parsed initial parameters before {parsed_initial_parameters}")

    compare_parameter_names = list(
        map(sanitise_parameter_name, parameter_names))

    log.debug(f"Initial parameters passed for parsing {initial_parameters}")

    if initial_parameters is not None:
        log.debug(f"Comparison names {compare_parameter_names}")
        for i, (parameter_name,
                values) in enumerate(initial_parameters.items()):
            spn = sanitise_parameter_name(parameter_name)
            log.debug(f"{parameter_name} {values} {spn}")

            try:
                index = compare_parameter_names.index(spn)
            except ValueError:
                log.warning(
                    f"Ignoring initial parameters for {parameter_name} as they are not in {parameter_names}"
                )
                log.debug(
                    f"Nothing matched for {spn} {parameter_name} {compare_parameter_names}"
                )
            else:
                log.debug(f"Matched to index {index}")
                # Replace non-finite values with the mid point.
                finite = np.isfinite(values)
                if not np.all(finite):
                    log.warning(
                        f"Missing or non-finite initial values given for {parameter_name}. Defaulting to the grid mid-point."
                    )

                values = np.array(values)
                values[~finite] = mid_point[index]

                log.debug(f"values are {values} {type(values[0])} {finite}")
                parsed_initial_parameters[:, index] = values

    log.debug(f"parsed initial parameters after {parsed_initial_parameters}")

    kwds = dict()
    frozen_parameters = (frozen_parameters or dict())
    if frozen_parameters:
        # Ensure we have a dict-like thing.
        if isinstance(frozen_parameters, (list, tuple, np.ndarray)):
            frozen_parameters = {
                sanitise_parameter_name(k): True
                for k in frozen_parameters
            }
        elif isinstance(frozen_parameters, dict):
            # Exclude things that have boolean False.
            frozen_parameters = {
                sanitise_parameter_name(k): v for k, v in frozen_parameters.items() \
                if not (isinstance(v, bool) and not v)
            }
        else:
            raise TypeError(
                f"frozen_parameters must be list-like or dict-like")

        unknown_parameters = set(frozen_parameters).difference(
            compare_parameter_names)
        if unknown_parameters:
            raise ValueError(
                f"unknown parameter(s): {unknown_parameters} (available: {parameter_names})"
            )

        indices = [
            i for i, pn in enumerate(compare_parameter_names, start=1)
            if pn not in frozen_parameters
        ]

        if len(indices) == 0:
            raise ValueError(f"all parameters frozen?!")

        # Over-ride initial values with the frozen ones if given.
        for parameter_name, value in frozen_parameters.items():
            if not isinstance(value, bool):
                log.debug(
                    f"Over-writing initial values for {parameter_name} with frozen value of {value}"
                )
                zero_index = compare_parameter_names.index(parameter_name)
                parsed_initial_parameters[:, zero_index] = value
    else:
        # No frozen parameters.
        indices = 1 + np.arange(len(parameter_names), dtype=int)

    # Build a frozen parameters dict for result metadata.
    parsed_frozen_parameters = {
        pn: (pn in frozen_parameters)
        for pn in compare_parameter_names
    }

    L = len(indices)
    kwds.update(
        ndim=headers["N_OF_DIM"],
        nov=L,
        indv=" ".join([f"{i:.0f}" for i in indices]),
        # We will always provide an initial guess, even if it is the grid mid point.
        init=0,
        indini=" ".join(["1"] * L))

    # Now deal with names.
    if names is None:
        names = [f"{i:.0f}" for i in range(len(parsed_initial_parameters))]
    else:
        if len(names) != len(parsed_initial_parameters):
            raise ValueError(
                f"names and initial parameters does not match ({len(names)} != {len(parsed_initial_parameters)})"
            )

    # Let's check the initial values are all within the grid boundaries.
    lower_limit, upper_limit = _get_grid_limits(headers)
    try:
        _check_initial_parameters_within_grid_limits(parsed_initial_parameters,
                                                     lower_limit, upper_limit,
                                                     parameter_names)
    except ValueError as e:
        log.exception(
            f"Exception when checking initial parameters within grid boundaries:"
        )
        log.critical(e, exc_info=True)

        if clip_initial_parameters_to_boundary_edges:
            log.info(
                f"Clipping initial parameters to boundary edges (use clip_initial_parameters_to_boundary_edges=False to raise exception instead)"
            )

            clip = clip_epsilon_percent * (upper_limit - lower_limit) / 100.
            parsed_initial_parameters = np.round(
                np.clip(parsed_initial_parameters, lower_limit + clip,
                        upper_limit - clip), 3)
        else:
            raise

    return (kwds, names, parsed_initial_parameters, parsed_frozen_parameters)
Ejemplo n.º 26
0
    def train(self,
              threads=None,
              op_method=None,
              op_strict=True,
              op_kwds=None,
              **kwargs):
        """
        Train the model.

        :param threads: [optional]
            The number of parallel threads to use.

        :param op_method: [optional]
            The optimization algorithm to use: l_bfgs_b (default) and powell
            are available.

        :param op_strict: [optional]
            Default to Powell's optimization method if BFGS fails.

        :param op_kwds:
            Keyword arguments to provide directly to the optimization function.

        :returns:
            A three-length tuple containing the spectral coefficients `theta`,
            the squared scatter term at each pixel `s2`, and metadata related to
            the training of each pixel.
        """

        kwds = dict(op_method=op_method, op_strict=op_strict, op_kwds=op_kwds)
        kwds.update(kwargs)

        if self.training_set_flux is None or self.training_set_ivar is None:
            raise TypeError(
                "cannot train: training set spectra not saved with the model")

        S, P = self.training_set_flux.shape
        T = self.design_matrix.shape[1]

        log.info("Training {0}-label {1} with {2} stars and {3} pixels/star"\
            .format(len(self.vectorizer.label_names), type(self).__name__, S, P))

        # Parallelise out.
        if threads in (1, None):
            mapper, pool = (map, None)

        else:
            pool = mp.Pool(threads)
            mapper = pool.map

        func = Wrapper(fitting.fit_pixel_fixed_scatter, None, kwds, P)

        meta = []
        theta = np.nan * np.ones((P, T))
        s2 = np.nan * np.ones(P)

        for pixel, (flux, ivar) \
        in enumerate(zip(self.training_set_flux.T, self.training_set_ivar.T)):

            args = (flux, ivar, self._initial_theta(pixel),
                    self._censored_design_matrix(pixel),
                    self._pixel_access(self.regularization, pixel, 0.0), None)
            (pixel_theta, pixel_s2, pixel_meta), = mapper(func, [args])

            meta.append(pixel_meta)
            theta[pixel], s2[pixel] = (pixel_theta, pixel_s2)

        self._theta, self._s2 = (theta, s2)

        if pool is not None:
            pool.close()
            pool.join()

        return (theta, s2, meta)
Ejemplo n.º 27
0
        upstream_pk = instance.parameters.get("upstream_pk", None)
        if upstream_pk is None:
            raise ValueError(f"cannot do median filter correction because no upstream_pk parameter for {instance}")

        upstream_pk = literal_eval(upstream_pk)

        # There could be many upstream tasks listed, so we should get the matching one.
        q = session.query(astradb.TaskInstance)\
                   .filter(astradb.TaskInstance.pk.in_(upstream_pk))\
                   .filter(astradb.TaskInstance.task_id.like(median_filter_correction_from_task_id_like))

        upstream_instance = q.one_or_none()
        if upstream_instance is None:
            raise RuntimeError(f"cannot find upstream instance in {upstream_pk} matching {median_filter_correction_from_task_id_like}")

        log.info(f"Applying median filtered correction\n\tto {instance}\n\tfrom {upstream_instance}")

        upstream_path = utils.output_data_product_path(upstream_instance.pk)
        with open(upstream_path, "rb") as fp:
            result, data = pickle.load(fp)
        
        # Need number of pixels from header
        n_pixels = [header["NPIX"] for header in utils.read_ferre_headers(utils.expand_path(instance.parameters["header_path"]))][1:]

        # Get the segment indices using the data mask and the known number of pixels.
        indices = 1 + np.cumsum(data["mask"]).searchsorted(np.cumsum(n_pixels))
        segment_indices = np.vstack([indices - n_pixels, indices]).T

        cont = median_filtered_correction(
            wavelength=data["wavelength"],
            # TODO: Check this median filtered correction.
Ejemplo n.º 28
0
    def run(self):
        """ Execute this task. """

        # Load the model.
        log.info(f"Loading model for {self}")
        state = testing.load_state(self.input()["model"].path)

        # We can run this in batch mode.
        label_names = state["label_names"]
        tqdm_kwds = dict(total=self.get_batch_size(), desc="The Payne")
        for init, task in tqdm(timer(self.get_batch_tasks()), **tqdm_kwds):
            if task.complete():
                continue
            
            #log.debug(f"Running {task}")
            spectrum, continuum, normalized_flux, normalized_ivar = task.prepare_observation()

            #log.debug(f"Prepared observations for {task}")
            
            p_opt, p_cov, model_flux, meta = testing.test(
                spectrum.wavelength.value,
                normalized_flux,
                normalized_ivar,
                **state
            )

            #log.debug(f"Completed inference on {task}. p_opt has shape {p_opt.shape}")

            results = dict(zip(label_names, p_opt.T))
            # Note: we count the number of label names here in case we are sometimes using
            #       radial velocity determination or not, before we add in the SNR.

            L = len(results)
            # Add in uncertainties on parameters.
            results.update(dict(zip(
                (f"u_{ln}" for ln in label_names),
                np.sqrt(p_cov[:, np.arange(L), np.arange(L)].T)
            )))

            # Add in SNR values for conveninence.
            results.update(snr=spectrum.meta["snr"])
            
            # Write AstraSource object.
            if "AstraSource" in task.output():
                #log.debug(f"Writing AstraSource object for {task}")    
                task.output()["AstraSource"].write(
                    spectrum=spectrum,
                    normalized_flux=normalized_flux,
                    normalized_ivar=normalized_ivar,
                    continuum=continuum,
                    model_flux=model_flux,
                    # TODO: Project uncertainties to flux space.
                    model_ivar=None,
                    results_table=Table(results)
                )

            # Write output to database.
            if "database" in task.output():
                #log.debug(f"Writing database output for {task}")
                task.output()["database"].write(results)

            # Trigger this event as complete, and record task duration.
            task.trigger_event_processing_time(time() - init, cascade=True)

        return None
Ejemplo n.º 29
0
def export_to_table(output_path, overwrite=True):
    """
    Export the APOGEENet database results to a table.

    :param output_path:
        The disk location where to write the table to.
    
    :param overwrite: [optional]
        Overwrite any
    """

    output_path = os.path.expandvars(os.path.expanduser(output_path))
    if not overwrite and os.path.exists(output_path):
        raise OSError(f"path '{output_path}' already exists and asked not to overwrite it")

    sq = session.query(
            astradb.ApogeeNet.output_pk.label("output_pk"),
            func.json_object_agg(
                astradb.Parameter.parameter_name,
                astradb.Parameter.parameter_value
            ).label("parameters")
        )\
        .filter(astradb.ApogeeNet.output_pk == astradb.TaskInstance.output_pk)\
        .filter(astradb.TaskInstance.pk == astradb.TaskInstanceParameter.ti_pk)\
        .filter(astradb.TaskInstanceParameter.parameter_pk == astradb.Parameter.pk)\
        .group_by(astradb.ApogeeNet)\
        .subquery(with_labels=True)

    q = session.query(
            astradb.TaskInstance,
            astradb.ApogeeNet, 
            func.cardinality(astradb.ApogeeNet.snr),
            sq.c.parameters
        )\
        .filter(sq.c.output_pk == astradb.ApogeeNet.output_pk)\
        .filter(sq.c.output_pk == astradb.TaskInstance.output_pk)

    total, = session.query(func.sum(func.cardinality(astradb.ApogeeNet.snr))).first()

    table_columns = OrderedDict([
        ("ti_pk", []),
        ("run_id", []),
        ("release", []),
        ("apred", []),
        ("field", []),
        ("healpix", []),
        ("telescope", []),
        ("obj", []),
        ("spectrum_index", []),
    ])
    column_names = ("snr", "teff", "u_teff", "logg", "u_logg", "fe_h", "u_fe_h", "bitmask_flag")
    for cn in column_names:
        table_columns[cn] = []

    with tqdm(total=total, unit="spectra") as pb:
    
        for task_instance, result, N, parameters in q.yield_per(1):
            for i in range(N):
                table_columns["ti_pk"].append(result.ti_pk)
                table_columns["run_id"].append(task_instance.run_id)
                table_columns["release"].append(parameters["release"])
                table_columns["apred"].append(parameters["apred"])
                table_columns["field"].append(parameters.get("field", ""))
                table_columns["healpix"].append(parameters.get("healpix", ""))
                table_columns["telescope"].append(parameters["telescope"])
                table_columns["obj"].append(parameters["obj"])
                table_columns["spectrum_index"].append(i)

                for column_name in column_names:
                    table_columns[column_name].append(getattr(result, column_name)[i])
                
                pb.update(1)
    
    log.info(f"Creating table with {total} rows")
    table = Table(data=table_columns)
    log.info(f"Table created.")

    log.info(f"Writing to {output_path}")
    table.write(output_path, overwrite=overwrite)
    log.info("Done")

    return table_columns