Ejemplo n.º 1
0
    def pre_execute(self, context):
        """
        Create task instances for all the data model identifiers. 
        
        :param context:
            The Airflow context dictionary.
        """

        args = (context["dag"].dag_id, context["task"].task_id,
                context["run_id"])

        # Get parameters from the parent class initialisation that should also be stored.
        common_task_parameters = self.common_task_parameters()

        pks = []
        for data_model_identifiers in self.data_model_identifiers(context):
            # This order here is important if common_task_parameters['release'] = None,
            # and the release has been inferred from the execution date.
            parameters = {**common_task_parameters, **data_model_identifiers}
            instance = create_task_instance(*args, parameters)
            pks.append(instance.pk)

        if not pks:
            raise AirflowSkipException(
                "No data model identifiers found for this time period.")

        self.pks = pks
        return None
Ejemplo n.º 2
0
Archivo: base.py Proyecto: sdss/astra
    def pre_execute(self, context):
        """
        Create a task instance for this execution.
        
        :param context:
            The Airflow context dictionary.
        """

        if self.python_callable is None:
            raise RuntimeError("No python_callable specified")

        args = (context["dag"].dag_id, context["task"].task_id,
                context["run_id"])

        # Get a string representation of the python callable to store in the database.
        parameters = dict(
            python_callable=callable_to_string(self.python_callable))
        parameters.update(self.parameters)
        instance = create_task_instance(*args, parameters)
        self.pks = (instance.pk, )
        return None
Ejemplo n.º 3
0
def classify_apstar(pks, dag, task, run_id, **kwargs):
    """
    Classify observations of APOGEE (ApStar) sources, given the existing classifications of the
    individual visits.

    :param pks:
        The primary keys of task instances where visits have been classified. These primary keys will
        be used to work out which stars need classifying, before tasks
    """

    pks = deserialize_pks(pks, flatten=True)

    # For each unique apStar object, we need to find all the visits that have been classified.
    distinct_apogee_drp_star_pk = session.query(
        distinct(astradb.TaskInstanceMeta.apogee_drp_star_pk)).filter(
            astradb.TaskInstance.pk.in_(pks),
            astradb.TaskInstanceMeta.ti_pk == astradb.TaskInstance.pk).all()

    # We need to make sure that we will only retrieve results on apVisit objects, and not on apStar objects.
    parameter_pk, = session.query(astradb.Parameter.pk).filter(
        astradb.Parameter.parameter_name == "filetype",
        astradb.Parameter.parameter_value == "apVisit").one_or_none()

    for star_pk in distinct_apogee_drp_star_pk:

        results = session.query(
            astradb.TaskInstance, astradb.TaskInstanceMeta,
            astradb.Classification
        ).filter(
            astradb.Classification.output_pk == astradb.TaskInstance.output_pk,
            astradb.TaskInstance.pk == astradb.TaskInstanceMeta.ti_pk,
            astradb.TaskInstanceMeta.apogee_drp_star_pk == star_pk,
            astradb.TaskInstanceParameter.ti_pk == astradb.TaskInstance.pk,
            astradb.TaskInstanceParameter.parameter_pk == parameter_pk).all()

        column_func = lambda column_name: column_name.startswith("lp_")

        lps = {}
        for j, (ti, meta, classification) in enumerate(results):
            if j == 0:
                for column_name in classification.__table__.columns.keys():
                    if column_func(column_name):
                        lps[column_name] = []

            for column_name in lps.keys():
                values = getattr(classification, column_name)
                if values is None: continue
                assert len(
                    values
                ) == 1, "We are getting results from apStars and re-adding to apStars!"
                lps[column_name].append(values[0])

        # Calculate total log probabilities.
        joint_lps = np.array(
            [np.sum(lp) for lp in lps.values() if len(lp) > 0])
        keys = [key for key, lp in lps.items() if len(lp) > 0]

        # Calculate normalized probabilities.
        with np.errstate(under="ignore"):
            relative_log_probs = joint_lps - logsumexp(joint_lps)

        # Round for PostgreSQL 'real' type.
        # https://www.postgresql.org/docs/9.1/datatype-numeric.html
        # and
        # https://stackoverflow.com/questions/9556586/floating-point-numbers-of-python-float-and-postgresql-double-precision
        decimals = 3
        probs = np.round(np.exp(relative_log_probs), decimals)

        joint_result = {k: [float(lp)] for k, lp in zip(keys, joint_lps)}
        joint_result.update({k[1:]: [float(v)] for k, v in zip(keys, probs)})

        # Create a task for this classification.
        # To do that we need to construct the parameters for the task.
        columns = (
            apogee_drpdb.Star.apred_vers.label(
                "apred"),  # TODO: Raise with Nidever
            apogee_drpdb.Star.healpix,
            apogee_drpdb.Star.telescope,
            apogee_drpdb.Star.apogee_id.label(
                "obj"),  # TODO: Raise with Nidever
        )
        apred, healpix, telescope, obj = sdss_session.query(*columns).filter(
            apogee_drpdb.Star.pk == star_pk).one()
        parameters = dict(apred=apred,
                          healpix=healpix,
                          telescope=telescope,
                          obj=obj,
                          release="sdss5",
                          filetype="apStar",
                          apstar="stars")

        args = (dag.dag_id, task.task_id, run_id)

        # Get a string representation of the python callable to store in the database.

        instance = create_task_instance(*args, parameters)
        output = create_task_output(instance.pk, astradb.Classification,
                                    **joint_result)

        raise a
Ejemplo n.º 4
0
def write_database_outputs(
        task, 
        ti, 
        run_id, 
        element_from_task_id_callable=None,
        **kwargs
    ):
    """
    Collate outputs from upstream FERRE executions and write them to an ASPCAP database table.
    
    :param task:
        This task, as given by the Airflow context dictionary.
    
    :param ti:
        This task instance, as given by the Airflow context dictionary.
    
    :param run_id:
        This run ID, as given by the Airflow context dictionary.
    
    :param element_from_task_id_callable: [optional]
        A Python callable that returns the chemical element, given a task ID.
    """

    
    log.debug(f"Writing ASPCAP database outputs")

    pks = []
    for upstream_task in task.upstream_list:
        pks.append(ti.xcom_pull(task_ids=upstream_task.task_id))

    log.debug(f"Upstream primary keys: {pks}")

    # Group them together by source.
    instance_pks = []
    for source_pks in list(zip(*pks)):

        # The one with the lowest primary key will be the stellar parameters.
        sp_pk, *abundance_pks = sorted(source_pks)
        
        sp_instance = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk == sp_pk).one_or_none()
        abundance_instances = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk.in_(abundance_pks)).all()

        # Get parameters that are in common to all instances.
        keep = {}
        for key, value in sp_instance.parameters.items():
            for instance in abundance_instances:
                if instance.parameters[key] != value:
                    break
            else:
                keep[key] = value

        # Create a task instance.
        instance = create_task_instance(
            dag_id=task.dag_id, 
            task_id=task.task_id, 
            run_id=run_id,
            parameters=keep
        )

        # Create a partial results table.
        keys = ["snr"]
        label_names = ("teff", "logg", "metals", "log10vdop", "o_mg_si_s_ca_ti", "lgvsini", "c", "n")
        for key in label_names:
            keys.extend([key, f"u_{key}"])
        
        results = dict([(key, getattr(sp_instance.output, key)) for key in keys])

        # Now update with elemental abundance instances.
        for el_instance in abundance_instances:
            
            if element_from_task_id_callable is not None:
                element = element_from_task_id_callable(el_instance.task_id).lower()
            else:
                element = el_instance.task_id.split(".")[-1].lower()
            
            # Check what is not frozen.
            thawed_label_names = []
            ignore = ("lgvsini", ) # Ignore situations where lgvsini was missing from grid and it screws up the task
            for key in label_names:
                if key not in ignore and not getattr(el_instance.output, f"frozen_{key}"):
                    thawed_label_names.append(key)

            if len(thawed_label_names) > 1:
                log.warning(f"Multiple thawed label names for {element} {el_instance}: {thawed_label_names}")

            values = np.hstack([getattr(el_instance.output, ln) for ln in thawed_label_names]).tolist()
            u_values = np.hstack([getattr(el_instance.output, f"u_{ln}") for ln in thawed_label_names]).tolist()

            results.update({
                f"{element}_h": values,
                f"u_{element}_h": u_values,
            })

        # Include associated primary keys so we can reference back to original parameters, etc.
        results["associated_ti_pks"] = [sp_pk, *abundance_pks]

        log.debug(f"Results entry: {results}")

        # Create an entry in the output interface table.
        # (We will update this later with any elemental abundance results).
        # TODO: Should we link back to the original FERRE primary keys?
        output = create_task_output(
            instance,
            astradb.Aspcap,
            **results
        )
        log.debug(f"Created output {output} for instance {instance}")
        instance_pks.append(instance.pk)
        
    return instance_pks
Ejemplo n.º 5
0
    def execute(self, context):
        """
        Create task instances for all the data model identifiers, which could include
        multiple task instances for each data model identifier set.

        :param context:
            The Airflow context dictionary.
        """

        # Get header information.
        grid_info = utils.parse_grid_information(self.header_paths)

        args = (context["dag"].dag_id, context["task"].task_id, context["run_id"])

        # Get parameters from the parent class initialisation that should also be stored.
        common_task_parameters = self.common_task_parameters()

        pks = []
        trees = {}
        
        for data_model_identifiers in self.data_model_identifiers(context):

            parameters = { **common_task_parameters, **data_model_identifiers }

            release = parameters["release"]
            tree = trees.get(release, None)
            if tree is None:
                trees[release] = tree = SDSSPath(release=release)

            path = tree.full(**parameters)
            
            # Generate initial guess(es).
            initial_guesses = []

            # From headers
            try:
                header = getheader(path)

                teff = safe_read_header(header, ("RV_TEFF", "RVTEFF"))
                logg = safe_read_header(header, ("RV_LOGG", "RVLOGG"))
                fe_h = safe_read_header(header, ("RV_FEH", "RVFEH"))

                # Get information relevant for matching initial guess and grids.
                initial_guesses.append(dict(
                    telescope=parameters["telescope"], # important for LSF information
                    mean_fiber=header["MEANFIB"], # important for LSF information
                    teff=teff,
                    logg=logg,
                    metals=fe_h,
                ))

            except:
                log.exception(f"Unable to load relevant headers from path {path}")
                continue
            
            # Add any other initial guesses? From Gaia? etc?
            for initial_guess in initial_guesses:
                for header_path, _ in utils.yield_suitable_grids(grid_info, **initial_guess):
                    parameters.update(
                        header_path=header_path,
                        initial_teff=np.round(initial_guess["teff"], 0),
                        initial_logg=np.round(initial_guess["logg"], 3),
                        initial_metals=np.round(initial_guess["metals"], 3),
                        initial_log10vdop=np.round(utils.approximate_log10_microturbulence(initial_guess["logg"]), 3),
                        initial_o_mg_si_s_ca_ti=0.0,
                        initial_lgvsini=1.0,  # :eyes:
                        initial_c=0.0,
                        initial_n=0.0,
                    )
                    instance = create_task_instance(*args, parameters)
                    pks.append(instance.pk)
                    
                    log.debug(f"Created {instance} with parameters {parameters}")

        if not pks:
            raise AirflowSkipException("No data model identifiers found for this time period.")

        return pks