def get_best_result(task, ti, **kwargs): """ When there are numerous FERRE tasks that are upstream, this function will return the primary keys of the task instances that gave the best result on a per-observation basis. """ # Get the PKs from upstream. pks = [] log.debug(f"Upstream tasks: {task.upstream_list}") for upstream_task in task.upstream_list: pks.append(ti.xcom_pull(task_ids=upstream_task.task_id)) pks = flatten(pks) log.debug(f"Getting best initial guess among primary keys {pks}") # Need to uniquely identify observations. param_bit_mask = bitmask.ParamBitMask() bad_grid_edge = (param_bit_mask.get_value("GRIDEDGE_WARN") | param_bit_mask.get_value("GRIDEDGE_BAD")) trees = {} best_tasks = {} for i, pk in enumerate(pks): q = session.query(astradb.TaskInstance).filter(astradb.TaskInstance.pk==pk) instance = q.one_or_none() if instance.output is None: log.warning(f"No output found for task instance {instance}") continue p = instance.parameters # Check that the telescope is the same as what we expect from this task ID. # This is a bit of a hack. Let us explain. # The "BA" grid does not have a telescope/fiber model, so you can run LCO and APO # data through the initial-BA grid. And those outputs go to the "get_best_results" # for each of the APO and LCO tasks (e.g., this function). # If there is only APO data, then the LCO "get_best_result" will only have one # input: the BA results. Then it will erroneously think that's the best result # for that source. # It's hacky to put this logic in here. It should be in the DAG instead. Same # thing for parsing 'telescope' name in the DAG (eg 'APO') from 'apo25m'. this_telescope_short_name = p["telescope"][:3].upper() expected_telescope_short_name = task.task_id.split(".")[1] log.info(f"For instance {instance} we have {this_telescope_short_name} and {expected_telescope_short_name}") if this_telescope_short_name != expected_telescope_short_name: continue try: tree = trees[p["release"]] except KeyError: tree = trees[p["release"]] = SDSSPath(release=p["release"]) key = "_".join([ p['release'], p['filetype'], *[p[k] for k in tree.lookup_keys(p['filetype'])] ]) best_tasks.setdefault(key, (np.inf, None)) # TODO: Confirm that this is base10 log. This should also be 'log_reduced_chisq_fit', # according to the documentation. log_chisq_fit, *_ = instance.output.log_chisq_fit previous_teff, *_ = instance.output.teff bitmask_flag, *_ = instance.output.bitmask_flag log.debug(f"Result {instance} {instance.output} with log_chisq_fit = {log_chisq_fit} and {previous_teff} and {bitmask_flag}") # Note: If FERRE totally fails then it will assign -999 values to the log_chisq_fit. So we have to # check that the log_chisq_fit is actually sensible! # (Or we should only query task instances where the output is sensible!) if log_chisq_fit < 0: # TODO: This is a f*****g hack. log.debug(f"Skipping result for {instance} {instance.output} as log_chisq_fit = {log_chisq_fit}") continue parsed_header = utils.parse_header_path(p["header_path"]) # Penalise chi-sq in the same way they did for DR17. # See github.com/sdss/apogee/python/apogee/aspcap/aspcap.py#L658 if parsed_header["spectral_type"] == "GK" and previous_teff < 3900: log.debug(f"Increasing \chisq because spectral type GK") log_chisq_fit += np.log10(10) bitmask_flag_logg, bitmask_flag_teff = bitmask_flag[-2:] if bitmask_flag_logg & bad_grid_edge: log.debug(f"Increasing \chisq because logg flag is bad edge") log_chisq_fit += np.log10(5) if bitmask_flag_teff & bad_grid_edge: log.debug(f"Increasing \chisq because teff flag is bad edge") log_chisq_fit += np.log10(5) # Is this the best so far? if log_chisq_fit < best_tasks[key][0]: log.debug(f"Assigning this output to best task as {log_chisq_fit} < {best_tasks[key][0]}: {pk}") best_tasks[key] = (log_chisq_fit, pk) for key, (log_chisq_fit, pk) in best_tasks.items(): if pk is None: log.warning(f"No good task found for key {key}: ({log_chisq_fit}, {pk})") else: log.info(f"Best task for key {key} with \chi^2 of {log_chisq_fit:.2f} is primary key {pk}") if best_tasks: return [pk for (log_chisq_fit, pk) in best_tasks.values() if pk is not None] else: raise AirflowSkipException(f"no task outputs found from {len(pks)} primary keys")
def execute(self, context): """ Create task instances for all the data model identifiers, which could include multiple task instances for each data model identifier set. :param context: The Airflow context dictionary. """ # Get header information. grid_info = utils.parse_grid_information(self.header_paths) args = (context["dag"].dag_id, context["task"].task_id, context["run_id"]) # Get parameters from the parent class initialisation that should also be stored. common_task_parameters = self.common_task_parameters() pks = [] trees = {} for data_model_identifiers in self.data_model_identifiers(context): parameters = { **common_task_parameters, **data_model_identifiers } release = parameters["release"] tree = trees.get(release, None) if tree is None: trees[release] = tree = SDSSPath(release=release) path = tree.full(**parameters) # Generate initial guess(es). initial_guesses = [] # From headers try: header = getheader(path) teff = safe_read_header(header, ("RV_TEFF", "RVTEFF")) logg = safe_read_header(header, ("RV_LOGG", "RVLOGG")) fe_h = safe_read_header(header, ("RV_FEH", "RVFEH")) # Get information relevant for matching initial guess and grids. initial_guesses.append(dict( telescope=parameters["telescope"], # important for LSF information mean_fiber=header["MEANFIB"], # important for LSF information teff=teff, logg=logg, metals=fe_h, )) except: log.exception(f"Unable to load relevant headers from path {path}") continue # Add any other initial guesses? From Gaia? etc? for initial_guess in initial_guesses: for header_path, _ in utils.yield_suitable_grids(grid_info, **initial_guess): parameters.update( header_path=header_path, initial_teff=np.round(initial_guess["teff"], 0), initial_logg=np.round(initial_guess["logg"], 3), initial_metals=np.round(initial_guess["metals"], 3), initial_log10vdop=np.round(utils.approximate_log10_microturbulence(initial_guess["logg"]), 3), initial_o_mg_si_s_ca_ti=0.0, initial_lgvsini=1.0, # :eyes: initial_c=0.0, initial_n=0.0, ) instance = create_task_instance(*args, parameters) pks.append(instance.pk) log.debug(f"Created {instance} with parameters {parameters}") if not pks: raise AirflowSkipException("No data model identifiers found for this time period.") return pks
def execute(self, context): if not os.path.exists(os.path.dirname(self.folder)): try: os.makedirs(os.path.dirname(self.folder)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise folder = self.folder if self.str_files is None: self.str_files = common.getUpstreamVariable(self, context) if self.str_files is None or len(self.str_files) == 0: raise AirflowSkipException("there is not files") i = 0 _nc_files = [x for x in self.str_files if ".nc" in x] _files = [ x for x in _nc_files if "{}.nc".format(self.output_type) in x and (self.lat is None or "{}_{}".format(self.lat[0], self.lon[0]) in x) and self.year is None or "_{}_".format(self.year) ] _other_files = [x for x in self.str_files if ".nc" not in x] kwargs = self.alg_kwargs xarrs = {} for _f in _files: _xarr = common.readNetCDF(_f) if len(_xarr.data_vars) == 0: raise AirflowSkipException("No data inside the files ") xarrs[os.path.basename(_f)] = _xarr kwargs["xarrs"] = xarrs kwargs["product"] = self.product kwargs["folder"] = folder kwargs["other_files"] = _other_files exec( open(common.ALGORITHMS_FOLDER + "/" + self.algorithm + "/" + self.algorithm + "_" + str(self.version) + ".py", encoding='utf-8').read(), kwargs) fns = [] history = u'Creado con CDCOL con el algoritmo {} y ver. {}'.format( self.algorithm, str(self.version)) if not self.lat is None and not self.year is None: _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm), self.lat[0], self.lon[0], self.year) elif not self.lat is None: _exp = "{}_{}_{}_{}_all".format(self.task_id, str(self.algorithm), self.lat[0], self.lon[0]) elif not self.year is None: _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm), "All", "All", self.year) else: _exp = "{}_{}_{}_{}_{}".format(self.task_id, str(self.algorithm), "All", "All", "All") if "output" in kwargs: #output debería ser un xarray #Guardar a un archivo... output = kwargs["output"] if self.to_tiff: filename = folder + "{}_output.tif".format(_exp) common.write_geotiff_from_xr(filename, output) # Siguientes 4 lineas descomentadas por Aurelio #filename = folder + "{}_output.nc".format(_exp) #filename = folder + "{}_{}_{}_{}_{}_output.nc".format(self.task_id, str(self.algorithm), # _fn.split("_")[2], _fn.split("_")[3], # _fn.split("_")[4]) #common.saveNC(output, filename, history) # common.translate_netcdf_to_tiff(self.task_id, str(self.algorithm), self.folder, [filename]) else: filename = folder + "{}_output.nc".format(_exp) common.saveNC(output, filename, history) fns.append(filename) if "outputs" in kwargs: if self.to_tiff: common.write_geotiff_from_xr(filename, ouput, bands) for xa in kwargs["outputs"]: filename = folder + "{}_{}.tif".format(_exp, xa) common.write_geotiff_from_xr(filename, kwargs["outputs"][xa]) fns.append(filename) else: for xa in kwargs["outputs"]: filename = folder + "{}_{}.tif".format(_exp, xa) common.saveNC(kwargs["outputs"][xa], filename, history) fns.append(filename) if "outputtxt" in kwargs: filename = folder + "{}.txt".format(_exp) with open(filename, "w") as text_file: text_file.write(kwargs["outputtxt"]) fns.append(filename) if "outputxcom" in kwargs: fns.append(kwargs["outputxcom"]) return fns
def execute(self, context): if not os.path.exists(os.path.dirname(self.folder)): try: os.makedirs(os.path.dirname(self.folder)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise folder = self.folder dc = datacube.Datacube(app=self.execID) kwargs = self.alg_kwargs xanm = "xarr" start = time.time() bands = [] print(self.time_ranges) if self.product['bands'] != None and len(self.product['bands']) > 0: bands = self.product['bands'] if isinstance( self.time_ranges, list) and self.alg_folder == common.COMPLETE_ALGORITHMS_FOLDER: i = 0 for t in self.time_ranges: kwargs[xanm + str(i)] = dc.load(product=self.product['name'], measurements=bands, longitude=self.lon, latitude=self.lat, time=t) if len(kwargs[xanm + str(i)].data_vars) == 0: print("ERROR: NO HAY DATOS EN LA ZONA") open( posixpath.join(common.LOGS_FOLDER, self.execID, self.task_id, "no_data.lock"), "w+").close() raise AirflowSkipException("No hay datos en la zona") i += 1 else: kwargs[xanm + str(0)] = dc.load(product=self.product['name'], measurements=bands, longitude=self.lon, latitude=self.lat, time=self.time_ranges) if len(kwargs[xanm + str(0)].data_vars) == 0: print("ERROR: NO HAY DATOS EN LA ZONA") open( posixpath.join(common.LOGS_FOLDER, self.execID, self.task_id, "no_data.lock"), "w+").close() raise AirflowSkipException("No hay datos en la zona") # kwargs[xanm] = dc.load(product=self.product['name'], longitude=self.lon, latitude=self.lat, time=self.time_ranges) dc.close() end = time.time() logging.info('TIEMPO CONSULTA:' + str((end - start))) kwargs["product"] = self.product kwargs["folder"] = folder path = posixpath.join(self.alg_folder, self.algorithm, self.algorithm + "_" + str(self.version) + ".py") exec(open(path, encoding='utf-8').read(), kwargs) fns = [] history = u'Creado con CDCOL con el algoritmo {} y ver. {}'.format( self.algorithm, str(self.version)) if "output" in kwargs: # output debería ser un xarray # Guardar a un archivo... output = kwargs["output"] if self.to_tiff: filename = folder + "{}_{}_{}_{}_{}_output.tif".format( self.task_id, str(self.algorithm), self.lat[0], self.lon[0], re.sub('[^\w_.)(-]', '', str(self.time_ranges))) common.write_geotiff_from_xr(filename, output) #common.saveNC(output, filename, history) else: filename = folder + "{}_{}_{}_{}_{}_output.nc".format( self.task_id, str(self.algorithm), self.lat[0], self.lon[0], re.sub('[^\w_.)(-]', '', str(self.time_ranges))) common.saveNC(output, filename, history) fns.append(filename) if "outputs" in kwargs: if self.to_tiff: for xa in kwargs["outputs"]: filename = folder + "{}_{}_{}_{}_{}_{}.tif".format( self.task_id, str(self.algorithm), self.lat[0], self.lon[0], re.sub('[^\w_.)(-]', '', str(self.time_ranges)), xa) common.write_geotiff_from_xr(filename, kwargs["outputs"][xa]) fns.append(filename) else: for xa in kwargs["outputs"]: filename = folder + "{}_{}_{}_{}_{}_{}.nc".format( self.task_id, str(self.algorithm), self.lat[0], self.lon[0], re.sub('[^\w_.)(-]', '', str(self.time_ranges)), xa) common.saveNC(kwargs["outputs"][xa], filename, history) fns.append(filename) if "outputtxt" in kwargs: filename = folder + "{}_{}_{}.txt".format( self.lat[0], self.lon[0], re.sub('[^\w_.)(-]', '', str(self.time_ranges))) with open(filename, "w") as text_file: text_file.write(kwargs["outputtxt"]) fns.append(filename) if "outputxcom" in kwargs: fns.append(kwargs["outputxcom"]) return fns
def trigger_null(context): raise AirflowSkipException('Intentionally not doing it')
def check_events_for_skips(events): check.list_param(events, 'events', of_type=DagsterEvent) skipped = any([e.event_type_value == DagsterEventType.STEP_SKIPPED.value for e in events]) if skipped: raise AirflowSkipException('Dagster emitted skip event, skipping execution in Airflow')