def get_metadata(fname): """Parse metadata from the file.""" res = None for section in CONFIG.sections(): try: parser = Parser(CONFIG.get(section, "pattern")) except NoOptionError: continue if not parser.validate(fname): continue res = parser.parse(fname) res.update(dict(CONFIG.items(section))) for key in ["watcher", "pattern", "timeliness", "regions"]: res.pop(key, None) res = trigger.fix_start_end_time(res) if ("sensor" in res) and ("," in res["sensor"]): res["sensor"] = res["sensor"].split(",") res["uri"] = fname res["filename"] = os.path.basename(fname) return res
def extract_filenames_in_time_window(file_list, starttime, endtime): """Extract the filenames with time inside the time interval specified. NB! Only tested for EARS-NWC granules. This does not support assembling several locally received full swaths""" # New EARS-NWC filenames: # Ex.: # W_XX-EUMETSAT-Darmstadt,SING+LEV+SAT,NOAA19+CT_C_EUMS_20150819124700_\ # 33643.nc.bz2 pnew = Parser(EARS_PPS_FILE_MASK) # Old EARS-NWC filenames: # Ex.: # ctth_20130910_205300_metopb.h5.bz2 pold = Parser("{product:s}_{starttime:%Y%m%d_%H%M}00_{platform_name:s}.h5" "{compression:s}") plocal = Parser(LOCAL_PPS_FILE_MASK) valid_filenames = [] valid_times = [] LOG.debug("Time window: (%s, %s)", str(starttime), str(endtime)) for fname in file_list: try: data = pnew.parse(os.path.basename(fname)) except ValueError: try: data = pold.parse(os.path.basename(fname)) except ValueError: data = plocal.parse(os.path.basename(fname)) if (data['starttime'] >= starttime and data['starttime'] < endtime): valid_filenames.append(fname) valid_times.append(data['starttime']) LOG.debug("Start time %s inside window", str(data['starttime'])) else: pass # Can we rely on the files being sorted according to time? # Sort the filenames according to time: vtimes = np.array(valid_times) idx = np.argsort(vtimes) vfiles = np.array(valid_filenames) return np.take(vfiles, idx).tolist()
def _read_cf_from_string_export(cls, blob): """Read blob as a string created by `to_cf`.""" pattern = "{central:f} {unit:s} ({min:f}-{max:f} {unit2:s})" from trollsift import Parser parser = Parser(pattern) res_dict = parser.parse(blob) res_dict.pop('unit2') obj = cls(**res_dict) return obj
def _match_files_to_pattern(files, path, pattern): if pattern is not None: parser = Parser(posixpath.join(path, pattern)) matching_files = [] for file in files: try: metadata = parser.parse(file['name']) file['metadata'] = metadata matching_files.append(file) except ValueError: pass return matching_files return files
def get_metadata(fname): """Parse metadata from the file. """ res = None for section in CONFIG.sections(): try: parser = Parser(CONFIG.get(section, "pattern")) except NoOptionError: continue if not parser.validate(fname): continue res = parser.parse(fname) res.update(dict(CONFIG.items(section))) for key in ["watcher", "pattern", "timeliness", "regions"]: res.pop(key, None) if "duration" in res and "end_time" not in res: res["end_time"] = (res["start_time"] + timedelta(seconds=int(res["duration"]))) if "start_date" in res: res["start_time"] = datetime.combine(res["start_date"].date(), res["start_time"].time()) if "end_date" not in res: res["end_date"] = res["start_date"] del res["start_date"] if "end_date" in res: res["end_time"] = datetime.combine(res["end_date"].date(), res["end_time"].time()) del res["end_date"] while res["start_time"] > res["end_time"]: res["end_time"] += timedelta(days=1) if "duration" in res: del res["duration"] if ("sensor" in res) and ("," in res["sensor"]): res["sensor"] = res["sensor"].split(",") res["uri"] = fname res["filename"] = os.path.basename(fname) return res
def _get_metadata(self, fname): """Parse metadata from the file.""" parser = Parser(self._config_items["pattern"]) res = parser.parse(fname) res.update(dict(self._config_items)) for key in ["watcher", "pattern", "timeliness", "regions"]: res.pop(key, None) res = fix_start_end_time(res) if ("sensor" in res) and ("," in res["sensor"]): res["sensor"] = res["sensor"].split(",") res["uri"] = fname res["filename"] = os.path.basename(fname) return res
def get_metadata_from_filename(infile_pattern, filepath): """From the filename and its pattern get basic metadata of the satellite observations.""" p__ = Parser(infile_pattern) fname = os.path.basename(filepath) try: res = p__.parse(fname) except ValueError: # Do something! return None # Fix the end time: endtime = datetime(res['start_time'].year, res['start_time'].month, res['start_time'].day, res['end_hour'].hour, res['end_hour'].minute, res['end_hour'].second) if endtime < res['start_time']: endtime = endtime + timedelta(days=1) res['end_time'] = endtime return res
def get_metadata(fname): res = None for section in config.sections(): if section == "default": continue try: parser = Parser(config.get(section, "pattern")) except NoOptionError: continue if not parser.validate(fname): continue res = parser.parse(fname) res.update(dict(config.items(section))) for key in ["watcher", "pattern", "timeliness"]: res.pop(key, None) if "duration" in res and "end_time" not in res: res["end_time"] = (res["start_time"] + timedelta(seconds=int(res["duration"]))) if "start_date" in res: res["start_time"] = datetime.combine(res["start_date"].date(), res["start_time"].time()) if "end_date" not in res: res["end_date"] = res["start_date"] del res["start_date"] if "end_date" in res: res["end_time"] = datetime.combine(res["end_date"].date(), res["end_time"].time()) del res["end_date"] while res["start_time"] > res["end_time"]: res["end_time"] += timedelta(days=1) if "duration" in res: del res["duration"] res["uri"] = fname res["filename"] = os.path.basename(fname) return res
class PPSReader(Reader): """Reader class for PPS files""" pformat = "nc_pps_l2" def __init__(self, *args, **kwargs): Reader.__init__(self, *args, **kwargs) # Source of the data, 'local' or 'ears' self._source = None # Parser for getting info from the file names self._parser = None # Satellite config self._config = None # Location of geolocation files, required for 'local' products self._cloud_product_geodir = None # Name of the product having geolocation for 'local' products self._geolocation_product_name = None def _read_config(self, sat_name, instrument_name): '''Read config for the satellite''' if self._config: return self._config = ConfigParser() configfile = os.path.join(CONFIG_PATH, sat_name + ".cfg") LOG.debug("Read configfile %s", configfile) self._config.read(configfile) try: self._cloud_product_geodir = \ self._config.get(instrument_name + "-level3", "cloud_product_geodir", raw=True, vars=os.environ) except NoOptionError: pass LOG.debug("cloud_product_geodir = %s", self._cloud_product_geodir) try: self._geolocation_product_name = \ self._config.get(instrument_name + "-level3", "geolocation_product_name", raw=True, vars=os.environ) except NoOptionError: if self._source != 'ears': LOG.warning("No geolocation product name given in config, " "using default: %s", GEO_PRODUCT_NAME_DEFAULT) self._geolocation_product_name = GEO_PRODUCT_NAME_DEFAULT def _determine_prod_and_geo_files(self, prodfilenames): """From the list of product files and the products to load determine the product files and the geolocation files that will be considered when reading the data """ # geofiles4product is a dict listing all geo-locations files applicable # for each product. # prodfiles4product is a dict listing all product files for a given # product name prodfiles4product = {} geofiles4product = {} if prodfilenames: if not isinstance(prodfilenames, (list, set, tuple)): prodfilenames = [prodfilenames] for fname in prodfilenames: # Only standard NWCSAF/PPS and EARS-NWC naming accepted! # No support for old file names (< PPSv2014) if (os.path.basename(fname).startswith("S_NWC") or os.path.basename(fname).startswith("W_XX-EUMETSAT")): if not self._parser: if os.path.basename(fname).startswith("S_NWC"): self._source = 'local' self._parser = Parser(LOCAL_PPS_FILE_MASK) else: self._source = 'ears' self._parser = Parser(EARS_PPS_FILE_MASK) else: LOG.info("Unrecognized NWCSAF/PPS file: %s", fname) continue parse_info = self._parser.parse(os.path.basename(fname)) prodname = parse_info['product'] if prodname not in prodfiles4product: prodfiles4product[prodname] = [] prodfiles4product[prodname].append(fname) # Assemble geolocation information if self._source == 'ears': # For EARS data, the files have geolocation in themselves for prodname, fnames in prodfiles4product.iteritems(): geofiles4product[prodname] = fnames else: # For locally processed data, use the geolocation from # the product defined in config if self._geolocation_product_name in prodfiles4product: for prodname in prodfiles4product.keys(): geofiles4product[prodname] = \ prodfiles4product[self._geolocation_product_name] else: # If the product files with geolocation are not used, # assume that they are still available on the disk. if self._cloud_product_geodir is None: LOG.warning("Config option 'cloud_product_geodir' is not " "available! Assuming same directory as " "products.") for prodname in prodfiles4product.keys(): geofiles4product[prodname] = [] for fname in prodfiles4product[prodname]: directory = self._cloud_product_geodir or \ os.path.abspath(fname) parse_info = \ self._parser.parse(os.path.basename(fname)) fname = fname.replace(parse_info['product'], self._geolocation_product_name) fname = os.path.join(directory, fname) geofiles4product[prodname].append(fname) # Check that each product file has a corresponding geolocation # file: ''' if self._geolocation_product_name: for prod in products: if prod not in geofiles4product: LOG.error("No product name %s in dict " "geofiles4product!", prod) continue if prod not in prodfiles4product: LOG.error("No product name %s in dict " "prodfiles4product!", prod) continue if len(geofiles4product[prod]) != \ len(prodfiles4product[prod]): LOG.error("Mismatch in number of product files and " "matching geolocation files!") ''' return prodfiles4product, geofiles4product def load(self, satscene, **kwargs): """Read data from file and load it into *satscene*. """ prodfilenames = kwargs.get('filename') time_interval = kwargs.get('time_interval') if prodfilenames and time_interval: LOG.warning("You have specified both a list of files " + "and a time interval") LOG.warning("Specifying a time interval will only take effect " + "if no files are specified") time_interval = None products = satscene.channels_to_load & set(PPS_PRODUCTS) if len(products) == 0: LOG.debug("No PPS cloud products to load, abort") return self._read_config(satscene.fullname, satscene.instrument_name) LOG.info("Products to load: %s", str(products)) # If a list of files are provided to the load call, we disregard the # direcorty and filename specifications/definitions in the config file. if not prodfilenames: try: area_name = satscene.area_id or satscene.area.area_id except AttributeError: area_name = "satproj_?????_?????" # Make the list of files for the requested products: if isinstance(time_interval, (tuple, set, list)) and \ len(time_interval) == 2: time_start, time_end = time_interval else: time_start, time_end = satscene.time_slot, None LOG.debug( "Start and end times: %s %s", str(time_start), str(time_end)) prodfilenames = get_filenames(satscene, products, self._config, (time_start, time_end), area_name) LOG.debug("Product files: %s", str(prodfilenames)) retv = self._determine_prod_and_geo_files(prodfilenames) prodfiles4product, geofiles4product = retv # Reading the products classes = {"CTTH": CloudTopTemperatureHeight, "CT": CloudType, "CMA": CloudMask, "PC": PrecipitationClouds, "CPP": CloudPhysicalProperties } nodata_mask = False read_external_geo = {} for product in products: LOG.debug("Loading %s", product) if product not in prodfiles4product: LOG.warning("No files found for product: %s", product) continue pps_band = PPSProductData(prodfiles4product[product]).read() chn = classes[product]() chn.read(pps_band) if not chn.name in satscene: LOG.info("Adding new channel %s", chn.name) satscene.channels.append(chn) # Check if geolocation is loaded: if not chn.area: read_external_geo[product] = satscene.channels[-1].name # Check if some 'channel'/product needs geolocation. If some # product does not have geolocation, get it from the # geofilename: from pyresample import geometry # Load geolocation for chn_name in read_external_geo.values(): LOG.debug("ch_name = %s", str(chn_name)) chn = satscene[chn_name] geofilenames = geofiles4product[chn_name] LOG.debug("Geo-files = %s", str(geofilenames)) geoloc = PpsGeolocationData(chn.shape, chn.granule_lengths, geofilenames).read() try: satscene[chn.name].area = geometry.SwathDefinition( lons=geoloc.longitudes, lats=geoloc.latitudes) area_name = ("swath_" + satscene.fullname + "_" + str(satscene.time_slot) + "_" + str(chn.shape) + "_" + chn.name) satscene[chn.name].area.area_id = area_name satscene[chn.name].area_id = area_name except ValueError: LOG.exception('Failed making a SwathDefinition: ' + 'min,max lons,lats = (%f %f") (%f,%f)', geoloc.longitudes.data.min(), geoloc.longitudes.data.max(), geoloc.latitudes.data.min(), geoloc.latitudes.data.max()) LOG.warning("No geolocation loaded for %s", str(chn_name)) # PpsGeolocationData.clear_cache() return
class EventHandler(ProcessEvent): """ Event handler class for inotify. *topic* - topic of the published messages *posttroll_port* - port number to publish the messages on *filepattern* - filepattern for finding information from the filename """ def __init__(self, topic, instrument, posttroll_port=0, filepattern=None, aliases=None, tbus_orbit=False, history=0, granule_length=0): super(EventHandler, self).__init__() self._pub = NoisyPublisher("trollstalker", posttroll_port, topic) self.pub = self._pub.start() self.topic = topic self.info = {} if filepattern is None: filepattern = '{filename}' self.file_parser = Parser(filepattern) self.instrument = instrument self.aliases = aliases self.tbus_orbit = tbus_orbit self.granule_length = granule_length self._deque = deque([], history) def stop(self): '''Stop publisher. ''' self._pub.stop() def __clean__(self): '''Clean instance attributes. ''' self.info = {} def process_IN_CLOSE_WRITE(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CLOSE_WRITE") self.process(event) def process_IN_CLOSE_NOWRITE(self, event): """When a nonwritable file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_MOVED_TO(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_MOVED_TO") self.process(event) def process_IN_CREATE(self, event): """When a file is created, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_CLOSE_MODIFY(self, event): """When a file is modified and closed, process the associated event. """ LOGGER.debug("trigger: IN_MODIFY") self.process(event) def process(self, event): '''Process the event''' # New file created and closed if not event.dir: LOGGER.debug("processing %s", event.pathname) # parse information and create self.info dict{} self.parse_file_info(event) if len(self.info) > 0: # Check if this file has been recently dealt with if event.pathname not in self._deque: self._deque.append(event.pathname) message = self.create_message() LOGGER.info("Publishing message %s", str(message)) self.pub.send(str(message)) else: LOGGER.info("Data has been published recently, skipping.") self.__clean__() def create_message(self): """Create broadcasted message """ return Message(self.topic, 'file', self.info) def parse_file_info(self, event): '''Parse satellite and orbit information from the filename. Message is sent, if a matching filepattern is found. ''' try: LOGGER.debug("filter: %s\t event: %s", self.file_parser.fmt, event.pathname) self.info = self.file_parser.parse( os.path.basename(event.pathname)) LOGGER.debug("Extracted: %s", str(self.info)) except ValueError: # Filename didn't match pattern, so empty the info dict LOGGER.info("Couldn't extract any usefull information") self.info = {} else: self.info['uri'] = event.pathname self.info['uid'] = os.path.basename(event.pathname) self.info['sensor'] = self.instrument.split(',') LOGGER.debug("self.info['sensor']: " + str(self.info['sensor'])) if self.tbus_orbit and "orbit_number" in self.info: LOGGER.info("Changing orbit number by -1!") self.info["orbit_number"] -= 1 # replace values with corresponding aliases, if any are given if self.aliases: info = self.info.copy() for key in info: if key in self.aliases: self.info['orig_'+key] = self.info[key] self.info[key] = self.aliases[key][str(self.info[key])] # add start_time and end_time if not present try: base_time = self.info["time"] except KeyError: try: base_time = self.info["nominal_time"] except KeyError: base_time = self.info["start_time"] if "start_time" not in self.info: self.info["start_time"] = base_time if "start_date" in self.info: self.info["start_time"] = \ dt.datetime.combine(self.info["start_date"].date(), self.info["start_time"].time()) if "end_date" not in self.info: self.info["end_date"] = self.info["start_date"] del self.info["start_date"] if "end_date" in self.info: self.info["end_time"] = \ dt.datetime.combine(self.info["end_date"].date(), self.info["end_time"].time()) del self.info["end_date"] if "end_time" not in self.info and self.granule_length > 0: self.info["end_time"] = base_time + dt.timedelta(seconds=self.granule_length) if "end_time" in self.info: while self.info["start_time"] > self.info["end_time"]: self.info["end_time"] += dt.timedelta(days=1)
class GeoGatherer(object): """Gatherer for geostationary satellite segments""" def __init__(self, config, section): self._config = config self._section = section topics = config.get(section, 'topics').split() services = "" if config.has_option(section, 'services'): services = config.get(section, 'services').split() self._listener = ListenerContainer(topics=topics, services=services) self._publisher = publisher.NoisyPublisher("geo_gatherer") self._subject = config.get(section, "publish_topic") self._pattern = config.get(section, 'pattern') self._providing_server = None if config.has_option(section, 'providing_server'): self._providing_server = config.get(section, 'providing_server') self._parser = Parser(self._pattern) try: self._timeliness = dt.timedelta( seconds=config.getint(section, "timeliness")) except (NoOptionError, ValueError): self._timeliness = dt.timedelta(seconds=20) self._timeout = None self.metadata = {} self.received_files = set() self.wanted_files = set() self.all_files = set() self.critical_files = set() self.delayed_files = OrderedDict() self.logger = logging.getLogger("geo_gatherer") self._loop = False def _clear_data(self): """Clear data.""" self._timeout = None self.metadata = {} self.received_files = set() self.wanted_files = set() self.all_files = set() self.critical_files = set() self.delayed_files = OrderedDict() def _init_data(self, msg): """Init wanted, all and critical files""" # Init metadata struct for key in msg.data: if key not in ("uid", "uri", "channel_name", "segment"): self.metadata[key] = msg.data[key] self.metadata['dataset'] = [] # Critical files that are required, otherwise production will fail self.critical_files = \ self._compose_filenames(self._config.get(self._section, "critical_files")) # These files are wanted, but not critical for production self.wanted_files = \ self._compose_filenames(self._config.get(self._section, "wanted_files")) self.all_files = \ self._compose_filenames(self._config.get(self._section, "all_files")) def _compose_filenames(self, itm_str): """Compose filename set()s based on a pattern and item string. itm_str is formated like ':PRO,:EPI' or 'VIS006:8,VIS008:1-8,...'""" # Empty set result = set() # Get copy of metadata meta = self.metadata.copy() for itm in itm_str.split(','): channel_name, segments = itm.split(':') segments = segments.split('-') if len(segments) > 1: segments = [ '%06d' % i for i in range(int(segments[0]), int(segments[-1]) + 1) ] meta['channel_name'] = channel_name for seg in segments: meta['segment'] = seg fname = self._parser.compose(meta) result.add(fname) return result def _publish(self): """Publish file dataset and reinitialize gatherer.""" # Diagnostic logging about delayed ... if len(self.delayed_files) > 0: file_str = '' for key in self.delayed_files: file_str += "%s %f seconds, " % (key, self.delayed_files[key]) self.logger.warning("Files received late: %s", file_str.strip(', ')) # and missing files missing_files = self.all_files.difference(self.received_files) if len(missing_files) > 0: self.logger.warning("Missing files: %s", ', '.join(missing_files)) msg = message.Message(self._subject, "dataset", self.metadata) self.logger.info("Sending: %s", str(msg)) self._publisher.send(str(msg)) self._clear_data() def set_logger(self, logger): """Set logger.""" self.logger = logger def collection_ready(self): """Determine if collection is ready to be published.""" # If no files have been collected, return False if len(self.received_files) == 0: return False # If all wanted files have been received, return True if self.wanted_files.union(self.critical_files).issubset( self.received_files): return True # If all critical files have been received ... if self.critical_files.issubset(self.received_files): # and timeout is reached, return True if self._timeout is not None and \ self._timeout <= dt.datetime.utcnow(): return True # else, set timeout if not already running else: if self._timeout is None: self._timeout = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s", str(self._timeout)) return False # In other cases continue gathering return False def run(self): """Run GeoGatherer""" self._publisher.start() self._loop = True while self._loop: # Check if collection is ready for publication if self.collection_ready(): self._publish() # Check listener for new messages msg = None try: msg = self._listener.output_queue.get(True, 1) except AttributeError: msg = self._listener.queue.get(True, 1) except KeyboardInterrupt: self.stop() continue except Queue.Empty: continue if msg.type == "file": self.logger.info("New message received: %s", str(msg)) self.process(msg) def stop(self): """Stop gatherer.""" self.logger.info("Stopping gatherer.") self._loop = False if self._listener is not None: self._listener.stop() if self._publisher is not None: self._publisher.stop() def process(self, msg): """Process message""" if self._providing_server and self._providing_server != msg.host: return mda = self._parser.parse(msg.data["uid"]) if msg.data['uid'] in self.received_files: return # Init metadata etc if this is the first file if len(self.metadata) == 0: self._init_data(msg) # If the nominal time of the new segment is later than the # current metadata has, ... elif mda["nominal_time"] > self.metadata["nominal_time"]: # timeout ... self._timeout = dt.datetime.utcnow() # and check if the collection is ready and publish if self.collection_ready(): self._publish() self._clear_data() self._init_data(msg) # or discard data and start new collection else: self.logger.warning("Collection not finished before new " "started") missing_files = self.all_files.difference(self.received_files) self.logger.warning("Missing files: %s", missing_files) self._clear_data() self._init_data(msg) # Add uid and uri self.metadata['dataset'].append({ 'uri': msg.data['uri'], 'uid': msg.data['uid'] }) # If critical files have been received but the collection is # not complete, add the file to list of delayed files if self.critical_files.issubset(self.received_files): delay = dt.datetime.utcnow() - (self._timeout - self._timeliness) self.delayed_files[msg.data['uid']] = delay.total_seconds() # Add to received files self.received_files.add(msg.data['uid'])
class GeoGatherer(object): """Gatherer for geostationary satellite segments""" def __init__(self, config, section): self._config = config self._section = section topics = config.get(section, 'topics').split() self._listener = ListenerContainer(topics=topics) self._publisher = publisher.NoisyPublisher("geo_gatherer") self._subject = config.get(section, "publish_topic") self._pattern = config.get(section, 'pattern') self._parser = Parser(self._pattern) try: self._timeliness = dt.timedelta(seconds=config.getint(section, "timeliness")) except (NoOptionError, ValueError): self._timeliness = dt.timedelta(seconds=20) self._timeout = None self.metadata = {} self.received_files = set() self.wanted_files = set() self.all_files = set() self.critical_files = set() self.delayed_files = OrderedDict() self.logger = logging.getLogger("geo_gatherer") self._loop = False def _clear_data(self): """Clear data.""" self._timeout = None self.metadata = {} self.received_files = set() self.wanted_files = set() self.all_files = set() self.critical_files = set() self.delayed_files = OrderedDict() def _init_data(self, msg): """Init wanted, all and critical files""" # Init metadata struct for key in msg.data: if key not in ("uid", "uri", "channel_name", "segment"): self.metadata[key] = msg.data[key] self.metadata['dataset'] = [] # Critical files that are required, otherwise production will fail self.critical_files = \ self._compose_filenames(self._config.get(self._section, "critical_files")) # These files are wanted, but not critical for production self.wanted_files = \ self._compose_filenames(self._config.get(self._section, "wanted_files")) self.all_files = \ self._compose_filenames(self._config.get(self._section, "all_files")) def _compose_filenames(self, itm_str): """Compose filename set()s based on a pattern and item string. itm_str is formated like ':PRO,:EPI' or 'VIS006:8,VIS008:1-8,...'""" # Empty set result = set() # Get copy of metadata meta = self.metadata.copy() for itm in itm_str.split(','): channel_name, segments = itm.split(':') segments = segments.split('-') if len(segments) > 1: segments = ['%06d' % i for i in range(int(segments[0]), int(segments[-1])+1)] meta['channel_name'] = channel_name for seg in segments: meta['segment'] = seg fname = self._parser.compose(meta) result.add(fname) return result def _publish(self): """Publish file dataset and reinitialize gatherer.""" # Diagnostic logging about delayed ... if len(self.delayed_files) > 0: file_str = '' for key in self.delayed_files: file_str += "%s %f seconds, " % (key, self.delayed_files[key]) self.logger.warning("Files received late: %s", file_str.strip(', ')) # and missing files missing_files = self.all_files.difference(self.received_files) if len(missing_files) > 0: self.logger.warning("Missing files: %s", ', '.join(missing_files)) msg = message.Message(self._subject, "dataset", self.metadata) self.logger.info("Sending: %s", str(msg)) self._publisher.send(str(msg)) self._clear_data() def set_logger(self, logger): """Set logger.""" self.logger = logger def collection_ready(self): """Determine if collection is ready to be published.""" # If no files have been collected, return False if len(self.received_files) == 0: return False # If all wanted files have been received, return True if self.wanted_files.union(self.critical_files).issubset(\ self.received_files): return True # If all critical files have been received ... if self.critical_files.issubset(self.received_files): # and timeout is reached, return True if self._timeout is not None and \ self._timeout <= dt.datetime.utcnow(): return True # else, set timeout if not already running else: if self._timeout is None: self._timeout = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s", str(self._timeout)) return False # In other cases continue gathering return False def run(self): """Run GeoGatherer""" self._publisher.start() self._loop = True while self._loop: # Check if collection is ready for publication if self.collection_ready(): self._publish() # Check listener for new messages msg = None try: msg = self._listener.queue.get(True, 1) except KeyboardInterrupt: self.stop() continue except Queue.Empty: continue if msg.type == "file": self.logger.info("New message received: %s", str(msg)) self.process(msg) def stop(self): """Stop gatherer.""" self.logger.info("Stopping gatherer.") self._loop = False if self._listener is not None: self._listener.stop() if self._publisher is not None: self._publisher.stop() def process(self, msg): """Process message""" mda = self._parser.parse(msg.data["uid"]) if msg.data['uid'] in self.received_files: return # Init metadata etc if this is the first file if len(self.metadata) == 0: self._init_data(msg) # If the nominal time of the new segment is later than the # current metadata has, ... elif mda["nominal_time"] > self.metadata["nominal_time"]: # timeout ... self._timeout = dt.datetime.utcnow() # and check if the collection is ready and publish if self.collection_ready(): self._publish() self._clear_data() self._init_data(msg) # or discard data and start new collection else: self.logger.warning("Collection not finished before new " "started") missing_files = self.all_files.difference(self.received_files) self.logger.warning("Missing files: %s", missing_files) self._clear_data() self._init_data(msg) # Add uid and uri self.metadata['dataset'].append({'uri': msg.data['uri'], 'uid': msg.data['uid']}) # If critical files have been received but the collection is # not complete, add the file to list of delayed files if self.critical_files.issubset(self.received_files): delay = dt.datetime.utcnow() - (self._timeout - self._timeliness) self.delayed_files[msg.data['uid']] = delay.total_seconds() # Add to received files self.received_files.add(msg.data['uid'])
class SegmentGatherer(object): """Gatherer for geostationary satellite segments and multifile polar satellite granules.""" def __init__(self, config, section): self._config = config self._section = section topics = config.get(section, 'topics').split() try: nameservers = config.get(section, 'nameserver') nameservers = nameservers.split() except (NoOptionError, ValueError): nameservers = [] try: addresses = config.get(section, 'addresses') addresses = addresses.split() except (NoOptionError, ValueError): addresses = None try: publish_port = config.get(section, 'publish_port') except NoOptionError: publish_port = 0 try: services = config.get(section, 'services').split() except (NoOptionError, ValueError): services = "" self._listener = ListenerContainer(topics=topics, addresses=addresses services=services) self._publisher = publisher.NoisyPublisher("segment_gatherer", port=publish_port, nameservers=nameservers) self._subject = config.get(section, "publish_topic") self._pattern = config.get(section, 'pattern') self._parser = Parser(self._pattern) try: self._time_tolerance = config.getint(section, "time_tolerance") except NoOptionError: self._time_tolerance = 30 try: self._timeliness = dt.timedelta(seconds=config.getint(section, "timeliness")) except (NoOptionError, ValueError): self._timeliness = dt.timedelta(seconds=1200) try: self._num_files_premature_publish = \ config.getint(section, "num_files_premature_publish") except (NoOptionError, ValueError): self._num_files_premature_publish = -1 self.slots = OrderedDict() self.time_name = config.get(section, 'time_name') self.logger = logging.getLogger("segment_gatherer") self._loop = False self._providing_server = None if config.has_option(section, 'providing_server'): self._providing_server = config.get(section, 'providing_server') def _clear_data(self, time_slot): """Clear data.""" if time_slot in self.slots: del self.slots[time_slot] def _init_data(self, mda): """Init wanted, all and critical files""" # Init metadata struct metadata = mda.copy() metadata['dataset'] = [] time_slot = str(metadata[self.time_name]) self.logger.debug("Adding new slot: %s", time_slot) self.slots[time_slot] = {} self.slots[time_slot]['metadata'] = metadata.copy() # Critical files that are required, otherwise production will fail. # If there are no critical files, empty set([]) is used. try: critical_segments = self._config.get(self._section, "critical_files") self.slots[time_slot]['critical_files'] = \ self._compose_filenames(time_slot, critical_segments) except (NoOptionError, ValueError): self.slots[time_slot]['critical_files'] = set([]) # These files are wanted, but not critical to production self.slots[time_slot]['wanted_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "wanted_files")) # Name of all the files self.slots[time_slot]['all_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "all_files")) self.slots[time_slot]['received_files'] = set([]) self.slots[time_slot]['delayed_files'] = dict() self.slots[time_slot]['missing_files'] = set([]) self.slots[time_slot]['timeout'] = None self.slots[time_slot]['files_till_premature_publish'] = \ self._num_files_premature_publish def _compose_filenames(self, time_slot, itm_str): """Compose filename set()s based on a pattern and item string. itm_str is formated like ':PRO,:EPI' or 'VIS006:8,VIS008:1-8,...'""" # Empty set result = set() # Get copy of metadata meta = self.slots[time_slot]['metadata'].copy() # Replace variable tags (such as processing time) with # wildcards, as these can't be forecasted. try: meta = _copy_without_ignore_items( meta, ignored_keys=self._config.get(self._section, 'variable_tags').split(',')) except NoOptionError: pass for itm in itm_str.split(','): channel_name, segments = itm.split(':') segments = segments.split('-') if len(segments) > 1: format_string = '%d' if len(segments[0]) > 1 and segments[0][0] == '0': format_string = '%0' + str(len(segments[0])) + 'd' segments = [format_string % i for i in range(int(segments[0]), int(segments[-1]) + 1)] meta['channel_name'] = channel_name for seg in segments: meta['segment'] = seg fname = self._parser.globify(meta) result.add(fname) return result def _publish(self, time_slot, missing_files_check=True): """Publish file dataset and reinitialize gatherer.""" data = self.slots[time_slot] # Diagnostic logging about delayed ... delayed_files = data['delayed_files'] if len(delayed_files) > 0: file_str = '' for key in delayed_files: file_str += "%s %f seconds, " % (key, delayed_files[key]) self.logger.warning("Files received late: %s", file_str.strip(', ')) if missing_files_check: # and missing files missing_files = data['all_files'].difference( data['received_files']) if len(missing_files) > 0: self.logger.warning("Missing files: %s", ', '.join(missing_files)) # Remove tags that are not necessary for datasets for tag in REMOVE_TAGS: try: del data['metadata'][tag] except KeyError: pass msg = message.Message(self._subject, "dataset", data['metadata']) self.logger.info("Sending: %s", str(msg)) self._publisher.send(str(msg)) # self._clear_data(time_slot) def set_logger(self, logger): """Set logger.""" self.logger = logger def update_timeout(self, slot): slot['timeout'] = dt.datetime.utcnow() + self._timeliness time_slot = str(slot['metadata'][self.time_name]) self.logger.info("Setting timeout to %s for slot %s.", str(slot['timeout']), time_slot) def slot_ready(self, slot): """Determine if slot is ready to be published.""" # If no files have been collected, return False if len(slot['received_files']) == 0: return SLOT_NOT_READY time_slot = str(slot['metadata'][self.time_name]) wanted_and_critical_files = slot[ 'wanted_files'].union(slot['critical_files']) num_wanted_and_critical_files_received = len( wanted_and_critical_files & slot['received_files']) self.logger.debug("Got %s wanted or critical files in slot %s.", num_wanted_and_critical_files_received, time_slot) if num_wanted_and_critical_files_received \ == slot['files_till_premature_publish']: slot['files_till_premature_publish'] = -1 return SLOT_READY_BUT_WAIT_FOR_MORE # If all wanted files have been received, return True if wanted_and_critical_files.issubset( slot['received_files']): self.logger.info("All files received for slot %s.", time_slot) return SLOT_READY if slot['timeout'] is None: self.update_timeout(slot) if slot['timeout'] < dt.datetime.utcnow(): if slot['critical_files'].issubset(slot['received_files']): # All critical files have been received # Timeout reached, collection ready self.logger.info("Timeout occured, required files received " "for slot %s.", time_slot) return SLOT_READY else: # Timeout reached, collection is obsolete self.logger.warning("Timeout occured and required files " "were not present, data discarded for " "slot %s.", time_slot) return SLOT_OBSOLETE_TIMEOUT # Timeout not reached, wait for more files return SLOT_NOT_READY def run(self): """Run SegmentGatherer""" self._publisher.start() self._loop = True while self._loop: # Check if there are slots ready for publication slots = self.slots.copy() for slot in slots: slot = str(slot) status = self.slot_ready(slots[slot]) if status == SLOT_READY: # Collection ready, publish and remove self._publish(slot) self._clear_data(slot) if status == SLOT_READY_BUT_WAIT_FOR_MORE: # Collection ready, publish and but wait for more self._publish(slot, missing_files_check=False) elif status == SLOT_OBSOLETE_TIMEOUT: # Collection unfinished and obslote, discard self._clear_data(slot) else: # Collection unfinished, wait for more data pass # Check listener for new messages msg = None try: msg = self._listener.output_queue.get(True, 1) except AttributeError: msg = self._listener.queue.get(True, 1) except KeyboardInterrupt: self.stop() continue except Queue.Empty: continue if msg.type == "file": if (self._providing_server and self._providing_server != msg.host): continue self.logger.info("New message received: %s", str(msg)) self.process(msg) def stop(self): """Stop gatherer.""" self.logger.info("Stopping gatherer.") self._loop = False if self._listener is not None: self._listener.stop() if self._publisher is not None: self._publisher.stop() def process(self, msg): """Process message""" try: mda = self._parser.parse(msg.data["uid"]) except ValueError: self.logger.debug("Unknown file, skipping.") return metadata = {} # Use values parsed from the filename as basis for key in mda: if key not in DO_NOT_COPY_KEYS: metadata[key] = mda[key] # Update with data given in the message for key in msg.data: if key not in DO_NOT_COPY_KEYS: metadata[key] = msg.data[key] time_slot = self._find_time_slot(metadata[self.time_name]) # Init metadata etc if this is the first file if time_slot not in self.slots: self._init_data(metadata) slot = self.slots[time_slot] to_add = [] for filename in slot['all_files']: if filename == msg.data['uid']: continue url = urlparse(msg.data['uri']) path = os.path.join(os.path.dirname(url.path), filename) if not os.path.exists(path): continue new_url = list(url) new_url[2] = path uri = urlunparse(new_url) slot['metadata']['dataset'].append({'uri': uri, 'uid': filename}) to_add.append(filename) slot['received_files'].update(to_add) if to_add: self.logger.debug("Some files were already received %s", str(to_add)) self.update_timeout(slot) slot = self.slots[time_slot] # Replace variable tags (such as processing time) with # wildcards, as these can't be forecasted. try: mda = _copy_without_ignore_items( mda, ignored_keys=self._config.get(self._section, 'variable_tags').split(',')) except NoOptionError: pass mask = self._parser.globify(mda) if mask in slot['received_files']: return # Add uid and uri slot['metadata']['dataset'].append({'uri': msg.data['uri'], 'uid': msg.data['uid']}) # Collect all sensors, not only the latest if type(msg.data["sensor"]) not in (tuple, list, set): msg.data["sensor"] = [msg.data["sensor"]] for sensor in msg.data["sensor"]: if "sensor" not in slot["metadata"]: slot["metadata"]["sensor"] = [] if sensor not in slot["metadata"]["sensor"]: slot["metadata"]["sensor"].append(sensor) # If critical files have been received but the slot is # not complete, add the file to list of delayed files if len(slot['critical_files']) > 0 and \ slot['critical_files'].issubset(slot['received_files']): delay = dt.datetime.utcnow() - (slot['timeout'] - self._timeliness) slot['delayed_files'][msg.data['uid']] = delay.total_seconds() # Add to received files slot['received_files'].add(mask) def _find_time_slot(self, time_obj): """Find time slot and return the slot as a string. If no slots are close enough, return *str(time_obj)*""" for slot in self.slots: time_slot = self.slots[slot]['metadata'][self.time_name] time_diff = time_obj - time_slot if abs(time_diff.total_seconds()) < self._time_tolerance: self.logger.debug("Found existing time slot, using that") return str(time_slot) return str(time_obj)
class EventHandler(ProcessEvent): """ Event handler class for inotify. *topic* - topic of the published messages *posttroll_port* - port number to publish the messages on *filepattern* - filepattern for finding information from the filename """ def __init__(self, topic, instrument, config_item, posttroll_port=0, filepattern=None, aliases=None, tbus_orbit=False, history=0, granule_length=0, custom_vars=None, nameservers=[], watchManager=None): super(EventHandler, self).__init__() self._pub = NoisyPublisher("trollstalker_" + config_item, posttroll_port, topic, nameservers=nameservers) self.pub = self._pub.start() self.topic = topic self.info = OrderedDict() if filepattern is None: filepattern = '{filename}' self.file_parser = Parser(filepattern) self.instrument = instrument self.aliases = aliases self.custom_vars = custom_vars self.tbus_orbit = tbus_orbit self.granule_length = granule_length self._deque = deque([], history) self._watchManager = watchManager self._watched_dirs = dict() def stop(self): '''Stop publisher. ''' self._pub.stop() def __clean__(self): '''Clean instance attributes. ''' self.info = OrderedDict() def process_IN_CLOSE_WRITE(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CLOSE_WRITE") self.process(event) def process_IN_CLOSE_NOWRITE(self, event): """When a nonwritable file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_MOVED_TO(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_MOVED_TO") self.process(event) def process_IN_CREATE(self, event): """When a file is created, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_CLOSE_MODIFY(self, event): """When a file is modified and closed, process the associated event. """ LOGGER.debug("trigger: IN_MODIFY") self.process(event) def process_IN_DELETE(self, event): """On delete.""" if (event.mask & pyinotify.IN_ISDIR): try: try: self._watchManager.rm_watch( self._watched_dirs[event.pathname], quiet=False) except pyinotify.WatchManagerError: #As the directory is deleted prior removing the watch will cause a error message #from pyinotify. This is ok, so just pass the exception. LOGGER.debug("Removed watch: {}".format(event.pathname)) pass finally: del self._watched_dirs[event.pathname] except KeyError: LOGGER.warning( "Dir {} not watched by inotify. Can not delete watch.". format(event.pathname)) return def process(self, event): '''Process the event''' # New file created and closed if not event.dir: LOGGER.debug("processing %s", event.pathname) # parse information and create self.info OrderedDict{} self.parse_file_info(event) if len(self.info) > 0: # Check if this file has been recently dealt with if event.pathname not in self._deque: self._deque.append(event.pathname) message = self.create_message() LOGGER.info("Publishing message %s", str(message)) self.pub.send(str(message)) else: LOGGER.info("Data has been published recently, skipping.") self.__clean__() elif (event.mask & pyinotify.IN_ISDIR): tmask = (pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_CREATE | pyinotify.IN_DELETE) try: self._watched_dirs.update( self._watchManager.add_watch(event.pathname, tmask)) LOGGER.debug("Added watch on dir: {}".format(event.pathname)) except AttributeError: LOGGER.error( "No watchmanager given. Can not add watch on {}".format( event.pathname)) pass def create_message(self): """Create broadcasted message """ return Message(self.topic, 'file', dict(self.info)) def parse_file_info(self, event): '''Parse satellite and orbit information from the filename. Message is sent, if a matching filepattern is found. ''' try: LOGGER.debug("filter: %s\t event: %s", self.file_parser.fmt, event.pathname) pathname_join = os.path.basename(event.pathname) if 'origin_inotify_base_dir_skip_levels' in self.custom_vars: pathname_list = event.pathname.split('/') pathname_join = "/".join(pathname_list[int( self.custom_vars['origin_inotify_base_dir_skip_levels']):]) else: LOGGER.debug( "No origin_inotify_base_dir_skip_levels in self.custom_vars" ) self.info = OrderedDict() self.info.update(self.file_parser.parse(pathname_join)) LOGGER.debug("Extracted: %s", str(self.info)) except ValueError: # Filename didn't match pattern, so empty the info dict LOGGER.info("Couldn't extract any usefull information") self.info = OrderedDict() else: self.info['uri'] = event.pathname self.info['uid'] = os.path.basename(event.pathname) self.info['sensor'] = self.instrument.split(',') LOGGER.debug("self.info['sensor']: " + str(self.info['sensor'])) if self.tbus_orbit and "orbit_number" in self.info: LOGGER.info("Changing orbit number by -1!") self.info["orbit_number"] -= 1 # replace values with corresponding aliases, if any are given if self.aliases: info = self.info.copy() for key in info: if key in self.aliases: self.info['orig_' + key] = self.info[key] self.info[key] = self.aliases[key][str(self.info[key])] # add start_time and end_time if not present try: base_time = self.info["time"] except KeyError: try: base_time = self.info["nominal_time"] except KeyError: base_time = self.info["start_time"] if "start_time" not in self.info: self.info["start_time"] = base_time if "start_date" in self.info: self.info["start_time"] = \ dt.datetime.combine(self.info["start_date"].date(), self.info["start_time"].time()) if "end_date" not in self.info: self.info["end_date"] = self.info["start_date"] del self.info["start_date"] if "end_date" in self.info: self.info["end_time"] = \ dt.datetime.combine(self.info["end_date"].date(), self.info["end_time"].time()) del self.info["end_date"] if "end_time" not in self.info and self.granule_length > 0: self.info["end_time"] = base_time + \ dt.timedelta(seconds=self.granule_length) if "end_time" in self.info: while self.info["start_time"] > self.info["end_time"]: self.info["end_time"] += dt.timedelta(days=1) if self.custom_vars is not None: for var_name in self.custom_vars: var_pattern = self.custom_vars[var_name] var_val = None if '%' in var_pattern: var_val = helper_functions.create_aligned_datetime_var( var_pattern, self.info) if var_val is None: var_val = compose(var_pattern, self.info) self.info[var_name] = var_val
class SegmentGatherer(object): """Gatherer for geostationary satellite segments and multifile polar satellite granules.""" def __init__(self, config, section): self._config = config self._section = section topics = config.get(section, 'topics').split() self._listener = ListenerContainer(topics=topics) self._publisher = publisher.NoisyPublisher("segment_gatherer") self._subject = config.get(section, "publish_topic") self._pattern = config.get(section, 'pattern') self._parser = Parser(self._pattern) try: self._timeliness = dt.timedelta( seconds=config.getint(section, "timeliness")) except (NoOptionError, ValueError): self._timeliness = dt.timedelta(seconds=1200) try: self._num_files_premature_publish = \ config.getint(section, "num_files_premature_publish") except (NoOptionError, ValueError): self._num_files_premature_publish = -1 self.slots = OrderedDict() self.time_name = config.get(section, 'time_name') self.logger = logging.getLogger("segment_gatherer") self._loop = False def _clear_data(self, time_slot): """Clear data.""" if time_slot in self.slots: del self.slots[time_slot] def _init_data(self, msg, mda): """Init wanted, all and critical files""" # Init metadata struct metadata = {} for key in msg.data: if key not in ("uid", "uri", "channel_name", "segment"): metadata[key] = msg.data[key] metadata['dataset'] = [] # Use also metadata parsed from the filenames metadata.update(mda) time_slot = str(metadata[self.time_name]) self.slots[time_slot] = {} self.slots[time_slot]['metadata'] = metadata.copy() # Critical files that are required, otherwise production will fail. # If there are no critical files, empty set([]) is used. try: critical_segments = self._config.get(self._section, "critical_files") self.slots[time_slot]['critical_files'] = \ self._compose_filenames(time_slot, critical_segments) except (NoOptionError, ValueError): self.slots[time_slot]['critical_files'] = set([]) # These files are wanted, but not critical to production self.slots[time_slot]['wanted_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "wanted_files")) # Name of all the files self.slots[time_slot]['all_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "all_files")) self.slots[time_slot]['received_files'] = set([]) self.slots[time_slot]['delayed_files'] = dict() self.slots[time_slot]['missing_files'] = set([]) self.slots[time_slot]['timeout'] = None self.slots[time_slot]['files_till_premature_publish'] = \ self._num_files_premature_publish def _compose_filenames(self, time_slot, itm_str): """Compose filename set()s based on a pattern and item string. itm_str is formated like ':PRO,:EPI' or 'VIS006:8,VIS008:1-8,...'""" # Empty set result = set() # Get copy of metadata meta = self.slots[time_slot]['metadata'].copy() # Replace variable tags (such as processing time) with a # wildcard, as these can't be forecasted. try: for tag in self._config.get(self._section, 'variable_tags').split(','): meta[tag] = '*' except NoOptionError: pass for itm in itm_str.split(','): channel_name, segments = itm.split(':') segments = segments.split('-') if len(segments) > 1: segments = [ '%d' % i for i in range(int(segments[0]), int(segments[-1]) + 1) ] meta['channel_name'] = channel_name for seg in segments: meta['segment'] = seg fname = self._parser.compose(meta) result.add(fname) return result def _publish(self, time_slot, missing_files_check=True): """Publish file dataset and reinitialize gatherer.""" data = self.slots[time_slot] # Diagnostic logging about delayed ... delayed_files = data['delayed_files'] if len(delayed_files) > 0: file_str = '' for key in delayed_files: file_str += "%s %f seconds, " % (key, delayed_files[key]) self.logger.warning("Files received late: %s", file_str.strip(', ')) if missing_files_check: # and missing files missing_files = data['all_files'].difference( data['received_files']) if len(missing_files) > 0: self.logger.warning("Missing files: %s", ', '.join(missing_files)) msg = message.Message(self._subject, "dataset", data['metadata']) self.logger.info("Sending: %s", str(msg)) self._publisher.send(str(msg)) # self._clear_data(time_slot) def set_logger(self, logger): """Set logger.""" self.logger = logger def slot_ready(self, slot): """Determine if slot is ready to be published.""" # If no files have been collected, return False if len(slot['received_files']) == 0: return SLOT_NOT_READY time_slot = str(slot['metadata'][self.time_name]) wanted_and_critical_files = \ slot['wanted_files'].union(slot['critical_files']) num_wanted_and_critical_files_received = \ len(wanted_and_critical_files & slot['received_files']) self.logger.debug("Got %s wanted or critical files in slot %s.", num_wanted_and_critical_files_received, time_slot) if num_wanted_and_critical_files_received \ == slot['files_till_premature_publish']: slot['files_till_premature_publish'] = -1 return SLOT_READY_BUT_WAIT_FOR_MORE # If all wanted files have been received, return True if wanted_and_critical_files.issubset(slot['received_files']): self.logger.info("All files received for slot %s.", time_slot) return SLOT_READY if slot['critical_files'].issubset(slot['received_files']): # All critical files have been received if slot['timeout'] is None: # Set timeout slot['timeout'] = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s for slot %s.", str(slot['timeout']), time_slot) return SLOT_NOT_READY elif slot['timeout'] < dt.datetime.utcnow(): # Timeout reached, collection ready self.logger.info( "Timeout occured, required files received " "for slot %s.", time_slot) return SLOT_READY else: pass else: if slot['timeout'] is None: slot['timeout'] = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s for slot %s", str(slot['timeout']), time_slot) return SLOT_NOT_READY elif slot['timeout'] < dt.datetime.utcnow(): # Timeout reached, collection is obsolete self.logger.warning( "Timeout occured and required files " "were not present, data discarded for " "slot %s.", time_slot) return SLOT_OBSOLETE_TIMEOUT else: pass # Timeout not reached, wait for more files return SLOT_NOT_READY def run(self): """Run SegmentGatherer""" self._publisher.start() self._loop = True while self._loop: # Check if there are slots ready for publication slots = self.slots.copy() for slot in slots: slot = str(slot) status = self.slot_ready(slots[slot]) if status == SLOT_READY: # Collection ready, publish and remove self._publish(slot) self._clear_data(slot) if status == SLOT_READY_BUT_WAIT_FOR_MORE: # Collection ready, publish and but wait for more self._publish(slot, missing_files_check=False) elif status == SLOT_OBSOLETE_TIMEOUT: # Collection unfinished and obslote, discard self._clear_data(slot) else: # Collection unfinished, wait for more data pass # Check listener for new messages msg = None try: msg = self._listener.queue.get(True, 1) except KeyboardInterrupt: self.stop() continue except Queue.Empty: continue if msg.type == "file": self.logger.info("New message received: %s", str(msg)) self.process(msg) def stop(self): """Stop gatherer.""" self.logger.info("Stopping gatherer.") self._loop = False if self._listener is not None: self._listener.stop() if self._publisher is not None: self._publisher.stop() def process(self, msg): """Process message""" try: mda = self._parser.parse(msg.data["uid"]) except ValueError: self.logger.debug("Unknown file, skipping.") return time_slot = str(mda[self.time_name]) # Init metadata etc if this is the first file if time_slot not in self.slots: self._init_data(msg, mda) slot = self.slots[time_slot] # Replace variable tags (such as processing time) with a # wildcard, as these can't be forecasted. try: for tag in self._config.get(self._section, 'variable_tags').split(','): mda[tag] = '*' except NoOptionError: pass mask = self._parser.compose(mda) if mask in slot['received_files']: return # Add uid and uri slot['metadata']['dataset'].append({ 'uri': msg.data['uri'], 'uid': msg.data['uid'] }) # If critical files have been received but the slot is # not complete, add the file to list of delayed files if len(slot['critical_files']) > 0 and \ slot['critical_files'].issubset(slot['received_files']): delay = dt.datetime.utcnow() - (slot['timeout'] - self._timeliness) slot['delayed_files'][msg.data['uid']] = delay.total_seconds() # Add to received files slot['received_files'].add(mask)
class EventHandler(ProcessEvent): """ Event handler class for inotify. *topic* - topic of the published messages *posttroll_port* - port number to publish the messages on *filepattern* - filepattern for finding information from the filename """ def __init__(self, topic, instrument, posttroll_port=0, filepattern=None, aliases=None, tbus_orbit=False, history=0, granule_length=0): super(EventHandler, self).__init__() self._pub = NoisyPublisher("trollstalker", posttroll_port, topic) self.pub = self._pub.start() self.topic = topic self.info = {} if filepattern is None: filepattern = '{filename}' self.file_parser = Parser(filepattern) self.instrument = instrument self.aliases = aliases self.tbus_orbit = tbus_orbit self.granule_length = granule_length self._deque = deque([], history) def stop(self): '''Stop publisher. ''' self._pub.stop() def __clean__(self): '''Clean instance attributes. ''' self.info = {} def process_IN_CLOSE_WRITE(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CLOSE_WRITE") self.process(event) def process_IN_CLOSE_NOWRITE(self, event): """When a nonwritable file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_MOVED_TO(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_MOVED_TO") self.process(event) def process_IN_CREATE(self, event): """When a file is created, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_CLOSE_MODIFY(self, event): """When a file is modified and closed, process the associated event. """ LOGGER.debug("trigger: IN_MODIFY") self.process(event) def process(self, event): '''Process the event''' # New file created and closed if not event.dir: LOGGER.debug("processing %s", event.pathname) # parse information and create self.info dict{} self.parse_file_info(event) if len(self.info) > 0: # Check if this file has been recently dealt with if event.pathname not in self._deque: self._deque.append(event.pathname) message = self.create_message() LOGGER.info("Publishing message %s", str(message)) self.pub.send(str(message)) else: LOGGER.info("Data has been published recently, skipping.") self.__clean__() def create_message(self): """Create broadcasted message """ return Message(self.topic, 'file', self.info) def parse_file_info(self, event): '''Parse satellite and orbit information from the filename. Message is sent, if a matching filepattern is found. ''' try: LOGGER.debug("filter: %s\t event: %s", self.file_parser.fmt, event.pathname) self.info = self.file_parser.parse(os.path.basename( event.pathname)) LOGGER.debug("Extracted: %s", str(self.info)) except ValueError: # Filename didn't match pattern, so empty the info dict LOGGER.info("Couldn't extract any usefull information") self.info = {} else: self.info['uri'] = event.pathname self.info['uid'] = os.path.basename(event.pathname) self.info['sensor'] = self.instrument.split(',') LOGGER.debug("self.info['sensor']: " + str(self.info['sensor'])) if self.tbus_orbit and "orbit_number" in self.info: LOGGER.info("Changing orbit number by -1!") self.info["orbit_number"] -= 1 # replace values with corresponding aliases, if any are given if self.aliases: info = self.info.copy() for key in info: if key in self.aliases: self.info['orig_' + key] = self.info[key] self.info[key] = self.aliases[key][str(self.info[key])] # add start_time and end_time if not present try: base_time = self.info["time"] except KeyError: try: base_time = self.info["nominal_time"] except KeyError: base_time = self.info["start_time"] if "start_time" not in self.info: self.info["start_time"] = base_time if "start_date" in self.info: self.info["start_time"] = \ dt.datetime.combine(self.info["start_date"].date(), self.info["start_time"].time()) if "end_date" not in self.info: self.info["end_date"] = self.info["start_date"] del self.info["start_date"] if "end_date" in self.info: self.info["end_time"] = \ dt.datetime.combine(self.info["end_date"].date(), self.info["end_time"].time()) del self.info["end_date"] if "end_time" not in self.info and self.granule_length > 0: self.info["end_time"] = base_time + dt.timedelta( seconds=self.granule_length) if "end_time" in self.info: while self.info["start_time"] > self.info["end_time"]: self.info["end_time"] += dt.timedelta(days=1)
def update_nwp(params): LOG.info("METNO update nwp") tempfile.tempdir = params['options']['nwp_outdir'] ecmwf_path = params['options']['ecmwf_path'] if not os.path.exists(ecmwf_path): ecmwf_path = ecmwf_path.replace("storeB", "storeA") LOG.warning( "Need to replace storeB with storeA for ecmwf_path: {}".format( str(ecmwf_path))) filelist = glob( os.path.join(ecmwf_path, params['options']['ecmwf_prefix'] + "*")) if len(filelist) == 0: LOG.info("Found no input files! dir = " + str( os.path.join(ecmwf_path, params['options']['ecmwf_prefix'] + "*"))) return from trollsift import Parser, compose filelist.sort() for filename in filelist: if params['options']['ecmwf_file_name_sift'] is not None: try: parser = Parser(params['options']['ecmwf_file_name_sift']) except NoOptionError as noe: LOG.error("NoOptionError {}".format(noe)) continue if not parser.validate(os.path.basename(filename)): LOG.error( "Parser validate on filename: {} failed.".format(filename)) continue res = parser.parse("{}".format(os.path.basename(filename))) time_now = datetime.utcnow() if 'analysis_time' in res: if res['analysis_time'].year == 1900: # This is tricky. Filename is missing year in name # Need to guess the year from a compination of year now # and month now and month of the analysis time taken from the filename # If the month now is 1(January) and the analysis month is 12, # then the time has passed New Year, but the NWP analysis time is previous year. if time_now.month == 1 and res['analysis_time'].month == 12: analysis_year = time_now.year - 1 else: analysis_year = time_now.year res['analysis_time'] = res['analysis_time'].replace( year=analysis_year) else: LOG.error( "Can not parse analysis_time in file name. Check config and filename timestamp" ) if 'forecast_time' in res: if res['forecast_time'].year == 1900: # See above for explanation if res['analysis_time'].month == 12 and res[ 'forecast_time'].month == 1: forecast_year = res['analysis_time'].year + 1 else: forecast_year = res['analysis_time'].year res['forecast_time'] = res['forecast_time'].replace( year=forecast_year) else: LOG.error( "Can not parse forecast_time in file name. Check config and filename timestamp" ) forecast_time = res['forecast_time'] analysis_time = res['analysis_time'] step_delta = forecast_time - analysis_time step = "{:03d}H{:02d}M".format( int(step_delta.days * 24 + step_delta.seconds / 3600), 0) else: LOG.error("Not sift pattern given. Can not parse input NWP files") if analysis_time < params['starttime']: # LOG.debug("skip analysis time {} older than search time {}".format(analysis_time, params['starttime'])) continue if int(step[:3]) not in params['nlengths']: # LOG.debug("Skip step {}, not in {}".format(int(step[:3]), params['nlengths'])) continue output_parameters = {} output_parameters['analysis_time'] = analysis_time output_parameters['step_hour'] = int(step_delta.days * 24 + step_delta.seconds / 3600) output_parameters['step_min'] = 0 try: if not os.path.exists(params['options']['nwp_outdir']): os.makedirs(params['options']['nwp_outdir']) except OSError as e: LOG.error("Failed to create directory: %s", e) result_file = "" try: result_file = os.path.join( params['options']['nwp_outdir'], compose(params['options']['nwp_output'], output_parameters)) _result_file = os.path.join( params['options']['nwp_outdir'], compose("." + params['options']['nwp_output'], output_parameters)) _result_file_lock = os.path.join( params['options']['nwp_outdir'], compose("." + params['options']['nwp_output'] + ".lock", output_parameters)) except Exception as e: LOG.error( "Joining outdir with output for nwp failed with: {}".format(e)) LOG.info("Result file: {}".format(result_file)) if os.path.exists(result_file): LOG.info("File: " + str(result_file) + " already there...") continue import fcntl import errno import time rfl = open(_result_file_lock, 'w+') # do some locking while True: try: fcntl.flock(rfl, fcntl.LOCK_EX | fcntl.LOCK_NB) LOG.debug("1Got lock for NWP outfile: {}".format(result_file)) break except IOError as e: if e.errno != errno.EAGAIN: raise else: LOG.debug("Waiting for lock ... {}".format(result_file)) time.sleep(1) if os.path.exists(result_file): LOG.info("File: " + str(result_file) + " already there...") # Need to release the lock fcntl.flock(rfl, fcntl.LOCK_UN) rfl.close() continue fout = open(_result_file, 'wb') try: # Do the static fields # Note: field not in the filename variable, but a configured filename for static fields static_filename = params['options']['ecmwf_static_surface'] if not os.path.exists(static_filename): static_filename = static_filename.replace("storeB", "storeA") LOG.warning("Need to replace storeB with storeA") index_vals = [] index_keys = ['paramId', 'level'] LOG.debug("Start building index") LOG.debug("Handeling file: %s", filename) iid = ecc.codes_index_new_from_file(filename, index_keys) filename_n1s = filename.replace('N2D', 'N1S') LOG.debug("Add to index %s", filename_n1s) ecc.codes_index_add_file(iid, filename_n1s) LOG.debug("Add to index %s", static_filename) ecc.codes_index_add_file(iid, static_filename) LOG.debug("Done index") for key in index_keys: key_vals = ecc.codes_index_get(iid, key) key_vals = tuple(x for x in key_vals if x != 'undef') index_vals.append(key_vals) for prod in product(*index_vals): for i in range(len(index_keys)): ecc.codes_index_select(iid, index_keys[i], prod[i]) while 1: gid = ecc.codes_new_from_index(iid) if gid is None: break param = ecc.codes_get(gid, index_keys[0]) parameters = [ 172, 129, 235, 167, 168, 137, 130, 131, 132, 133, 134, 157 ] if param in parameters: LOG.debug("Doing param: %d", param) copy_needed_field(gid, fout) ecc.codes_release(gid) ecc.codes_index_release(iid) fout.close() os.rename(_result_file, result_file) except WrongLengthError as wle: LOG.error("Something wrong with the data: %s", wle) raise # In the end release the lock fcntl.flock(rfl, fcntl.LOCK_UN) rfl.close() os.remove(_result_file_lock) return
class SegmentGatherer(object): """Gatherer for geostationary satellite segments and multifile polar satellite granules.""" def __init__(self, config, section): self._config = config self._section = section topics = config.get(section, 'topics').split() self._listener = ListenerContainer(topics=topics) self._publisher = publisher.NoisyPublisher("segment_gatherer") self._subject = config.get(section, "publish_topic") self._pattern = config.get(section, 'pattern') self._parser = Parser(self._pattern) try: self._timeliness = dt.timedelta(seconds=config.getint(section, "timeliness")) except (NoOptionError, ValueError): self._timeliness = dt.timedelta(seconds=1200) try: self._num_files_premature_publish = \ config.getint(section, "num_files_premature_publish") except (NoOptionError, ValueError): self._num_files_premature_publish = -1 self.slots = OrderedDict() self.time_name = config.get(section, 'time_name') self.logger = logging.getLogger("segment_gatherer") self._loop = False def _clear_data(self, time_slot): """Clear data.""" if time_slot in self.slots: del self.slots[time_slot] def _init_data(self, msg, mda): """Init wanted, all and critical files""" # Init metadata struct metadata = {} for key in msg.data: if key not in ("uid", "uri", "channel_name", "segment"): metadata[key] = msg.data[key] metadata['dataset'] = [] # Use also metadata parsed from the filenames metadata.update(mda) time_slot = str(metadata[self.time_name]) self.slots[time_slot] = {} self.slots[time_slot]['metadata'] = metadata.copy() # Critical files that are required, otherwise production will fail. # If there are no critical files, empty set([]) is used. try: critical_segments = self._config.get(self._section, "critical_files") self.slots[time_slot]['critical_files'] = \ self._compose_filenames(time_slot, critical_segments) except (NoOptionError, ValueError): self.slots[time_slot]['critical_files'] = set([]) # These files are wanted, but not critical to production self.slots[time_slot]['wanted_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "wanted_files")) # Name of all the files self.slots[time_slot]['all_files'] = \ self._compose_filenames(time_slot, self._config.get(self._section, "all_files")) self.slots[time_slot]['received_files'] = set([]) self.slots[time_slot]['delayed_files'] = dict() self.slots[time_slot]['missing_files'] = set([]) self.slots[time_slot]['timeout'] = None self.slots[time_slot]['files_till_premature_publish'] = \ self._num_files_premature_publish def _compose_filenames(self, time_slot, itm_str): """Compose filename set()s based on a pattern and item string. itm_str is formated like ':PRO,:EPI' or 'VIS006:8,VIS008:1-8,...'""" # Empty set result = set() # Get copy of metadata meta = self.slots[time_slot]['metadata'].copy() # Replace variable tags (such as processing time) with # wildcards, as these can't be forecasted. try: meta = _copy_without_ignore_items( meta, ignored_keys=self._config.get(self._section, 'variable_tags').split(',')) except NoOptionError: pass for itm in itm_str.split(','): channel_name, segments = itm.split(':') segments = segments.split('-') if len(segments) > 1: segments = ['%d' % i for i in range(int(segments[0]), int(segments[-1]) + 1)] meta['channel_name'] = channel_name for seg in segments: meta['segment'] = seg fname = self._parser.globify(meta) result.add(fname) return result def _publish(self, time_slot, missing_files_check=True): """Publish file dataset and reinitialize gatherer.""" data = self.slots[time_slot] # Diagnostic logging about delayed ... delayed_files = data['delayed_files'] if len(delayed_files) > 0: file_str = '' for key in delayed_files: file_str += "%s %f seconds, " % (key, delayed_files[key]) self.logger.warning("Files received late: %s", file_str.strip(', ')) if missing_files_check: # and missing files missing_files = data['all_files'].difference(data['received_files']) if len(missing_files) > 0: self.logger.warning("Missing files: %s", ', '.join(missing_files)) msg = message.Message(self._subject, "dataset", data['metadata']) self.logger.info("Sending: %s", str(msg)) self._publisher.send(str(msg)) # self._clear_data(time_slot) def set_logger(self, logger): """Set logger.""" self.logger = logger def slot_ready(self, slot): """Determine if slot is ready to be published.""" # If no files have been collected, return False if len(slot['received_files']) == 0: return SLOT_NOT_READY time_slot = str(slot['metadata'][self.time_name]) wanted_and_critical_files = \ slot['wanted_files'].union(slot['critical_files']) num_wanted_and_critical_files_received = \ len(wanted_and_critical_files & slot['received_files']) self.logger.debug("Got %s wanted or critical files in slot %s.", num_wanted_and_critical_files_received, time_slot) if num_wanted_and_critical_files_received \ == slot['files_till_premature_publish']: slot['files_till_premature_publish'] = -1 return SLOT_READY_BUT_WAIT_FOR_MORE # If all wanted files have been received, return True if wanted_and_critical_files.issubset( slot['received_files']): self.logger.info("All files received for slot %s.", time_slot) return SLOT_READY if slot['critical_files'].issubset(slot['received_files']): # All critical files have been received if slot['timeout'] is None: # Set timeout slot['timeout'] = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s for slot %s.", str(slot['timeout']), time_slot) return SLOT_NOT_READY elif slot['timeout'] < dt.datetime.utcnow(): # Timeout reached, collection ready self.logger.info("Timeout occured, required files received " "for slot %s.", time_slot) return SLOT_READY else: pass else: if slot['timeout'] is None: slot['timeout'] = dt.datetime.utcnow() + self._timeliness self.logger.info("Setting timeout to %s for slot %s", str(slot['timeout']), time_slot) return SLOT_NOT_READY elif slot['timeout'] < dt.datetime.utcnow(): # Timeout reached, collection is obsolete self.logger.warning("Timeout occured and required files " "were not present, data discarded for " "slot %s.", time_slot) return SLOT_OBSOLETE_TIMEOUT else: pass # Timeout not reached, wait for more files return SLOT_NOT_READY def run(self): """Run SegmentGatherer""" self._publisher.start() self._loop = True while self._loop: # Check if there are slots ready for publication slots = self.slots.copy() for slot in slots: slot = str(slot) status = self.slot_ready(slots[slot]) if status == SLOT_READY: # Collection ready, publish and remove self._publish(slot) self._clear_data(slot) if status == SLOT_READY_BUT_WAIT_FOR_MORE: # Collection ready, publish and but wait for more self._publish(slot, missing_files_check=False) elif status == SLOT_OBSOLETE_TIMEOUT: # Collection unfinished and obslote, discard self._clear_data(slot) else: # Collection unfinished, wait for more data pass # Check listener for new messages msg = None try: msg = self._listener.queue.get(True, 1) except KeyboardInterrupt: self.stop() continue except Queue.Empty: continue if msg.type == "file": self.logger.info("New message received: %s", str(msg)) self.process(msg) def stop(self): """Stop gatherer.""" self.logger.info("Stopping gatherer.") self._loop = False if self._listener is not None: self._listener.stop() if self._publisher is not None: self._publisher.stop() def process(self, msg): """Process message""" try: mda = self._parser.parse(msg.data["uid"]) except ValueError: self.logger.debug("Unknown file, skipping.") return time_slot = str(mda[self.time_name]) # Init metadata etc if this is the first file if time_slot not in self.slots: self._init_data(msg, mda) slot = self.slots[time_slot] # Replace variable tags (such as processing time) with # wildcards, as these can't be forecasted. try: mda = _copy_without_ignore_items( mda, ignored_keys=self._config.get(self._section, 'variable_tags').split(',')) except NoOptionError: pass mask = self._parser.globify(mda) if mask in slot['received_files']: return # Add uid and uri slot['metadata']['dataset'].append({'uri': msg.data['uri'], 'uid': msg.data['uid']}) # If critical files have been received but the slot is # not complete, add the file to list of delayed files if len(slot['critical_files']) > 0 and \ slot['critical_files'].issubset(slot['received_files']): delay = dt.datetime.utcnow() - (slot['timeout'] - self._timeliness) slot['delayed_files'][msg.data['uid']] = delay.total_seconds() # Add to received files slot['received_files'].add(mask)
# IASI_PW3_02_M01_20160309180258Z_20160309180554Z_N_O_20160309184345Z.h5 pattern = 'IASI_PW3_02_{platform_name:3s}_{start_time:%Y%m%d%H%M%S}Z_{end_time:%Y%m%d%H%M%S}Z_N_O_{creation_time:%Y%m%d%H%M%S}Z.h5' p__ = Parser(pattern) PREFIXES = [ 'IASI_PW3_', 'IASI_PW3_', ] ftp = FTP(HOST) print("connecting to %s" % HOST) ftp.login(USER, PASSWD) tempfile.tempdir = outpath for (remotedir, prefix) in zip(REMOTE_DIRS, PREFIXES): remotefiles = ftp.nlst(remotedir) fnames = [os.path.basename(f) for f in remotefiles] dates_remote = [p__.parse(s)['start_time'] for s in fnames] rfarr = np.array(remotefiles) drarr = np.array(dates_remote) remotefiles = rfarr[drarr > starttime].tolist() remotefiles = [ r for r in remotefiles if (r.endswith('.h5') and os.path.basename(r).startswith(prefix)) ] localfiles = [ os.path.join(outpath, os.path.basename(f.strip('.h5'))) for f in remotefiles ] try:
class EventHandler(ProcessEvent): """ Event handler class for inotify. *topic* - topic of the published messages *posttroll_port* - port number to publish the messages on *filepattern* - filepattern for finding information from the filename """ def __init__(self, topic, instrument, posttroll_port=0, filepattern=None, aliases=None, tbus_orbit=False): super(EventHandler, self).__init__() self._pub = NoisyPublisher("trollstalker", posttroll_port, topic) self.pub = self._pub.start() self.topic = topic self.info = {} if filepattern is None: filepattern = '{filename}' self.file_parser = Parser(filepattern) self.instrument = instrument self.aliases = aliases self.tbus_orbit = tbus_orbit def stop(self): '''Stop publisher. ''' self._pub.stop() def __clean__(self): '''Clean instance attributes. ''' self.info = {} def process_IN_CLOSE_WRITE(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CLOSE_WRITE") self.process(event) def process_IN_CLOSE_NOWRITE(self, event): """When a nonwritable file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_MOVED_TO(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_MOVED_TO") self.process(event) def process_IN_CREATE(self, event): """When a file is created, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_CLOSE_MODIFY(self, event): """When a file is modified and closed, process the associated event. """ LOGGER.debug("trigger: IN_MODIFY") self.process(event) def process(self, event): '''Process the event''' # New file created and closed if not event.dir: LOGGER.debug("processing %s", event.pathname) # parse information and create self.info dict{} self.parse_file_info(event) if len(self.info) > 0: message = self.create_message() LOGGER.info("Publishing message %s" % str(message)) self.pub.send(str(message)) self.__clean__() def create_message(self): """Create broadcasted message """ return Message(self.topic, 'file', self.info) def parse_file_info(self, event): '''Parse satellite and orbit information from the filename. Message is sent, if a matching filepattern is found. ''' try: LOGGER.debug("filter: %s\t event: %s", self.file_parser.fmt, event.pathname) self.info = self.file_parser.parse( os.path.basename(event.pathname)) LOGGER.debug("Extracted: %s", str(self.info)) except ValueError: # Filename didn't match pattern, so empty the info dict LOGGER.info("Couldn't extract any usefull information") self.info = {} else: self.info['uri'] = event.pathname self.info['uid'] = os.path.basename(event.pathname) self.info['sensor'] = self.instrument.split(',') LOGGER.debug("self.info['sensor']: " + str(self.info['sensor'])) if self.tbus_orbit and "orbit_number" in self.info: LOGGER.info("Changing orbit number by -1!") self.info["orbit_number"] -= 1 # replace values with corresponding aliases, if any are given if self.aliases: for key in self.info: if key in self.aliases: self.info[key] = self.aliases[key][str(self.info[key])]
def update_nwp(params): LOG.info("METNO update nwp") result_files = dict() tempfile.tempdir = params['options']['nwp_outdir'] ecmwf_path = params['options']['ecmwf_path'] if not os.path.exists(ecmwf_path): ecmwf_path = ecmwf_path.replace("storeB","storeA") LOG.warning("Need to replace storeB with storeA for ecmwf_path: {}".format(str(ecmwf_path))) filelist = glob(os.path.join(ecmwf_path, params['options']['ecmwf_prefix'] + "*")) if len(filelist) == 0: LOG.info("Found no input files! dir = " + str(os.path.join(ecmwf_path, params['options']['ecmwf_prefix'] + "*"))) return from trollsift import Parser, compose filelist.sort() for filename in filelist: if params['options']['ecmwf_file_name_sift'] != None: try: parser = Parser(params['options']['ecmwf_file_name_sift']) except NoOptionError as noe: LOG.error("NoOptionError {}".format(noe)) continue if not parser.validate(os.path.basename(filename)): LOG.error("Parser validate on filename: {} failed.".format(filename)) continue res = parser.parse("{}".format(os.path.basename(filename))) #This takes to long to complete. # if filename not in file_cache: # cmd="grib_get -w count=1 -p dataDate {}".format(filename) # run_shell_command(cmd, stdout_logfile='/tmp/dataDate') # dataDate = open("/tmp/dataDate", 'r') # dataDate_input = dataDate.read() # dataDate.close() # for dd in dataDate_input.splitlines(): # try: # _dataDate = datetime.strptime(dd, "%Y%m%d") # except Exception as e: # LOG.error("Failed with :{}".format(e)) # print "Data date is: {}".format(_dataDate) # _file_cache[filename] = _dataDate # file_cache.append(_file_cache) # else: # print "already got datetime" time_now = datetime.utcnow() if 'analysis_time' in res: if res['analysis_time'].year == 1900: #This is tricky. Filename is missing year in name #Need to guess the year from a compination of year now #and month now and month of the analysis time taken from the filename #If the month now is 1(January) and the analysis month is 12, #then the time has passed New Year, but the NWP analysis time is previous year. if time_now.month == 1 and res['analysis_time'].month == 12: analysis_year = time_now.year-1 else: analysis_year = time_now.year res['analysis_time'] = res['analysis_time'].replace( year = analysis_year) else: LOG.error("Can not parse analysis_time in file name. Check config and filename timestamp") if 'forecast_time' in res: if res['forecast_time'].year == 1900: #See above for explanation if res['analysis_time'].month == 12 and res['forecast_time'].month == 1: forecast_year = res['analysis_time'].year+1 else: forecast_year = res['analysis_time'].year res['forecast_time'] = res['forecast_time'].replace( year = forecast_year) else: LOG.error("Can not parse forecast_time in file name. Check config and filename timestamp") forecast_time = res['forecast_time'] analysis_time = res['analysis_time'] timestamp = analysis_time.strftime("%Y%m%d%H%M") step_delta = forecast_time - analysis_time step = "{:03d}H{:02d}M".format(int(step_delta.days*24 + step_delta.seconds/3600),0) timeinfo = "{:s}{:s}{:s}".format(analysis_time.strftime("%m%d%H%M"), forecast_time.strftime("%m%d%H%M"), res['end']) else: LOG.error("Not sift pattern given. Can not parse input NWP files") if analysis_time < params['starttime']: #LOG.debug("skip analysis time {} older than search time {}".format(analysis_time, params['starttime'])) continue if int(step[:3]) not in params['nlengths']: #LOG.debug("Skip step {}, not in {}".format(int(step[:3]), params['nlengths'])) continue output_parameters = {} output_parameters['analysis_time'] = analysis_time output_parameters['step_hour'] = int(step_delta.days*24 + step_delta.seconds/3600) output_parameters['step_min'] = 0 try: if not os.path.exists(params['options']['nwp_outdir']): os.makedirs(params['options']['nwp_outdir']) except OSError as e: LOG.error("Failed to create directory: %s", e) result_file = "" try: result_file = os.path.join(params['options']['nwp_outdir'], compose(params['options']['nwp_output'],output_parameters)) _result_file = os.path.join(params['options']['nwp_outdir'], compose("."+params['options']['nwp_output'],output_parameters)) _result_file_lock = os.path.join(params['options']['nwp_outdir'], compose("."+params['options']['nwp_output']+".lock",output_parameters)) except Exception as e: LOG.error("Joining outdir with output for nwp failed with: {}".format(e)) LOG.info("Result file: {}".format(result_file)) if os.path.exists(result_file): LOG.info("File: " + str(result_file) + " already there...") continue import fcntl import errno import time rfl = open(_result_file_lock,'w+') #do some locking while True: try: fcntl.flock(rfl, fcntl.LOCK_EX|fcntl.LOCK_NB) LOG.debug("1Got lock for NWP outfile: {}".format(result_file)) break; except IOError as e: if e.errno != errno.EAGAIN: raise else: LOG.debug("Waiting for lock ... {}".format(result_file)) time.sleep(1) if os.path.exists(result_file): LOG.info("File: " + str(result_file) + " already there...") #Need to release the lock fcntl.flock(rfl, fcntl.LOCK_UN) rfl.close() continue #Need to set up temporary file to copy grib fields to #If ram is available through /run/shm, use this, else use /tmp if os.path.exists("/run/shm"): __tmpfile = "/run/shm/__tmp" else: __tmpfile = "/tmp/__tmp" #mgid = codes_grib_multi_new() #codes_grib_multi_support_on() #Some parameters can be found from the first name, and some from paramID #Need to check the second of the first one is not found parameter_name_list = ["indicatorOfParameter","paramId"] fout = open(_result_file, 'wb') try: #Do the static fields #Note: field not in the filename variable, but a configured filename for static fields static_filename = params['options']['ecmwf_static_surface'] #print("Handeling static file: %s", static_filename) if not os.path.exists(static_filename): static_filename = static_filename.replace("storeB","storeA") LOG.warning("Need to replace storeB with storeA") index_vals = [] index_keys = ['paramId', 'level'] LOG.debug("Start building index") LOG.debug("Handeling file: %s", filename) iid = codes_index_new_from_file(filename, index_keys) filename_n1s = filename.replace('N2D','N1S') LOG.debug("Add to index %s", filename_n1s) codes_index_add_file(iid, filename_n1s) LOG.debug("Add to index %s", static_filename) codes_index_add_file(iid, static_filename) LOG.debug("Done index") for key in index_keys: #print("size: ", key, codes_index_get_size(iid, key)) key_vals = codes_index_get(iid, key) key_vals = tuple(x for x in key_vals if x != 'undef') #print(key_vals) #print(" ".join(key_vals)) index_vals.append(key_vals) for prod in product(*index_vals): #print('All products: ', end='') for i in range(len(index_keys)): #print('Range:', index_keys[i], prod[i]) #print("{} {}, ".format(index_keys[i], prod[i]), end='') codes_index_select(iid, index_keys[i], prod[i]) #print() while 1: gid = codes_new_from_index(iid) if gid is None: break #print(" ".join(["%s=%s" % (key, codes_get(gid, key)) # for key in index_keys])) param = codes_get(gid, index_keys[0]) #print("Doing param:",param) parameters = [172, 129, 235, 167, 168, 137, 130, 131, 132, 133, 134, 157] if param in parameters: LOG.debug("Doing param: %d",param) #copy_needed_field(gid, mgid) copy_needed_field(gid, fout) codes_release(gid) codes_index_release(iid) #fout = open(_result_file, 'wb') #codes_grib_multi_write(mgid, fout) #codes_grib_multi_release(mgid) fout.close() os.rename(_result_file, result_file) except WrongLengthError as wle: LOG.error("Something wrong with the data: %s", wle) raise #In the end release the lock fcntl.flock(rfl, fcntl.LOCK_UN) rfl.close() os.remove(_result_file_lock) return
class EventHandler(ProcessEvent): """ Event handler class for inotify. *topic* - topic of the published messages *posttroll_port* - port number to publish the messages on *filepattern* - filepattern for finding information from the filename """ def __init__(self, topic, instrument, posttroll_port=0, filepattern=None, aliases=None, tbus_orbit=False): super(EventHandler, self).__init__() self._pub = NoisyPublisher("trollstalker", posttroll_port, topic) self.pub = self._pub.start() self.topic = topic self.info = {} if filepattern is None: filepattern = '{filename}' self.file_parser = Parser(filepattern) self.instrument = instrument self.aliases = aliases self.tbus_orbit = tbus_orbit def stop(self): '''Stop publisher. ''' self._pub.stop() def __clean__(self): '''Clean instance attributes. ''' self.info = {} def process_IN_CLOSE_WRITE(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CLOSE_WRITE") self.process(event) def process_IN_CLOSE_NOWRITE(self, event): """When a nonwritable file is closed, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_MOVED_TO(self, event): """When a file is closed, process the associated event. """ LOGGER.debug("trigger: IN_MOVED_TO") self.process(event) def process_IN_CREATE(self, event): """When a file is created, process the associated event. """ LOGGER.debug("trigger: IN_CREATE") self.process(event) def process_IN_CLOSE_MODIFY(self, event): """When a file is modified and closed, process the associated event. """ LOGGER.debug("trigger: IN_MODIFY") self.process(event) def process(self, event): '''Process the event''' # New file created and closed if not event.dir: LOGGER.debug("processing %s", event.pathname) # parse information and create self.info dict{} self.parse_file_info(event) if len(self.info) > 0: message = self.create_message() LOGGER.info("Publishing message %s" % str(message)) self.pub.send(str(message)) self.__clean__() def create_message(self): """Create broadcasted message """ return Message(self.topic, 'file', self.info) def parse_file_info(self, event): '''Parse satellite and orbit information from the filename. Message is sent, if a matching filepattern is found. ''' try: LOGGER.debug("filter: %s\t event: %s", self.file_parser.fmt, event.pathname) self.info = self.file_parser.parse(os.path.basename( event.pathname)) LOGGER.debug("Extracted: %s", str(self.info)) except ValueError: # Filename didn't match pattern, so empty the info dict LOGGER.info("Couldn't extract any usefull information") self.info = {} else: self.info['uri'] = event.pathname self.info['uid'] = os.path.basename(event.pathname) self.info['sensor'] = self.instrument.split(',') LOGGER.debug("self.info['sensor']: " + str(self.info['sensor'])) if self.tbus_orbit and "orbit_number" in self.info: LOGGER.info("Changing orbit number by -1!") self.info["orbit_number"] -= 1 # replace values with corresponding aliases, if any are given if self.aliases: for key in self.info: if key in self.aliases: self.info[key] = self.aliases[key][str(self.info[key])]
def update_nwp(starttime, nlengths): """Prepare NWP grib files for PPS. Consider only analysis times newer than *starttime*. And consider only the forecast lead times in hours given by the list *nlengths* of integers """ LOG.info("Path to prepare_nwp config file = %s", str(CONFIG_PATH)) LOG.info("Prepare_nwp config file = %s", str(CONFIG_FILE)) LOG.info("Path to nhsf files: %s", str(nhsf_path)) LOG.info("Path to nhsp files: %s", str(nhsp_path)) tempfile.tempdir = nwp_outdir filelist = glob(os.path.join(nhsf_path, nhsf_prefix + "*")) if len(filelist) == 0: LOG.info("No input files! dir = %s", str(nhsf_path)) return LOG.debug('NHSF NWP files found = %s', str(filelist)) nfiles_error = 0 for filename in filelist: if nhsf_file_name_sift is None: raise NwpPrepareError() try: parser = Parser(nhsf_file_name_sift) except NoOptionError as noe: LOG.error("NoOptionError {}".format(noe)) continue if not parser.validate(os.path.basename(filename)): LOG.error("Parser validate on filename: {} failed.".format(filename)) continue LOG.info("{}".format(os.path.basename(filename))) res = parser.parse("{}".format(os.path.basename(filename))) LOG.info("{}".format(res)) if 'analysis_time' in res: if res['analysis_time'].year == 1900: res['analysis_time'] = res['analysis_time'].replace(year=datetime.utcnow().year) analysis_time = res['analysis_time'] timestamp = analysis_time.strftime("%Y%m%d%H%M") else: raise NwpPrepareError("Can not parse analysis_time in file name. Check config and filename timestamp") if 'forecast_time' in res: if res['forecast_time'].year == 1900: res['forecast_time'] = res['forecast_time'].replace(year=datetime.utcnow().year) forecast_time = res['forecast_time'] forecast_step = forecast_time - analysis_time forecast_step = "{:03d}H{:02d}M".format(forecast_step.days*24 + forecast_step.seconds/3600, 0) timeinfo = "{:s}{:s}{:s}".format(analysis_time.strftime( "%m%d%H%M"), forecast_time.strftime("%m%d%H%M"), res['end']) else: LOG.info("Can not parse forecast_time in file name. Try forecast step...") # This needs to be done more solid using the sift pattern! FIXME! timeinfo = filename.rsplit("_", 1)[-1] # Forecast step in hours: if 'forecast_step' in res: forecast_step = res['forecast_step'] else: raise NwpPrepareError( 'Failed parsing forecast_step in file name. Check config and filename timestamp.') LOG.debug("Analysis time and start time: %s %s", str(analysis_time), str(starttime)) if analysis_time < starttime: continue if forecast_step not in nlengths: LOG.debug("Skip step. Forecast step and nlengths: %s %s", str(forecast_step), str(nlengths)) continue LOG.info("timestamp, step: %s %s", str(timestamp), str(forecast_step)) result_file = os.path.join( nwp_outdir, nwp_output_prefix + timestamp + "+" + '%.3dH00M' % forecast_step) if os.path.exists(result_file): LOG.info("File: " + str(result_file) + " already there...") continue tmp_filename = make_temp_filename(suffix="_" + timestamp + "+" + '%.3dH00M' % forecast_step, dir=nwp_outdir) LOG.info("result and tmp files: " + str(result_file) + " " + str(tmp_filename)) nhsp_file = os.path.join(nhsp_path, nhsp_prefix + timeinfo) if not os.path.exists(nhsp_file): LOG.warning("Corresponding nhsp-file not there: " + str(nhsp_file)) continue cmd = ("grib_copy -w gridType=regular_ll " + nhsp_file + " " + tmp_filename) retv = run_command(cmd) LOG.debug("Returncode = " + str(retv)) if retv != 0: LOG.error( "Failed doing the grib-copy! Will continue with the next file") nfiles_error = nfiles_error + 1 if nfiles_error > len(filelist) / 2: LOG.error( "More than half of the Grib files failed upon grib_copy!") raise IOError('Failed running grib_copy on many Grib files') if not os.path.exists(nwp_lsmz_filename): LOG.error("No static grib file with land-sea mask and " + "topography available. Can't prepare NWP data") raise IOError('Failed getting static land-sea mask and topography') tmp_result_filename = make_temp_filename() cmd = ('cat ' + tmp_filename + " " + os.path.join(nhsf_path, nhsf_prefix + timeinfo) + " " + nwp_lsmz_filename + " > " + tmp_result_filename) LOG.debug("Add topography and land-sea mask to data:") LOG.debug("Command = " + str(cmd)) _start = time.time() retv = os.system(cmd) _end = time.time() LOG.debug("os.system call took: %f seconds", _end - _start) LOG.debug("Returncode = " + str(retv)) if retv != 0: LOG.warning("Failed generating nwp file %s ...", result_file) if os.path.exists(tmp_result_filename): os.remove(tmp_result_filename) raise IOError("Failed adding topography and land-sea " + "mask data to grib file") if os.path.exists(tmp_filename): os.remove(tmp_filename) else: LOG.warning("tmp file %s gone! Cannot clean it...", tmp_filename) if check_nwp_content(tmp_result_filename): LOG.info('A check of the NWP file content has been attempted: %s', result_file) _start = time.time() os.rename(tmp_result_filename, result_file) _end = time.time() LOG.debug("Rename file %s to %s: This took %f seconds", tmp_result_filename, result_file, _end - _start) else: LOG.warning("Missing important fields. No nwp file %s written to disk", result_file) if os.path.exists(tmp_result_filename): os.remove(tmp_result_filename) return
class PPSReader(Reader): """Reader class for PPS files""" pformat = "nc_pps_l2" def __init__(self, *args, **kwargs): Reader.__init__(self, *args, **kwargs) # Source of the data, 'local' or 'ears' self._source = None # Parser for getting info from the file names self._parser = None # Satellite config self._config = None # Location of geolocation files, required for 'local' products self._cloud_product_geodir = None # Name of the product having geolocation for 'local' products self._geolocation_product_name = None def _read_config(self, sat_name, instrument_name): '''Read config for the satellite''' if self._config: return self._config = ConfigParser() configfile = os.path.join(CONFIG_PATH, sat_name + ".cfg") LOG.debug("Read configfile %s", configfile) self._config.read(configfile) try: self._cloud_product_geodir = \ self._config.get(instrument_name + "-level3", "cloud_product_geodir", raw=True, vars=os.environ) except NoOptionError: pass LOG.debug("cloud_product_geodir = %s", self._cloud_product_geodir) try: self._geolocation_product_name = \ self._config.get(instrument_name + "-level3", "geolocation_product_name", raw=True, vars=os.environ) except NoOptionError: if self._source != 'ears': LOG.warning( "No geolocation product name given in config, " "using default: %s", GEO_PRODUCT_NAME_DEFAULT) self._geolocation_product_name = GEO_PRODUCT_NAME_DEFAULT def _determine_prod_and_geo_files(self, prodfilenames): """From the list of product files and the products to load determine the product files and the geolocation files that will be considered when reading the data """ # geofiles4product is a dict listing all geo-locations files applicable # for each product. # prodfiles4product is a dict listing all product files for a given # product name prodfiles4product = {} geofiles4product = {} if prodfilenames: if not isinstance(prodfilenames, (list, set, tuple)): prodfilenames = [prodfilenames] for fname in prodfilenames: # Only standard NWCSAF/PPS and EARS-NWC naming accepted! # No support for old file names (< PPSv2014) if (os.path.basename(fname).startswith("S_NWC") or os.path.basename(fname).startswith("W_XX-EUMETSAT")): if not self._parser: if os.path.basename(fname).startswith("S_NWC"): self._source = 'local' self._parser = Parser(LOCAL_PPS_FILE_MASK) else: self._source = 'ears' self._parser = Parser(EARS_PPS_FILE_MASK) else: LOG.info("Unrecognized NWCSAF/PPS file: %s", fname) continue parse_info = self._parser.parse(os.path.basename(fname)) prodname = parse_info['product'] if prodname not in prodfiles4product: prodfiles4product[prodname] = [] prodfiles4product[prodname].append(fname) # Assemble geolocation information if self._source == 'ears': # For EARS data, the files have geolocation in themselves for prodname, fnames in prodfiles4product.iteritems(): geofiles4product[prodname] = fnames else: # For locally processed data, use the geolocation from # the product defined in config if self._geolocation_product_name in prodfiles4product: for prodname in prodfiles4product.keys(): geofiles4product[prodname] = \ prodfiles4product[self._geolocation_product_name] else: # If the product files with geolocation are not used, # assume that they are still available on the disk. if self._cloud_product_geodir is None: LOG.warning( "Config option 'cloud_product_geodir' is not " "available! Assuming same directory as " "products.") for prodname in prodfiles4product.keys(): geofiles4product[prodname] = [] for fname in prodfiles4product[prodname]: directory = self._cloud_product_geodir or \ os.path.abspath(fname) parse_info = \ self._parser.parse(os.path.basename(fname)) fname = fname.replace( parse_info['product'], self._geolocation_product_name) fname = os.path.join(directory, fname) geofiles4product[prodname].append(fname) # Check that each product file has a corresponding geolocation # file: ''' if self._geolocation_product_name: for prod in products: if prod not in geofiles4product: LOG.error("No product name %s in dict " "geofiles4product!", prod) continue if prod not in prodfiles4product: LOG.error("No product name %s in dict " "prodfiles4product!", prod) continue if len(geofiles4product[prod]) != \ len(prodfiles4product[prod]): LOG.error("Mismatch in number of product files and " "matching geolocation files!") ''' return prodfiles4product, geofiles4product def load(self, satscene, **kwargs): """Read data from file and load it into *satscene*. """ prodfilenames = kwargs.get('filename') time_interval = kwargs.get('time_interval') if prodfilenames and time_interval: LOG.warning("You have specified both a list of files " + "and a time interval") LOG.warning("Specifying a time interval will only take effect " + "if no files are specified") time_interval = None products = satscene.channels_to_load & set(PPS_PRODUCTS) if len(products) == 0: LOG.debug("No PPS cloud products to load, abort") return self._read_config(satscene.fullname, satscene.instrument_name) LOG.info("Products to load: %s", str(products)) # If a list of files are provided to the load call, we disregard the # direcorty and filename specifications/definitions in the config file. if not prodfilenames: try: area_name = satscene.area_id or satscene.area.area_id except AttributeError: area_name = "satproj_?????_?????" # Make the list of files for the requested products: if isinstance(time_interval, (tuple, set, list)) and \ len(time_interval) == 2: time_start, time_end = time_interval else: time_start, time_end = satscene.time_slot, None LOG.debug("Start and end times: %s %s", str(time_start), str(time_end)) prodfilenames = get_filenames(satscene, products, self._config, (time_start, time_end), area_name) LOG.debug("Product files: %s", str(prodfilenames)) retv = self._determine_prod_and_geo_files(prodfilenames) prodfiles4product, geofiles4product = retv # Reading the products classes = { "CTTH": CloudTopTemperatureHeight, "CT": CloudType, "CMA": CloudMask, "PC": PrecipitationClouds, "CPP": CloudPhysicalProperties } nodata_mask = False read_external_geo = {} for product in products: LOG.debug("Loading %s", product) if product not in prodfiles4product: LOG.warning("No files found for product: %s", product) continue pps_band = PPSProductData(prodfiles4product[product]).read() chn = classes[product]() chn.read(pps_band) if not chn.name in satscene: LOG.info("Adding new channel %s", chn.name) satscene.channels.append(chn) # Check if geolocation is loaded: if not chn.area: read_external_geo[product] = satscene.channels[-1].name # Check if some 'channel'/product needs geolocation. If some # product does not have geolocation, get it from the # geofilename: from pyresample import geometry # Load geolocation for chn_name in read_external_geo.values(): LOG.debug("ch_name = %s", str(chn_name)) chn = satscene[chn_name] geofilenames = geofiles4product[chn_name] LOG.debug("Geo-files = %s", str(geofilenames)) geoloc = PpsGeolocationData(chn.shape, chn.granule_lengths, geofilenames).read() try: satscene[chn.name].area = geometry.SwathDefinition( lons=geoloc.longitudes, lats=geoloc.latitudes) area_name = ("swath_" + satscene.fullname + "_" + str(satscene.time_slot) + "_" + str(chn.shape) + "_" + chn.name) satscene[chn.name].area.area_id = area_name satscene[chn.name].area_id = area_name except ValueError: LOG.exception( 'Failed making a SwathDefinition: ' + 'min,max lons,lats = (%f %f") (%f,%f)', geoloc.longitudes.data.min(), geoloc.longitudes.data.max(), geoloc.latitudes.data.min(), geoloc.latitudes.data.max()) LOG.warning("No geolocation loaded for %s", str(chn_name)) # PpsGeolocationData.clear_cache() return
def get_filenames(filepattern): parser = Parser(filepattern) for filename in glob.iglob(parser.globify()): yield filename, parser.parse(filename)