def _processors_data(self) -> List[ProcessorData]: processors_data = [] ftp = FTP(self._provider_url_parsed.hostname, timeout=20) ftp.login() base_path = self._provider_url_parsed.path directory_dates = ftp.nlst(base_path) if directory_dates: base_path = directory_dates[ -1] # arbitrarily choosing the most recent date in the interim for directory_product in ftp.nlst(base_path): if os.path.basename( directory_product) in self.PRODUCT_DIRECTORIES: files = ftp.nlst(directory_product) for file in files: # we only want tm02 files ("time minus 2 hour files, valid two hours before cycle time) if os.path.basename( directory_product ) != self.PRODUCT_TIME_SLICE and not os.path.basename( file).startswith('nwm.t02z.'): continue processors_data.append( ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url='ftp://{}{}'.format( self._provider_url_parsed.hostname, file), label=os.path.basename(file), kwargs=self._processor_kwargs(), group=os.path.basename(directory_product), )) # filter processors_data = self.generic_filter(processors_data) return processors_data
def _processors_data(self) -> List[ProcessorData]: return [ ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url=self._provider.url, kwargs=self._processor_kwargs(), ) ]
def process_dataset_task(data: list): """ Run the dataset processor """ processor_data = ProcessorData(*data) named_storm = get_object_or_404(NamedStorm, pk=processor_data.named_storm_id) provider = get_object_or_404(CoveredDataProvider, pk=processor_data.provider_id) processor_cls = processor_class(provider) processor = processor_cls( named_storm=named_storm, provider=provider, url=processor_data.url, label=processor_data.label, group=processor_data.group, **processor_data.kwargs, # include any extra kwargs ) processor.fetch() return processor.to_dict()
def _processors_data(self) -> List[ProcessorData]: dataset_paths = [] processors_data = [] # build catalogRefs and filter catalog_refs = self._catalog_ref_elements(self._provider.url) catalog_refs = self.generic_filter(catalog_refs) # build list of catalog urls catalog_urls = [self._catalog_ref_href(ref) for ref in catalog_refs] # build a list of relevant datasets for each station catalogs = self._catalog_documents(catalog_urls) for station in catalogs: for dataset in station.xpath('//catalog:dataset', namespaces=self.namespaces): if self._is_using_dataset(dataset.get('name')): dataset_paths.append(dataset.get('urlPath')) # build a list of processors for all the relevant datasets for dataset_path in dataset_paths: label, _ = os.path.splitext(os.path.basename( dataset_path)) # remove extension since it's handled later url = '{}://{}/{}/{}'.format( self._provider_url_parsed.scheme, self._provider_url_parsed.hostname, 'thredds/dodsC', dataset_path, ) processors_data.append( ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url=url, label=label, kwargs=self._processor_kwargs(), )) return processors_data
def _processors_data(self) -> List[ProcessorData]: processors_data = [] # fetch deployment types deployment_types_req = requests.get( 'https://stn.wim.usgs.gov/STNServices/DeploymentTypes.json', timeout=10) deployment_types_req.raise_for_status() self.deployment_types = deployment_types_req.json() # fetch event sensors sensors_req = requests.get( 'https://stn.wim.usgs.gov/STNServices/Events/{}/Instruments.json'. format(self._named_storm_covered_data.external_storm_id), timeout=10, ) sensors_req.raise_for_status() self.sensors = sensors_req.json() # fetch event data files files_req = requests.get( 'https://stn.wim.usgs.gov/STNServices/Events/{}/Files.json'.format( self._named_storm_covered_data.external_storm_id), timeout=10, ) files_req.raise_for_status() files_json = files_req.json() # filter unique files files_json = self._filter_unique_files(files_json) # filter files_json = self.generic_filter(files_json) # build a list of data processors for all the files/sensors for this event for file in files_json: # skip files that don't have an associated "instrument_id" if not file.get('instrument_id'): continue # skip files that aren't "data" files if file['filetype_id'] != self.FILE_TYPE_DATA: continue # skip files where their sensors aren't in the valid list of deployment types if not self._is_valid_sensor_deployment_type(file): continue # skip files where their types are blacklisted if not self._is_valid_file(file): continue file_url = 'https://stn.wim.usgs.gov/STNServices/Files/{}/item'.format( file['file_id']) processors_data.append( ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url=file_url, label=file['name'], group=self._sensor_deployment_type(file['instrument_id']), kwargs=self._processor_kwargs(), )) return processors_data
def _processors_data(self) -> List[ProcessorData]: processors_data = [] # fetch and parse the station listings stations_response = requests.get(self.API_STATIONS_URL, timeout=10) stations_response.raise_for_status() stations_json = stations_response.json() stations = stations_json['stations'] # build a list of stations to collect data for station in stations: lat = station['lat'] lng = station['lng'] station_point = Point(x=lng, y=lat) # skip this station if it's outside our covered data's geo if not self._named_storm_covered_data.geo.contains(station_point): continue # get a list of products this station offers products_request = requests.get(station['products']['self'], timeout=10) if products_request.ok: station_products = [ p['name'] for p in products_request.json()['products'] ] else: continue # build a list for each product that's available for product in self.PRODUCTS: # verify product "name" was added to the station's available products if product[1] not in station_products: continue label = 'station-{}-{}'.format(station['id'], product[0]) query_args = dict( begin_date=self._named_storm_covered_data.date_start. strftime(self.DATE_FORMAT_STR), end_date=self._named_storm_covered_data.date_end.strftime( self.DATE_FORMAT_STR), station=station['id'], product=product[0], units='metric', time_zone='gmt', application='cwwed', format=self.FILE_TYPE, ) # PRODUCT_WATER_LEVEL only if product[0] == self.PRODUCT_WATER_LEVEL[0]: # skip this station if it doesn't offer the right DATUM datum_request = requests.get(station['datums']['self'], timeout=10) if datum_request.ok: if not [ d for d in datum_request.json()['datums'] if d['name'] == self.DATUM ]: continue else: continue # include "datum" in query args query_args.update({ 'datum': self.DATUM, }) # include "datum" in label label = '{}-{}'.format(label, self.DATUM) # # success - add station to list # url = '{}?{}'.format(self.API_DATA_URL, parse.urlencode(query_args)) processors_data.append( ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url=url, label='{}.{}'.format(label, self.FILE_TYPE), kwargs=self._processor_kwargs(), group=product[1], )) return processors_data
def _processors_data(self) -> List[ProcessorData]: dataset_paths = [] processors_data = [] # fetch and build first level catalog refs (i.e designated by year) catalog_ref_elements_year = self._catalog_ref_elements( self._provider.url) catalog_ref_elements_year = self._filter_catalog_refs_by_year( catalog_ref_elements_year) # fetch and build second level catalogs (i.e for day of the year) catalog_refs = [] for ref_year in catalog_ref_elements_year: year = int(self._catalog_ref_title(ref_year)) url = self._catalog_ref_url(ref_year) catalog_documents_day = self._catalog_documents([url]) catalog_ref_elements_day = [] for catalog in catalog_documents_day: catalog_ref_elements_day += catalog.xpath( '//catalog:catalogRef', namespaces=self.namespaces) catalog_ref_elements_day = self._filter_catalog_refs_by_day( year, catalog_ref_elements_day) catalog_refs += catalog_ref_elements_day # build a list of actual URLs for each yearly catalog catalog_ref_urls_day = [] for ref in catalog_refs: catalog_ref_urls_day.append(self._catalog_ref_url(ref)) # build a list of relevant datasets for each catalog catalog_documents = self._catalog_documents(catalog_ref_urls_day) for catalog_document in catalog_documents: for dataset in catalog_document.xpath('//catalog:dataset', namespaces=self.namespaces): if self._is_using_dataset(dataset.get('name')): dataset_paths.append(dataset.get('ID')) # filter datasets dataset_paths = self.generic_filter(dataset_paths) # build a list of processors for all the relevant datasets for dataset_path in dataset_paths: label = os.path.basename(dataset_path) url = '{}://{}/{}'.format( self._provider_url_parsed.scheme, self._provider_url_parsed.hostname, dataset_path, ) folder = os.path.basename(os.path.dirname(url)) processors_data.append( ProcessorData( named_storm_id=self._named_storm.id, provider_id=self._provider.id, url=url, label=label, group=folder, kwargs=self._processor_kwargs(), )) return processors_data