Example #1
0
    def _processors_data(self) -> List[ProcessorData]:
        processors_data = []
        ftp = FTP(self._provider_url_parsed.hostname, timeout=20)
        ftp.login()
        base_path = self._provider_url_parsed.path
        directory_dates = ftp.nlst(base_path)
        if directory_dates:
            base_path = directory_dates[
                -1]  # arbitrarily choosing the most recent date in the interim
            for directory_product in ftp.nlst(base_path):
                if os.path.basename(
                        directory_product) in self.PRODUCT_DIRECTORIES:
                    files = ftp.nlst(directory_product)
                    for file in files:
                        # we only want tm02 files ("time minus 2 hour files, valid two hours before cycle time)
                        if os.path.basename(
                                directory_product
                        ) != self.PRODUCT_TIME_SLICE and not os.path.basename(
                                file).startswith('nwm.t02z.'):
                            continue
                        processors_data.append(
                            ProcessorData(
                                named_storm_id=self._named_storm.id,
                                provider_id=self._provider.id,
                                url='ftp://{}{}'.format(
                                    self._provider_url_parsed.hostname, file),
                                label=os.path.basename(file),
                                kwargs=self._processor_kwargs(),
                                group=os.path.basename(directory_product),
                            ))

        # filter
        processors_data = self.generic_filter(processors_data)

        return processors_data
Example #2
0
 def _processors_data(self) -> List[ProcessorData]:
     return [
         ProcessorData(
             named_storm_id=self._named_storm.id,
             provider_id=self._provider.id,
             url=self._provider.url,
             kwargs=self._processor_kwargs(),
         )
     ]
Example #3
0
def process_dataset_task(data: list):
    """
    Run the dataset processor
    """
    processor_data = ProcessorData(*data)
    named_storm = get_object_or_404(NamedStorm,
                                    pk=processor_data.named_storm_id)
    provider = get_object_or_404(CoveredDataProvider,
                                 pk=processor_data.provider_id)
    processor_cls = processor_class(provider)
    processor = processor_cls(
        named_storm=named_storm,
        provider=provider,
        url=processor_data.url,
        label=processor_data.label,
        group=processor_data.group,
        **processor_data.kwargs,  # include any extra kwargs
    )
    processor.fetch()
    return processor.to_dict()
Example #4
0
    def _processors_data(self) -> List[ProcessorData]:
        dataset_paths = []
        processors_data = []

        # build catalogRefs and filter
        catalog_refs = self._catalog_ref_elements(self._provider.url)
        catalog_refs = self.generic_filter(catalog_refs)

        # build list of catalog urls
        catalog_urls = [self._catalog_ref_href(ref) for ref in catalog_refs]

        # build a list of relevant datasets for each station
        catalogs = self._catalog_documents(catalog_urls)
        for station in catalogs:
            for dataset in station.xpath('//catalog:dataset',
                                         namespaces=self.namespaces):
                if self._is_using_dataset(dataset.get('name')):
                    dataset_paths.append(dataset.get('urlPath'))

        # build a list of processors for all the relevant datasets
        for dataset_path in dataset_paths:
            label, _ = os.path.splitext(os.path.basename(
                dataset_path))  # remove extension since it's handled later
            url = '{}://{}/{}/{}'.format(
                self._provider_url_parsed.scheme,
                self._provider_url_parsed.hostname,
                'thredds/dodsC',
                dataset_path,
            )
            processors_data.append(
                ProcessorData(
                    named_storm_id=self._named_storm.id,
                    provider_id=self._provider.id,
                    url=url,
                    label=label,
                    kwargs=self._processor_kwargs(),
                ))

        return processors_data
Example #5
0
    def _processors_data(self) -> List[ProcessorData]:
        processors_data = []

        # fetch deployment types
        deployment_types_req = requests.get(
            'https://stn.wim.usgs.gov/STNServices/DeploymentTypes.json',
            timeout=10)
        deployment_types_req.raise_for_status()
        self.deployment_types = deployment_types_req.json()

        # fetch event sensors
        sensors_req = requests.get(
            'https://stn.wim.usgs.gov/STNServices/Events/{}/Instruments.json'.
            format(self._named_storm_covered_data.external_storm_id),
            timeout=10,
        )
        sensors_req.raise_for_status()
        self.sensors = sensors_req.json()

        # fetch event data files
        files_req = requests.get(
            'https://stn.wim.usgs.gov/STNServices/Events/{}/Files.json'.format(
                self._named_storm_covered_data.external_storm_id),
            timeout=10,
        )
        files_req.raise_for_status()
        files_json = files_req.json()

        # filter unique files
        files_json = self._filter_unique_files(files_json)

        # filter
        files_json = self.generic_filter(files_json)

        # build a list of data processors for all the files/sensors for this event
        for file in files_json:

            # skip files that don't have an associated "instrument_id"
            if not file.get('instrument_id'):
                continue
            # skip files that aren't "data" files
            if file['filetype_id'] != self.FILE_TYPE_DATA:
                continue
            # skip files where their sensors aren't in the valid list of deployment types
            if not self._is_valid_sensor_deployment_type(file):
                continue
            # skip files where their types are blacklisted
            if not self._is_valid_file(file):
                continue

            file_url = 'https://stn.wim.usgs.gov/STNServices/Files/{}/item'.format(
                file['file_id'])
            processors_data.append(
                ProcessorData(
                    named_storm_id=self._named_storm.id,
                    provider_id=self._provider.id,
                    url=file_url,
                    label=file['name'],
                    group=self._sensor_deployment_type(file['instrument_id']),
                    kwargs=self._processor_kwargs(),
                ))

        return processors_data
Example #6
0
    def _processors_data(self) -> List[ProcessorData]:
        processors_data = []

        # fetch and parse the station listings
        stations_response = requests.get(self.API_STATIONS_URL, timeout=10)
        stations_response.raise_for_status()
        stations_json = stations_response.json()
        stations = stations_json['stations']

        # build a list of stations to collect data
        for station in stations:

            lat = station['lat']
            lng = station['lng']
            station_point = Point(x=lng, y=lat)

            # skip this station if it's outside our covered data's geo
            if not self._named_storm_covered_data.geo.contains(station_point):
                continue

            # get a list of products this station offers
            products_request = requests.get(station['products']['self'],
                                            timeout=10)
            if products_request.ok:
                station_products = [
                    p['name'] for p in products_request.json()['products']
                ]
            else:
                continue

            # build a list for each product that's available
            for product in self.PRODUCTS:

                # verify product "name" was added to the station's available products
                if product[1] not in station_products:
                    continue

                label = 'station-{}-{}'.format(station['id'], product[0])

                query_args = dict(
                    begin_date=self._named_storm_covered_data.date_start.
                    strftime(self.DATE_FORMAT_STR),
                    end_date=self._named_storm_covered_data.date_end.strftime(
                        self.DATE_FORMAT_STR),
                    station=station['id'],
                    product=product[0],
                    units='metric',
                    time_zone='gmt',
                    application='cwwed',
                    format=self.FILE_TYPE,
                )

                # PRODUCT_WATER_LEVEL only
                if product[0] == self.PRODUCT_WATER_LEVEL[0]:

                    # skip this station if it doesn't offer the right DATUM
                    datum_request = requests.get(station['datums']['self'],
                                                 timeout=10)
                    if datum_request.ok:
                        if not [
                                d for d in datum_request.json()['datums']
                                if d['name'] == self.DATUM
                        ]:
                            continue
                    else:
                        continue

                    # include "datum" in query args
                    query_args.update({
                        'datum': self.DATUM,
                    })

                    # include "datum" in label
                    label = '{}-{}'.format(label, self.DATUM)

                #
                # success - add station to list
                #

                url = '{}?{}'.format(self.API_DATA_URL,
                                     parse.urlencode(query_args))

                processors_data.append(
                    ProcessorData(
                        named_storm_id=self._named_storm.id,
                        provider_id=self._provider.id,
                        url=url,
                        label='{}.{}'.format(label, self.FILE_TYPE),
                        kwargs=self._processor_kwargs(),
                        group=product[1],
                    ))

        return processors_data
Example #7
0
    def _processors_data(self) -> List[ProcessorData]:
        dataset_paths = []
        processors_data = []

        # fetch and build first level catalog refs (i.e designated by year)
        catalog_ref_elements_year = self._catalog_ref_elements(
            self._provider.url)
        catalog_ref_elements_year = self._filter_catalog_refs_by_year(
            catalog_ref_elements_year)

        # fetch and build second level catalogs (i.e for day of the year)
        catalog_refs = []
        for ref_year in catalog_ref_elements_year:
            year = int(self._catalog_ref_title(ref_year))
            url = self._catalog_ref_url(ref_year)

            catalog_documents_day = self._catalog_documents([url])

            catalog_ref_elements_day = []
            for catalog in catalog_documents_day:
                catalog_ref_elements_day += catalog.xpath(
                    '//catalog:catalogRef', namespaces=self.namespaces)
            catalog_ref_elements_day = self._filter_catalog_refs_by_day(
                year, catalog_ref_elements_day)

            catalog_refs += catalog_ref_elements_day

        # build a list of actual URLs for each yearly catalog
        catalog_ref_urls_day = []
        for ref in catalog_refs:
            catalog_ref_urls_day.append(self._catalog_ref_url(ref))

        # build a list of relevant datasets for each catalog
        catalog_documents = self._catalog_documents(catalog_ref_urls_day)
        for catalog_document in catalog_documents:
            for dataset in catalog_document.xpath('//catalog:dataset',
                                                  namespaces=self.namespaces):
                if self._is_using_dataset(dataset.get('name')):
                    dataset_paths.append(dataset.get('ID'))

        # filter datasets
        dataset_paths = self.generic_filter(dataset_paths)

        # build a list of processors for all the relevant datasets
        for dataset_path in dataset_paths:
            label = os.path.basename(dataset_path)
            url = '{}://{}/{}'.format(
                self._provider_url_parsed.scheme,
                self._provider_url_parsed.hostname,
                dataset_path,
            )
            folder = os.path.basename(os.path.dirname(url))
            processors_data.append(
                ProcessorData(
                    named_storm_id=self._named_storm.id,
                    provider_id=self._provider.id,
                    url=url,
                    label=label,
                    group=folder,
                    kwargs=self._processor_kwargs(),
                ))

        return processors_data