Python Crawl.Crawl Examples

Programming Language: Python

Namespace/Package Name: thredds_crawler.crawl

Class/Type: Crawl

Method/Function: Crawl

Examples at hotexamples.com: 13

Python Crawl.Crawl - 13 examples found. These are the top rated real world Python examples of thredds_crawler.crawl.Crawl.Crawl extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Crawl(13)

_find_root_url(1)

run(1)

Frequently Used Methods

Crawl (13)

_find_root_url (1)

run (1)

Example #1

Show file

from bs4 import BeautifulSoup
from lxml import etree
from lxml.html.soupparser import fromstring
from thredds_crawler.crawl import Crawl
from django.conf import settings
import os


#url = "http://*****:*****@'), "catalogRefs/CatalogTELEDEM.html")
url = "http://localhost:8080/thredds/CatalogTELEDM.html"
cat = TDSCatalog('http://localhost:8080/thredds/CatalogTELEDM.html')
#context = ssl._create_unverified_context()
#catT = TDSCatalog(url)

cat = Crawl(url)
datasets = [i.id for i in cat.datasets]
root = "/home/mers/Bureau/teledm/donnees/"
catalog = {}
for item in datasets:
    p = catalog
    for x in item.split('/'):
        p = p.setdefault(x, {})
    
ds = {}
for d in datasets:
    dp = d.split('/')
    nc = Dataset(root+d,'r')
    dt = nc.variables['time']
    dates = num2date(dt[:], dt.units)
    dset = {}

Example #2

Show file

    #path='ANMN/NRS/NRSKAI'
    path='ABOS/SOTS'
    #path='ABOS/SOTS/2016'

    if len(sys.argv) > 1:
        path = sys.argv[1]

    #skips = Crawl.SKIPS + [".*FV00"]
    skips = Crawl.SKIPS + [".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded", ".*burst", ".*gridded", ".*long-timeseries"]
    #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"]
    #skips = Crawl.SKIPS + [".*regridded"]

    crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml'
    #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html'

    c = Crawl(crawl_path, select=['.*SAZ.*2020'], skip=skips)

    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*'])

    # print(c.datasets)

    # serice can be httpService or dapService
    urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "httpserver"]  # httpserver or opendap

    for url in urls:
        print(url)

Example #3

Show file

File: __init__.py Project: danellecline/stoqs

    def loadStationData(self, stride=1):
        '''Crawl the OceanSITES Mooring data TDS for OPeNDAP links and load into STOQS
        '''
        urls = []
        strides = {}
        for dataSet in self.dataSets:
            c = Crawl(dataSet[0], select=dataSet[1], debug=self.args.verbose)
            dsUrls = [
                s.get("url") for d in c.datasets for s in d.services
                if s.get("service").lower() == "opendap"
            ]
            for dsu in dsUrls:
                strides[dsu] = dataSet[2]
            urls += dsUrls

        # First pass through urls matching OceanSITES pattern to collect platform names to get colors
        # Use OceanSITES naming convention for platform "OS_<platformName>_xxx_R|D_<type>.nc"
        pNames = set()
        platfrormTypeNames = set()
        for url in urls:
            platfrormTypeNames.add(url.split('/')[-2])
            if url.find('MOVE1_') != -1:
                # Special hack for MOVE PlatformCode
                newUrl = url.replace('MOVE1_', 'MOVE1-')
                pNames.add(newUrl.split('/')[-1].split('.')[0].split('_')[1])
            else:
                pNames.add(url.split('/')[-1].split('.')[0].split('_')[1])

        # Assign colors by platformTypeName
        pColors = {}
        for ptName, color in zip(sorted(platfrormTypeNames),
                                 self.getColor(len(platfrormTypeNames))):
            pColors[ptName] = color

        # Now loop again, this time loading the data
        for url in urls:
            logger.info("Executing runMooringLoader with url = %s", url)
            if self.args.optimal_stride and strides[url]:
                stride = strides[url]
            elif self.args.test:
                stride = strides[url] * 2

            fixedUrl = url
            if url.find('OS_IMOS-EAC_EAC') != -1:
                # Special fix to get platform name
                fixedUrl = url.replace('OS_IMOS-EAC_EAC', 'OS_IMOS-EAC-EAC')

            if stride > 1:
                aName = fixedUrl.split('/')[-1].split(
                    '.')[0] + '(stride=%d)' % stride
            else:
                aName = fixedUrl.split('/')[-1].split('.')[0]

            pName = aName.split('_')[1]
            ptName = url.split('/')[-2]

            logger.debug("Instantiating Mooring_Loader for url = %s", url)
            try:
                ml = Mooring_Loader(
                    url=url,
                    campaignName=self.campaignName,
                    campaignDescription=self.campaignDescription,
                    dbAlias=self.dbAlias,
                    activityName=aName,
                    activitytypeName='Mooring Deployment',
                    platformName=pName,
                    platformColor=pColors[ptName],
                    platformTypeName=ptName,
                    stride=stride,
                    startDatetime=self.startDatetime,
                    dataStartDatetime=None,
                    endDatetime=self.endDatetime)
            except UnicodeDecodeError as e:
                logger.warn(str(e))
                logger.warn(f'Cannot read data from {url}')
                continue

            # Special fixes for non standard metadata and if files don't contain the standard TEMP and PSAL parameters
            if url.find('MBARI-') != -1:
                ml.include_names = ['TEMP', 'PSAL']
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH'
                    }
            elif url.find('OS_PAPA_2009PA003_D_CTD_10min') != -1:
                ml.include_names = ['TEMP']
            elif url.find('OS_PAPA_2009PA003_D_PSAL_1hr') != -1:
                ml.include_names = ['PSAL']
            elif url.find('OS_SOTS_SAZ-15-2012_D_microcat-4422m') != -1:
                ml.include_names = ['TEMP', 'PSAL']
                # DEPTH_CN_PR_PS_TE coordinate missing standard_name attribute
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH_CN_PR_PS_TE'
                    }
                # Only global attribute is 'cdm_data_type: Time-series'; monkey-patch the method
                Mooring_Loader.getFeatureType = lambda self: 'timeseries'
            elif url.find('D_MICROCAT-PART') != -1:
                ml.include_names = ['TEMP', 'PSAL']
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH'
                    }
            elif url.find('D_RDI-WORKHORSE-ADCP-') != -1:
                ml.include_names = ['UCUR', 'VCUR', 'WCUR']
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'HEIGHT_ABOVE_SENSOR'
                    }
                # Metadata in file states 'timeseries', but it's really something different; monkey-patch the getFeatureType() method
                Mooring_Loader.getFeatureType = lambda self: 'trajectoryprofile'
            elif url.find('TVSM_dy.nc') != -1:
                ##ml.include_names = ['UCUR', 'VCUR', 'TEMP', 'PSAL', 'CSPD', 'CDIR']
                ml.include_names = ['TEMP', 'PSAL']
                ml.auxCoords = {}
                for v in ('UCUR', 'VCUR', 'CSPD', 'CDIR'):
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPCUR'
                    }
                for v in ('TEMP', ):
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH'
                    }
                for v in ('PSAL', ):
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPPSAL'
                    }
                # These PIRATA daily files are timeSeriesProfile which hsa no featureType attribute
                Mooring_Loader.getFeatureType = lambda self: 'timeseriesprofile'
            elif url.find('CCE') != -1:
                ml.include_names = ['TEMP', 'PSAL']
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH'
                    }
            elif url.find('NOG') != -1:
                ml.include_names = ['TEMP', 'PSAL']
                Mooring_Loader.getFeatureType = lambda self: 'timeseries'
            elif url.find('Stratus') != -1:
                # Variable attrubute coordinates: TIME, DEPTH, LATITUDE, LONGITUDE; it should not contain commas
                ml.include_names = ['TEMP', 'PSAL']
                ml.auxCoords = {}
                for v in ml.include_names:
                    ml.auxCoords[v] = {
                        'time': 'TIME',
                        'latitude': 'LATITUDE',
                        'longitude': 'LONGITUDE',
                        'depth': 'DEPTH'
                    }
            else:
                ml.include_names = ['TEMP', 'PSAL']

            try:
                (nMP, path, parmCountHash) = ml.process_data()
                logger.debug("Loaded Activity with name = %s", aName)
            except NoValidData as e:
                logger.warning(e)

Example #4

Show file

    if args.post:
        token = os.environ['SLACKTOKEN']
        slack = Slacker(token)

    # Assume that the database has already been created with description and terrain information, so use minimal arguments in constructor
    lm = CANONLoader(args.database, args.campaign)
    lm.dbAlias = args.database
    lm.campaignName = args.campaign

    # Get directory list from sites
    s = args.inUrl.rsplit('/', 1)
    files = s[1]
    url = s[0]
    logger.info("Crawling %s for %s files", url, files)
    c = Crawl(os.path.join(url, 'catalog.xml'), select=[files], debug=False)

    for d in c.datasets:
        logger.debug('Found %s', d.id)

    urls = [
        s2.get("url") for d in c.datasets for s2 in d.services
        if s2.get("service").lower() == "opendap"
    ]

    pw = lrauvNc4ToNetcdf.InterpolatorWriter()

    # If parameter names contains any group forward slash '/' delimiters
    # replace them with underscores. This is because pydap automatically renames slashes as underscores
    # and needs to reference the parameter correctly in the DAPloader
    parm_list = []

Example #5

Show file

def download(folder, projects, filesubset, since):

    # Use thredds_crawler to find DAP endpoints of the CF-1.6 data.
    skips = Crawl.SKIPS
    if projects:
        skips += ['^(?!{}).*^(?!.*\.(cdf|nc)).*$'.format('|'.join(projects))]

    catalog = 'http://geoport.whoi.edu/thredds/catalog/usgs/data2/emontgomery/stellwagen/CF-1.6/catalog.html'

    try:
        datasets = Crawl(catalog, select=['.*\.(cdf|nc)'], skip=skips, after=since).datasets
        logger.info("Found {0} TOTAL datasets!".format(len(datasets)))
    except KeyboardInterrupt:
        logger.info("Breaking out of crawling loop.")
        datasets = []

    try:
        os.makedirs(folder)
    except OSError:
        pass

    # Save datasets to download directory
    saved_files = []
    for num, d in enumerate(datasets):

        if filesubset and d.name.lower() not in filesubset:
            continue

        try:
            http_url = next(s["url"] for s in d.services if s["service"].lower() == "httpserver")
            project_name = http_url.split("/")[-2]
        except StopIteration:
            logger.error("No HTTPServer endpoint found, skipping")
            continue

        # Make download folder
        save_file = os.path.join(folder, project_name, d.name)
        if not os.path.isdir(os.path.dirname(save_file)):
            os.makedirs(os.path.dirname(save_file))
        logger.info("Downloading {0}".format(http_url))
        try:
            with open(save_file, "wb") as f:
                r = requests.get(http_url, stream=True)
                if not r.ok:
                    logger.error("Could not download '{!s}' from '{!s}', skipping".format(d.name, http_url))
                    break
                for block in r.iter_content(1024):
                    if not block:
                        break
                    f.write(block)
        except KeyboardInterrupt:
            logger.info("Breaking out of download loop.")
            raise
        except BaseException:
            logger.error("Could not download... error with HTTP endpoint.  Skipping.")
            continue

        # Try to open file, if it fails, writing failed.
        try:
            nc = netCDF4.Dataset(save_file)
        except BaseException:
            os.remove(save_file)
        else:
            logger.info("{!s} saved ({!s}/{!s})".format(d.name, num + 1, len(datasets)))
            saved_files.append(save_file)
        finally:
            nc_close(nc)

    return saved_files

Example #6

Show file

File: check_data.py Project: pbrickley/datateam_tools

def main(url, save_dir):
    if type(url) is str:
        if url.endswith('.html'):
            url = url.replace('.html', '.xml')
            tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
            c = Crawl(url, select=[".*\.nc$"], debug=False)
            datasets = [os.path.join(tds_url, x.id) for x in c.datasets]
            splitter = url.split('/')[-2].split('-')
        elif url.endswith('.xml'):
            tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
            c = Crawl(url, select=[".*\.nc$"], debug=False)
            datasets = [os.path.join(tds_url, x.id) for x in c.datasets]
            splitter = url.split('/')[-2].split('-')
        elif url.endswith('.nc') or url.endswith('.ncml'):
            datasets = [url]
            splitter = url.split('/')[-2].split('-')
        else:
            print 'Unrecognized input. Input must be a string of the file location(s) or list of file(s)'
    else:
        print 'Dataset must be in a string.'

    data = OrderedDict(deployments=OrderedDict())
    for dataset in datasets:
        filename = os.path.basename(dataset)
        if 'ENG000000' not in filename:  # script will not analyze glider ENG data files
            logging.info('Processing {}'.format(str(dataset)))
            try:
                print 'Opening file: {}'.format(dataset)
                with xr.open_dataset(dataset, mask_and_scale=False) as ds:
                    ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor)
                    deployment = np.unique(ds['deployment'].data)[0]

                    qc_data = request_qc_json(
                        ref_des)  # grab data from the qc database
                    ref_des_dict = get_parameter_list(qc_data)
                    deploy_info = get_deployment_information(
                        qc_data, deployment)

                    if deploy_info is None:
                        print 'info from deployment ' + str(
                            deployment) + ' does not match data'
                        continue

                    data_start = ds.time_coverage_start + 'Z'
                    data_end = ds.time_coverage_end + 'Z'

                    # Deployment Variables
                    deploy_start = str(deploy_info['start_date'] + 'Z')
                    if deploy_info['stop_date']:
                        deploy_stop = str(deploy_info['stop_date'] + 'Z')
                    else:
                        deploy_stop = str(deploy_info['stop_date'])
                    deploy_lon = deploy_info['longitude']
                    deploy_lat = deploy_info['latitude']

                    # Add reference designator to dictionary
                    try:
                        data['ref_des']
                    except KeyError:
                        data['ref_des'] = ref_des

                    deployment = 'D0000{}'.format(deployment)

                    deployments = data['deployments'].keys()

                    # Add deployment to dictionary and initialize stream sub dictionary
                    if not deployment in deployments:
                        data['deployments'][deployment] = OrderedDict(
                            start=deploy_start,
                            end=deploy_stop,
                            lon=deploy_lon,
                            lat=deploy_lat,
                            streams=OrderedDict(),
                            data_times=dict(start=[], end=[]))

                    # Add data start and stop times to a data_times array. When the files are all processed, it checks data vs deployment times
                    if ds.stream == splitter[-1]:
                        data['deployments'][deployment]['data_times'][
                            'start'].append(data_start)
                        data['deployments'][deployment]['data_times'][
                            'end'].append(data_end)

                    streams = data['deployments'][deployment]['streams'].keys()

                    # Add stream to subdictionary inside deployment
                    if not ds.stream in streams:
                        data['deployments'][deployment]['streams'][
                            ds.stream] = OrderedDict(files=OrderedDict())

                    qc_df = parse_qc(ds)

                    qc_vars = [x for x in qc_df.keys() if not 'test' in x]
                    qc_df = qc_df.reset_index()
                    variables = ds.data_vars.keys()
                    variables = eliminate_common_variables(variables)
                    variables = [
                        x for x in variables if not 'qc' in x
                    ]  # remove qc variables, because we don't care about them

                    # Gap test. Get a list of gaps
                    gap_list = test_gaps(qc_df)

                    # Deployment Distance
                    data_lat = np.unique(ds['lat'])[0]
                    data_lon = np.unique(ds['lon'])[0]
                    dist_calc = distance((deploy_lat, deploy_lon),
                                         (data_lat, data_lon))

                    # Unique times
                    time = ds['time']
                    len_time = time.__len__()
                    len_time_unique = np.unique(time).__len__()
                    if len_time == len_time_unique:
                        time_test = True
                    else:
                        time_test = False
                    db_list = ref_des_dict[ds.stream]

                    [_, unmatch1] = compare_lists(db_list, variables)
                    [_, unmatch2] = compare_lists(variables, db_list)

                    filenames = data['deployments'][deployment]['streams'][
                        ds.stream]['files']

                    if not filename in filenames:
                        data['deployments'][deployment]['streams'][
                            ds.stream]['files'][filename] = OrderedDict(
                                data_start=data_start,
                                data_end=data_end,
                                time_gaps=gap_list,
                                lon=data_lon,
                                lat=data_lat,
                                distance_from_deploy_km=dist_calc,
                                unique_times=str(time_test),
                                variables=OrderedDict(),
                                vars_not_in_file=unmatch1,
                                vars_not_in_db=unmatch2)
                    else:
                        print filename + ' already in dictionary. Skipping'

                    for v in variables:
                        # print v
                        # Availability test
                        if v in db_list:
                            available = True
                        else:
                            available = False

                        if ds[v].dtype.kind == 'S' \
                                or ds[v].dtype == np.dtype('datetime64[ns]') \
                                or 'time' in v:
                            dict_vars = data['deployments'][deployment][
                                'streams'][ds.stream]['files'][filename][
                                    'variables'].keys()

                            if not v in dict_vars:
                                data['deployments'][deployment]['streams'][
                                    ds.stream]['files'][filename]['variables'][
                                        v] = OrderedDict(
                                            available=str(available))
                            continue
                        else:
                            var_data = ds[v].data

                            # NaN test. Make sure the parameter is not all NaNs
                            nan_test = np.all(np.isnan(var_data))
                            if not nan_test or available is False:
                                # Global range test
                                [g_min, g_max
                                 ] = get_global_ranges(ds.subsite, ds.node,
                                                       ds.sensor, v)
                                try:
                                    ind = reject_outliers(var_data, 3)
                                    min = float(np.nanmin(var_data[ind]))
                                    max = float(np.nanmax(var_data[ind]))
                                except (TypeError, ValueError):
                                    min = None
                                    max = None

                                # Fill Value test
                                try:
                                    fill_value = float(ds[v]._FillValue)
                                    fill_test = np.any(
                                        var_data == ds[v]._FillValue)
                                except AttributeError:
                                    fill_value = None
                                    fill_test = None

                                dict_vars = data['deployments'][deployment][
                                    'streams'][ds.stream]['files'][filename][
                                        'variables'].keys()
                                if not v in dict_vars:
                                    data['deployments'][deployment]['streams'][
                                        ds.stream]['files'][filename][
                                            'variables'][v] = OrderedDict(
                                                available=str(available),
                                                all_nans=str(nan_test),
                                                data_min=min,
                                                data_max=max,
                                                global_min=g_min,
                                                global_max=g_max,
                                                fill_test=str(fill_test),
                                                fill_value=fill_value)

                                if v in qc_vars:
                                    temp_list = []
                                    tests = [
                                        'global_range_test',
                                        'dataqc_stuckvaluetest',
                                        'dataqc_spiketest'
                                    ]
                                    for test in tests:
                                        var = '{}_{}'.format(v, test)
                                        group_var = 'group_{}'.format(var)
                                        try:
                                            qc_df[group_var] = qc_df[var].diff(
                                            ).cumsum().fillna(0)
                                        except KeyError as e:
                                            # logging.warn('Error: P')
                                            temp_list.append(['Did not run'])
                                            continue
                                        tdf = qc_df.groupby([
                                            group_var, var
                                        ])['time'].agg(['first', 'last'])
                                        tdf = tdf.reset_index().drop(
                                            [group_var], axis=1)
                                        tdf = tdf.loc[tdf[var] == False].drop(
                                            var, axis=1)
                                        tdf['first'] = tdf['first'].apply(
                                            lambda x: x.strftime(
                                                '%Y-%m-%dT%H:%M:%SZ'))
                                        tdf['last'] = tdf['last'].apply(
                                            lambda x: x.strftime(
                                                '%Y-%m-%dT%H:%M:%SZ'))
                                        if tdf.empty:
                                            data['deployments'][deployment][
                                                'streams'][ds.stream]['files'][
                                                    filename]['variables'][v][
                                                        test] = []
                                        else:
                                            data['deployments'][deployment][
                                                'streams'][ds.stream]['files'][
                                                    filename]['variables'][v][
                                                        test] = map(
                                                            list, tdf.values)

                                else:
                                    data['deployments'][deployment]['streams'][
                                        ds.stream]['files'][filename][
                                            'variables'][v][
                                                'global_range_test'] = None
                                    data['deployments'][deployment]['streams'][
                                        ds.stream]['files'][filename][
                                            'variables'][v][
                                                'dataqc_stuckvaluetest'] = None
                                    data['deployments'][deployment]['streams'][
                                        ds.stream]['files'][filename][
                                            'variables'][v][
                                                'dataqc_spiketest'] = None
                            else:
                                dict_vars = data['deployments'][deployment][
                                    'streams'][ds.stream]['files'][filename][
                                        'variables'].keys()
                                if not v in dict_vars:
                                    data['deployments'][deployment]['streams'][
                                        ds.stream]['files'][filename][
                                            'variables'][v] = OrderedDict(
                                                available=str(available),
                                                all_nans=str(nan_test))
            except Exception as e:
                logging.warn('Error: Processing failed due to {}.'.format(
                    str(e)))
                raise
        else:
            pass

    deployments = data['deployments'].keys()
    for d in deployments:
        data['deployments'][d]['data_times']['start'].sort(key=natural_keys)
        data['deployments'][d]['data_times']['end'].sort(key=natural_keys)

        data['deployments'][d]['data_times']['start'] = data['deployments'][d][
            'data_times']['start'][0]
        data['deployments'][d]['data_times']['end'] = data['deployments'][d][
            'data_times']['end'][-1]

    #make_dir(save_dir)

    json_dir = (os.path.join(save_dir, 'json_output'))
    make_dir(json_dir)

    save_file = os.path.join(
        json_dir, '{}-{}-{}-{}__{}-{}__requested_{}.json'.format(
            splitter[1], splitter[2], splitter[3], splitter[4], splitter[5],
            splitter[6], splitter[0]))
    with open(save_file, 'w') as outfile:
        json.dump(data, outfile)
    return save_file

Example #7

Show file

File: check_data.py Project: leilabbb/check_ooi_nc

def main(url):
    tds_url = 'http://opendap.oceanobservatories.org/thredds/dodsC'
    c = Crawl(url, select=[".*ncml"])
    data = []

    for n in c.datasets:
        ncml_url = os.path.join(tds_url, n.id)
        ds = xr.open_dataset(ncml_url, mask_and_scale=False)
        deployment = np.unique(ds['deployment'].data)[0]
        variables = ds.data_vars.keys()
        variables = eliminate_common_variables(variables)
        variables = [x for x in variables if not 'qc' in x
                     ]  # remove qc variables, because we don't care about them
        ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor)
        ref_des_dict = get_parameter_list(ref_des)

        # Gap test. Get a list of gaps
        gap_list = test_gaps(ds['time'].data)

        for v in variables:
            var_data = ds[v].data
            print v
            # Availability test
            if v in ref_des_dict[ds.stream]:
                available = True
            else:
                available = False

            # Global range test
            [g_min, g_max] = get_global_ranges(ds.subsite, ds.node, ds.sensor,
                                               v)
            try:
                min = np.nanmin(var_data)
                max = np.nanmax(var_data)
            except TypeError:
                min = 'n/a'
                max = 'n/a'

            if g_min is not None:
                if min > g_min:
                    if max < g_max:
                        gr_result = True
                    else:
                        gr_result = False
                else:
                    gr_result = False
            else:
                gr_result = 'None'

            # Fill Value test
            fill_test = np.all(var_data == ds[v]._FillValue)

            try:
                # NaN test. Make sure the parameter is not all NaNs
                nan_test = np.all(np.isnan(var_data))
            except TypeError:
                nan_test = 'None'

            data.append((ref_des, ds.stream, deployment, v, available,
                         gr_result, [g_min, min], [g_max, max], fill_test,
                         ds[v]._FillValue, nan_test, gap_list))
    df = pd.DataFrame(data,
                      columns=[
                          'ref_des', 'stream', 'deployment', 'variable',
                          'availability', 'global_range_test',
                          'min[global, data]', 'max[global, data]',
                          'fill_test', 'fill_value', 'not_nan', 'gaps'
                      ])
    df.to_csv('/Users/michaesm/Documents/test.csv', index=False)

Example #8

Show file

File: catalog.py Project: bweeding/imos-tools

    if len(sys.argv) > 1:
        path = sys.argv[1]

    #skips = Crawl.SKIPS + [".*FV00"]
    skips = Crawl.SKIPS + [
        ".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME",
        ".*regridded", ".*burst", ".*gridded", ".*long-timeseries"
    ]
    #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"]
    #skips = Crawl.SKIPS + [".*regridded"]

    crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml'
    #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html'

    c = Crawl(crawl_path, select=['.*FV01'], skip=skips)

    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*'])

    # print(c.datasets)

    # serice can be httpService or dapService
    urls = [
        s.get("url") for d in c.datasets for s in d.services
        if s.get("service").lower() == "httpserver"
    ]  # httpserver or opendap

    for url in urls:
        print(url)

Example #9

Show file

File: check_data_old.py Project: pbrickley/datateam_tools

def main(url, save_dir):
    if type(url) is str:
        if url.endswith('.html'):
            url = url.replace('.html', '.xml')
            tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
            c = Crawl(url, select=[".*ncml"])
            datasets = [os.path.join(tds_url, x.id) for x in c.datasets]
        elif url.endswith('.xml'):
            tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
            c = Crawl(url, select=[".*ncml"])
            datasets = [os.path.join(tds_url, x.id) for x in c.datasets]
        elif url.endswith('.nc') or url.endswith('.ncml'):
            datasets = [url]
        elif os.path.exists(url):
            datasets = glob.glob(url + '/*.nc')
        else:
            print 'Unrecognized input. Input must be a string of the file location(s) or list of file(s)'
    elif type(url) is list:
        datasets = url

    data = []
    for dataset in datasets:
        logging.info('Processing {}'.format(str(dataset)))
        try:
            print 'Opening file: {}'.format(dataset)
            with xr.open_dataset(dataset, mask_and_scale=False) as ds:
                qc_df = parse_qc(ds)
                qc_vars = [x for x in qc_df.keys() if not 'test' in x]
                qc_df = qc_df.reset_index()
                deployment = np.unique(ds['deployment'].data)[0]
                variables = ds.data_vars.keys()
                variables = eliminate_common_variables(variables)
                variables = [
                    x for x in variables if not 'qc' in x
                ]  # remove qc variables, because we don't care about them
                ref_des = '{}-{}-{}'.format(ds.subsite, ds.node, ds.sensor)
                qc_data = request_qc_json(
                    ref_des)  # grab data from the qc database
                ref_des_dict = get_parameter_list(qc_data)
                deploy_info = get_deployment_information(qc_data, deployment)

                # Gap test. Get a list of gaps
                gap_list = test_gaps(qc_df)

                # Deployment Variables
                deploy_start = str(deploy_info['start_date'])
                deploy_stop = str(deploy_info['stop_date'])
                deploy_lon = deploy_info['longitude']
                deploy_lat = deploy_info['latitude']

                # Deployment Time
                data_start = ds.time_coverage_start
                data_stop = ds.time_coverage_end

                start_test = [str(deploy_start), str(data_start)]
                stop_test = [str(deploy_stop), str(data_stop)]

                # Deployment Distance
                data_lat = np.unique(ds['lat'])[0]
                data_lon = np.unique(ds['lon'])[0]
                dist_calc = distance((deploy_lat, deploy_lon),
                                     (data_lat, data_lon))
                if dist_calc < .5:  # if distance is less than .5 km
                    dist = True
                else:
                    dist = False
                dist_test = '{} [{} km]'.format(dist, dist_calc)

                # Unique times
                time = ds['time']
                len_time = time.__len__()
                len_time_unique = np.unique(time).__len__()
                if len_time == len_time_unique:
                    time_test = True
                else:
                    time_test = False

                for v in variables:
                    print v
                    # Availability test
                    if v in ref_des_dict[ds.stream]:
                        available = True
                    else:
                        available = False

                    if ds[v].dtype == np.dtype(
                            'S64'
                    ) or ds[v].dtype == np.dtype(
                            'datetime64[ns]'
                    ) or 'time' in v:  # this will skip most engineering/system variables because they are strings
                        # ['ref_des', 'stream', 'deployment', 'start', 'stop', 'distance_from_deploy_<=.5km',
                        # 'time_unique', 'variable', 'availability', 'all_nans', 'global_range_test', 'min', 'max',
                        # 'fill_test', 'fill_value',  'gaps', 'global_range', 'stuck_value', 'spike_test'])
                        data.append((ref_des, ds.stream, deployment,
                                     start_test, stop_test, dist_test,
                                     time_test, v, available, None, None, None,
                                     None, None, None, None, None, None, None))
                        continue
                    else:
                        var_data = ds[v].data

                        # NaN test. Make sure the parameter is not all NaNs
                        nan_test = np.all(np.isnan(var_data))
                        if not nan_test or available is False:
                            # Global range test
                            [g_min, g_max
                             ] = get_global_ranges(ds.subsite, ds.node,
                                                   ds.sensor, v)
                            try:
                                ind = reject_outliers(var_data, 3)
                                min = np.nanmin(var_data[ind])
                                max = np.nanmax(var_data[ind])
                            except TypeError:
                                min = None
                                max = None

                            if g_min is not None:
                                if min >= g_min:
                                    if max <= g_max:
                                        gr_result = True
                                    else:
                                        gr_result = False
                                else:
                                    gr_result = False
                            else:
                                gr_result = None

                            # Fill Value test
                            try:
                                fill_value = ds[v]._FillValue
                                fill_test = np.any(
                                    var_data == ds[v]._FillValue)
                            except AttributeError:
                                fill_value = 'n/a'
                                fill_test = 'n/a'

                            data_tuple = (ref_des, ds.stream, deployment,
                                          start_test, stop_test, dist_test,
                                          time_test, v, available, nan_test,
                                          gr_result, [g_min,
                                                      min], [g_max, max],
                                          fill_test, fill_value, gap_list)

                            if v in qc_vars:
                                temp_list = []
                                tests = [
                                    'global_range_test',
                                    'dataqc_stuckvaluetest', 'dataqc_spiketest'
                                ]
                                for test in tests:
                                    var = '{}_{}'.format(v, test)
                                    group_var = 'group_{}'.format(var)
                                    try:
                                        qc_df[group_var] = qc_df[var].diff(
                                        ).cumsum().fillna(0)
                                    except KeyError as e:
                                        logging.warn('Error: P')
                                        temp_list.append('DNR')
                                        continue
                                    tdf = qc_df.groupby([group_var,
                                                         var])['time'].agg(
                                                             ['first', 'last'])
                                    tdf = tdf.reset_index().drop([group_var],
                                                                 axis=1)
                                    tdf = tdf.loc[tdf[var] == False].drop(
                                        var, axis=1)
                                    tdf['first'] = tdf['first'].apply(
                                        lambda x: x.strftime(
                                            '%Y-%m-%d %H:%M:%S'))
                                    tdf['last'] = tdf['last'].apply(
                                        lambda x: x.strftime(
                                            '%Y-%m-%d %H:%M:%S'))
                                    if tdf.empty:
                                        temp_list.append([])
                                    else:
                                        temp_list.append(map(list, tdf.values))
                                temp_tuple = data_tuple + tuple(temp_list)
                                data.append(temp_tuple)
                            else:
                                temp_tuple = data_tuple + ('n/a', 'n/a', 'n/a')
                                data.append(temp_tuple)
                        else:
                            data.append(
                                (ref_des, ds.stream, deployment, start_test,
                                 stop_test, dist_test, time_test, v, available,
                                 nan_test, 'n/a', 'n/a', 'n/a', 'n/a', 'n/a',
                                 gap_list, 'n/a', 'n/a', 'n/a'))
        except Exception as e:
            logging.warn('Error: Processing failed due to {}.'.format(str(e)))
            raise

    df = pd.DataFrame(data,
                      columns=[
                          'ref_des', 'stream', 'deployment', 'start', 'stop',
                          'distance_from_deploy_<=.5km', 'time_unique',
                          'variable', 'availability', 'all_nans',
                          'global_range_test', 'min[global,data]',
                          'max[global,data]', 'fill_test', 'fill_value',
                          'gaps', 'global_range', 'stuck_value', 'spike_test'
                      ])
    df.to_csv(os.path.join(
        save_dir, '{}-{}-{}-{}-process_on_{}.csv'.format(
            ds.subsite, ds.node, ds.sensor, ds.stream,
            dt.now().strftime('%Y-%m-%dT%H%M00'))),
              index=False)

Example #10

Show file

def download(folder, project_metadata, filesubset, since):

    # Use thredds_crawler to find DAP endpoints of the RAW data.
    total_datasets = []
    skips = Crawl.SKIPS + ['.*OTHER.*', '.*ancillary.*', '.*OLD_VERSIONS.*']

    try:
        for k, v in project_metadata.items():
            # http://regexr.com/3conn
            datasets = Crawl(
                v['catalog_xml'],
                select=[
                    '([0-9]+\..*|.*(-(a|A)|(ls|[0-9]s|isus|[aAB]s|sgs|aqds|vs|pcs|d|tide)-cal){1}(?!lp)(?!1(h|H))\.*.*)'
                ],
                skip=skips,
                after=since).datasets
            logger.info("Found {0} datasets in {1}!".format(len(datasets), k))
            total_datasets += datasets
        logger.info("Found {0} TOTAL datasets!".format(len(total_datasets)))
    except KeyboardInterrupt:
        logger.info("Breaking out of crawling loop.")
        total_datasets = []

    try:
        os.makedirs(folder)
    except OSError:
        pass

    # Save datasets to download directory
    saved_files = []
    for num, d in enumerate(total_datasets):

        if filesubset and d.name.lower() not in filesubset:
            continue

        try:
            http_url = next(s["url"] for s in d.services
                            if s["service"].lower() == "httpserver")
            project_name = http_url.split("/")[-2]
        except StopIteration:
            logger.error("No HTTPServer endpoint found, skipping")
            continue

        # Make download folder
        save_file = os.path.join(folder, project_name, d.name)
        if not os.path.isdir(os.path.dirname(save_file)):
            os.makedirs(os.path.dirname(save_file))
        logger.info("Downloading {0}".format(http_url))
        try:
            with open(save_file, "wb") as f:
                r = requests.get(http_url, stream=True)
                if not r.ok:
                    logger.error(
                        "Could not download '{!s}' from '{!s}', skipping".
                        format(d.name, http_url))
                    break
                for block in r.iter_content(1024):
                    if not block:
                        break
                    f.write(block)
        except KeyboardInterrupt:
            logger.info("Breaking out of download loop.")
            raise
        except BaseException:
            logger.error(
                "Could not download... error with HTTP endpoint.  Skipping.")
            continue

        # Try to open file, if it fails, writing failed.
        try:
            with EnhancedDataset(save_file, 'a') as nc:
                name, _ = os.path.splitext(d.name)
                nc.id = "{0}/{1}".format(project_name, name)
        except BaseException:
            os.remove(save_file)
            raise
        else:
            logger.info("{!s} saved ({!s}/{!s})".format(
                d.name, num + 1, len(total_datasets)))
            saved_files.append(save_file)

    return saved_files

Example #11

Show file

def download_data(data, array_name, refdes, method, stream):
    '''
    Download data from THREDDS sever after data request is successful.
    data: returned value by reqest_data() function
    refdes: Reference Designator
    method: 'telemetered' or other
    stream: stream name from ooi
    beginDT: beginning time.
    endDT: ending time.
    Details please see: http://ooi.visualocean.net/instruments/view/GP02HYPM-WFP02-04-CTDPFL000
    '''
    # Get the data URL for the NetCDF file dataset from THREDDS server.
    url = data['allURLs'][0]  # This is the THREDDS server address.
    print('THREDDS server address: ' + url)
    url = url.replace('.html', '.xml')
    tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
    c = Crawl(url, select=[".*\.nc$"], debug=False)
    datasets = [os.path.join(tds_url, x.id) for x in c.datasets]
    #print(datasets)
    ds = xr.open_mfdataset(datasets)
    #ds = ds.swap_dims({'obs': 'time'})

    # Useful variables. Use L1 data and QC flags.
    select_var = [
        'time', 'lon', 'lat', 'ctdpf_ckl_seawater_temperature',
        'ctdpf_ckl_seawater_conductivity', 'ctdpf_ckl_seawater_pressure',
        'practical_salinity', 'density', 'density_qc_executed',
        'density_qc_results', 'practical_salinity_qc_executed',
        'practical_salinity_qc_results',
        'ctdpf_ckl_seawater_pressure_qc_executed',
        'ctdpf_ckl_seawater_pressure_qc_results',
        'ctdpf_ckl_seawater_temperature_qc_executed',
        'ctdpf_ckl_seawater_temperature_qc_results',
        'ctdpf_ckl_seawater_conductivity_qc_executed',
        'ctdpf_ckl_seawater_conductivity_qc_results'
    ]
    df = ds[select_var].to_dataframe()
    df.drop(columns=['pressure'], inplace=True)
    # Rename columns to make things easier.
    df.columns = [
        'time', 'lon', 'lat', 'sea_water_temperature',
        'sea_water_conductivity', 'sea_water_pressure', 'sea_water_salinity',
        'sea_water_density', 'density_qc_executed', 'density_qc_results',
        'salinity_qc_executed', 'salinity_qc_results', 'pressure_qc_executed',
        'pressure_qc_results', 'temperature_qc_executed',
        'temperature_qc_results', 'conductivity_qc_executed',
        'conductivity_qc_results'
    ]

    # Set normal or suspicious flags for each variable
    # If one qc is not passed, I consider it as suspicious.
    df['pressure_flag'] = df['pressure_qc_executed'] == df[
        'pressure_qc_results']
    df['temperature_flag'] = df['temperature_qc_executed'] == df[
        'temperature_qc_results']
    df['conductivity_flag'] = df['conductivity_qc_executed'] == df[
        'conductivity_qc_results']
    df['density_flag'] = df['density_qc_executed'] == df['density_qc_results']
    df['salinity_flag'] = df['salinity_qc_executed'] == df[
        'salinity_qc_results']

    df.to_csv(array_name + '_' + refdes + '_' + method + '_' + stream + '.csv')
    print('Data saved as ' + array_name + '_' + refdes + '_' + method + '_' +
          stream + '.csv')

    return df

Example #12

Show file

File: harvester.py Project: SP7-Ritmare/geogate

def harvest_thredds(src_url, dest_url):
    c = Crawl(src_url)
    return c

Example #13

Show file

File: makeLRAUVNetCDFs.py Project: kcirym10/stoqs

    def find_urls(self, base, select, startdate, enddate):
        cat_url = os.path.join(base, 'catalog.xml')
        u = urlparse(cat_url)
        name, ext = os.path.splitext(u.path)
        if ext == ".html":
            u = urlparse(cat_url.replace(".html", ".xml"))
        cat_url = u.geturl()
        urls = []

        if self.args.realtime:
            self.logger.info(
                f"Attempting to crawl {cat_url} for realtime shore.nc4 files")
            skips = Crawl.SKIPS + [
                ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*",
                ".*.cfg$", ".*.js$", ".*.kml$", ".*.log$"
            ]
            crawl_debug = False
            if self.args.verbose > 2:
                crawl_debug = True
            rt_cat = Crawl(cat_url,
                           select=[".*shore.nc4"],
                           skip=skips,
                           debug=crawl_debug)

            for url in [
                    s.get("url") for d in rt_cat.datasets for s in d.services
                    if s.get("service").lower() == "opendap"
            ]:
                dir_start = datetime.strptime(
                    url.split('/')[-2], '%Y%m%dT%H%M%S')
                if startdate <= dir_start and dir_start <= enddate:
                    self.logger.debug(f"Adding url {url}")
                    urls.append(url)
        else:
            self.logger.debug(
                f"Attempting to Crawl {cat_url} looking for .dlist files")
            skips = Crawl.SKIPS + [
                ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*",
                ".*.cfg$"
            ]
            dlist_cat = Crawl(cat_url, select=[".*dlist"], skip=skips)

            self.logger.info(
                f"Crawling {cat_url} for {files} files to make {self.args.resampleFreq}_{self.args.appendString}.nc files"
            )
            for dataset in dlist_cat.datasets:
                # get the mission directory name and extract the start and ending dates
                dlist = os.path.basename(dataset.id)
                mission_dir_name = dlist.split('.')[0]
                dts = mission_dir_name.split('_')
                dir_start = datetime.strptime(dts[0], '%Y%m%d')
                dir_end = datetime.strptime(dts[1], '%Y%m%d')

                # if within a valid range, grab the valid urls
                self.logger.debug(
                    f"Checking if .dlist {dlist} is within {startdate} and {enddate}"
                )
                if (startdate <= dir_start
                        and dir_start <= enddate) or (startdate <= dir_end
                                                      and dir_end <= enddate):
                    catalog = '{}_{}/catalog.xml'.format(
                        dir_start.strftime('%Y%m%d'),
                        dir_end.strftime('%Y%m%d'))
                    self.logger.debug(
                        f"Crawling {os.path.join(base, catalog)}")
                    log_cat = Crawl(os.path.join(base, catalog),
                                    select=[select],
                                    skip=skips)
                    self.logger.debug(
                        f"Getting opendap urls from datasets {log_cat.datasets}"
                    )
                    for url in [
                            s.get("url") for d in log_cat.datasets
                            for s in d.services
                            if s.get("service").lower() == "opendap"
                    ]:
                        self.logger.debug(f"Adding url {url}")
                        urls.append(url)
        if not urls:
            self.logger.info("No URLs found.")

        return urls