コード例 #1
0
ファイル: _instrument.py プロジェクト: yangjian615/pysat
class Instrument(object):
    """Download, load, manage, modify and analyze science data.
    
    Parameters
    ----------    
    platform : string
        name of platform/satellite.
    name : string
        name of instrument. 
    tag : string, optional
        identifies particular subset of instrument data.
    sat_id : string, optional
        identity within constellation
    clean_level : {'clean','dusty','dirty','none'}, optional
        level of data quality
    pad : pandas.DateOffset, or dictionary, optional
        Length of time to pad the begining and end of loaded data for 
        time-series processing. Extra data is removed after applying all 
        custom functions. Dictionary, if supplied, is simply passed to 
        pandas DateOffset.
    orbit_info : dict    
        Orbit information, {'index':index, 'kind':kind, 'period':period}.
        See pysat.Orbits for more information.            
    inst_module : module, optional
        Provide instrument module directly. 
        Takes precedence over platform/name.
    update_files : boolean, optional
        If True, immediately query filesystem for instrument files and store.
    temporary_file_list : boolean, optional
        If true, the list of Instrument files will not be written to disk.
        Prevents a race condition when running multiple pysat processes.
    multi_file_day : boolean, optional 
        Set to True if Instrument data files for a day are spread across
        multiple files and data for day n could be found in a file
        with a timestamp of day n-1 or n+1.
    manual_org : bool
        if True, then pysat will look directly in pysat data directory
        for data files and will not use default /platform/name/tag
    directory_format : str
        directory naming structure in string format. Variables such as
        platform, name, and tag will be filled in as needed using python
        string formatting. The default directory structure would be 
        expressed as '{platform}/{name}/{tag}'
    file_format : str or NoneType
        File naming structure in string format.  Variables such as year,
        month, and sat_id will be filled in as needed using python string
        formatting.  The default file format structure is supplied in the
        instrument list_files routine.
               
    Attributes
    ----------
    data : pandas.DataFrame
        loaded science data 
    date : pandas.datetime
        date for loaded data
    yr : int
        year for loaded data
    bounds : (datetime/filename/None, datetime/filename/None)
        bounds for loading data, supply array_like for a season with gaps
    doy : int
        day of year for loaded data
    files : pysat.Files
        interface to instrument files
    meta : pysat.Meta
        interface to instrument metadata, similar to netCDF 1.6
    orbits : pysat.Orbits
        interface to extracting data orbit-by-orbit
    custom : pysat.Custom
        interface to instrument nano-kernel
    kwargs : dictionary
        keyword arguments passed to instrument loading routine
    
    Note
    ----
    Pysat attempts to load the module platform_name.py located in
    the pysat/instruments directory. This module provides the underlying 
    functionality to download, load, and clean instrument data. 
    Alternatively, the module may be supplied directly
    using keyword inst_module.
    
    Examples
    --------     
    :: 
           
        # 1-second mag field data
        vefi = pysat.Instrument(platform='cnofs', 
                                name='vefi', 
                                tag='dc_b', 
                                clean_level='clean')
        start = pysat.datetime(2009,1,1)
        stop = pysat.datetime(2009,1,2)
        vefi.download(start, stop)
        vefi.load(date=start)
        print(vefi['dB_mer'])
        print(vefi.meta['db_mer'])
    
        # 1-second thermal plasma parameters
        ivm = pysat.Instrument(platform='cnofs', 
                                name='ivm', 
                                tag='', 
                                clean_level='clean')
        ivm.download(start,stop)
        ivm.load(2009,1)
        print(ivm['ionVelmeridional'])
        
        # Ionosphere profiles from GPS occultation
        cosmic = pysat.Instrument('cosmic2013', 
                                    'gps', 
                                    'ionprf', 
                                    altitude_bin=3)
        # bins profile using 3 km step
        cosmic.download(start, stop, user=user, password=password)
        cosmic.load(date=start)

    """
    def __init__(self,
                 platform=None,
                 name=None,
                 tag=None,
                 sat_id=None,
                 clean_level='clean',
                 update_files=None,
                 pad=None,
                 orbit_info=None,
                 inst_module=None,
                 multi_file_day=None,
                 manual_org=None,
                 directory_format=None,
                 file_format=None,
                 temporary_file_list=False,
                 *arg,
                 **kwargs):

        if inst_module is None:
            # use strings to look up module name
            if isinstance(platform, str) and isinstance(name, str):
                self.platform = platform.lower()
                self.name = name.lower()
                # look to module for instrument functions and defaults
                self._assign_funcs(by_name=True)
            elif (platform is None) and (name is None):
                # creating "empty" Instrument object with this path
                self.name = ''
                self.platform = ''
                self._assign_funcs()
            else:
                raise ValueError(
                    'Inputs platform and name must both be strings, or both None.'
                )
        else:
            # user has provided a module
            try:
                # platform and name are expected to be part of module
                self.name = inst_module.name.lower()
                self.platform = inst_module.platform.lower()
            except AttributeError:
                raise AttributeError(
                    string.join((
                        'A name and platform attribute for the ',
                        'instrument is required if supplying routine module directly.'
                    )))
            # look to module for instrument functions and defaults
            self._assign_funcs(inst_module=inst_module)

        # more reasonable defaults for optional parameters
        self.tag = tag.lower() if tag is not None else ''
        self.sat_id = sat_id.lower() if sat_id is not None else ''
        self.clean_level = (clean_level.lower()
                            if clean_level is not None else 'none')

        # assign_func sets some instrument defaults, direct info rules all
        if directory_format is not None:
            self.directory_format = directory_format.lower()
        # value not provided by user, check if there is a value provided by
        # instrument module
        elif self.directory_format is not None:
            try:
                # check if it is a function
                self.directory_format = self.directory_format(tag, sat_id)
            except TypeError:
                pass

        if file_format is not None:
            self.file_format = file_format
        # value not provided by user, check if there is a value provided by
        # instrument module
        elif self.file_format is not None:
            # check if it is an iterable string.  If it isn't formatted
            # properly, give a warning and set file_format to None
            if (not isinstance(self.file_format, str)
                    or self.file_format.find("{") < 0
                    or self.file_format.find("}") < 1):
                estr = 'file format set to default, supplied string must be '
                estr = '{:s}iteratable [{:}]'.format(estr, self.file_format)
                print(estr)
                self.file_format = None

        # set up empty data and metadata
        self.data = DataFrame(None)
        self.meta = _meta.Meta()

        # function processing class, processes data on load
        self.custom = _custom.Custom()
        # create arrays to store data around loaded day
        # enables padding across day breaks with minimal loads
        self._next_data = DataFrame(None)
        self._next_data_track = []
        self._prev_data = DataFrame(None)
        self._prev_data_track = []
        self._curr_data = DataFrame(None)

        # multi file day, default set by assign_funcs
        if multi_file_day is not None:
            self.multi_file_day = multi_file_day

        # arguments for padding
        if isinstance(pad, pds.DateOffset):
            self.pad = pad
        elif isinstance(pad, dict):
            self.pad = pds.DateOffset(**pad)
        elif pad is None:
            self.pad = None
        else:
            estr = 'pad must be a dictionary or a pandas.DateOffset instance.'
            raise ValueError(estr)

        # instantiate Files class
        manual_org = False if manual_org is None else manual_org
        temporary_file_list = not temporary_file_list
        self.files = _files.Files(self,
                                  manual_org=manual_org,
                                  directory_format=self.directory_format,
                                  update_files=update_files,
                                  file_format=self.file_format,
                                  write_to_disk=temporary_file_list)

        # set bounds for iteration
        # self.bounds requires the Files class
        # setting (None,None) loads default bounds
        self.bounds = (None, None)
        self.date = None
        self._fid = None
        self.yr = None
        self.doy = None
        self._load_by_date = False

        # initialize orbit support
        if orbit_info is None:
            if self.orbit_info is None:
                # if default info not provided, set None as default
                orbit_info = {'index': None, 'kind': None, 'period': None}
            else:
                # default provided by instrument module
                orbit_info = self.orbit_info
        self.orbits = _orbits.Orbits(self, **orbit_info)

        # store kwargs, passed to load routine
        self.kwargs = kwargs

        # run instrument init function, a basic pass function is used
        # if user doesn't supply the init function
        self._init_rtn(self)

    def __getitem__(self, key):
        """
        Convenience notation for accessing data; inst['name'] is inst.data.name
        
        Examples
        --------
        ::
        
            # By name
            inst['name']
            # By position  
            inst[row_index, 'name']  
            # Slicing by row 
            inst[row1:row2, 'name']            
            # By Date  
            inst[datetime, 'name']
            # Slicing by date, inclusive
            inst[datetime1:datetime2, 'name']
            # Slicing by name and row/date  
            inst[datetime1:datetime1, 'name1':'name2']
            
        """

        if isinstance(key, tuple):
            # support slicing
            return self.data.ix[key[0], key[1]]
        else:
            return self.data[key]

    def __setitem__(self, key, new):
        """Convenience method for adding data to instrument.
        
        Examples
        --------
        ::
        
            # Simple Assignment, default metadata assigned
            # 'long_name' = 'name'
            # 'units' = ''
            inst['name'] = newData
            # Assignment with Metadata 
            inst['name'] = {'data':new_data, 
                            'long_name':long_name, 
                            'units':units}
        
        Note
        ----
        If no metadata provided and if metadata for 'name' not already stored 
        then default meta information is also added, 
        long_name = 'name', and units = ''.
        
        """
        if isinstance(new, dict):
            # metadata should be included in dict
            self.data[key] = new.pop('data')
            # pass the rest to meta
            self.meta[key] = new
        else:
            if isinstance(key, tuple):
                self.data.ix[key[0], key[1]] = new
                self.meta[key[1]] = {}
            elif isinstance(key, str):
                self.data[key] = new
                self.meta[key] = {}
            elif isinstance(new, DataFrame):
                self.data[key] = new[key]
                for ke in key:
                    self.meta[ke] = {}
            else:
                raise ValueError("No support for supplied input key")

    def copy(self):
        """Deep copy of the entire Instrument object."""
        return copy.deepcopy(self)

    def _pass_func(*args, **kwargs):
        pass

    def _assign_funcs(self, by_name=False, inst_module=None):
        """Assign all external science instrument methods to Instrument object."""
        import importlib
        # set defaults
        self._list_rtn = self._pass_func
        self._load_rtn = self._pass_func
        self._default_rtn = self._pass_func
        self._clean_rtn = self._pass_func
        self._init_rtn = self._pass_func
        self._download_rtn = self._pass_func
        # default params
        self.directory_format = None
        self.file_format = None
        self.multi_file_day = False
        self.orbit_info = None

        if by_name:
            # look for code with filename name, any errors passed up
            inst = importlib.import_module(''.join(
                ('.', self.platform, '_', self.name)),
                                           package='pysat.instruments')
        elif inst_module is not None:
            # user supplied an object with relevant instrument routines
            inst = inst_module
        else:
            # no module or name info, default pass functions assigned
            return

        try:
            self._load_rtn = inst.load
            self._list_rtn = inst.list_files
            self._download_rtn = inst.download
        except AttributeError:
            estr = 'A load, file_list, and download routine are required for '
            raise AttributeError('{:s}every instrument.'.format(estr))
        try:
            self._default_rtn = inst.default
        except AttributeError:
            pass
        try:
            self._init_rtn = inst.init
        except AttributeError:
            pass
        try:
            self._clean_rtn = inst.clean
        except AttributeError:
            pass

        # look for instrument default parameters
        try:
            self.directory_format = inst.directory_format
        except AttributeError:
            pass
        try:
            self.multi_file_day = inst.self.multi_file_day
        except AttributeError:
            pass
        try:
            self.orbit_info = inst.orbit_info
        except AttributeError:
            pass

        return

    def _load_data(self, date=None, fid=None):
        """
        Load data for an instrument on given date or fid, dependng upon input.
        
        """

        if fid is not None:
            # get filename based off of index value
            fname = self.files[fid:fid + 1]
        elif date is not None:
            fname = self.files[date:date + pds.DateOffset(days=1)]
        else:
            raise ValueError('Must supply either a date or file id number.')

        if len(fname) > 0:
            load_fname = [os.path.join(self.files.data_path, f) for f in fname]
            data, mdata = self._load_rtn(load_fname,
                                         tag=self.tag,
                                         sat_id=self.sat_id,
                                         **self.kwargs)
        else:
            data = DataFrame(None)
            mdata = _meta.Meta()

        output_str = '{platform} {name} {tag} {sat_id}'
        output_str = output_str.format(platform=self.platform,
                                       name=self.name,
                                       tag=self.tag,
                                       sat_id=self.sat_id)
        if not data.empty:
            if not isinstance(data, DataFrame):
                raise TypeError(
                    string.join(('Data returned by instrument load',
                                 'routine must be a pandas.DataFrame')))
            if not isinstance(mdata, _meta.Meta):
                raise TypeError(
                    'Metadata returned must be a pysat.Meta object')
            if date is not None:
                output_str = ' '.join(
                    ('Returning', output_str, 'data for', date.strftime('%D')))
            else:
                if len(fname) == 1:
                    # this check was zero
                    output_str = ' '.join(
                        ('Returning', output_str, 'data from', fname[0]))
                else:
                    output_str = ' '.join(
                        ('Returning', output_str, 'data from', fname[0], '::',
                         fname[-1]))
        else:
            # no data signal
            output_str = ' '.join(
                ('No', output_str, 'data for', date.strftime('%D')))
        # remove extra spaces, if any
        output_str = " ".join(output_str.split())
        print(output_str)
        return data, mdata

    def _load_next(self):
        """Load the next days data (or file) without incrementing the date.
        Repeated calls will not advance date/file and will produce the same data
        
        Uses info stored in object to either increment the date, 
        or the file. Looks for self._load_by_date flag. 
         
        """
        if self._load_by_date:
            next_date = self.date + pds.DateOffset(days=1)
            return self._load_data(date=next_date)
        else:
            return self._load_data(fid=self._fid + 1)

    def _load_prev(self):
        """Load the next days data (or file) without decrementing the date.
        Repeated calls will not decrement date/file and will produce the same data
        
        Uses info stored in object to either decrement the date, 
        or the file. Looks for self._load_by_date flag.  
        
        """

        if self._load_by_date:
            prev_date = self.date - pds.DateOffset(days=1)
            return self._load_data(date=prev_date)
        else:
            return self._load_data(fid=self._fid - 1)

    def load(self,
             yr=None,
             doy=None,
             date=None,
             fname=None,
             fid=None,
             verifyPad=False):
        """Load instrument data into Instrument object .data.

        Parameters
        ----------
        yr : integer
            year for desired data
        doy : integer
            day of year
        date : datetime object
            date to load
        fname : 'string'
            filename to be loaded
        verifyPad : boolean 
            if True, padding data not removed (debug purposes)

        Returns
        --------
        Void.  Data is added to self.data

        Note
        ----
        Loads data for a chosen instrument into .data. Any functions chosen
        by the user and added to the custom processing queue (.custom.add)
        are automatically applied to the data before it is available to 
        user in .data.
        
        """

        if date is not None:
            # date supplied getyrdoy checks if it is datetime
            year, doy = utils.getyrdoy(date)
            self.yr = year
            self.doy = doy
            self.date = date
            self._fid = None
            self._load_by_date = True
            inc = pds.DateOffset(days=1)
            curr = date
        elif (yr is not None) & (doy is not None):
            # if date not defined but both yr and doy are
            self.date = pds.datetime(yr, 1, 1) + pds.DateOffset(days=(doy - 1))
            self.yr = yr
            self.doy = doy
            self._fid = None
            self._load_by_date = True
            inc = pds.DateOffset(days=1)
            curr = self.date
        elif fname is not None:
            # date will have to be set later by looking at the data
            self.date = None
            self.yr = None
            self.doy = None
            self._load_by_date = False
            # if no index, called func tries to find file in instrument dir,
            # throws error if it fails
            self._fid = self.files.get_index(fname)
            inc = 1
            curr = self._fid.copy()
        elif fid is not None:
            self._load_by_date = False
            self._fid = fid
            self.date = None
            self.yr = None
            self.doy = None
            inc = 1
            curr = fid
        else:
            estr = 'Must supply a yr,doy pair, or datetime object, or filename'
            estr = '{:s} to load data from.'.format(estr)
            raise TypeError(estr)

        self.orbits._reset()
        # if pad is true, need to have a three day/file load
        if (self.pad is not None) | self.multi_file_day:
            if self._next_data.empty & self._prev_data.empty:
                # data has not already been loaded for previous and next days
                # load data for all three
                print('Initializing three day/file window')
                # using current date or fid
                self._prev_data, self._prev_meta = self._load_prev()
                self._curr_data, self._curr_meta = \
                    self._load_data(date=self.date, fid=self._fid)
                self._next_data, self._next_meta = self._load_next()
            else:
                # moving forward in time
                if self._next_data_track == curr:
                    self._prev_data = self._curr_data
                    self._prev_meta = self._curr_meta
                    self._curr_data = self._next_data
                    self._curr_meta = self._next_meta
                    self._next_data, self._next_meta = self._load_next()
                # moving backward in time
                elif self._prev_data_track == curr:
                    self._next_data = self._curr_data
                    self._next_meta = self._curr_meta
                    self._curr_data = self._prev_data
                    self._curr_meta = self._prev_meta
                    self._prev_data, self._prev_meta = self._load_prev()
                # jumped in time/or switched from filebased to date based access
                else:
                    self._prev_data, self._prev_meta = self._load_prev()
                    self._curr_data, self._curr_meta = \
                                self._load_data(date=self.date, fid=self._fid)
                    self._next_data, self._next_meta = self._load_next()

            # make sure datetime indices for all data is monotonic
            if not self._prev_data.index.is_monotonic_increasing:
                self._prev_data.sort_index(inplace=True)
            if not self._curr_data.index.is_monotonic_increasing:
                self._curr_data.sort_index(inplace=True)
            if not self._next_data.index.is_monotonic_increasing:
                self._next_data.sort_index(inplace=True)

            # make tracking indexes consistent with new loads
            self._next_data_track = curr + inc
            self._prev_data_track = curr - inc
            # attach data to object
            if not self._curr_data.empty:
                self.data = self._curr_data.copy()
                self.meta = self._curr_meta.copy()
            else:
                self.data = DataFrame(None)
                # line below removed as it would delete previous meta, if any
                # if you end a seasonal analysis with a day with no data, then
                # no meta: self.meta = _meta.Meta()

            if self.multi_file_day:
                self.data = self.data.ix[self.date:self.date + pds.DateOffset(
                    hours=23, minutes=59, seconds=59, nanoseconds=99999999)]

            # pad data based upon passed parameter
            if (not self._prev_data.empty) & (not self.data.empty):
                if self.multi_file_day and self._load_by_date:
                    padLeft = self._prev_data.ix[(
                        self.date):self._curr_data.index[0]]
                else:
                    padLeft = self._prev_data.ix[(
                        self._curr_data.index[0] -
                        self.pad):self._curr_data.index[0]]
                #self.data = pds.concat([padLeft[0:-1], self.data])
                self.data = pds.concat([padLeft, self.data])

            if (not self._next_data.empty) & (not self.data.empty):

                if self.multi_file_day and self._load_by_date:
                    padRight = self._next_data.ix[self.date : (self.date + \
        pds.DateOffset(hours=23, minutes=59, seconds=59, nanoseconds=99999999))]
                else:
                    padRight = self._next_data.ix[self._curr_data.index[-1]:(
                        self._curr_data.index[-1] + self.pad)]
                #self.data = pds.concat([self.data, padRight[1:]])
                self.data = pds.concat([self.data, padRight])

            # drop any possible duplicate index times
            #self.data.drop_duplicates(inplace=True)
            self.data = self.data[~self.data.index.duplicated()]

        # if self.pad is False, load single day
        else:
            self.data, meta = self._load_data(date=self.date, fid=self._fid)
            if not self.data.empty:
                self.meta = meta

        # check if load routine actually returns meta
        if self.meta.data.empty:
            self.meta[self.data.columns] = {
                'long_name': self.data.columns,
                'units': [''] * len(self.data.columns)
            }
        # if loading by file set the yr, doy, and date
        if not self._load_by_date:
            temp = self.data.index[0]
            temp = pds.datetime(temp.year, temp.month, temp.day)
            self.date = temp
            self.yr, self.doy = utils.getyrdoy(self.date)

        if not self.data.empty:
            self._default_rtn(self)
        # clean
        if (not self.data.empty) & (self.clean_level != 'none'):
            self._clean_rtn(self)
        # apply custom functions
        if not self.data.empty:
            self.custom._apply_all(self)
        # remove the excess padding, if any applied
        if (self.pad is not None) & (not self.data.empty) & (not verifyPad):
            self.data = self.data[self._curr_data.index[0]:self._curr_data.
                                  index[-1]]

        sys.stdout.flush()
        return

    def download(self, start, stop, freq='D', user=None, password=None):
        """Download data for given Instrument object from start to stop.
        
        Parameters
        ----------
        start : pandas.datetime
            start date to download data
        stop : pandas.datetime
            stop date to download data
        freq : string
            Stepsize between dates for season, 'D' for daily, 'M' monthly 
            (see pandas)
        user : string
            username, if required by instrument data archive
        password : string
            password, if required by instrument data archive
            
        Note
        ----
        Data will be downloaded to pysat_data_dir/patform/name/tag
        
        If Instrument bounds are set to defaults they are updated
        after files are downloaded.
        
        """
        import errno
        # make sure directories are there, otherwise create them
        try:
            os.makedirs(self.files.data_path)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        print('Downloading data to: ', self.files.data_path)
        date_array = utils.season_date_range(start, stop, freq=freq)
        if user is None:
            self._download_rtn(date_array,
                               tag=self.tag,
                               sat_id=self.sat_id,
                               data_path=self.files.data_path)
        else:
            self._download_rtn(date_array,
                               tag=self.tag,
                               sat_id=self.sat_id,
                               data_path=self.files.data_path,
                               user=user,
                               password=password)
        # get current file date range
        first_date = self.files.start_date
        last_date = self.files.stop_date

        print('Updating pysat file list')
        self.files.refresh()

        # if instrument object has default bounds, update them
        if len(self.bounds[0]) == 1:
            if (self.bounds[0][0] == first_date
                    and self.bounds[1][0] == last_date):
                print('Updating instrument object bounds.')
                self.bounds = None

    @property
    def bounds(self):
        """Boundaries for iterating over instrument object by date or file.

        Parameters
        ----------
        start : datetime object, filename, or None (default)
            start of iteration, if None uses first data date.
            list-like collection also accepted
        end :  datetime object, filename, or None (default)
                end of iteration, inclusive. If None uses last data date.
                list-like collection also accepted

        Note
        ----
        Both start and stop must be the same type (date, or filename) or None

        Examples
        --------
        ::

            inst = pysat.Instrument(platform=platform,
                                    name=name,
                                    tag=tag)
            start = pysat.datetime(2009,1,1)
            stop = pysat.datetime(2009,1,31)
            inst.bounds = (start,stop)

            start2 = pysat.datetetime(2010,1,1)
            stop2 = pysat.datetime(2010,2,14)
            inst.bounds = ([start, start2], [stop, stop2])

        """
        return self._iter_start, self._iter_stop

    @bounds.setter
    def bounds(self, value=None):
        if value is None:
            value = (None, None)
        if len(value) < 2:
            raise ValueError('Must supply both a start and end date/file' +
                             'Supply None if you want the first/last possible')

        start = value[0]
        end = value[1]
        # get the frequency, or step size, of season
        if len(value) == 3:
            step = value[2]
        else:
            # default do daily
            step = 'D'

        if (start is None) and (end is None):
            # set default
            self._iter_start = [self.files.start_date]
            self._iter_stop = [self.files.stop_date]
            self._iter_type = 'date'
            if self._iter_start[0] is not None:
                # check here in case Instrument is initialized with no input
                self._iter_list = utils.season_date_range(self._iter_start,
                                                          self._iter_stop,
                                                          freq=step)

        elif (hasattr(start, '__iter__') and not isinstance(start, str)) and (
                hasattr(end, '__iter__') and not isinstance(end, str)):
            base = type(start[0])
            for s, t in zip(start, end):
                if (type(s) != type(t)) or (type(s) != base):
                    raise ValueError(
                        'Start and end items must all be of the same type')
            if isinstance(start[0], str):
                self._iter_type = 'file'
                self._iter_list = self.files.get_file_array(start, end)
            elif isinstance(start[0], pds.datetime):
                self._iter_type = 'date'
                self._iter_list = utils.season_date_range(start,
                                                          end,
                                                          freq=step)
            else:
                raise ValueError(
                    'Input is not a known type, string or datetime')
            self._iter_start = start
            self._iter_stop = end

        elif (hasattr(start, '__iter__') and not isinstance(start, str)) or (
                hasattr(end, '__iter__') and not isinstance(end, str)):
            raise ValueError(
                'Both start and end must be iterable if one bound is iterable')

        elif isinstance(start, str) or isinstance(end, str):
            if isinstance(start, pds.datetime) or isinstance(
                    end, pds.datetime):
                raise ValueError('Not allowed to mix file and date bounds')
            if start is None:
                start = self.files[0]
            if end is None:
                end = self.files.files[-1]
            self._iter_start = [start]
            self._iter_stop = [end]
            self._iter_list = self.files.get_file_array(
                self._iter_start, self._iter_stop)
            self._iter_type = 'file'

        elif isinstance(start, pds.datetime) or isinstance(end, pds.datetime):
            if start is None:
                start = self.files.start_date
            if end is None:
                end = self.files.stop_date
            self._iter_start = [start]
            self._iter_stop = [end]
            self._iter_list = utils.season_date_range(start, end, freq=step)
            self._iter_type = 'date'
        else:
            raise ValueError(
                'Provided an invalid combination of bounds. ' +
                'if specifying by file, both bounds must be by file. Other ' +
                'combinations of datetime objects and None are allowed.')

    def __iter__(self):
        """Iterates instrument object by loading subsequent days or files.

        Note
        ----
        Limits of iteration, and iteration type (date/file)
        set by `bounds` attribute.

        Default bounds are the first and last dates from files on local system.

        Examples
        --------
        ::

            inst = pysat.Instrument(platform=platform,
                                    name=name,
                                    tag=tag)
            start = pysat.datetime(2009,1,1)
            stop = pysat.datetime(2009,1,31)
            inst.bounds = (start,stop)
            for inst in inst:
                print('Another day loaded', inst.date)

        """

        if self._iter_type == 'file':
            for fname in self._iter_list:
                self.load(fname=fname)
                yield self

        elif self._iter_type == 'date':
            for date in self._iter_list:
                self.load(date=date)
                yield self

    def next(self):
        """Manually iterate through the data loaded in Instrument object.
        
        Bounds of iteration and iteration type (day/file) are set by 
        `bounds` attribute.
        
        Note
        ----
        If there were no previous calls to load then the 
        first day(default)/file will be loaded.
         
        """

        if self._iter_type == 'date':
            if self.date is not None:
                idx, = np.where(self._iter_list == self.date)
                if (len(idx) == 0) | (idx + 1 >= len(self._iter_list)):
                    raise StopIteration('Outside the set date boundaries.')
                else:
                    idx += 1
                    self.load(date=self._iter_list[idx[0]])
            else:
                self.load(date=self._iter_list[0])

        elif self._iter_type == 'file':
            if self._fid is not None:
                first = self.files.get_index(self._iter_list[0])
                last = self.files.get_index(self._iter_list[-1])
                if (self._fid < first) | (self._fid + 1 > last):
                    raise StopIteration('Outside the set file boundaries.')
                else:
                    self.load(fname=self._iter_list[self._fid + 1 - first])
            else:
                self.load(fname=self._iter_list[0])

    def prev(self):
        """Manually iterate backwards through the data in Instrument object.
        
        Bounds of iteration and iteration type (day/file) 
        are set by `bounds` attribute.
        
        Note
        ----
        If there were no previous calls to load then the 
        first day(default)/file will be loaded.
            
        """

        if self._iter_type == 'date':
            if self.date is not None:
                idx, = np.where(self._iter_list == self.date)
                if (len(idx) == 0) | (idx - 1 < 0):
                    raise StopIteration('Outside the set date boundaries.')
                else:
                    idx -= 1
                    self.load(date=self._iter_list[idx[0]])
            else:
                self.load(date=self._iter_list[-1])

        elif self._iter_type == 'file':
            if self._fid is not None:
                first = self.files.get_index(self._iter_list[0])
                last = self.files.get_index(self._iter_list[-1])
                if (self._fid - 1 < first) | (self._fid > last):
                    raise StopIteration('Outside the set file boundaries.')
                else:
                    self.load(fname=self._iter_list[self._fid - 1 - first])
            else:
                self.load(fname=self._iter_list[-1])

    def to_netcdf4(self, fname=None, format=None):
        """Stores loaded data into a netCDF3/4 file.
        
        Parameters
        ----------
        fname : string
            full path to save instrument object to
        format : string
            format keyword passed to netCDF4 routine
            NETCDF3_CLASSIC, NETCDF3_64BIT, NETCDF4_CLASSIC, and NETCDF4
        
        Note
        ----
        Stores 1-D data along dimension 'time' - the date time index.
        
        Stores object data (e.g. dataframes within series) separately
                    
         - The name of the series is used to prepend extra variable
           dimensions within netCDF, key_2, key_3; first dimension time
         - The index organizing the data stored as key_sample_index
         - from_netcdf3 uses this naming scheme to reconstruct data structure
            
        The datetime index is stored as 'UNIX time'. netCDF-3 doesn't support
        64-bit integers so it is stored as a 64-bit float. This results in a
        loss of datetime precision when converted back to datetime index
        up to hundreds of nanoseconds. Use netCDF4 if this is a problem.
          
        All attributes attached to instrument meta are written to netCDF attrs.
        
        """

        import netCDF4

        if format is None:
            format = 'NETCDF3_64BIT'
        else:
            format = format.upper()

        with netCDF4.Dataset(fname, mode='w', format=format) as out_data:

            num = len(self.data.index)
            out_data.createDimension('time', num)

            # write out the datetime index
            cdfkey = out_data.createVariable(
                'time',
                'f8',
                dimensions=('time'),
            )
            cdfkey.units = 'seconds since 1970-1-1 0:0:0'
            cdfkey.long_name = 'UNIX time'
            cdfkey.calendar = 'standard'
            cdfkey[:] = (self.data.index.astype(int) *
                         1.E-3).astype(int) * 1.E-6
            # store all of the data in dataframe columns
            for key in self.data.columns:
                if self[key].dtype != np.dtype('O'):
                    # not an object, simple column of data, write it out
                    if ((self[key].dtype == np.int64) &
                        (format[:7] == 'NETCDF3')):
                        self[key] = self[key].astype(np.int32)
                    cdfkey = out_data.createVariable(
                        key,
                        self[key].dtype,
                        dimensions=('time'),
                    )
                    cdfkey.units = self.meta[key].units
                    cdfkey.long_name = self.meta[key].long_name
                    cdfkey[:] = self[key].values
                else:
                    # we are dealing with a more complicated object
                    # presuming a series with a dataframe in each location
                    dims = np.shape(self[key].iloc[0])
                    obj_dim_names = []
                    # don't need to recreate last dimension,
                    # it covers number of columns
                    for i, dim in enumerate(dims[:-1]):
                        obj_dim_names.append(key + '_dim_%i' % (i + 1))
                        out_data.createDimension(obj_dim_names[-1], dim)
                    var_dim = tuple(['time'] + obj_dim_names)
                    #print (key, var_dim)
                    # iterate over columns and store
                    try:
                        iterable = self[key].iloc[0].columns
                        is_frame = True
                    except AttributeError:
                        # looking at a series, which doesn't have columns
                        iterable = self[key].iloc[0].name
                        is_frame = False

                    for col in iterable:
                        if is_frame:
                            coltype = self[key].iloc[0][col].dtype
                        else:
                            coltype = self[key].iloc[0].dtype
                        if ((coltype == np.int64) & (format[:7] == 'NETCDF3')):
                            coltype = np.int32
                        #elif coltype == np.dtype('O'):
                        #    if isinstance(self[key].iloc[0][col][0], basestring):
                        #        coltype = 'S1'
                        #print (key+'_' +col, var_dim, coltype)
                        cdfkey = out_data.createVariable(key + '_' + col,
                                                         coltype,
                                                         dimensions=var_dim)
                        cdfkey.long_name = col
                        cdfkey.units = ''
                        if is_frame:
                            for i in xrange(num):
                                cdfkey[i, :] = self[key].iloc[i][
                                    col].values.astype(coltype)
                        else:
                            #print (self[key])
                            print(np.shape(cdfkey))
                            for i in xrange(num):
                                print(i)
                                cdfkey[i, :] = self[key].iloc[i].values.astype(
                                    coltype)

                    # store the dataframe index for each time of main dataframe
                    datetime_flag = False
                    coltype = self[key].iloc[0].index.dtype
                    # check for datetime index
                    if coltype == np.dtype('<M8[ns]'):
                        coltype = 'f8'
                        datetime_flag = True
                    if coltype == np.int64:
                        coltype = np.int32
                    #print (key+'_' + '_ample', var_dim, coltype)
                    cdfkey = out_data.createVariable(key + '_dim_1',
                                                     coltype,
                                                     dimensions=var_dim)
                    if datetime_flag:
                        cdfkey.units = 'seconds since 1970-1-1 0:0:0'
                        cdfkey.long_name = 'UNIX time'
                        for i in xrange(num):
                            cdfkey[i, :] = (self[key].iloc[i].index.astype(int)
                                            * 1.E-3).astype(int) * 1.E-6
                    else:
                        cdfkey.units = ''
                        if self[key].iloc[0].index.name is not None:
                            cdfkey.long_name = self[key].iloc[0].index.name
                        else:
                            cdfkey.long_name = key
                        for i in xrange(num):
                            cdfkey[i, :] = self[key].iloc[
                                i].index.to_native_types()

            # store any non standard attributes
            base_attrb = dir(Instrument())
            this_attrb = dir(self)

            adict = {}
            for key in this_attrb:
                if key not in base_attrb:
                    if key[0] != '_':
                        adict[key] = self.__getattribute__(key)
            # store any non-standard attributes attached to meta
            base_attrb = dir(_meta.Meta())
            this_attrb = dir(self.meta)
            for key in this_attrb:
                if key not in base_attrb:
                    if key[0] != '_':
                        adict[key] = self.meta.__getattribute__(key)
            adict['pysat_version'] = 1.0
            adict['Conventions'] = 'CF-1.6'

            # check for binary types
            for key in adict.keys():
                if isinstance(adict[key], bool):
                    adict[key] = int(adict[key])

            out_data.setncatts(adict)
        return
コード例 #2
0
def load_netcdf4(fnames=None,
                 strict_meta=False,
                 format=None):  #, index_label=None,
    # unix_time=False, **kwargs):
    """Load netCDF-3/4 file produced by pysat.
    
    Parameters
    ----------
    fnames : string or array_like of strings
        filenames to load
    strict_meta : boolean
        check if metadata across fnames is the same
    format : string
        format keyword passed to netCDF4 routine
        NETCDF3_CLASSIC, NETCDF3_64BIT, NETCDF4_CLASSIC, and NETCDF4
     
    """

    import netCDF4
    import string
    import pysat

    if fnames is None:
        raise ValueError("Must supply a filename/list of filenames")
    if isinstance(fnames, basestring):
        fnames = [fnames]

    if format is None:
        format = 'NETCDF3_64BIT'
    else:
        format = format.upper()

    saved_mdata = None
    running_idx = 0
    running_store = []
    two_d_keys = []
    two_d_dims = []
    for fname in fnames:
        with netCDF4.Dataset(fname, mode='r', format=format) as data:
            # build up dictionary with all ncattrs
            # and add those attributes to a pysat meta object
            ncattrsList = data.ncattrs()
            mdata = pysat.Meta()
            for d in ncattrsList:
                if hasattr(mdata, d):
                    mdata.__setattr__(d + '_', data.getncattr(d))
                else:
                    mdata.__setattr__(d, data.getncattr(d))

            # loadup all of the variables in the netCDF
            loadedVars = {}
            for key in data.variables.keys():
                # load up metadata
                # from here group unique dimensions and act accordingly, 1D, 2D, 3D
                if len(data.variables[key].dimensions) == 1:
                    # assuming basic time dimension
                    loadedVars[key] = data.variables[key][:]
                    if key != 'time':
                        mdata[key] = {
                            'long_name': data.variables[key].long_name,
                            'units': data.variables[key].units
                        }
                        # 'nc_dimensions':data.variables[key].dimensions}

                if len(data.variables[key].dimensions) == 2:
                    # part of dataframe within dataframe
                    two_d_keys.append(key)
                    two_d_dims.append(data.variables[key].dimensions)

            # we now have a list of keys that need to go into a dataframe,
            # could be more than one, collect unique dimensions for 2D keys
            for dim in set(two_d_dims):

                # get the name of the final data column
                # dimension naming follows name_dim_number,
                # pull out name by finding last _ and tracking back
                obj_key_name = dim[1][:-string.find(dim[1][::-1], '_') - 5]
                # collect variable names associated with object
                obj_var_keys = []
                for tkey, tdim in zip(two_d_keys, two_d_dims):
                    if tdim == dim:
                        obj_var_keys.append(tkey)

                # loop over first object dimension
                # preallocate dataframes to hold objects because it is faster
                init_frame = DataFrame(None)
                loop_list = [init_frame
                             ] * data.variables[obj_var_keys[0]].shape[0]
                for i, loop_frame in enumerate(loop_list):
                    loop_frame = init_frame.copy()
                    for key in obj_var_keys:
                        loop_frame[key[len(obj_key_name) +
                                       1:]] = data.variables[key][i, :]

                    # if the object index uses unix time, process into datetime index
                    if data.variables[obj_key_name +
                                      '_dim_1'].long_name == 'UNIX time':
                        # nanosecond resolution from datetime64 can't be stored in netcdf3
                        # no 64-bit integers
                        # it is stored as a float, need to undo processing
                        # due to precision loss, resolution limited to the microsecond
                        loop_frame.index = pds.to_datetime(
                            (1E6 * loop_frame['dim_1']).astype(int) * 1000)
                        loop_frame.index.name = 'time'
                    else:
                        loop_frame.index = loop_frame['dim_1']
                        loop_frame.index.name = data.variables[
                            obj_key_name + '_dim_1'].long_name

                    del loop_frame['dim_1']
                    loop_list[i] = loop_frame
                    #print (loop_list[i] )
                    #loop_list.append(loop_frame)
                # add object data to loaded data dictionary
                loadedVars[obj_key_name] = loop_list
                del loop_list

            # prepare dataframe index for this netcdf file
            loadedVars['time'] = pds.to_datetime(
                (loadedVars.pop('time') * 1E6).astype(int) * 1000)
            running_store.append(loadedVars)
            running_idx += len(loadedVars['time'])
            # if index_label is not None:
            #     if unix_time:
            #         loadedVars['_index'] = pds.to_datetime((loadedVars.pop(index_label)*1E6).astype(int)*1000)
            #     else:
            #         loadedVars['_index'] = loadedVars.pop(index_label)
            #     running_store.append(loadedVars)
            #     running_idx += len(loadedVars['_index'])
            # else:
            #     # keep a running integer index if none provided
            #     num = len(loadedVars[loadedVars.keys()[0]])
            #     # this only guaranteed to work if all variables share the same
            #     # first dimension
            #     loadedVars['_index'] = np.arange(num) + running_idx
            #     running_store.append(loadedVars)
            #     running_idx += num

            if strict_meta:
                if saved_mdata is None:
                    saved_mdata = copy.deepcopy(mdata)
                elif (mdata != saved_mdata):
                    raise ValueError(
                        'Metadata across filenames is not the same.')

    # combine all of the data loaded across files together
    # currently doesn't work if list of dicts of lists is provided
    # in other words, only one file at a time
    out = DataFrame.from_records(running_store[0], index='time')

    return out, mdata