Example #1
0
def concat(dfs: List[pd.DataFrame]) -> List[pd.DataFrame]:
    freqs = [pd.infer_freq(df.index) for df in dfs]
    if all(freq == freqs[0] for freq in freqs):
        combined = pd.concat(dfs, axis=1)
    else:
        for freq_opt in ["A-DEC", "A", "Q-DEC", "Q", "M", "2W-SUN", "W-SUN"]:
            if freq_opt in freqs:
                output = []
                for df in dfs:
                    freq_df = pd.infer_freq(df.index)
                    if freq_df == freq_opt:
                        df_match = df.copy()
                    else:
                        type_df = df.columns.get_level_values("Tipo")[0]
                        unit_df = df.columns.get_level_values("Unidad")[0]
                        if type_df == "Stock":
                            df_match = transform.resample(df,
                                                          rule=freq_opt,
                                                          operation="last")
                        elif type_df == "Flujo" and not any(
                                x in unit_df for x in ["%", "=", "Cambio"]):
                            df_match = transform.resample(df,
                                                          rule=freq_opt,
                                                          operation="sum")
                        else:
                            df_match = transform.resample(df,
                                                          rule=freq_opt,
                                                          operation="mean")
                    output.append(df_match)
                combined = pd.concat(output, axis=1)
                break
            else:
                continue

    return combined
Example #2
0
def test_shift_ruptures_shift_min(midday):
    shifted = _shift_between(
        midday,
        30,
        start='2020-01-01',
        end='2020-01-25',
    )
    shift_expected = pd.Series(0, index=shifted.index, dtype='int64')
    shift_expected.loc['2020-01-01':'2020-01-25'] = 30
    no_shift = pd.Series(0, index=shifted.index, dtype='int64')
    shift_mask, shift_amount = time.shifts_ruptures(shifted,
                                                    midday,
                                                    shift_min=60,
                                                    round_up_from=40)
    assert not shift_mask.any()
    assert_series_equal(shift_amount, no_shift, check_names=False)
    shift_mask, shift_amount = time.shifts_ruptures(shifted,
                                                    midday,
                                                    shift_min=30)
    assert_series_equal(
        shift_mask,
        shift_expected != 0 if pd.infer_freq(shifted.index) != 'H' else False,
        check_names=False)
    assert_series_equal(
        shift_amount,
        shift_expected if pd.infer_freq(shifted.index) != 'H' else no_shift,
        check_names=False)
def select_same_time_slice(reference_ds, ds):
    """ Select the values for the same timestep as the """
    # CHECK THEY ARE THE SAME FREQUENCY
    # get the frequency of the time series from reference_ds
    freq = pd.infer_freq(reference_ds.time.values)
    old_freq = pd.infer_freq(ds.time.values)
    assert freq == old_freq, f"The frequencies should be the same! currenlty ref: {freq} vs. old: {old_freq}"

    # get the STARTING time point from the reference_ds
    min_time = reference_ds.time.min().values
    max_time = reference_ds.time.max().values
    orig_time_range = pd.date_range(min_time, max_time, freq=freq)
    # EXTEND the original time_range by 1 (so selecting the whole slice)
    # because python doesn't select the final in a range
    periods = len(orig_time_range) + 1
    # create new time series going ONE EXTRA PERIOD
    new_time_range = pd.date_range(min_time, freq=freq, periods=periods)
    new_max = new_time_range.max()

    # select using the NEW MAX as upper limit
    ds = ds.sel(time=slice(min_time, new_max))
    # assert reference_ds.time.shape[0] == ds.time.shape[0],"The time dimensions should match, currently reference_ds.time dims {reference_ds.time.shape[0]} != ds.time dims {ds.time.shape[0]}"

    print_time_min = pd.to_datetime(ds.time.min().values)
    print_time_max = pd.to_datetime(ds.time.max().values)
    try:
        vars = [i for i in ds.var().variables]
    except:
        vars = ds.name
    ref_vars = [i for i in reference_ds.var().variables]
    print(f"Select same timeslice for ds with vars: {vars}. Min {print_time_min} Max {print_time_max}")

    return ds
Example #4
0
def infer_freq(da):
    """Infer temporal resolution of a dataset.

    Parameters
    ----------
    da : xarray.DataArray
        DataArray to process.

    Returns
    -------
    str
        Inferred temporal resolution.
    """
    # If the data is uniformally spaced in time (eg, hourly, daily), the
    # temporal resolution is instantly inferred by pandas / xarray.
    #if encoding in ['noleap', 'all_leap', '365_day', '366_day', '360_day']:
    #    pass
    idx = da.indexes['time']
    freq = pd.infer_freq(idx)

    if not freq:
        # Because input might be seasonalized, it may no longer be uniform
        # in space. In that case, we will assume the data follows a pattern
        # and extrapolate from the first three time steps. We should try to
        # use a better approach for this in the future.
        idx = idx.to_series().apply(lambda dt: dt.replace(day=1, hour=0, minute=0))
        freq = pd.infer_freq(idx[:3])
    if freq:
        return freq
    else:
        raise ValueError('Could not infer frequency.')
Example #5
0
def load_1min_gwangali_sitewise():
    """ loads 1 minute data from 2 sites close to gwangali. This does not contain wat temp, tide and salinity"""
    _d_dir = os.path.join(os.getcwd(), 'data\\AWS_data\\site_wise')
    _files = [f for f in os.listdir(_d_dir) if f.endswith('txt')]
    a_files, b_files = [], []
    for f in _files:
        if f.split('.')[0].endswith("_a"):
            a_files.append(f)
        elif f.split('.')[0].endswith('_b'):
            b_files.append(f)

    haupt_df = pd.DataFrame()
    for af, bf in zip(a_files, b_files):
        _f = os.path.join(_d_dir, af)
        _df = pd.read_csv(_f)
        _df.index = pd.to_datetime(_df['Date_Time1'])
        _df.index.freq = pd.infer_freq(_df.index)
        if _df.index.freq is None:
            _f = os.path.join(_d_dir, bf)
            _df = pd.read_csv(_f)
            _df.index = pd.to_datetime(_df['Date_Time2'])
            _df.index.freq = pd.infer_freq(_df.index)
            print(_df.index.freq, ' taken from ', bf)
        print(_df.index.freq)
        haupt_df = pd.concat([haupt_df, _df])

    return haupt_df
Example #6
0
    def _bias_correct(self, add_error_scale, mul_error_scale, frcst_matrix, freq):
        frcst_matrix = frcst_matrix.resample(freq).sum()
        frcst_matrix = frcst_matrix.astype(np.float)

        # Checks the frequency of error scales are of the same as given freq.
        add_error_freq = pd.infer_freq(add_error_scale.index, warn=True)
        mul_error_freq = pd.infer_freq(mul_error_scale.index, warn=True)
        if add_error_freq != freq:
            raise ValueError("Error scale frequency is not same as the passed freq: %s" % freq, add_error_freq)
        if mul_error_freq != freq:
            raise ValueError("Error scale frequency is not same as the passed freq: %s" % freq, mul_error_freq)

        # blancket corrector function
        def blanket_corrector(series, err_scl):
            try:
                add_err = float(err_scl.at[series.name, 'value'])
            except Exception as ex:
                print(ex)
                return series

            bring_forward = 0.0
            corr_vals = []
            keys = []
            for index, value in series.iteritems():
                correction = value + (bring_forward + add_err)
                if correction >= 0:
                    corr_vals.append(correction)
                    keys.append(index)
                    bring_forward = 0.0
                else:
                    corr_vals.append(0.0)
                    keys.append(index)
                    bring_forward = correction
            return pd.Series(data=corr_vals, index=keys, dtype=np.float).rename(series.name)

        # multiplier corrector function
        def multiplier_corrector(series, err_scl):
            try:
                mul_err = float(err_scl.at[series.name, 'value'])
            except Exception as ex:
                print(ex)
                return series

            corr_vals = []
            keys = []
            for index, value in series.iteritems():
                correction = value * mul_err
                if correction >= 0:
                    corr_vals.append(correction)
                    keys.append(index)
                else:
                    corr_vals.append(value)
                    keys.append(index)
            return pd.Series(data=corr_vals, index=keys, dtype=np.float).rename(series.name)

        return frcst_matrix.apply(blanket_corrector,
                                  axis='columns',
                                  raw=False,
                                  err_scl=add_error_scale)
Example #7
0
    def fill_missing_values(frame, inferred_feq: datetime.timedelta = None):
        if len(frame) == 1:
            return frame

        freq = pandas.infer_freq(frame.index)
        if freq:
            if not frame.index.freq:
                frame.set_index(pandas.DatetimeIndex(frame.index.values,
                                                     freq=freq),
                                inplace=True)
            return frame

        if not inferred_feq:
            counter = collections.Counter()
            index_it = iter(frame.index)
            prev_value = next(index_it, None)
            if prev_value:
                for current_value in index_it:
                    time_distance = current_value - prev_value
                    counter[time_distance] += 1
                    prev_value = current_value
            inferred_feq = counter.most_common(1)[0][0]

        start_index = frame.index.min()
        end_index = frame.index.max()
        missing_values = []
        current_index = start_index
        while current_index < end_index:
            if current_index not in frame.index:
                missing_values.append(current_index)
            current_index += inferred_feq

        percentage_missing_values = float(
            len(missing_values)) / (frame.shape[0] + len(missing_values))
        if percentage_missing_values > WeatherCache.PERCENTAGE_MISSING_VALUES_THRESHOLD:
            warnings.warn(
                'Missing values constitute {0:.2f}% of all values in the frame which exceeds {1}% threshold'
                .format(
                    percentage_missing_values * 100.0,
                    WeatherCache.PERCENTAGE_MISSING_VALUES_THRESHOLD * 100.0))

        data = numpy.full((len(missing_values), len(frame.columns)), numpy.NaN)

        missing_values_frame = pandas.DataFrame(index=missing_values,
                                                data=data,
                                                columns=frame.columns)
        filled_frame = frame.append(missing_values_frame)
        filled_frame.sort_index(inplace=True)
        filled_frame.fillna(method='ffill', inplace=True)

        freq = pandas.infer_freq(filled_frame.index)
        if freq:
            filled_frame.set_index(pandas.DatetimeIndex(
                filled_frame.index.values, freq=freq),
                                   inplace=True)

        return filled_frame
Example #8
0
    def __init__(self,
                 df: pd.DataFrame = None,
                 tz: str = None,
                 units: Union[str, list] = None,
                 name: str = "") -> None:
        """
        Initializes the Market.
        """

        # Deal with DataFrame
        if (df is None) or (df.empty is True):
            self.data = pd.DataFrame(index=None, data=None)
            self.start_utc = None
            self.end_utc = None
            self.dims = (0, 0)
            self.freq = None
            self.name = 'Empty Market'
        else:
            # Extract values
            if type(df.index[0]) == 'str':
                new_index = pd.to_datetime(df.index, format=fmt)
                self.data = pd.DataFrame(index=new_index, data=df.values)
                self.start_utc = datetime.strptime(str(new_index[0]), fmt)
                self.end_utc = datetime.strptime(str(new_index[-1]), fmt)
                self.dims = df.shape
                try:
                    self.freq = pd.infer_freq(new_index)
                except:
                    self.freq = 'Unknown'
                self.name = name
            else:
                self.data = df
                self.start_utc = df.index[0]
                self.end_utc = df.index[-1]
                self.dims = df.shape
                try:
                    self.freq = pd.infer_freq(df.index)
                except:
                    self.freq = 'Unknown'
                self.name = name

        # Deal with unit
        if units is None:
            self.units = None
        else:
            assert (len(units) == len(self.data.columns))
            self.units = units

        # Deal with timezone
        if tz is None:
            self.tz = 'UTC'
            self.timezone = pytz.utc
        else:
            self.tz = tz
            self.timezone = pytz.timezone(tz)
Example #9
0
    def shift_dates(self, h):
        """ Auxiliary function for creating dates for forecasts

        Parameters
        ----------
        h : int
            How many steps to forecast

        Returns
        ----------
        A transformed date_index object
        """

        date_index = copy.deepcopy(self.index)
        date_index = date_index[self.max_lag:len(date_index)]

        if self.is_pandas is True:

            if isinstance(date_index, pd.tseries.index.DatetimeIndex):

                if pd.infer_freq(date_index) == 'H' or pd.infer_freq(
                        date_index) == 'M' or pd.infer_freq(date_index) == 'S':

                    for t in range(h):
                        date_index += pd.DateOffset(
                            (date_index[len(date_index) - 1] -
                             date_index[len(date_index) - 2]).seconds)

                else:  # Assume higher frequency (configured for days)

                    for t in range(h):
                        date_index += pd.DateOffset(
                            (date_index[len(date_index) - 1] -
                             date_index[len(date_index) - 2]).days)

            elif isinstance(date_index, pd.core.index.Int64Index):

                for i in range(h):
                    new_value = date_index.values[
                        len(date_index.values) -
                        1] + (date_index.values[len(date_index.values) - 1] -
                              date_index.values[len(date_index.values) - 2])
                    date_index = pd.Int64Index(
                        np.append(date_index.values, new_value))

        else:

            for t in range(h):
                date_index.append(date_index[len(date_index) - 1] + 1)

        return date_index
Example #10
0
def select_same_time_slice(reference_ds, ds):
    """ Select the values for the same timestep as the reference ds"""
    # CHECK THEY ARE THE SAME FREQUENCY
    # get the frequency of the time series from reference_ds
    freq = pd.infer_freq(reference_ds.time.values)
    if freq == None:
        warnings.warn("HARDCODED FOR THIS PROBLEM BUT NO IDEA WHY NOT WORKING")
        freq = "M"
        # assert False, f"Unable to infer frequency from the reference_ds timestep"

    old_freq = pd.infer_freq(ds.time.values)
    warnings.warn(
        "Disabled the assert statement. ENSURE FREQUENCIES THE SAME (e.g. monthly)"
    )
    # assert freq == old_freq, f"The frequencies should be the same! currenlty ref: {freq} vs. old: {old_freq}"

    # get the STARTING time point from the reference_ds
    min_time = reference_ds.time.min().values
    max_time = reference_ds.time.max().values
    orig_time_range = pd.date_range(min_time, max_time, freq=freq)
    # EXTEND the original time_range by 1 (so selecting the whole slice)
    # because python doesn't select the final in a range
    periods = len(orig_time_range)  # + 1
    # create new time series going ONE EXTRA PERIOD
    new_time_range = pd.date_range(min_time, freq=freq, periods=periods)
    new_max = new_time_range.max()

    # select using the NEW MAX as upper limit
    # --------------------------------------------------------------------------
    # FOR SOME REASON slice is removing the minimum time ...
    # something to do with the fact that matplotlib / xarray is working oddly with numpy64datetime object
    warnings.warn("L153: HARDCODING THE MIN VALUE OTHERWISE IGNORED ...")
    min_time = datetime.datetime(2001, 1, 31)
    # --------------------------------------------------------------------------
    ds = ds.sel(time=slice(min_time, new_max))
    assert (
        reference_ds.time.shape[0] == ds.time.shape[0]
    ), f"The time dimensions should match, currently reference_ds.time dims {reference_ds.time.shape[0]} != ds.time dims {ds.time.shape[0]}"

    print_time_min = pd.to_datetime(ds.time.min().values)
    print_time_max = pd.to_datetime(ds.time.max().values)
    try:
        vars = [i for i in ds.var().variables]
    except:
        vars = ds.name
    # ref_vars = [i for i in reference_ds.var().variables]
    print(
        f"Select same timeslice for ds with vars: {vars}. Min {print_time_min} Max {print_time_max}"
    )

    return ds
Example #11
0
def calculate_mas(data_df, periods):

    for period in periods:
        data_df['ema' + str(period)] = data_df['close'].ewm(span=period).mean()
        data_df['ma' + str(period)] = data_df['close'].rolling(period).mean()

    if pd.infer_freq(data_df.index).split('H')[0] == '':
        freq = 1
    else:
        freq = float(pd.infer_freq(data_df.index).split('H')[0])  #in hours

    data_df['200dma'] = data_df['close'].rolling(int(200 * 24.0 / freq)).mean()
    data_df['bull'] = data_df['close'] > data_df['200dma']
    return data_df
Example #12
0
def data_freq(time_series):
    """
    Determine frequency of given time series

    Args:
        time_series (Series): Series with datetime index

    Returns:
        string: frequency specifier
    """
    try:
        freq = time_series.index.freq
        return freq.freqstr or pd.infer_freq(time_series.index)
    except AttributeError:
        return pd.infer_freq(time_series.index)
Example #13
0
def infer_or_inject_freq(df, injected_freq='1s', start_date=None, **kwargs):
    """
        Infer index frequency. If there's not a proper time index, create fake timestamps,
        keeping the desired `injected_freq`. If that is None, set a default one of 1 second.
        start_date: the first date of the index (int or string).
    """
    inferred_freq = pd.infer_freq(df.index)
    if inferred_freq == 'N':
        timedelta = pd.to_timedelta(injected_freq)
        df.index = pd.to_datetime(ifnone(start_date, 0), **
                                  kwargs) + timedelta * df.index
        df.index.freq = pd.infer_freq(df.index)
    else:
        df.index.freq = inferred_freq
    return df
Example #14
0
    def merge(self, ts: 'TimeSeries') -> 'TimeSeries':
        """Merge two time series and make sure all the given indexes are sorted.

        Args:
            ts: the TimeSeries to merge with self

        Returns:
            TimeSeries
        """
        # append and infer new freq
        merged = self.series.append(ts.series)
        infer_freq(merged.index)

        # instanciate a TimeSeries to sort it
        return TimeSeries(merged, self.metadata)
Example #15
0
def DINGO_df_to_data_structure(file_in, 
                               var_list = None,
                               fill_missing_with_nan = True,
                               output_structure = None,
                               return_global_attr = False):

    df = pd.read_pickle(file_in)
    time_step =  pd.infer_freq(df.index)
    attr_dict = {'time_step': int(time_step[: len(time_step) - 1])}

    all_var_list = df.columns
    
    if var_list == None: 
        var_list = all_var_list
    else:
        if not isinstance(var_list, list): var_list = [var_list]
    
    if output_structure == 'pandas':
        data_structure = df[var_list]
    else:
        data_dict = {'date_time': np.array([pd.Timestamp(rec).to_datetime() 
                                            for rec in df.index])}
        for var in var_list:
            data_dict[var] = np.array(df[var])
        data_structure = data_dict

    if return_global_attr:
        return data_structure, attr_dict
    else:
        return data_structure
Example #16
0
def get_freq(X):
    if isinstance(X.index, pd.MultiIndex):
        freq = get_freq_multi_idx(X)
    else:
        freq = to_offset(pd.infer_freq(X.index))

    return freq
Example #17
0
def change_time_zone(ts, tz):
    """Convert hourly time series to new time zone. UTC is assumed if no time zone is
    assigned to the input time series.

    :param pandas.DataFrame/pands.Series ts: time series.
    :param str tz: new time zone.
    :return: (*pandas.DataFrame/pandas.Series*) -- time series with new time zone.
    :raises TypeError: if tz is not a str.
    :raises ValueError: if tz is invalid or the time series has already been resampled.
    """
    _check_time_series(ts, "time series")

    if pd.infer_freq(ts.index) != "H":
        raise ValueError("frequency of time series must be 1h")

    if not isinstance(tz, str):
        raise TypeError("time zone must be a str")
    try:
        pytz.timezone(tz)
    except pytz.exceptions.UnknownTimeZoneError:
        raise ValueError("Unknown time zone %s" % tz)

    ts.index.name = tz
    if ts.index.tz is None:
        return ts.tz_localize("UTC").tz_convert(tz)
    else:
        return ts.tz_convert(tz)
Example #18
0
def parse_data(df):
    if type(df) == pd.DataFrame:
        if df.shape[1] > 1:
            raise ValueError(
                "The dataframe should only contain one target column")
    elif type(df) == pd.Series:
        df = df.to_frame()
    else:
        raise TypeError(
            "Please supply a pandas dataframe with one column or a pandas series"
        )
    try:
        df.index.date
    except AttributeError:
        raise TypeError("The index should be a datetype")
    print(type(df))
    if df.isnull().any().values[0]:
        raise ValueError(
            "The dataframe cannot have any null values, please interpolate")
    try:
        df.columns = ["Target"]
    except:
        raise ValueError("There should only be one column")

    df.index = df.index.rename("Date")
    df.index = add_freq(df.index)

    print(
        "The data has been successfully parsed by infering a frequency, and establishing a 'Date' index and 'Target' column."
    )

    return df, pd.infer_freq(df.index)
def _round_date_nearest_index(time_index_series, time_stamp):
    """ Internal helper function to round the date to the nearest DateTimeIndex in the DataFrame.
    Note: Prioritises the index in the same month first. Eg. if 1 Jan is given, and indexes 31 Dec and 31 Jan are
    available, it will wound to 31 Jan.
    If no valid date is found, it will just round it to the oldest or newest date, whichever is closer.
    """
    for time in time_index_series:
        # If the timestamp and a time in index are equal, just return
        if time == time_stamp:
            return time
        # If the frequency of the series is yearly, just check same year
        if "as" in pd.infer_freq(
                time_index_series).lower() and time.year == time_stamp.year:
            print "Time given was rounded to the same year as an index."
            return time
        # Else assume monthly, check same month, same year
        elif time.month == time_stamp.month and time.year == time_stamp.year:
            print "Time given was rounded to the same month as an index."
            return time
    # If above all fail, just try to round it to the first or last value in the entire series
    if abs((time_stamp - time_index_series[0]).total_seconds()) > abs(
        (time_stamp - time_index_series[-1]).total_seconds()):
        print "Out of range so rounding to in range index"
        return time_index_series[-1]
    # This code should never be reached under normal circumstances, placed as a failsafe.
    print "Did not round"
    return time_index_series[0]
Example #20
0
def join_pathname(df, inplace=False, a=None, c=None, e=None, f=None):
    """
    Summary
    -------
    Function to join pathname parts of CalSim tidy DataFrame into a "Pathname"
    column.

    """
    # Initialize DataFrame for operation.
    df_out = df if inplace else df.copy()
    # Infer Part E, if not provided.
    if not e:
        inf_t_step = pd.infer_freq(df_out['DateTime'].unique())
        e = variables.t_steps_inv[inf_t_step] if inf_t_step else None
    # Set column requirements.
    req_col = {'Part A': a, 'Part C': c, 'Part E': e, 'Part F': f}
    # Fill missing required columns.
    miss_col = list()
    for k, v in req_col.items():
        if k not in df_out.columns:
            if v:
                df_out[k] = v
            else:
                miss_col.append(k)
    if miss_col:
        msg = 'Values required for the following columns: {}.'
        raise ValueError(msg.format(miss_col))
    # Create "Pathname" column.
    construct_pathname = lambda x: r'/{}/{}/{}//{}/{}/'.format(*x.values)
    col_part = ['Part A', 'Part B', 'Part C', 'Part E', 'Part F']
    df_out['Pathname'] = df_out[col_part].apply(construct_pathname, axis=1)
    # Drop pathname parts.
    df_out.drop(col_part, axis=1, inplace=True)
    # Return DataFrame.
    return df_out
Example #21
0
def get_ytw_from_date(fromdate, srcfile=r'src/YTW-All-Values.xlsx'):
    '''
    load data from source file into dataframe
    columns:
        Corp - corporate bond rate
        TB - treasury bond rate
        CS - credit spread
        Econ - economic data
    '''

    import nb_credit_spread as cslibrary
    cslib = cslibrary.creditspread()
    return cslib.get_ytw_from_date_delta(srcfile=srcfile)

    import pandas as pd
    import cs_logger as cslog

    source_file = srcfile
    src_file = pd.read_excel(source_file,
                             sheet_name='data',
                             header=0,
                             index_col='Date')
    ytw_df = pd.DataFrame(src_file)
    ytw_df = ytw_df.asfreq(pd.infer_freq(
        ytw_df.index))  # infer data frequency; monthly

    start_date = pd.to_datetime(fromdate)
    ytw_df = ytw_df[start_date:]  # filter records by date

    cslog.debug(f"df: {ytw_df.head()}")
    return ytw_df
Example #22
0
def restore_index(df, idx_meta, rowid_sort=True):
    """
    restore index proper

    :param df: the dataframe
    :param idx_meta: index metadata
    :param rowid_sort: whether to sort by row id. defaults to True
           If your query is already sorted in some specific way,
           specify False to keep the sort order.
    """
    # -- establish row order proper
    if rowid_sort and '_om#rowid' in df:
        df.sort_values('_om#rowid', inplace=True)
        del df['_om#rowid']
    # -- get index columns
    index_cols = restore_index_columns_order(df.columns)
    # -- set index columns
    result = df.set_index(index_cols) if index_cols else df
    if index_cols:
        result.index.names = idx_meta.get('names', [None] * len(index_cols))
    if isinstance(result.index, pd.DatetimeIndex):
        # restore datetime frequency, if possible
        if 'freq' in idx_meta:
            try:
                freq = idx_meta.get('freq')
                freq = freq or pd.infer_freq(result.index)
                result = result.asfreq(freq)
            except:
                pass
    return result
Example #23
0
def _get_feats(index,
               ts,
               freq,
               scale=True,
               features=[
                   acf_features, arch_stat, crossing_points, entropy,
                   flat_spots, heterogeneity, holt_parameters, lumpiness,
                   nonlinearity, pacf_features, stl_features, stability,
                   hw_parameters, unitroot_kpss, unitroot_pp, series_length,
                   hurst
               ],
               dict_freqs=FREQS):

    if freq is None:
        freq = pd.infer_freq(ts['ds'])
        freq = dict_freqs[freq]

    if isinstance(ts, pd.DataFrame):
        assert 'y' in ts.columns
        ts = ts['y'].values

    if isinstance(ts, pd.Series):
        ts = ts.values

    if scale:
        ts = scalets(ts)

    c_map = ChainMap(
        *[dict_feat for dict_feat in [func(ts, freq) for func in features]])

    return pd.DataFrame(dict(c_map), index=[index])
def test_seasonality_transformer(
    X_start, X_len, weekdays, weeks, months, quarter, year
):

    X = pd.DataFrame(index=pd.date_range(X_start, periods=X_len, freq="D"))
    y = pd.Series(np.arange(len(X)), name="values", index=X.index)

    freq = pd.infer_freq(X.index)

    df = pd.concat(
        [X, y, SeasonalityTransformer(freq=freq).fit(X, y).transform(X)], axis=1
    )

    assert set(weekdays).issubset(df.columns)
    assert set(weeks).issubset(df.columns)
    assert set(months).issubset(df.columns)
    assert set(quarter).issubset(df.columns)
    assert set(year).issubset(df.columns)

    first_row = df.head(1).T
    cols_with_ones = first_row[first_row[first_row.columns[0]] == 1].index

    single_date_cols = (
        SeasonalityTransformer(freq=freq)
        .fit(X.head(1), y.head(1))
        .transform(X.head(1))
        .columns
    )

    assert set(cols_with_ones) == set(single_date_cols)
Example #25
0
def tsreg(ts, freq=None, interp=False):
    """
    Function to regularize a time series object (pandas).
    The first three indeces must be regular for freq=None!!!

    Parameters
    ----------
    ts : DataFrame
        pandas time series dataframe.
    freq : str or None
        Either specify the known frequency of the data or use None and
    determine the frequency from the first three indices.
    interp : bool
        Should linear interpolation be applied on all missing data?

    Returns
    -------
    DataFrame
    """

    if freq is None:
        freq = pd.infer_freq(ts.index[:3])
    ts1 = ts.resample(freq).mean()
    if interp:
        ts1 = ts1.interpolate('time')

    return ts1
Example #26
0
def tidy_to_wide(df):
    """
    Summary
    -------
    Transforms a copy of the input DataFrame from tidy to wide data format.

    """
    # Ensure input DataFrame is in tidy format.
    if not validation.is_tidy(df):
        msg = 'Cannot transform DataFrame from tidy format to wide format.'
        raise TypeError(msg)
    # Initialize DataFrame for operation.
    df_out = df.copy()
    # Split Pathname into Parts.
    split_pathname(df_out, inplace=True)
    # Pivot DataFrame.
    col_header = [
        'Part A', 'Part B', 'Part C', 'Part E', 'Part F', 'Units', 'Data Type'
    ]
    if 'Study' in df_out.columns:
        col_header.insert(0, 'Study')
    df_out.set_index(col_header + ['DateTime'], append=True, inplace=True)
    df_out.reset_index(0, drop=True, inplace=True)
    df_out = df_out['Value']
    df_out = df_out.unstack(col_header)
    df_out.index.freq = pd.infer_freq(df_out.index, warn=False)
    # Return DataFrame.
    return df_out
Example #27
0
File: perf.py Project: ychaim/tia
def periodicity(freq_or_frame):
    """
    resolve the number of periods per year
    """
    if hasattr(freq_or_frame, 'rule_code'):
        rc = freq_or_frame.rule_code
        rc = rc.split('-')[0]
        factor = PER_YEAR_MAP.get(rc, None)
        if factor is not None:
            return factor / abs(freq_or_frame.n)
        else:
            raise Exception('Failed to determine periodicity. No factor mapping for %s' % freq_or_frame)
    elif isinstance(freq_or_frame, basestring):
        factor = PER_YEAR_MAP.get(freq_or_frame, None)
        if factor is not None:
            return factor
        else:
            raise Exception('Failed to determine periodicity. No factor mapping for %s' % freq_or_frame)
    elif isinstance(freq_or_frame, (pd.Series, pd.DataFrame, pd.TimeSeries)):
        freq = freq_or_frame.index.freq
        if not freq:
            freq = pd.infer_freq(freq_or_frame.index)
            if freq:
                return periodicity(freq)
            else:
                # Attempt to resolve it
                import warnings

                freq = guess_freq(freq_or_frame.index)
                warnings.warn('frequency not set. guessed it to be %s' % freq)
                return periodicity(freq)
        else:
            return periodicity(freq)
    else:
        raise ValueError("periodicity expects DataFrame, Series, or rule_code property")
Example #28
0
def scale_profile(profile, weight):
    """Scale hourly profile using a list of monthly weights.

    :param pandas.DataFrame profile: hourly profile.
    :param list weight: list of monthly weights.
    :return: (*pandas.DataFrame*) -- scaled hourly profile.
    :raises TypeError: if profile is not a time series or weight is not a list.
    :raises ValueError: if frequency of time series is not 1h or size of weight is
        not 12
    """
    if not isinstance(profile, pd.Series):
        raise TypeError("profile must be a pandas.Series object")
    if not isinstance(weight, list):
        raise TypeError("weight must be a list")
    if pd.infer_freq(profile.index) != "H":
        raise ValueError("frequency of time series must be 1h")
    if len(weight) != 12:
        raise ValueError("the list of weight must have exactly 12 elements")

    monthly_profile = profile.resample("M").sum(min_count=24 * 28)
    monthly_factor = [t / p for t, p in zip(weight, monthly_profile.values)]
    hourly_factor = (pd.Series(
        monthly_factor,
        index=pd.date_range(profile.index.min(), periods=12, freq="MS"),
    ).resample("H").ffill().reindex(profile.index, method="ffill"))

    return profile * hourly_factor
Example #29
0
def validate_continuous_fasts(fasts: pd.Series) -> bool:
    """
    Validate a continuous log of fasts for use by other module functions.
    Validations:
        - Frequency of series index is 1 minute ('T')
        - Value at each time step is either 0 or 1 (0 ~ not fasting, 1 ~ fasting), no extraneous or NaN values

    Args:
        fasts: Series of continuous logs with a datetime index at a 1 minute frequency and values of 0 or 1.

    Returns: True if the fasts series is valid.
    """

    # Validate frequency of index is 1 minute ('T')
    freq = pd.infer_freq(fasts.index)
    if freq != 'T':
        raise ValueError(f"""
                        Frequency of the continuous fast must be: 'T' (1 minute).
                        Frequency of fasts series input: {freq}.
                        """)

    # Validate values only contain 0 or 1
    if not fasts.isin([0, 1]).all():
        unexpected_values = fasts[((fasts != 0) & (fasts != 1))]
        raise ValueError(f"""
                        Continuous fast (input to fasts) must contain only values of 0 or 1.
                        Check fasts for extraneous or NaN values:
                        {unexpected_values}
                        """)

    return True
Example #30
0
    def fit(self, X, y=None):
        """Check if `date_col` has daily frequency

        This check is in `fit` method since pandas.infer_freq is used which requires at least 3 observations.

        Parameters
        ----------
        X : pandas.DataFrame
            Input features.

        y : Any
            Ignored

        Returns
        -------
        HolidayTransformer
            self

        Raises
        ------
        ValueError
            in case daily frequency is not used or very few datapoints are provided in X
        """
        if pd.infer_freq(X.index) != "D":
            raise ValueError(
                f"HolidayTransformer can be used only with daily frequency in index. "
                f"Your index is of type {type(X.index)} with frequency {pd.infer_freq(X.index)}"
            )

        return self
Example #31
0
    def _reindex(self, data, times, columns):

        if len(data) != len(times):

            if self.resample:
                # Resample at a specific frequency
                kwargs = {"periods": len(data)}
                if self.resample_rate is None:
                    kwargs["freq"] = pd.infer_freq(times)
                    kwargs["freq"] = pd.tseries.frequencies.to_offset(kwargs["freq"])
                else:
                    kwargs["freq"] = pd.DateOffset(seconds=1 / self.resample_rate)
                if self.resample_direction == "right":
                    kwargs["start"] = times[0]
                elif self.resample_direction == "left":
                    kwargs["end"] = times[-1]
                else:

                    def middle(a):
                        return int(np.ceil(len(a) / 2)) - 1

                    kwargs["start"] = times[middle(times)] - (
                        middle(data) * kwargs["freq"]
                    )
                times = pd.date_range(**kwargs)

            else:
                # Linearly arange between first and last
                times = pd.date_range(start=times[0], end=times[-1], periods=len(data))

        return pd.DataFrame(data, times, columns)
Example #32
0
def infer_periodocity(train):
    perd = pd.infer_freq(train.index)
    if perd in ["MS", "M", "BM", "BMS"]:
        periodocity = 12
    elif perd in ["BH", "H"]:
        periodocity = 24
    elif perd == "B":
        periodocity = 5
    elif perd == "D":
        periodocity = 7
    elif perd in [
            "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT"
    ]:
        periodocity = 52
    elif perd in ["Q", "QS", "BQ", "BQS"]:
        periodocity = 4
    elif perd in ["A", "BA", "AS", "BAS"]:
        periodocity = 10
    elif perd in ["T", "min"]:
        periodocity = 60
    elif perd == "S":
        periodocity = 60
    elif perd in ["L", "ms"]:
        periodocity = 1000
    elif perd in ["U", "us"]:
        periodocity = 1000
    elif perd == "N":
        periodocity = 1000

    return periodocity
Example #33
0
File: knmi.py Project: pastas/pasta
def read_knmi(fname, variables='RD'):
    """This method can be used to import KNMI data.

    Parameters
    ----------
    fname: str
        Filename and path to a Dino file.
    variables: str
        String with the variable name to extract.

    Returns
    -------
    ts: pastas.TimeSeries
        returns a Pastas TimeSeries object or a list of objects.

    """
    knmi = KnmiStation.fromfile(fname)
    if variables is None:
        variables = knmi.variables.keys()
    if isinstance(variables, str):
        variables = [variables]

    stn_codes = knmi.data['STN'].unique()

    ts = []
    for code in stn_codes:
        for variable in variables:
            if variable not in knmi.data.keys():
                raise (ValueError(
                    "variable %s is not in this dataset. Please use one of "
                    "the following keys: %s" % (variable, knmi.data.keys())))

            series = knmi.data.loc[knmi.data['STN'] == code, variable]
            # get rid of the hours when data is daily
            if pd.infer_freq(series.index) == 'D':
                series.index = series.index.normalize()

            metadata = {}
            if knmi.stations is not None and not knmi.stations.empty:
                station = knmi.stations.loc[code, :]
                metadata['x'] = station.LON_east
                metadata['y'] = station.LAT_north
                metadata['z'] = station.ALT_m
                metadata['projection'] = 'epsg:4326'
                stationname = station.NAME
            else:
                stationname = str(code)
            metadata['description'] = knmi.variables[variable]
            if variable == 'RD' or variable == 'RH':
                kind = 'prec'
            elif variable == 'EV24':
                kind = 'evap'
            else:
                kind = None
            ts.append(TimeSeries(series, name=variable + ' ' + stationname,
                                 metadata=metadata, settings=kind))
    if len(ts) == 1:
        ts = ts[0]
    return ts
Example #34
0
def xlsx_to_pandas(file_in,header=True,header_row=0,skiprows_after_header=0,date_col=True,regularise=True,worksheets=[]):

    xl_book=xlrd.open_workbook(file_in)
    
    d={}
    
    start_date='1900-01-01'
    end_date='2100-01-01'    
    
    if not worksheets:
        get_sheets=xl_book.sheet_names()
    else:
        get_sheets=worksheets
    
    for sheet_name in get_sheets:

        sheet=xl_book.sheet_by_name(sheet_name)
       
        rows=sheet.nrows
        cols=sheet.ncols
        
        if rows==0: print 'Could not find any valid rows'
        if cols==0: print 'Could not find any valid columns'

        if rows!=0 and cols!=0:
            if header==True:
                if date_col==True:
                    column_names=[str(sheet.cell_value(header_row,i)) for i in range(cols)]
                    index=[]
                    for i in xrange(header_row+skiprows_after_header+1,sheet.nrows):
                        try:
                            index.append(dt.datetime(*xlrd.xldate_as_tuple(sheet.cell_value(i,0), xl_book.datemode)))
                        except ValueError:
                            index.append('')
                            print 'Error in sheet '+sheet_name+' at row '+str(i)+'; missing or invalid datetime stamp! Skipping...'
                    df=pd.DataFrame(columns=column_names[1:],index=index)
                    for i in range(1,cols):
                        arr=np.array(sheet.col_values(i)[header_row+skiprows_after_header+1:])
                        arr[arr=='']='-9999'
                        df[column_names[i]]=arr.astype(np.float)
                    if regularise==True:
                        df_freq=pd.infer_freq(df.index)
                        df_ind=pd.date_range(start=df.index[0],end=df.index[-1],freq=df_freq)
                        df=df.reindex(df_ind)
                    d[sheet_name]=df
        else:
            d[sheet_name]=pd.DataFrame()
    return d          
    
    # Multiple sheets are returned as dictionary (pandas dataframe) objects
    # Note XLRD cell type codes:
    #    XL_CELL_EMPTY: 0
    #    XL_CELL_TEXT: 1 (STRING)
    #    XL_CELL_NUMBER: 2 (FLOAT)
    #    XL_CELL_DATE: 3 (FLOAT)
    #    XL_CELL_BOOLEAN: 4 (INT)
    #    XL_CELL_ERROR: 5 (INTERNAL EXCEL CODE)
    #    XL_CELL_BLANK: 6 (EMPTY STRING)
Example #35
0
    def _QC(self):
        
        interval = int(filter(lambda x: x.isdigit(), 
                              pd.infer_freq(self.df.index)))
        assert interval % 30 == 0
        recs_per_day = 1440 / interval
        self.recs_per_day = recs_per_day

        return
Example #36
0
    def shift_dates(self,h):
        """ Auxiliary function for creating dates for forecasts

        Parameters
        ----------
        h : int
            How many steps to forecast

        Returns
        ----------
        A transformed date_index object
        """

        date_index = copy.deepcopy(self.index)
        date_index = date_index[self.max_lag:len(date_index)]

        if self.is_pandas is True:

            if isinstance(date_index,pd.tseries.index.DatetimeIndex):

                if pd.infer_freq(date_index) == 'H' or pd.infer_freq(date_index) == 'M' or pd.infer_freq(date_index) == 'S':

                    for t in range(h):
                        date_index += pd.DateOffset((date_index[len(date_index)-1] - date_index[len(date_index)-2]).seconds)

                else: # Assume higher frequency (configured for days)

                    for t in range(h):
                        date_index += pd.DateOffset((date_index[len(date_index)-1] - date_index[len(date_index)-2]).days)

            elif isinstance(date_index,pd.core.index.Int64Index):

                for i in range(h):
                    new_value = date_index.values[len(date_index.values)-1] + (date_index.values[len(date_index.values)-1] - date_index.values[len(date_index.values)-2])
                    date_index = pd.Int64Index(np.append(date_index.values,new_value))

        else:

            for t in range(h):
                date_index.append(date_index[len(date_index)-1]+1)

        return date_index   
Example #37
0
def _check_period_index(x, freq="M"):
    from pandas import PeriodIndex, DatetimeIndex
    if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
        raise ValueError("The index must be a DatetimeIndex or PeriodIndex")

    if x.index.freq is not None:
        inferred_freq = x.index.freqstr
    else:
        inferred_freq = pd.infer_freq(x.index)
    if not inferred_freq.startswith(freq):
        raise ValueError("Expected frequency {}. Got {}".format(inferred_freq,
                                                                freq))
Example #38
0
def add_freq(idx, freq=None):
    """Add a frequency attribute to idx, through inference or directly.

    Returns a copy.  If `freq` is None, it is inferred.
    """

    idx = idx.copy()
    if freq is None:
        if idx.freq is None:
            freq = pd.infer_freq(idx)
        else:
            return idx
    idx.freq = pd.tseries.frequencies.to_offset(freq)
    if idx.freq is None:
        raise AttributeError('no discernible frequency found to `idx`.  Specify'
                             ' a frequency string with `freq`.')
    return idx
Example #39
0
    def __init__(self, dataframe, resample = True, names_dict = None, 
                 insolation_threshold = 10, season_routine = 'standard'):

        interval = int(filter(lambda x: x.isdigit(), 
                              pd.infer_freq(dataframe.index)))
        assert interval % 30 == 0
        assert season_routine in ['standard', 'barr']
        if not names_dict: 
            self.external_names = self._define_default_external_names()
        else:
            self.external_names = names_dict
        self.df = utils.rename_df(dataframe, self.external_names, 
                                  self._define_default_internal_names())
        self.resample = resample
        self.insolation_threshold = insolation_threshold
        self.season_routine = season_routine
        self.interval = interval
        self.season_n = 1000 if interval == 30 else 600
        self.bin_n = 5 if interval == 30 else 3
        self.valid_years_list = self._get_valid_years()
Example #40
0
 def _get_stats_and_qc(self):
     
     interval = int(filter(lambda x: x.isdigit(), 
                           pd.infer_freq(self.df.index)))
     if not interval % 30 == 0:
         raise RuntimeError('Dataset datetime index is non-contiguous - '
                            'exiting')
     df_length = len(self.df)
     model_length = len(self.df.loc[pd.isnull(self.df.Model) == 0])
     obs_length = len(self.df.loc[pd.isnull(self.df.Observations) == 0])
     pct_available = obs_length / float(df_length) * 100
     if model_length != df_length:
         raise RuntimeError('{} missing values in model series... aborting'
                            .format(str(df_length - model_length)))
     if pct_available < self.minimum_pct:
         raise RuntimeError('Insufficient data to proceed (minimum % '
                            'set to {0}, encountered only {1}%)... '
                            'returning'
                            .format(str(self.minimum_pct), 
                                    round(str(pct_available), 1)))
     self.interval = interval
     self.pct_available = pct_available
     return
Example #41
0
    def validate_series(self, series):
        """ This method performs some PASTAS specific tests for the TimeSeries.

        Parameters
        ----------
        series: pd.Series
            Pandas series object containing the series time series.

        Returns
        -------
        series: pandas.Series
            The validated series as pd.Series

        Notes
        -----
        The Series are validated for the following cases:

            1. Series is an actual pandas Series;
            2. Nan-values from begin and end are removed;
            3. Nan-values between observations are removed;
            4. Indices are in Timestamps (standard throughout PASTAS),
            making the index a pandas DateTimeIndex.
            5. Duplicate indices are removed (by averaging).

        """

        # 2. Make sure the indices are Timestamps and sorted
        series = series.astype(float)
        series.index = pd.to_datetime(series.index)
        series = series.sort_index()
        series.index.name = ""

        # 3. Drop nan-values at the beginning and end of the time series
        series = series.loc[series.first_valid_index():series.last_valid_index(
        )].copy(deep=True)

        # 4. Find the frequency of the original series
        if self.freq_original:
            pass
        elif pd.infer_freq(series.index):
            self.freq_original = pd.infer_freq(series.index)
            logger.info("Inferred frequency from time series %s: freq=%s " % (
                self.name, self.freq_original))
        else:
            self.freq_original = self.settings["freq"]
            if self.freq_original is None:
                logger.info(
                    "Cannot determine frequency of series %s" % self.name)
            elif self.settings["fill_nan"] and self.settings["fill_nan"] != \
                    "drop":
                logger.warning("User-provided frequency is applied when "
                               "validating the Time Series %s. Make sure the "
                               "provided frequency is close to the real "
                               "frequency of the original series." % self.name)

        # 5. Handle duplicate indices
        if not series.index.is_unique:
            logger.warning("duplicate time-indexes were found in the Time "
                           "Series %s. Values were averaged." % self.name)
            grouped = series.groupby(level=0)
            series = grouped.mean()

        # 6. drop nan-values
        if series.hasnans:
            series = self.fill_nan(series)

        if self.settings["tmin"] is None:
            self.settings["tmin"] = series.index.min()
        if self.settings["tmax"] is None:
            self.settings["tmax"] = series.index.max()

        return series
# expedience.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

data = [
    446.6565, 454.4733, 455.663, 423.6322, 456.2713, 440.5881, 425.3325,
    485.1494, 506.0482, 526.792, 514.2689, 494.211
]
index = pd.DatetimeIndex(start='1996', end='2008', freq='A')
oildata = pd.Series(data, index)
oildata.index = pd.DatetimeIndex(
    oildata.index, freq=pd.infer_freq(oildata.index))

data = [
    17.5534, 21.86, 23.8866, 26.9293, 26.8885, 28.8314, 30.0751, 30.9535,
    30.1857, 31.5797, 32.5776, 33.4774, 39.0216, 41.3864, 41.5966
]
index = pd.DatetimeIndex(start='1990', end='2005', freq='A')
air = pd.Series(data, index)
air.index = pd.DatetimeIndex(air.index, freq=pd.infer_freq(air.index))

data = [
    263.9177, 268.3072, 260.6626, 266.6394, 277.5158, 283.834, 290.309,
    292.4742, 300.8307, 309.2867, 318.3311, 329.3724, 338.884, 339.2441,
    328.6006, 314.2554, 314.4597, 321.4138, 329.7893, 346.3852, 352.2979,
    348.3705, 417.5629, 417.1236, 417.7495, 412.2339, 411.9468, 394.6971,
    401.4993, 408.2705, 414.2428
Example #43
0
 def _resample_date_range(self, date_range, freq):
     orig_freq_str = pd.infer_freq(date_range)
     orig_freq = pd.tseries.frequencies.to_offset(orig_freq_str)
     min_date = date_range[0]
     max_date = date_range[-1] + orig_freq
     return pd.date_range(min_date, max_date, freq=freq, closed="left")
Example #44
0
def correctDrift(drifted, correct_drifted_vars=None, correct=None,
                get_fit=True, write_fit=True, fit_file='correctDrift_linfit.params',
                apply_fit=True, show_plot=False, return_plot=False, units={}, return_index=False):
    """
    Parameters
    -----------
    correct: pandas.DataFrame
        dataset with the correct averages
    drifted: pandas.DataFrame
        dataset with the averages that need to be corrected
    correct_drifted_vars: dict
        dictionary where every key is a var in the right dataset and 
        its value is its correspondent in the drifted dataset
    get_fit: bool
        whether ot not to fit a linear relation between both datasets. Generally slow. Should only be done once
    write_fit: bool
        if get_fit == True, whether or not to write the linear fit to a file (recommended)
    fit_file: string
        where to write the linear fit (if one is written) or from where to read the linear fit (if no fit is written)
    apply_fit: bool
        whether of not to apply the lineat fit and correct the data (at least get_fit and fit_file must be true)
    show_plot: bool
        whether or not to show drifted vs correct plot, to see if it's a good fit
    units: dict
        if given, it creates a {file_file}.units file, to tell write down in which units data has to be in
        order to be correctly corrected
    return_index: bool
        whether to return the indexes of the used points for the calculation. Serves to check the regression

    Returns
    -------
    outdf: pandas.DataFrame
        drifted dataset corrected with right dataset
    """
    from matplotlib import pyplot as plt
    import pandas as pd
    import numpy as np

    if correct_drifted_vars:
        rwvars = correct_drifted_vars
    else:
        if len(correct.columns)==1:
            rwvars = { cor : dft for cor, dft in zip(correct.columns, drifted.columns) }
        else:
            raise NameError('If correct is not provided or has more than one column, you should provide correct_drifted_vars.')

    cors=[]
    #----------------
    # This option is activated if we provide a correct dataset from which to withdraw the correction parameters
    if get_fit:
        for slw, fst in rwvars.iteritems():
            slow=correct[slw]
            fast=drifted[fst]
            #----------------
            # Check to see if the frequency in both datasets are the same. Otherwise we are comparing different things
            try:
                if pd.infer_freq(correct.index) == pd.infer_freq(drifted.index):
                    slow, fast = map(np.array, [slow, fast] )
                else:
                    print('Frequencies must be the same, however, inferred frequencies appear to be different. Plese check.')
            except TypeError:
                print('Cannot determine if frequencies are the same. We will continue but you should check')
                slow, fast = map(np.array, [slow, fast] )
            #----------------
    
            #----------------
            # Does the 1D fitting filtering for NaN values (very important apparently)
            idx = np.isfinite(slow) & np.isfinite(fast)
            coefs, residuals, rank, singular_vals, rcond = np.polyfit(fast[idx], slow[idx], 1, full=True)
            #----------------
            if show_plot:
                plt.title('{} vs {}'.format(fst, slw))
                plt.plot(fast[idx], slow[idx], marker='o', linestyle='')
                plt.plot(fast[idx], np.poly1d(coefs)(fast[idx]), '-', linewidth=2)
                plt.xlabel(fst)
                plt.ylabel(slw)
                plt.grid(True)
                fig = plt.gcf()
                plt.show()
                if return_plot:
                    return fig
    
            correc=pd.DataFrame(columns=[ '{}_{}'.format(slw, fst) ], index=['angular', 'linear'], data=coefs).transpose()
            cors.append(correc)
        cors = pd.concat(cors, join='outer')
        print(cors)
    #----------------

        #----------------
        # Writes the fit parameters in a file to be used later
        if write_fit:
            cors.index.name='correct_drifted'
            cors.to_csv(fit_file, index=True)
            if units:
                with open(fit_file+'.units', 'wt') as fou:
                    for key, item in units.iteritems():
                        fou.write('{"%s":"%s"}\n' % (key,item))
        #----------------

    #----------------
    # If you do not want to correct from an existing correct dataset. A file with the parameters must be read
    else:
        cors=pd.read_csv(fit_file, index_col=0, header=0)
    #----------------

    #------------
    # Applies the fit column by column
    if apply_fit:
        corrected=drifted.copy()
        for slw, fst in rwvars.iteritems():
            coefs = np.array(cors.loc['{}_{}'.format(slw,fst), ['angular','linear']])
            corrected[ fst ] = np.poly1d(coefs)(drifted[ fst ])
    else:
        corrected=drifted.copy()
    #------------

    #----------------
    # The returning of the index idx is done mainly for checking purposes
    if return_index:
        return corrected, idx
    else:
        return corrected
Example #45
0
stdata[i] = 0

plt.figure()
plt.plot(stdata)

stdiff = np.diff(stdata,n=1)
plt.figure()
plt.plot(stdiff)

rng = pd.date_range(start = '2016-01-01', periods = data.size, freq ='H')
D = pd.DataFrame(data).set_index(rng)

dm = pd.DataFrame(D.resample('D').mean())
dm = dm.set_index(pd.date_range(start = '2016-01-01', periods = dm.shape[0], fraq = 'D'))
dm.index = pd.DatetimeIndex(pd.date_range(start = '2016-01-01', periods = dm.shape[0], fraq = 'D'))
pd.infer_freq(dm)

dmdiff = np.diff(dm[dm.columns[0]])

dm.plot()
plt.figure()
plt.plot(dmdiff)

np.mean(dmdiff)

#dm.to_csv('daily_means_2016.csv', sep=',')
#pd.DataFrame(dmdiff).to_csv('diff_daily_means_2016.csv', sep=',')


#### decompose dm and dmdiff ####
Example #46
0
    def setup_class(cls):
        #Changed for backwards compatability with pandas

        #oildata_oil_json = '{"851990400000":446.6565229,"883526400000":454.4733065,"915062400000":455.662974,"946598400000":423.6322388,"978220800000":456.2713279,"1009756800000":440.5880501,"1041292800000":425.3325201,"1072828800000":485.1494479,"1104451200000":506.0481621,"1135987200000":526.7919833,"1167523200000":514.268889,"1199059200000":494.2110193}'
        #oildata_oil = pd.read_json(oildata_oil_json, typ='Series').sort_index()
        data = [446.65652290000003, 454.47330649999998, 455.66297400000002,
                423.63223879999998, 456.27132790000002, 440.58805009999998,
                425.33252010000001, 485.14944789999998, 506.04816210000001,
                526.79198329999997, 514.26888899999994, 494.21101929999998]
        index= ['1996-12-31 00:00:00', '1997-12-31 00:00:00', '1998-12-31 00:00:00',
                '1999-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00',
                '2002-12-31 00:00:00', '2003-12-31 00:00:00', '2004-12-31 00:00:00',
                '2005-12-31 00:00:00', '2006-12-31 00:00:00', '2007-12-31 00:00:00']
        oildata_oil = pd.Series(data, index)
        oildata_oil.index = pd.DatetimeIndex(oildata_oil.index,
                                freq=pd.infer_freq(oildata_oil.index))
        cls.oildata_oil = oildata_oil

        #air_ausair_json = '{"662601600000":17.5534,"694137600000":21.8601,"725760000000":23.8866,"757296000000":26.9293,"788832000000":26.8885,"820368000000":28.8314,"851990400000":30.0751,"883526400000":30.9535,"915062400000":30.1857,"946598400000":31.5797,"978220800000":32.577569,"1009756800000":33.477398,"1041292800000":39.021581,"1072828800000":41.386432,"1104451200000":41.596552}'
        #air_ausair = pd.read_json(air_ausair_json, typ='Series').sort_index()
        data = [17.5534, 21.860099999999999, 23.886600000000001,
                26.929300000000001, 26.888500000000001, 28.831399999999999,
                30.075099999999999, 30.953499999999998, 30.185700000000001,
                31.579699999999999, 32.577568999999997, 33.477398000000001,
                39.021580999999998, 41.386431999999999, 41.596552000000003]
        index= ['1990-12-31 00:00:00', '1991-12-31 00:00:00', '1992-12-31 00:00:00',
                '1993-12-31 00:00:00', '1994-12-31 00:00:00', '1995-12-31 00:00:00',
                '1996-12-31 00:00:00', '1997-12-31 00:00:00', '1998-12-31 00:00:00',
                '1999-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00',
                '2002-12-31 00:00:00', '2003-12-31 00:00:00', '2004-12-31 00:00:00']
        air_ausair = pd.Series(data, index)
        air_ausair.index = pd.DatetimeIndex(air_ausair.index,
                                            freq=pd.infer_freq(air_ausair.index))
        cls.air_ausair = air_ausair

        #livestock2_livestock_json = '{"31449600000":263.917747,"62985600000":268.307222,"94608000000":260.662556,"126144000000":266.639419,"157680000000":277.515778,"189216000000":283.834045,"220838400000":290.309028,"252374400000":292.474198,"283910400000":300.830694,"315446400000":309.286657,"347068800000":318.331081,"378604800000":329.37239,"410140800000":338.883998,"441676800000":339.244126,"473299200000":328.600632,"504835200000":314.255385,"536371200000":314.459695,"567907200000":321.413779,"599529600000":329.789292,"631065600000":346.385165,"662601600000":352.297882,"694137600000":348.370515,"725760000000":417.562922,"757296000000":417.12357,"788832000000":417.749459,"820368000000":412.233904,"851990400000":411.946817,"883526400000":394.697075,"915062400000":401.49927,"946598400000":408.270468,"978220800000":414.2428}'
        #livestock2_livestock = pd.read_json(livestock2_livestock_json, typ='Series').sort_index()
        data = [263.91774700000002, 268.30722200000002, 260.662556,
                266.63941899999998, 277.51577800000001, 283.834045,
                290.30902800000001, 292.474198, 300.83069399999999,
                309.28665699999999, 318.33108099999998, 329.37239,
                338.88399800000002, 339.24412599999999, 328.60063200000002,
                314.25538499999999, 314.45969500000001, 321.41377899999998,
                329.78929199999999, 346.38516499999997, 352.29788200000002,
                348.37051500000001, 417.56292200000001, 417.12356999999997,
                417.749459, 412.233904, 411.94681700000001, 394.69707499999998,
                401.49927000000002, 408.27046799999999, 414.24279999999999]
        index= ['1970-12-31 00:00:00', '1971-12-31 00:00:00', '1972-12-31 00:00:00',
                '1973-12-31 00:00:00', '1974-12-31 00:00:00', '1975-12-31 00:00:00',
                '1976-12-31 00:00:00', '1977-12-31 00:00:00', '1978-12-31 00:00:00',
                '1979-12-31 00:00:00', '1980-12-31 00:00:00', '1981-12-31 00:00:00',
                '1982-12-31 00:00:00', '1983-12-31 00:00:00', '1984-12-31 00:00:00',
                '1985-12-31 00:00:00', '1986-12-31 00:00:00', '1987-12-31 00:00:00',
                '1988-12-31 00:00:00', '1989-12-31 00:00:00', '1990-12-31 00:00:00',
                '1991-12-31 00:00:00', '1992-12-31 00:00:00', '1993-12-31 00:00:00',
                '1994-12-31 00:00:00', '1995-12-31 00:00:00', '1996-12-31 00:00:00',
                '1997-12-31 00:00:00', '1998-12-31 00:00:00', '1999-12-31 00:00:00',
                '2000-12-31 00:00:00']
        livestock2_livestock = pd.Series(data, index)
        livestock2_livestock.index = pd.DatetimeIndex(
                livestock2_livestock.index,
                freq=pd.infer_freq(livestock2_livestock.index))
        cls.livestock2_livestock = livestock2_livestock

        #aust_json = '{"1104537600000":41.727458,"1112313600000":24.04185,"1120176000000":32.328103,"1128124800000":37.328708,"1136073600000":46.213153,"1143849600000":29.346326,"1151712000000":36.48291,"1159660800000":42.977719,"1167609600000":48.901525,"1175385600000":31.180221,"1183248000000":37.717881,"1191196800000":40.420211,"1199145600000":51.206863,"1207008000000":31.887228,"1214870400000":40.978263,"1222819200000":43.772491,"1230768000000":55.558567,"1238544000000":33.850915,"1246406400000":42.076383,"1254355200000":45.642292,"1262304000000":59.76678,"1270080000000":35.191877,"1277942400000":44.319737,"1285891200000":47.913736}'
        #aust = pd.read_json(aust_json, typ='Series').sort_index()
        data = [41.727457999999999, 24.04185, 32.328102999999999,
                37.328707999999999, 46.213152999999998, 29.346326000000001,
                36.482909999999997, 42.977719, 48.901524999999999,
                31.180221, 37.717880999999998, 40.420211000000002,
                51.206862999999998, 31.887228, 40.978262999999998,
                43.772491000000002, 55.558566999999996, 33.850915000000001,
                42.076383, 45.642291999999998, 59.766779999999997,
                35.191876999999998, 44.319737000000003, 47.913736]
        index= ['2005-03-01 00:00:00', '2005-06-01 00:00:00', '2005-09-01 00:00:00',
                '2005-12-01 00:00:00', '2006-03-01 00:00:00', '2006-06-01 00:00:00',
                '2006-09-01 00:00:00', '2006-12-01 00:00:00', '2007-03-01 00:00:00',
                '2007-06-01 00:00:00', '2007-09-01 00:00:00', '2007-12-01 00:00:00',
                '2008-03-01 00:00:00', '2008-06-01 00:00:00', '2008-09-01 00:00:00',
                '2008-12-01 00:00:00', '2009-03-01 00:00:00', '2009-06-01 00:00:00',
                '2009-09-01 00:00:00', '2009-12-01 00:00:00', '2010-03-01 00:00:00',
                '2010-06-01 00:00:00', '2010-09-01 00:00:00', '2010-12-01 00:00:00']
        aust = pd.Series(data, index)
        aust.index = pd.DatetimeIndex(aust.index,
                                      freq=pd.infer_freq(aust.index))
        cls.aust = aust
Example #47
0
    all_utm_N[i] = utm_N
    all_utm_E[i] = utm_E

# convert the numpy arrays to pandas dataframes
df_Temp = pd.DataFrame(all_Temp)
df_Prec = pd.DataFrame(all_Prec)
df_Wind = pd.DataFrame(all_Wind)
df_RH = pd.DataFrame(all_RH)
df_SW = pd.DataFrame(all_SW)
df_LW = pd.DataFrame(all_LW)


#get the time variable from the original netCDF file
df_Time = pd.DataFrame(dsTotal.time.values)
dtIndex = pd.DatetimeIndex(dsTotal.time.values)
freq = pd.infer_freq(dtIndex);
#df_Time = df_Time.set_index(dtIndex)
# this is slow
#df_Time.apply(tz_update_utc)

df_TimeTemp = pd.concat([df_Time, df_Temp], axis = 1)
df_TimePrec = pd.concat([df_Time, df_Prec], axis = 1)
df_TimeWind = pd.concat([df_Time, df_Wind], axis = 1)
df_TimeRH = pd.concat([df_Time, df_RH], axis = 1)
df_TimeSW = pd.concat([df_Time, df_SW], axis = 1)
df_TimeLW = pd.concat([df_Time, df_LW], axis = 1)

# add time variable to data frame
# add names to the columns
df_TimeTemp.columns = columnNames
df_TimePrec.columns = columnNames
Example #48
0
def asbestfreq(
        data,
        force_freq=None,
):
    """Test to determine best frequency to represent data.

    This uses several techniques.
    0.5.  If index is not DateTimeIndex, return
    1. If force_freq is set use .asfreq.
    2. If data.index.freq is not None, just return.
    3. If data.index.inferred_freq is set use .asfreq.
    4. Use pd.infer_freq - fails if any missing
    5. Use .is_* functions to establish A, AS, A-*, AS-*, Q, QS, M, MS
    6. Use minimum interval to establish the fixed time periods up to weekly
    7. Gives up returning None for PANDAS offset string

    """
    if not isinstance(data.index, pd.DatetimeIndex):
        return data

    if force_freq is not None:
        return data.asfreq(force_freq)

    ndiff = (data.index.values.astype('int64')[1:] -
             data.index.values.astype('int64')[:-1])
    if np.any(ndiff <= 0):
        raise ValueError("""
*
*   Duplicate or time reversal index entry at
*   record {1} (start count at 0):
*   "{0}".
*
""".format(data.index[:-1][ndiff <= 0][0], pd.np.where(ndiff <= 0)[0][0] + 1))

    if data.index.freq is not None:
        return data

    # Since pandas doesn't set data.index.freq and data.index.freqstr when
    # using .asfreq, this function returns that PANDAS time offset alias code
    # also.  Not ideal at all.
    #
    # This gets most of the frequencies...
    if data.index.inferred_freq is not None:
        try:
            return data.asfreq(data.index.inferred_freq)
        except ValueError:
            pass

    # pd.infer_freq would fail if given a large dataset
    if len(data.index) > 100:
        slic = slice(None, 99)
    else:
        slic = slice(None, None)
    try:
        infer_freq = pd.infer_freq(data.index[slic])
    except ValueError:
        infer_freq = None
    if infer_freq is not None:
        return data.asfreq(infer_freq)

    # At this point pd.infer_freq failed probably because of missing values.
    # The following algorithm would not capture things like BQ, BQS
    # ...etc.
    if np.alltrue(data.index.is_year_end):
        infer_freq = 'A'
    elif np.alltrue(data.index.is_year_start):
        infer_freq = 'AS'
    elif np.alltrue(data.index.is_quarter_end):
        infer_freq = 'Q'
    elif np.alltrue(data.index.is_quarter_start):
        infer_freq = 'QS'
    elif np.alltrue(data.index.is_month_end):
        if np.all(data.index.month == data.index[0].month):
            # Actually yearly with different ends
            infer_freq = 'A-{0}'.format(_ANNUALS[data.index[0].month])
        else:
            infer_freq = 'M'
    elif np.alltrue(data.index.is_month_start):
        if np.all(data.index.month == data.index[0].month):
            # Actually yearly with different start
            infer_freq = 'A-{0}'.format(_ANNUALS[data.index[0].month] - 1)
        else:
            infer_freq = 'MS'

    if infer_freq is not None:
        return data.asfreq(infer_freq)

    # Use the minimum of the intervals to test a new interval.
    # Should work for fixed intervals.
    ndiff = sorted(set(ndiff))
    mininterval = np.min(ndiff)
    if mininterval <= 0:
        raise ValueError
    if len(ndiff) == 1:
        ngcd = ndiff[0]
    else:
        ngcd = reduce(gcd, ndiff)
    if ngcd < 1000:
        infer_freq = '{0}N'.format(ngcd)
    elif ngcd < 1000000:
        infer_freq = '{0}U'.format(ngcd // 1000)
    elif ngcd < 1000000000:
        infer_freq = '{0}L'.format(ngcd // 1000000)
    elif ngcd < 60000000000:
        infer_freq = '{0}S'.format(ngcd // 1000000000)
    elif ngcd < 3600000000000:
        infer_freq = '{0}T'.format(ngcd // 60000000000)
    elif ngcd < 86400000000000:
        infer_freq = '{0}H'.format(ngcd // 3600000000000)
    elif ngcd < 604800000000000:
        infer_freq = '{0}D'.format(ngcd // 86400000000000)
    elif ngcd < 2419200000000000:
        infer_freq = '{0}W'.format(ngcd // 604800000000000)
        if np.all(data.index.dayofweek == data.index[0].dayofweek):
            infer_freq = infer_freq + '-{0}'.format(
                _WEEKLIES[data.index[0].dayofweek])
        else:
            infer_freq = 'D'

    if infer_freq is not None:
        return data.asfreq(infer_freq)

    # Give up
    return data
def resample_to_model_data_index(df, date_index, frequencies, date_group, \
                                 start, end, \
                                 fill='mean', stat='50%', df_freq=None, 
                                 index_report=True, label='left', debug=False,
                                 retain_na=False):

    pd_dt = pd.to_datetime

    if len(frequencies) != len(date_group) - 1:
        print("Frequencies list must have one less item than the date_group list")
        return

    if df_freq != None:
        df = df.resample(df_freq).mean()
    # end if
    df = df.loc[start:end]

    #However if the time period for the model is longer we need to reindex the dataseries
    if df_freq == None:
        df_freq = pd.infer_freq(df.index)

    # Create temporary date_index 
    date_index_temp = pd.date_range(start=date_index[0], end=date_index[-1], \
                                    freq=df_freq)
    df = df.reindex(date_index_temp)
    # Then we have to fill in the missing values with mean or some other descriptor
    df = _fill_in_time_series_nan(df, fill=fill, stat=stat)
    # Create empty list for placing the resampled parts of the dataframe
    df_resamples = []

    len_frequencies = len(frequencies)
    for index, frequency in enumerate(frequencies):
        #print(frequency)
        p_start, p_end = date_group[index], date_group[index + 1]
        #resample = df[df.index.isin(pd.date_range(p_start, p_end))] \
        if index < len_frequencies - 1:
            resample = df[(df.index >= pd_dt(p_start)) & (df.index < pd_dt(p_end))] \
                          .resample(frequency, label=label).mean()
        elif len_frequencies == 1:
            resample = df[(df.index >= pd_dt(p_start))] \
                          .resample(frequency, label='right').mean()
        else:
            resample = df[(df.index >= pd_dt(p_start))] \
                          .resample(frequency, label=label).mean()
            
        if debug: print resample.index
        if index < len_frequencies - 1:
            if label == 'left':
                df_resamples += [resample.iloc[1:]]
            elif label == 'right':
                df_resamples += [resample.iloc[:-1]]
            # end if
        elif len_frequencies == 1:
            df_resamples += [resample]
        else:
            df_resamples += [resample.iloc[1:]]
        # end if
    # end for
    df_concat = pd.concat(df_resamples)
    if index_report:
        # TODO: Report if any of the df_concat indices are not in date_index
        if len_frequencies > 1:
            if np.all(np.in1d(df_concat.index, date_index[:-1])): #np.array_equal(df_concat.index, date_index):
                print("Successful match of date indices for model and resampled df")
            else:
                print("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index))
                import sys        
                sys.exit("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index))        
            # end if
        else:
            if np.all(np.in1d(df_concat.index, date_index)): #np.array_equal(df_concat.index, date_index):
                print("Successful match of date indices for model and resampled df")
            else:
                print("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index))
                import sys        
                sys.exit("*** Failed match of some date indices for model and resampled df \n {0} \n {1}".format(df_concat.index, date_index))        
            # end if
        # end if
    # end if
    
    # Remove the dead rows from the dataframe if there was no filling
    if fill == 'none' and not retain_na:
        df_concat = df_concat.dropna()
    # end if
    
    return df_concat