Beispiel #1
0
def test_where():
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(cond).dropna()
    rs2 = s[cond]
    assert_series_equal(rs, rs2)

    rs = s.where(cond, -s)
    assert_series_equal(rs, s.abs())

    rs = s.where(cond)
    assert (s.shape == rs.shape)
    assert (rs is not s)

    # test alignment
    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())

    expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
    rs = s2.where(cond[:3])
    assert_series_equal(rs, expected)

    expected = s2.abs()
    expected.iloc[0] = s2[0]
    rs = s2.where(cond[:3], -s2)
    assert_series_equal(rs, expected)
Beispiel #2
0
def test_where():
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(cond).dropna()
    rs2 = s[cond]
    assert_series_equal(rs, rs2)

    rs = s.where(cond, -s)
    assert_series_equal(rs, s.abs())

    rs = s.where(cond)
    assert (s.shape == rs.shape)
    assert (rs is not s)

    # test alignment
    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())

    expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
    rs = s2.where(cond[:3])
    assert_series_equal(rs, expected)

    expected = s2.abs()
    expected.iloc[0] = s2[0]
    rs = s2.where(cond[:3], -s2)
    assert_series_equal(rs, expected)
Beispiel #3
0
def _safe_cast(name, series: pd.Series):
    if series.dtype == np.float64:
        assert series.abs().max() < 1e37, "Max too close to float32 max."
        return series.astype(np.float32)
    elif series.dtype == np.int64:
        if name == "detected":
            assert series.abs().max() < 128, "Max too close to int8 max."
            return series.astype(np.int8)
        else:
            assert series.abs().max() < 2e9, "Max too close to int32 max."
            return series.astype(np.int32)
    else:
        raise TypeError(f"Unexpected non-int/float column type {series.dtype}")
Beispiel #4
0
def li_et_al_2019(kurtosis: pd.Series, normal_period: range = range(50, 150), sigma_interval: float = 2) -> int:
    """
    Li et al. in 2019 used the kurtosis as a classification indicator by computing its mean and standard deviation
    in the early period of bearing operation. The first prediction time (FPT )was then determined as the point in time
    where the kurtosis exceeds the 2*std_dev interval.

    :param kurtosis: kurtosis that is used as the FPT indicator
    :param normal_period: Range of the period that is representative for normal bearing behaviour.
    :param sigma_interval: range of deviation that is allowed for the kurtosis
    :return: index of FPT
    """

    kurtosis_normal = kurtosis[normal_period]
    mean = kurtosis_normal.mean()
    std_dev = kurtosis_normal.std()

    kurtosis = kurtosis - mean
    kurtosis = kurtosis.abs()

    kurtosis = np.array(kurtosis)
    n = kurtosis.size
    threshold = sigma_interval * std_dev
    for i in range(150, n):
        if kurtosis[i - 1] > threshold:
            if kurtosis[i] > threshold:
                return i
    return 0
Beispiel #5
0
def make_equal(series: pd.Series, matched: float) -> pd.Series:
    """
    Equally distrubute a series considering a matched value.
    Lower values than matched value are filtered.
    :param series: A series which has one or more rows.
    :param matched: A positive float number.
    :return: A series that is equally distrubuted.
    """
    check_negative = series.sum() < 0
    if check_negative:
        sorted_ = series.abs().sort_values()
    else:
        sorted_ = series.sort_values()
    per_ = matched / series.size
    for i, v in enumerate(sorted_):
        if not v > per_:
            per_ = (matched - sorted_.iloc[:i + 1].sum()) / (series.size -
                                                             (i + 1))
        else:
            break
    sorted_.iloc[i:] = per_
    if check_negative:
        return series.mul(0).add(sorted_).mul(-1)
    else:
        return series.mul(0).add(sorted_)
    def check_sum(self):
        """
        The method checks to make sure that the probabilities add up to
        1 across choices.

        Inputs:
        None
        """
        self.num_agents = self.probabilities.shape[0]
        self.num_choices = self.probabilities.shape[-1]
        # print 'OLD CUM SUM'
        #cumsum_across_rows = self.probabilities.cumsum(-1)[:,-1]
        # print cumsum_across_rows
        # print 'NEW CUM SUM'
        cumsum_across_rows = self.probabilities.sum(1)
        #print self.probabilities.head()
        #print cumsum_across_rows.head()
        cumsum_zeros_ind = cumsum_across_rows.isnull()
        
        diff_from_unity = Series.abs(cumsum_across_rows - 1)
        if cumsum_zeros_ind.any():
            check_sum_across = diff_from_unity[~cumsum_zeros_ind] < 1e-6
            #self.error = True
            #raw_input("Cumulative sum is zero")
        else:
            check_sum_across = diff_from_unity < 1e-6
            #self.error = False
        if not check_sum_across.all():
            raise ProbabilityError, """probability values do not add up """ \
                """to one across rows"""
    def check_sum(self):
        """
        The method checks to make sure that the probabilities add up to
        1 across choices.

        Inputs:
        None
        """
        self.num_agents = self.probabilities.shape[0]
        self.num_choices = self.probabilities.shape[-1]
        # print 'OLD CUM SUM'
        #cumsum_across_rows = self.probabilities.cumsum(-1)[:,-1]
        # print cumsum_across_rows
        # print 'NEW CUM SUM'
        cumsum_across_rows = self.probabilities.sum(1)
        #print self.probabilities.head()
        #print cumsum_across_rows.head()
        cumsum_zeros_ind = cumsum_across_rows.isnull()

        diff_from_unity = Series.abs(cumsum_across_rows - 1)
        if cumsum_zeros_ind.any():
            check_sum_across = diff_from_unity[~cumsum_zeros_ind] < 1e-6
            #self.error = True
            #raw_input("Cumulative sum is zero")
        else:
            check_sum_across = diff_from_unity < 1e-6
            #self.error = False
        if not check_sum_across.all():
            raise ProbabilityError, """probability values do not add up """ \
                """to one across rows"""
Beispiel #8
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    assert_series_equal(rs, rs2)

    pytest.raises(ValueError, s.mask, 1)
    pytest.raises(ValueError, s.mask, cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    assert_series_equal(result, expected)
Beispiel #9
0
def global_filter(X: pd.Series,
                  no_change_window: int = 3,
                  min_value: float = None,
                  max_value: float = None,
                  allow_zero: bool = False,
                  allow_negative: bool = False,
                  copy=True) -> pd.Series:

    if not isinstance(X, pd.Series):
        raise ValueError('Input data is expected of pd.Series type')

    if copy:
        X = X.copy()

    time_step = X.index.to_series().diff().min()
    steps_per_hour = math.ceil(pd.Timedelta('1H') / time_step)
    start = int(no_change_window * steps_per_hour)

    changes = X.diff().abs()
    X[start:] = X[start:].mask(
        changes.rolling(f'{no_change_window}H').sum() < 1e-3, np.nan)

    if min_value is not None:
        X.loc[X < min_value] = np.nan
    if max_value is not None:
        X.loc[X > max_value] = np.nan
    if not allow_zero:
        X.loc[X <= np.finfo(np.float32).eps] = np.nan
    if not allow_negative:
        X.loc[X < 0] = np.nan

    median = X.median()
    X.loc[X.abs() > 10 * median] = np.nan
    return X
Beispiel #10
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    assert_series_equal(rs, rs2)

    pytest.raises(ValueError, s.mask, 1)
    pytest.raises(ValueError, s.mask, cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    assert_series_equal(result, expected)
Beispiel #11
0
def mean_abs_scaling(series: pd.Series, minimum_scale=1e-6):
    """
    Scales a Series by the mean of its absolute value.

    Returns the scaled Series and the scale itself.
    """
    scale = max(minimum_scale, series.abs().mean())
    return series / scale, scale
Beispiel #12
0
def bla(inst: Instrument, position: pd.Series):
    ''' Number of trade block per year / ( 2 * average absolute number of blocks held ) '''

    no_trade_per_year = position.ffill().diff().abs().rolling(window=system.n_bday_in_3m).sum() * 4
    avg_abs_no_trade = position.abs().rolling(window=system.n_bday_in_3m).sum() * 4
    # no_trade_per_year = positions.ffill().diff().abs()
    # avg_abs_no_trade = positions.abs()
    t = no_trade_per_year / avg_abs_no_trade
    # print(t.sum())
    return t
Beispiel #13
0
def signal_scalar(signal: pd.Series, target_abs_forecast=system.target_abs_forecast):
    """
    
    :param signal: the input signal to be scaled with to scale with median(abs(signal)) = target_abs_forecast 
    :param target_abs_forecast: scalar 
    :return: pd.Series
    """
    # time series average
    scaling_factor = target_abs_forecast / signal.abs().ewm(span=system.n_bday_in_year).mean()
    signal = scaling_factor * signal
    return signal
Beispiel #14
0
class FramePlotting(object):
    params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter',
               'hexbin']]
    param_names = ['kind']

    def setup(self, kind):
        if kind in ['bar', 'barh', 'pie']:
            n = 100
        elif kind in ['kde', 'scatter', 'hexbin']:
            n = 10000
        else:
            n = 1000000

        self.x = Series(np.random.randn(n))
        self.y = Series(np.random.randn(n))
        if kind in ['area', 'pie']:
            self.x = self.x.abs()
            self.y = self.y.abs()
        self.df = DataFrame({'x': self.x, 'y': self.y})

    def time_frame_plot(self, kind):
        self.df.plot(x='x', y='y', kind=kind)
Beispiel #15
0
    def _conflict_by_entity(simulation, of_var_holder, ipp_var, pb_calcul, ipp_output = ipp_output):
        of_var_series = Series(of_var_holder.array)
        entity = of_var_holder.entity
        if entity.is_persons_entity:
            quimen_series = Series(simulation.get_holder('quimen').array)
            of_var_series = of_var_series[quimen_series.isin([0, 1])].reset_index(drop = True)
            ipp_var_series = ipp_output[ipp_var]
            # print ipp_var
            # print ipp_var_series
            # print of_var_series
            # print "\n"
        else:
            quient_series = Series(simulation.get_holder('qui' + entity.symbol).array)
            quient_0 = quient_series[quient_series == 0]
            quient_1 = quient_series[quient_series == 1]
            long = range(len(quient_0))
            if len(quient_1) > 0:
                long = [2 * x for x in long]
            ipp_var_series = ipp_output.loc[long, ipp_var].reset_index(drop = True)

        conflict = ((ipp_var_series.abs() - of_var_series.abs()).abs() > threshold)
        idmen = simulation.get_holder('idmen').array
        conflict_selection = DataFrame({'idmen': idmen, 'idfoy': simulation.get_holder('idfoy').array})
        conflict_men = conflict_selection.loc[conflict[conflict == True].index, 'idmen'].drop_duplicates().values  # noqa
        conflict_foy = conflict_selection.loc[conflict[conflict == True].index, 'idfoy'].drop_duplicates().values  # noqa
        if (len(ipp_var_series[conflict]) != 0):
            if verbose:
                print u"Le calcul de {} pose problème : ".format(of_var)
                print DataFrame({
                    "IPP": ipp_var_series[conflict],
                    "OF": of_var_series[conflict],
                    "diff.": ipp_var_series[conflict].abs() - of_var_series[conflict].abs(),
                    }).to_string()
                relevant_variables = _relevant_input_variables(simulation)
                print relevant_variables
                input = {}
                for entity in ['ind', 'men', 'foy']:
                    dic = {}
                    for variable in relevant_variables[entity]:
                        dic[variable] = simulation.get_holder(variable).array
                    input[entity] = DataFrame(dic)
                print "Variables individuelles associées à ce ménage:"
                print input['ind'].loc[input['ind']['idmen'].isin(conflict_men)].to_string()
                # .loc[conflict[conflict == True].index].to_string()
                if not input['men'].empty:
                    print "Variables associées au ménage:"
                    print input['men'].loc[conflict_men].to_string()
                if not input['foy'].empty:
                    print "Variables associées au foyer fiscal:"
                    print input['foy'].loc[conflict_foy].to_string()
            pb_calcul += [of_var]
Beispiel #16
0
def log_minmax(column: pd.Series):
    """
    Similar to minmax but with changes values logarithmically before executing minmax.
    :param pd.Series column: Input series of numbers.
    :return pd.Series: Series with numbers in range from 0 to 1 according to their weight.
    """
    columns = {}
    column = column.fillna(0)
    column = column.abs()
    column = np.log10(column)
    columns[column.name] = [min(column[column != -np.inf]), max(column)]
    column = column - columns[column.name][0]
    column = column.replace(-np.inf, 0)
    return column / (columns[column.name][1] - columns[column.name][0])
Beispiel #17
0
class FramePlotting:
    params = [
        ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"]
    ]
    param_names = ["kind"]

    def setup(self, kind):
        if kind in ["bar", "barh", "pie"]:
            n = 100
        elif kind in ["kde", "scatter", "hexbin"]:
            n = 10000
        else:
            n = 1000000

        self.x = Series(np.random.randn(n))
        self.y = Series(np.random.randn(n))
        if kind in ["area", "pie"]:
            self.x = self.x.abs()
            self.y = self.y.abs()
        self.df = DataFrame({"x": self.x, "y": self.y})

    def time_frame_plot(self, kind):
        self.df.plot(x="x", y="y", kind=kind)
 def _turn_analog_vec_into_per_frame(self, vec: pd.Series) -> pd.Series:
     """
     Squeezes the input running data from a per-row basis to a per
     frame.
     """
     window_size = max(len(vec) // self.num_of_frames, 1)
     mean_data = vec.abs().rolling(window_size).mean()
     sample_at = np.linspace(window_size - 1,
                             len(vec) - 1,
                             num=self.num_of_frames,
                             dtype=np.uint32)
     data_per_frame = mean_data[sample_at]
     assert len(data_per_frame) == self.num_of_frames
     return data_per_frame
def plot_1d_corr_heatmap(corr: pd.Series,
                         annot=True,
                         fmt='.2f',
                         cmap='coolwarm'):
    max_corr = corr.abs().max()
    heatmap_df = pd.DataFrame(corr.sort_values(ascending=False))
    plt.subplots(figsize=(1.5, len(corr) // 3.5))

    sns.heatmap(heatmap_df,
                annot=annot,
                fmt=fmt,
                cmap=cmap,
                center=0,
                vmin=-max_corr,
                vmax=max_corr)
Beispiel #20
0
def infer_vmin_vmax(data:pd.Series, continuous_type="infer"):
    vmin = None
    vmax = None
    # Infer continuous type
    if continuous_type in ["infer", None]:
        continuous_type = infer_continuous_type(data)
    # +/-
    if continuous_type == "diverging":
        vmax = data.abs().max()
        vmin = -vmax
    # Other
    if continuous_type == "sequential":
        vmax = data.max()
        vmin = data.min()
    assert all(map(bool, [vmin,vmax])), "`vmin` and `vmax` should not be None at this point.  Please check `infer_continuous_type`"
    return vmin, vmax
Beispiel #21
0
def robust_zscore(x: pd.Series, zscore=False):
    """Robust ZScore Normalization

    Use robust statistics for Z-Score normalization:
        mean(x) = median(x)
        std(x) = MAD(x) * 1.4826

    Reference:
        https://en.wikipedia.org/wiki/Median_absolute_deviation.
    """
    x = x - x.median()
    mad = x.abs().median()
    x = np.clip(x / mad / 1.4826, -3, 3)
    if zscore:
        x -= x.mean()
        x /= x.std()
    return x
Beispiel #22
0
class SeriesPlotting:
    params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]]
    param_names = ["kind"]

    def setup(self, kind):
        if kind in ["bar", "barh", "pie"]:
            n = 100
        elif kind in ["kde"]:
            n = 10000
        else:
            n = 1000000

        self.s = Series(np.random.randn(n))
        if kind in ["area", "pie"]:
            self.s = self.s.abs()

    def time_series_plot(self, kind):
        self.s.plot(kind=kind)
Beispiel #23
0
class SeriesPlotting(object):
    params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']]
    param_names = ['kind']

    def setup(self, kind):
        if kind in ['bar', 'barh', 'pie']:
            n = 100
        elif kind in ['kde']:
            n = 10000
        else:
            n = 1000000

        self.s = Series(np.random.randn(n))
        if kind in ['area', 'pie']:
            self.s = self.s.abs()

    def time_series_plot(self, kind):
        self.s.plot(kind=kind)
Beispiel #24
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    assert_series_equal(rs, rs2)

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.mask(1)
    with pytest.raises(ValueError, match=msg):
        s.mask(cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    assert_series_equal(result, expected)

    # see gh-21891
    s = Series([1, 2])
    res = s.mask([True, False])

    exp = Series([np.nan, 2])
    tm.assert_series_equal(res, exp)
Beispiel #25
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    tm.assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    tm.assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    tm.assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    tm.assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    tm.assert_series_equal(rs, rs2)

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.mask(1)
    with pytest.raises(ValueError, match=msg):
        s.mask(cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    tm.assert_series_equal(result, expected)

    # see gh-21891
    s = Series([1, 2])
    res = s.mask([True, False])

    exp = Series([np.nan, 2])
    tm.assert_series_equal(res, exp)
Beispiel #26
0
def scale_series(series: pd.Series,
                 kwargs: Dict[str, Any]) -> Tuple[pd.Series, int, str]:
    """Scale a series, if 'scale_y'=True in kwargs. Also adjust ylabel text
       if that is present in kwargs. Factor is as in 10 ** factor. 
       Returns a tuple: adjusted series, factor, factor-text."""

    # do we need to act
    scale_y = get_selected_item(kwargs, 'scale_y', default=False)
    scale_x = get_selected_item(kwargs, 'scale_x', default=False)
    if not scale_x and not scale_y:
        return series, 0, ''

    label = 'xlabel' if scale_x else 'ylabel'
    max = series.abs().max()
    factor = 0 if max < 1000 else np.floor(np.log10(max) / 3.0) * 3
    if factor > 0:
        if label not in kwargs:
            kwargs[label] = f'{SCALES[factor].title()}'
        else:
            kwargs[label] = f'{kwargs[label]} ({SCALES[factor]})'
    return (series / (10**factor), factor, SCALES[factor])
Beispiel #27
0
    def cat_bar_plot(self, data: pd.Series, **kwargs) -> plt.Figure:
        data = data.abs()
        fig, ax = plt.subplots(**kwargs)

        ax.grid(True)
        ax.xaxis.set_major_formatter(dates.DateFormatter('%B'))

        total_w = 6
        if isinstance(data, pd.Series):
            ax.set_ylim(0, ceil(data.max() / 100) * 100)
            ax.bar(data.index, data.values, width=total_w)
        elif isinstance(data, pd.DataFrame):
            ax.set_ylim(0, ceil(data.max().max() / 100) * 100)
            for i, c in enumerate(data):
                w = total_w / data.shape[1]
                # offset = -(total_w / (data.shape[1] * 2)) + (i * w)
                offset = 0
                ax.bar(data.index + timedelta(days=int(offset)),
                       data[c].values,
                       width=w)
        return fig
def GSEA2005(geneset_membership: pd.Series, correlations: pd.Series):
    '''
  Implementation of algorithm described here:
  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1239896/

  :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes
  :param correlations: (pd.Series) Correlation of a given gene
  :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max()
  '''
    r_j = correlations.abs().sort_values(
        ascending=False)  # r_j: correlation of gene_j in ranked order
    S = geneset_membership[
        correlations.index]  # S: geneset mask aligned with r_j
    N = S.count()  # N: number of genes
    N_H = S.sum()  # N_H: number of hits
    N_R = r_j[S].sum()  # N_R: sum of r_j for g_j \in S
    P_hit = S * r_j / N_R  # P_hit: fraction of hits weighted by r_j
    P_miss = (~S) * 1 / (N - N_H)  # P_hit: fraction of misses up to position i
    # 0 added to beginning for plotting, doesn't affect sum
    x = np.arange(N + 1)
    y = np.concatenate([[0], np.cumsum(P_hit - P_miss)])
    return x, y
Beispiel #29
0
def ccf(x, y, lags=None, bin_method='gaussian', bin_width=None,
        max_gap=np.inf, min_obs=10, output="ccf"):
    """Method to calculate the cross-correlation function for irregular
    timesteps based on the slotting technique. Different methods (kernels)
    to bin the data are available.

    Parameters
    ----------
    x, y: pandas.Series
        Pandas Series containig the values to calculate the
        cross-correlation for. The index has to be a Pandas.DatetimeIndex
    lags: numpy.array, optional
        numpy array containing the lags in days for which the
        cross-correlation if calculated. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
        12, 13, 14, 30, 61, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365]
    bin_method: str, optional
        method to determine the type of bin. Options are "gaussian" (default),
        sinc and rectangle.
    bin_width: float, optional
        number of days used as the width for the bin to calculate the
        correlation. By default these values are chosed based on the
        bin_method.
    max_gap: float, optional
        Maximum timestep gap in the data. All timesteps above this gap value
        are not used for calculating the average timestep. This can be
        helpfull when there is a large gap in the data that influences the
        average timestep.

    Returns
    -------
    CCF: pandas.Series
        The Cross-correlation function.

    References
    ----------
    Rehfeld, K., Marwan, N., Heitzig, J., Kurths, J. (2011). Comparison
    of correlation analysis techniques for irregularly sampled time series.
    Nonlinear Processes in Geophysics. 18. 389-404. 10.5194 pg-18-389-2011.

    Examples
    --------
    acf = ps.stats.ccf(x, y, bin_method="gaussian")

    """
    # prepare the time indices for x and y
    dt_x = x.index.to_series().diff().values / Timedelta(1, "D")
    dt_x[0] = 0.0
    dt_x_mu = dt_x[dt_x < max_gap].mean()  # Deal with big gaps if present
    t_x = np.cumsum(dt_x)

    dt_y = y.index.to_series().diff().values / Timedelta(1, "D")
    dt_y[0] = 0.0
    dt_y_mu = dt_y[dt_y < max_gap].mean()
    t_y = np.cumsum(dt_y)

    dt_mu = max(dt_x_mu, dt_y_mu)

    # Create matrix with time differences
    t1, t2 = np.meshgrid(t_x, t_y)
    t = np.abs(np.subtract(t1, t2))  # absolute values

    # Normalize the values and create numpy arrays
    x = (x.values - x.values.mean()) / x.values.std()
    y = (y.values - y.values.mean()) / y.values.std()

    # Create matrix for covariances
    xy = np.outer(y, x)

    if lags is None:  # Default lags in Days, log-scale between 0 and 365.
        lags = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 30, 61, 90, 120,
                150, 180, 210, 240, 270, 300, 330, 365]

    # Remove lags that cannot be determined because lag < dt_min
    u, i = np.unique(dt_x, return_counts=True)
    dt_x_min = u[Series(i, u).cumsum() >= min_obs][0]
    u, i = np.unique(dt_y, return_counts=True)
    dt_y_min = u[Series(i, u).cumsum() >= min_obs][0]

    dt_min = min(dt_x_min, dt_y_min)
    # dt_min = min(dt_x[1:].min(), dt_y[1:].min())

    lags = np.array([float(lag) for lag in lags if lag >= dt_min or lag == 0])

    # Delete to free memory
    del (x, y, dt_x, dt_y, t1, t2, t_x, t_y)

    # Select appropriate bin_width, default depend on bin_method
    if bin_width is None:
        options = {"rectangle": 0.5, "sinc": 1, "gaussian": 0.25}
        bin_width = np.ones_like(lags) * options[bin_method] * dt_mu
    elif type(bin_width) is float:
        bin_width = np.ones_like(lags)
    else:
        bin_width = [0.5, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5,
                     2, 2, 2, 2, 2, 2, 2, 2]

    # Select the binning method to calculate the cross-correlation
    if bin_method == "rectangle":
        a = np.zeros_like(t, dtype=float)
        kernel_func = lambda d, h: np.less_equal(np.abs(d, out=a), h,
                                                 out=a).astype(int)
    elif bin_method == "gaussian":
        a = np.zeros_like(t, dtype=float)

        def kernel_func(d, h):
            den1 = -2 * h ** 2  # denominator 1
            den2 = np.sqrt(2 * np.pi * h)  # denominator 2
            return np.exp(np.square(d, out=a) / den1, out=a) / den2
    elif bin_method == "sinc":
        kernel_func = lambda d, h: np.sin(np.pi * h * d) / (np.pi * h * d)
    else:
        raise NotImplementedError("bin_method %s is not implemented." %
                                  bin_method)

    # Pre-allocate an array to speed up all numpy methods
    UDCF = np.zeros_like(lags, dtype=float)
    M = np.zeros_like(lags, dtype=float)
    d = np.zeros_like(t, dtype=float)

    for i, k in enumerate(lags):
        # Construct the kernel for the lag
        np.subtract(t, k, out=d)
        h = bin_width[i]
        b = kernel_func(d, h)
        c = np.multiply(xy, b, out=d)  # Element-wise multiplication
        UDCF[i] = np.sum(c)
        M[i] = np.sum(b)

    DCF = UDCF / M

    C = Series(data=DCF, index=lags, name="CCF")
    CCF = C / C.abs().max()

    if output == "full":
        CCFstd = np.sqrt((np.cumsum(UDCF) - M * DCF) ** 2) / (M - 1)
        CCF = DataFrame(data={"CCF": CCF.values, "stderr": CCFstd}, index=lags)

    CCF.index.name = "Lags (Days)"
    return CCF
Beispiel #30
0
 def round(series: pd.Series, decimals: int = 0) -> pd.Series:
     if not decimals:
         return (-(np.sign(series)) *
                 np.ceil(-(series.abs()) - 0.5)).astype(np.int64)
     return series.round(decimals=decimals)
        iteration = 0
        for exp in experiments:
            train = [c for c in conditions if not c.startswith(exp)]
            test = [c for c in conditions if c.startswith(exp)]

            yss, xss = ys.ix[m, train], xs[train].T

            lm = ElasticNet(alpha=0.01).fit(xss, yss)

            pred, meas = lm.predict(xs[test].T), ys.ix[m, test]

            coefs = Series(lm.coef_, index=xss.columns)

            cor, pval = pearsonr(list(pred), list(meas))

            m_coef['%d' % iteration] = coefs.abs().to_dict()

            iteration += 1

        lm_res[m] = DataFrame(m_coef).mean(1).to_dict()
    print '[INFO] Associations performed'

    # Conditions betas
    info_table = DataFrame([(f, m, lm_res[m][f]) for m in lm_res
                            for f in lm_res[m]],
                           columns=['feature', 'metabolite', 'coef'])

    # #
    # info_table = info_table[[i in db_proteins for i in info_table['feature']]]
    # info_table = info_table[[i in db_ions for i in info_table['metabolite']]]
def _zero_nans(out: pd.Series, a: pd.Series) -> pd.Series:
    a_almost_zero = a.abs() < 1e-16  # type: ignore
    nans = pd.isnull(out)  # nan when 0 / 0,  a > 0 /0 => inf
    out[a_almost_zero & nans] = 0.0  # type: ignore
    return out
Beispiel #33
0
    def features(x: pd.Series) -> pd.DataFrame:
        df = pd.DataFrame(dtype=np.float64)

        df.loc[1, 'ave'] = x.values.mean()
        df.loc[1, 'std'] = x.values.std()
        df.loc[1, 'max'] = x.values.max()
        df.loc[1, 'min'] = x.values.min()

        df.loc[1, 'q90'] = np.quantile(x.values, 0.90)
        df.loc[1, 'q95'] = np.quantile(x.values, 0.95)
        df.loc[1, 'q99'] = np.quantile(x.values, 0.99)
        df.loc[1, 'q05'] = np.quantile(x.values, 0.05)
        df.loc[1, 'q10'] = np.quantile(x.values, 0.10)
        df.loc[1, 'q01'] = np.quantile(x.values, 0.01)

        df.loc[1, 'abs_max'] = np.abs(x.values).max()
        df.loc[1, 'abs_mean'] = np.abs(x.values).mean()
        df.loc[1, 'abs_std'] = np.abs(x.values).std()
        df.loc[1, 'trend'] = Base._add_trend_feature(x.values)
        df.loc[1, 'abs_trend'] = Base._add_trend_feature(x.values,
                                                         abs_values=True)

        # New features - rolling features
        for w in [10, 50, 100, 1000]:
            x_roll_std = x.rolling(w).std().dropna().values
            x_roll_mean = x.rolling(w).mean().dropna().values
            x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values

            df.loc[1, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
            df.loc[1, 'std_roll_std_' + str(w)] = x_roll_std.std()
            df.loc[1, 'max_roll_std_' + str(w)] = x_roll_std.max()
            df.loc[1, 'min_roll_std_' + str(w)] = x_roll_std.min()

            df.loc[1, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
            df.loc[1, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
            df.loc[1, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10)
            df.loc[1, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
            df.loc[1, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)

            df.loc[1, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
            df.loc[1, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
            df.loc[1, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
            df.loc[1, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
            df.loc[1,
                   'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
            df.loc[1,
                   'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
            df.loc[1,
                   'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
            df.loc[1,
                   'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
            df.loc[1, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
            df.loc[1, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
            df.loc[1, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
            df.loc[1, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
            df.loc[1, 'q01_roll_abs_mean_' + str(w)] = np.quantile(
                x_roll_abs_mean, 0.01)
            df.loc[1, 'q05_roll_abs_mean_' + str(w)] = np.quantile(
                x_roll_abs_mean, 0.05)
            df.loc[1, 'q95_roll_abs_mean_' + str(w)] = np.quantile(
                x_roll_abs_mean, 0.95)
            df.loc[1, 'q99_roll_abs_mean_' + str(w)] = np.quantile(
                x_roll_abs_mean, 0.99)

        return df
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index
    
    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))
        
    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))
    
    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))
    
    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))
        
    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
Beispiel #35
0
def turnover(series:pd.Series):
    ratios = series.diff().abs() / series.abs().rolling(window=system.n_bday_in_3m).mean() * system.n_bday_in_year
    return ratios
Beispiel #36
0
# array([0, 5, 6, 0, 0])
trunc(fser100)
# array([0, 1, 2, 3, 4, 5])
trunc(fser200)
# array([90,  0, 67, 12,  0, 79], dtype=int64)
trunc(fser300)
# array([50, 24, 56, 89, 33], dtype=int64)
# now numpy array, the index is lost

# Statistics;
fser300.mean()
fser300.std()
fser300.max()
fser300.idxmax()
fser300.cumsum()
fser200.abs()



# Using python functions;
fser300.apply(lambda x: x if x > 40 else 40)

fser600 = Series(["Baby", "Girl", "Boy", "Woman", "Man"],
                 index=["Ba", "Gi", "Bo", "Wo", "Ma"])
myobj = {'Baby': 10, 'Girl': 22, 'Boy': 45, 'Woman': 90, 'Man': 89}
fser700 = Series(myobj)
fser600.map(myobj)
fser600.map(fser700)


fser300.map(lambda x: x if x > 40 else 40)
Beispiel #37
0
    def normalize(values: _pd.Series) -> _pd.Series:
        ''' Maps series of values to scale from -1.0 to 1.0 and returns it as a new Series object. '''
        min = values.abs().min()
        max = values.abs().max()

        return (values - min) / (max - min)
Beispiel #38
0
def _get_finite_bounds(numbers: Series) -> Tuple[float, float]:
    finite_numbers = numbers[numbers.abs() != float("inf")]
    return finite_numbers.min(), finite_numbers.max()