Example #1
0
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context('use_bottleneck', use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ['int32', 'int64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype='int64')
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ['float32', 'float64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
Example #2
0
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
Example #3
0
    def test_min_max_numeric_only(self):
        # TODO deprecate numeric_only argument for Categorical and use
        # skipna as well, see GH25303
        cat = Series(Categorical(
            ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True))

        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "a"

        _min = cat.min(numeric_only=True)
        _max = cat.max(numeric_only=True)
        assert _min == "b"
        assert _max == "a"

        _min = cat.min(numeric_only=False)
        _max = cat.max(numeric_only=False)
        assert np.isnan(_min)
        assert _max == "a"
Example #4
0
    def test_overflow(self):
        # GH 9442
        s = Series(pd.date_range('20130101', periods=100000, freq='H'))
        s[0] += pd.Timedelta('1s 1ms')

        # mean
        result = (s - s.min()).mean()
        expected = pd.Timedelta((pd.DatetimeIndex((s - s.min())).asi8 / len(s)
                                 ).sum())

        # the computation is converted to float so
        # might be some loss of precision
        assert np.allclose(result.value / 1000, expected.value / 1000)

        # sum
        pytest.raises(ValueError, lambda: (s - s.min()).sum())
        s1 = s[0:10000]
        pytest.raises(ValueError, lambda: (s1 - s1.min()).sum())
        s2 = s[0:1000]
        result = (s2 - s2.min()).sum()
Example #5
0
    def test_min_max(self):
        # unordered cats have no min/max
        cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
        with pytest.raises(TypeError):
            cat.min()
        with pytest.raises(TypeError):
            cat.max()

        cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "a"
        assert _max == "d"

        cat = Series(Categorical(["a", "b", "c", "d"], categories=[
                     'd', 'c', 'b', 'a'], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "d"
        assert _max == "a"

        cat = Series(Categorical(
            [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
                                                    ], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "b"

        cat = Series(Categorical(
            [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == 1
Example #6
0
def test_name2num():
    num_to_test = 10
    str_len = 4
    letters = string.ascii_letters
    x = Series(dict(zip(letters, map(ord, letters))))
    base = 256 ** np.arange(str_len)
    mn = base.dot(np.repeat(x.min(), str_len))
    mx = base.dot(np.repeat(x.max(), str_len))

    for _ in xrange(num_to_test):
        name = random.sample(letters, str_len)
        num = name2num(name)
        assert mn <= num <= mx
Example #7
0
    def test_min_max_skipna(self, skipna):
        # GH 25303
        cat = Series(
            Categorical(["a", "b", np.nan, "a"],
                        categories=["b", "a"],
                        ordered=True))
        _min = cat.min(skipna=skipna)
        _max = cat.max(skipna=skipna)

        if skipna is True:
            assert _min == "b"
            assert _max == "a"
        else:
            assert np.isnan(_min)
            assert np.isnan(_max)
Example #8
0
def test_td64_summation_overflow():
    # GH#9442
    ser = Series(pd.date_range("20130101", periods=100000, freq="H"))
    ser[0] += pd.Timedelta("1s 1ms")

    # mean
    result = (ser - ser.min()).mean()
    expected = pd.Timedelta(
        (pd.TimedeltaIndex(ser - ser.min()).asi8 / len(ser)).sum())

    # the computation is converted to float so
    # might be some loss of precision
    assert np.allclose(result.value / 1000, expected.value / 1000)

    # sum
    msg = "overflow in timedelta operation"
    with pytest.raises(ValueError, match=msg):
        (ser - ser.min()).sum()

    s1 = ser[0:10000]
    with pytest.raises(ValueError, match=msg):
        (s1 - s1.min()).sum()
    s2 = ser[0:1000]
    (s2 - s2.min()).sum()
Example #9
0
def get_stats(s: pd.Series):
    """ Calculate basic sample `s` statistics. """
    q1 = s.quantile(0.25)
    median = s.median()
    q3 = s.quantile(0.75)
    p90 = s.quantile(0.90)
    p95 = s.quantile(0.95)
    p99 = s.quantile(0.99)
    iqr = q3 - q1
    mean = round(s.mean(), 2)
    std = round(s.std(), 2)
    min = s.min()
    max = s.max()
    n = len(s)
    return [q1, median, q3, p90, p95, p99, iqr, mean, std, min, max, n]
Example #10
0
def numeric_stats_pandas(series: pd.Series):
    #     summary["min"] = summary["value_counts_without_nan"].index.min()
    # vc.index.min()
    return {
        "mean": series.mean(),
        "std": series.std(),
        "variance": series.var(),
        "min": series.min(),
        "max": series.max(),
        # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
        "kurtosis": series.kurt(),
        # Unbiased skew normalized by N-1
        "skewness": series.skew(),
        "sum": series.sum(),
    }
def compress_float(series: pd.Series) -> pd.Series:
    """
    Compressing to half-precision floating-point format can degrade computational performance
    CPUs often do not have native support for 16-bit floats and simulate the data type.
    https://en.wikipedia.org/wiki/Half-precision_floating-point_format
    https://stackoverflow.com/a/49997863/470433
    https://stackoverflow.com/a/15341193/470433
    :param series:
    :return:
    """
    minv, maxv = series.min(), series.max()
    tester = type_tester(minv, maxv, np.finfo)
    test_types = [np.float16, np.float32, np.float64]

    compressed_type = get_compressed_type(test_types, tester)
    return series.astype(compressed_type)
Example #12
0
    def test_min_max_dt64_api_consistency_with_NaT(self):
        # Calling the following sum functions returned an error for dataframes but
        # returned NaT for series. These tests check that the API is consistent in
        # min/max calls on empty Series/DataFrames. See GH:33704 for more
        # information
        df = DataFrame(dict(x=pd.to_datetime([])))
        expected_dt_series = Series(pd.to_datetime([]))
        # check axis 0
        assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is
                                                pd.NaT)
        assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is
                                                pd.NaT)

        # check axis 1
        tm.assert_series_equal(df.min(axis=1), expected_dt_series)
        tm.assert_series_equal(df.max(axis=1), expected_dt_series)
def _interval_index(facet: pd.Series, thresholds: Optional[List[Any]]) -> pd.IntervalIndex:
    """
    Creates a Interval Index from list of threshold values. See pd.IntervalIndex.from_breaks
    Ex. [0,1,2] -> [(0, 1], (1,2]]
    :param facet: input data series
    :param thresholds: list of int or float values defining the threshold splits
    :return: pd.IntervalIndex
    """
    if not thresholds:
        raise ValueError("Threshold values must be provided for continuous features")
    facet_max, facet_min = facet.max(), facet.min()
    threshold_intervals = thresholds.copy()
    # add  max value if not exists in threshold limits
    if abs(facet_max) not in thresholds:
        threshold_intervals.append(facet_max)
    return pd.IntervalIndex.from_breaks(threshold_intervals)
Example #14
0
def infer_vmin_vmax(data:pd.Series, continuous_type="infer"):
    vmin = None
    vmax = None
    # Infer continuous type
    if continuous_type in ["infer", None]:
        continuous_type = infer_continuous_type(data)
    # +/-
    if continuous_type == "diverging":
        vmax = data.abs().max()
        vmin = -vmax
    # Other
    if continuous_type == "sequential":
        vmax = data.max()
        vmin = data.min()
    assert all(map(bool, [vmin,vmax])), "`vmin` and `vmax` should not be None at this point.  Please check `infer_continuous_type`"
    return vmin, vmax
Example #15
0
def calc_data_identifier(feature: pd.Series):
    # Dependent on the type of the provided Series object, statistical features are calculated, converted to string and usable as a data identifier

    precision = 9

    n_rows = len(feature)

    if pd.api.types.is_numeric_dtype(feature.dtype):
        mean = round(float(feature.mean()), precision)
        max = round(float(feature.max()), precision)
        min = round(float(feature.min()), precision)
        std = round(float(feature.std()), precision)

        data_analysis = {
            'n_rows': n_rows,
            'mean': mean,
            'max': max,
            'min': min,
            'std': std,
        }

    elif pd.api.types.is_string_dtype(feature.dtype):

        value_counts = feature.value_counts().to_dict()

        data_analysis = {
            'n_rows': n_rows,
            'value_counts': value_counts,
        }

    elif is_datetime(feature.dtype):
        # convert datetime values to string to make them json exportable
        value_counts = feature.value_counts()
        value_counts.index = value_counts.index.astype(str)
        value_counts = value_counts.to_dict()

        data_analysis = {
            'n_rows': n_rows,
            'value_counts': value_counts,
        }

    else:
        raise (Exception(f"Unexpected feature dtype: {feature.dtype}"))

    data_identifier_string = json.dumps(data_analysis, sort_keys=True)

    return data_identifier_string
def create_histogram(series: pd.Series, cwd: str, ranking_score_capping: float,
                     limit: int) -> None:
    series_max_boundary = 1
    series_min_boundary = round(series.min(), 1)
    ranking_score_capping_bool = True if ranking_score_capping > series_min_boundary else False
    num_of_messages = series.count()
    if series_min_boundary >= 0:
        bin_num = int((series_max_boundary + series_min_boundary) * 100)
    else:
        bin_num = int((series_max_boundary + abs(series_min_boundary)) * 100)
    bin_seq = []
    for x in range(bin_num - 1):
        bin_seq.append(round(series_min_boundary, 2))
        series_min_boundary += 0.01

    fig = Figure(figsize=(18, 9), facecolor='#ffffff')
    ax = fig.add_subplot()
    ax.set_facecolor('#e5e5e5')
    ax.hist(series,
            bins=bin_seq,
            color="#61ade0",
            label="num of message scores in 0.01 bins")
    ax.grid(color='#000000', linestyle="--")
    if ranking_score_capping_bool:
        ax.axvline(ranking_score_capping,
                   color="#df1b12",
                   linestyle="--",
                   linewidth=3,
                   label=f"capping value ({ranking_score_capping})")
    if limit <= num_of_messages:
        limit_score_value = series.iloc[limit]
        ax.axvline(
            limit_score_value,
            color="#fdc530",
            linestyle="--",
            linewidth=3,
            label=f"limit from configuration file ({limit})"
            f"\nscore of last message before the limit is ~ {round(limit_score_value, 3)}"
        )
    ax.legend(loc='upper right', fontsize=10, shadow=True, facecolor='#ffffff')
    ax.set_title(
        f"Unfiltered messages ({num_of_messages}), sorted by ranking score, before using capping and limit",
        fontsize=10)
    ax.set_xlabel("Ranking score", fontsize=10)
    ax.set_ylabel("Num of messages in 0.01 bins", fontsize=10)
    fig.savefig(join(cwd + sep + 'histogram.png'))
def points_calculation_tree(var: pd.Series,
                            target: pd.Series,
                            min_size: float = 5,
                            rnd: int = 2) -> list:
    """
    Calculate points for binning numeric variable dependent on target variable.

    Keyword arguments:
        var (pd.Series) -- Numeric variable
        target (pd.Series) -- Target binary variable
        min_size (float) -- minimum size of group in percent
        rnd -- Round level for variable values (default 2)
    """
    size = var.shape[0]
    var_name = var.name
    if (var.isnull().sum() / size) > (1 - min_size / 100):
        print('WARNING! Variable "{vname}" has too much null values!'.format(
            vname=var_name))
        return [-np.inf, np.inf]
    elif (var.value_counts(dropna=True).max() / size) > (1 - min_size / 100):
        print('WARNING! Variable "{vname}" has too often one value!'.format(
            vname=var_name))
        return [-np.inf, np.inf]
    else:
        indx = var.notnull()
        var = var[indx]
        target = target[indx]
        xmin = var.min()
        xmax = var.max()
        min_samples = round(min_size * size / 100)
        clf = tree.DecisionTreeClassifier(min_samples_leaf=min_samples,
                                          random_state=777)
        clf.fit(var.to_frame(), target)
        cut_points = pd.Series(
            clf.tree_.threshold).value_counts().to_frame('cnt').reset_index()
        cut_points.columns = ['point', 'cnt']
        cut_points = cut_points.loc[cut_points['cnt'] == 1, :]
        cut_points = cut_points.loc[(cut_points['point'] <= xmax) &
                                    (cut_points['point'] >= xmin), :]
        cut_points = cut_points['point'].sort_values(ascending=True).values
        cut_points = [
            -np.inf,
        ] + list(cut_points.round(rnd)) + [
            np.inf,
        ]
        return cut_points
Example #18
0
def showNumericalInfo(data:pd.Series):
    '''
    @Description
    显示数值统计信息,unique值,mean, median, mode, max, min
    ------------
    @Params
    data, Series
    '''
    print(data.name, data.dtype)
    print("Miss:", data.isnull().sum())
    print("Unique:", data.nunique())
    print("Max:", data.max())
    print("Min:", data.min())
    print("Mean:", data.mean())
    print("Median:", data.median())
    print("Mode:", data.mode()[0])
    print(data.value_counts().head(n=10))
Example #19
0
def discretize_series(series: pd.Series) -> pd.Series:
    # Find minimum and maximum values of the series, and define boundaries
    s_min, s_max = series.min(), series.max()
    boundaries = [s_min / 2.0, 0.0, s_max / 2.0]

    # Assign tier based on the boundaries
    def get_tier(val):
        if val < boundaries[0]:
            return "Very Low"
        elif val >= boundaries[0] and val < boundaries[1]:
            return "Low"
        elif val >= boundaries[1] and val < boundaries[2]:
            return "High"
        else:
            return "Very High"

    return pd.Series([get_tier(val) for val in series])
Example #20
0
def infer_continuous_type(data:pd.Series):
    assert all(data.map(is_number)), "All values in `data` must be numerical"

    _vmin = data.min()
    _vmax = data.max()

    # Conditions
    conditions = defaultdict(list)
    conditions["A"].append(_vmin < 0)
    conditions["A"].append(_vmax > 0)

    # +/-
    if all(conditions["A"]):
        return "diverging"
    # Other
    else:
        return "sequential"
Example #21
0
    def transform(self, target: pd.Series) -> pd.Series:
        """
        Логарифмическое преобразование целевой переменной.

        Parameters
        ----------
        target: pandas.Series, shape = [n_samples, ]
            Вектор целевой переменной.

        Returns
        -------
        log_target: pandas.Series, shape = [n_samples, ]
            Вектор прологарифмированной целевой переменной.

        """
        self.check_is_fitted
        self.target_min = target.min()
        return np.log(target - self.target_min + 1 + self.bias)
Example #22
0
def forkAnalysis(fileName):
    df = pd.read_csv(fileName, encoding='gbk')
    block = df['block']
    result = []
    #    print block
    for i, row in block.iteritems():
        if '已孤儿' in row.encode('utf-8'):
            result.append(int(row[:-5].encode('utf-8')))

    pre = result[0]
    for i in xrange(1, len(result)):
        temp = result[i]
        result[i] = result[i] - pre
        pre = temp
    ser = Series(result)
    print '---------------分叉相关--------------------'
    print '最大值:', ser.max(), '最小值:', ser.min(), '平均值:', ser.mean(
    ), '方差:', ser.var()
 def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
     '''
     segment is divided by the median to determine its top or bottom part
     the part is smoothed and raised above the segment or put down below the segment
     '''
     if len(segment) < 2:
         return segment
     comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
     segment = segment - segment.min()
     segment_median = segment.median()
     part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
     part = pd.Series(part, index = segment.index)
     smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
     difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
     max_diff = max(difference)
     bound = [val + max_diff for val in smoothed_part.values]
     bound = pd.Series(bound, index = segment.index)
     return bound
def hist_bin_width_fd(x: pd.Series) -> float:
    """Create bin widths for histograms based on the Freedman-Diaconis rule.

    Args:
        x (pd.Series): Series of data to use to generate bin widths.

    Returns:
        float: Number that specifies the bin widths.
    """
    iqr = np.subtract(*np.percentile(x, [75, 25]))
    h = 2.0 * iqr * x.size**(-1.0 / 3.0)
    if (x.max() - x.min()) / h > 1e8 * x.size:
        warnings.warn(
            "Bin width estimated with the Freedman-Diaconis rule is very small"
            " (= {})".format(h),
            RuntimeWarning,
            stacklevel=2)
    return h
Example #25
0
    def fit(self, target: pd.Series) -> None:
        """
        Расчет минимального значения целевой переменной для
        корректного расчета логарифма на отрицательных значениях.

        Parameters
        ----------
        target: pandas.Series, shape = [n_samples, ]
            Вектор целевой переменной.

        Returns
        -------
        self

        """
        self.target_min = target.min()
        self.fitted = True
        return self
Example #26
0
    def test_timedelta64_analytics(self):

        # index min/max
        dti = pd.date_range('2012-1-1', periods=3, freq='D')
        td = Series(dti) - pd.Timestamp('20120101')

        result = td.idxmin()
        assert result == 0

        result = td.idxmax()
        assert result == 2

        # GH#2982
        # with NaT
        td[0] = np.nan

        result = td.idxmin()
        assert result == 1

        result = td.idxmax()
        assert result == 2

        # abs
        s1 = Series(pd.date_range('20120101', periods=3))
        s2 = Series(pd.date_range('20120102', periods=3))
        expected = Series(s2 - s1)

        # FIXME: don't leave commented-out code
        # this fails as numpy returns timedelta64[us]
        # result = np.abs(s1-s2)
        # assert_frame_equal(result,expected)

        result = (s1 - s2).abs()
        tm.assert_series_equal(result, expected)

        # max/min
        result = td.max()
        expected = pd.Timedelta('2 days')
        assert result == expected

        result = td.min()
        expected = pd.Timedelta('1 days')
        assert result == expected
def do_stats_numeric(series: pd.Series, updated_dict: dict):
    stats = updated_dict["stats"]
    stats["max"] = series.max()
    stats["mean"] = series.mean()
    for percentile, value in series.quantile([0.95, 0.75, 0.50, 0.25, 0.05]).to_dict().items():
        stats[f"perc{int(percentile*100)}"] = value
    stats["min"] = series.min()

    stats["range"] = stats["max"] - stats["min"]
    stats["iqr"] = stats["perc75"] - stats["perc25"]

    stats["std"] = series.std()
    stats["variance"] = series.var()
    stats["kurtosis"] = series.kurt()
    stats["skewness"] = series.skew()
    stats["sum"] = series.sum()
    stats["mad"] = series.mad()
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    return updated_dict
Example #28
0
def _make_grid(values: Series, size: int,
               attempt_geometric: bool) -> MakeGridResult:
    start, stop = values.min(), values.max()
    message = None
    geometric = attempt_geometric
    if geometric and (start < 0 or stop <= 0):
        message = (
            "Refusing to create a geometric grid for a series with negative or all-zero values"
        )
        geometric = False
    if geometric and start == 0:
        start = values.drop_duplicates().nsmallest(2).iloc[1]
        assert start != 0
    f: Any = np.geomspace if geometric else np.linspace
    return MakeGridResult(
        grid=f(start, stop, size),
        geometric=geometric,
        message=message,
    )
Example #29
0
    def test_timedelta64_analytics(self):

        # index min/max
        dti = pd.date_range("2012-1-1", periods=3, freq="D")
        td = Series(dti) - pd.Timestamp("20120101")

        result = td.idxmin()
        assert result == 0

        result = td.idxmax()
        assert result == 2

        # GH#2982
        # with NaT
        td[0] = np.nan

        result = td.idxmin()
        assert result == 1

        result = td.idxmax()
        assert result == 2

        # abs
        s1 = Series(pd.date_range("20120101", periods=3))
        s2 = Series(pd.date_range("20120102", periods=3))
        expected = Series(s2 - s1)

        # FIXME: don't leave commented-out code
        # this fails as numpy returns timedelta64[us]
        # result = np.abs(s1-s2)
        # assert_frame_equal(result,expected)

        result = (s1 - s2).abs()
        tm.assert_series_equal(result, expected)

        # max/min
        result = td.max()
        expected = pd.Timedelta("2 days")
        assert result == expected

        result = td.min()
        expected = pd.Timedelta("1 days")
        assert result == expected
Example #30
0
def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a date series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    stats = {
        "min": series.min(),
        "max": series.max(),
        "histogramdata": series,  # TODO: calc histogram here?
    }

    stats["range"] = stats["max"] - stats["min"]

    return stats
Example #31
0
def hist_in_range(
        series: pandas.Series, min_value: Optional[float] = None,
        max_value: Optional[float] = None, bins: int = 50) \
        -> Any:
    """Display histogram of values in a given range.

    Arguments:
        series: pd.Series to compute the histogram on.
        min_value: Minimum value in series to be used.
        max_value: Maximum value in series to be used.
    Returns: The axes object of the plot.
    """

    min_value = min_value or series.min()
    max_value = max_value or series.max()
    plot_range = (series >= min_value) & (series <= max_value)
    range_perc = plot_range.sum() / series.count() * 100
    print(f'{range_perc:.2f}% of values in range')
    return series[plot_range].hist(bins=bins)
Example #32
0
    def split(
            self,
            X: pd.DataFrame,
            reference: pd.Series,
            bins: int = 10,
            shuffle: Union[bool] = None,
            random_state: Union[int] = None) -> Tuple[np.ndarray, np.ndarray]:

        shuffle = shuffle if shuffle is not None else self.shuffle
        random_state = random_state if random_state is not None else self.random_state
        min_ref, max_ref = int(reference.min() - 1), int(reference.max() + 1)
        cut_threshold = np.linspace(min_ref, max_ref, bins)
        out = pd.cut(reference, bins=cut_threshold, labels=False)

        skf = StratifiedKFold(self.n_split,
                              shuffle=shuffle,
                              random_state=random_state)
        for train_idx, val_idx in skf.split(X, out):
            yield train_idx, val_idx
def describe_dc_as_dataframe(dc: pd.Series, ds_md: dict) -> pd.Series:
    """ describes the profile criteria for column
    Args:
        dc: the Series to create Profile for
        ds_md: the Metadata dictionary of the DataFrame that is to be profiled
    Returns:
        A Series containing calculated description values.
    """
    dc = pd.to_numeric(dc, errors='coerce')
    null_values = dc.isna().sum()
    unique_values = len(dc.dropna().unique()) / len(dc)
    constancy = dc.value_counts(normalize=True).max(
    )  #constancy defined as amount of most frequent value divided by amount of numbers in column

    dc_stats = [
        ["Metadaten spezifisch für Spalte",
         column_metadata(dc.name, ds_md)], ["Anzahl an Zeilen",
                                            len(dc)],
        ["Anzahl an fehlenden Werten", null_values],
        ["Fehlende Werte (Prozent)", (null_values / len(dc)) * TO_PERCENT],
        ["Distinkte Werte (Prozent)", unique_values * TO_PERCENT],
        ["Konstanz (Prozent)", constancy * TO_PERCENT],
        ["Mittelwert", format(dc.mean(), 'f')],
        [
            "Minimumwert (Jahr, Wert)",
            ({
                dc.idxmin().date(): format(dc.min(), 'f')
            } if len(dc.dropna()) > 0 else "")
        ],
        [
            "Maximumwert (Jahr, Wert)",
            ({
                dc.idxmax().date(): format(dc.max(), 'f')
            } if len(dc.dropna()) > 0 else "")
        ], ["Datenpunkte vorhanden für",
            check_is_consecutive(dc)]
    ]

    profile = pd.DataFrame(data=dc_stats, columns=["Kriterien", "Ergebnis"])
    profile.set_index("Kriterien", inplace=True)

    return profile
Example #34
0
def describe_timestamp_1d(data: Series,
                          percentiles: Sequence[float]) -> Series:
    """Describe series containing datetime64 dtype.

    Parameters
    ----------
    data : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    # GH-30164
    from pandas import Series

    formatted_percentiles = format_percentiles(percentiles)

    stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
    d = ([data.count(), data.mean(), data.min()] +
         data.quantile(percentiles).tolist() + [data.max()])
    return Series(d, index=stat_index, name=data.name)
Example #35
0
    def _fit(self, X: pd.Series, y):
        if not is_numeric_dtype(X) and X.name not in self.categorical_cols:
            raise ValueError(
                'Column {} is not numeric and not in categorical_cols.'.format(
                    X.name))

        if X.name in self.categorical_cols:
            X = self.encode_with_label(X, y)

        if not self.encode:
            self.min_[X.name], self.max_[X.name] = X.min(), X.max()

        X, y = self._drop_na(X, y)
        min_frac = self.min_frac if is_number(
            self.min_frac) else self.min_frac[X.name]
        DT = DecisionTreeClassifier(max_leaf_nodes=self.bins,
                                    min_samples_leaf=min_frac,
                                    random_state=self.random_state)
        DT.fit(X.to_frame(), y)
        return parse_tree(DT.tree_), DT
Example #36
0
def min_max_normalize(values: pd.Series) -> pd.Series:
    """
    Min-Max normalize a series.

    Args:
        values: the series to normalize

    Returns:
        the normalized series

    Test:
    >>> min_max_normalize(pd.Series([1,2,3]))
    0    0.0
    1    0.5
    2    1.0
    dtype: float64
    """
    max_value = values.max()
    min_value = values.min()
    return tp.cast(pd.Series, (values - min_value) / (max_value - min_value))
def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a date series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    stats = {"min": series.min(), "max": series.max(), "histogramdata": series}

    bins = config["plot"]["histogram"]["bins"].get(int)
    # Bins should never be larger than the number of distinct values
    bins = min(series_description["distinct_count_with_nan"], bins)
    stats["histogram_bins"] = bins

    stats["range"] = stats["max"] - stats["min"]

    return stats
Example #38
0
def count_estims(dist, gamma = 0.95):
    '''
    Counts all estimates
    :param dist: dsitribution
    :param gamma: probability of realisation of value
    :return point: point estimates
    :return interval: confidance intervals for point estimates
    '''
    import numpy as np
    x = Series(dist)
    #Точечные оценки
    point = {}
    N = x.count()

    med_ = med_u(x)#
    med = np.median(dist)
    mad = x.mad()#
    mean_c = mean(dist)#
    var = np.var(dist)
    std = np.std(dist)
    mod = stats.mode(dist).mode#
    kurt = stats.kurtosis(dist)
    skew_my = stats.skew(dist)#
    Chi = 1/np.sqrt(np.abs(kurt))
    quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5)
    W = std/mean_c;#

    quantiles_str = ""
    for index in quantiles.index:
        quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index])

    point['MED'] = np.round(med, 5)
    point['MED*'] = np.round(med_, 5)
    point['MAD'] = np.round(mad, 5)
    point['Min'] = np.round(x.min(), 5)
    point['Max'] = np.round(x.max(), 5)
    point['Mean'] = np.round(mean_c, 5)
    point['S^2'] = np.round(var, 5)
    point['S'] = np.round(std, 5)
    point['MOD'] = np.round(mod, 5)
    point['E'] = np.round(kurt, 5)
    point['A'] = np.round(skew_my, 5)
    point['Chi'] = np.round(Chi, 5)
    point['X(alpha)'] = quantiles_str
    point['W'] = np.round(W, 5)



    #Интервальные оценки
    from scipy.stats import t, norm
    import numpy as np
    interval = {}
    if N < 61:
        l = t.ppf((1-gamma)/2, N-1)
        u = t.ppf(1-(1-gamma)/2, N-1)
    else:
        l = norm.ppf((1-gamma)/2)
        u = norm.ppf(1-(1-gamma)/2)
    X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x))
    A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x))
    S_cf = (std + l*sigma_S(x), std+u*sigma_S(x))
    E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x))
    if W < 1:
        v = l/np.sqrt(2*(N-1))
        W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5)
    else: W_cf = (None, None)

    interval['Mean'] = np.round(X_cf, 5)
    interval['S'] = np.round(S_cf, 5)
    interval['E'] = np.round(E_cf, 5)
    interval['A'] = np.round(A_cf, 5)
    interval['W'] = W_cf

    return point, interval
    def source_data(self):
        
        st_date = self.stTrain
#        st_date = '2014-10-1'
        stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2]))
        if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date():
            raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!')
        if self.view:
            db_red = psycopg2.connect(host="***", database="***", port="***",
                                  user="******", password="******")
            db_red.autocommit = True
            df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view
                                from appstoredata_itunes_metrics where game='***' 
                                and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, 
                                con=db_red)  
                            
            df_red['date'] = pd.to_datetime(df_red['date'])
            ts_view_target1 = Series(df_red.view.tolist(), 
                                     index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_install_target1 = Series(df_red.install.tolist(), 
                                        index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_view_target1) < (self.endP-stD).days :
                ts_view_target1[pd.to_datetime(st_date)] = 0
                ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
                ts_install_target1[pd.to_datetime(st_date)] = 0
                ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_view_target = (ts_view_target1)/(ts_view_target1.sum())
            ts_install_target = (ts_install_target1)/(ts_install_target1.sum())
        else:
            ts_view_target = []
            ts_view_target1 = []
            ts_install_target = []  
            ts_install_target1 = []
        
        db = MySQLdb.connect(
        host = '***', 
        user = '******', 
        passwd = '***', 
        db = '***', 
        port = '***')
        
        df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country,
                               sum(metrics_daily.value) as value, dim_channel.channel_type as type
                               from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id 
                               left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 
                               and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db)  
                       
        
        df_mysql['date'] = pd.to_datetime(df_mysql['date'])
        all_data_target = df_mysql[df_mysql.country==self.target]
        org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)]
        ts_org_target1 = Series(org_data_target.value.tolist(), 
                               index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_all_target1 = Series(all_data_target.value.tolist(), 
                                index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_org_target = (ts_org_target1)/(ts_org_target1.sum())
        ts_all_target = (ts_all_target1)/(ts_all_target1.sum())
        
        if self.baseorg:
            org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)]
            ts_org_base1 = Series(org_data_base.value.tolist(), 
                                 index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)   
            ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min())
        else:
            ts_org_base = []
            ts_org_base1 = []
        
        if self.paid:
            paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)]
            ts_paid_target1 = Series(paid_data_target.value.tolist(),
                                    index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_paid_target1) < (self.endP-stD).days :
                ts_paid_target1[pd.to_datetime(st_date)] = 0
                ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum())
        else:
            ts_paid_target = []
            ts_paid_target1 = []
            
        if self.rank:
            df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where 
                                    country='%s' and device!='android'and game='***' 
                                    and category='Overall' group by date;''' % self.target, con=db)  
            
            df_rank['date'] = pd.to_datetime(df_rank['date'])
            ts_rank_target1 = Series(df_rank.bestRank.tolist(), 
                                     index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_rank_target1) < (self.endP-stD).days :
                ts_rank_target1[pd.to_datetime(st_date)] = 0
                ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum())
        else:
            ts_rank_target = []
            ts_rank_target1 = []
        
#        endog = ts_org_target1
#        endog = ts_install_target
        endog = ts_all_target1
        
        Tlist = [self.paid, self.baseorg, self.view, self.rank]
        dff = DataFrame()
        tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target]
        tlist = ['paid', 'base', 'view', 'rank']
        for i in xrange(0,len(Tlist)):
            if Tlist[i]:
                dff[tlist[i]] = tList[i]
        if dff.empty:
            raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!')
                
        exog = dff
        
        return (endog, exog)
frame
'''

   A  B  C
a  0  1  2
b  3  4  5
c  6  7  8
'''
print
frame.max()
'''
A    6
B    7
C    8
'''
f = lambda x: x.max() - x.min()
print
frame.apply(f)  # 作用到每一列
'''
A    6
B    6
C    6
'''
print
frame.apply(f, axis=1)  # 作用到每一行
'''
a    2
b    2
c    2
'''