Example #1
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context('use_bottleneck', use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ['int32', 'int64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype='int64')
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ['float32', 'float64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
Example #2
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
Example #3
    def test_min_max_numeric_only(self):
        # TODO deprecate numeric_only argument for Categorical and use
        # skipna as well, see GH25303
        cat = Series(Categorical(
            ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True))

        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "a"

        _min = cat.min(numeric_only=True)
        _max = cat.max(numeric_only=True)
        assert _min == "b"
        assert _max == "a"

        _min = cat.min(numeric_only=False)
        _max = cat.max(numeric_only=False)
        assert np.isnan(_min)
        assert _max == "a"
Example #4
    def test_overflow(self):
        # GH 9442
        s = Series(pd.date_range('20130101', periods=100000, freq='H'))
        s[0] += pd.Timedelta('1s 1ms')

        # mean
        result = (s - s.min()).mean()
        expected = pd.Timedelta((pd.DatetimeIndex((s - s.min())).asi8 / len(s)

        # the computation is converted to float so
        # might be some loss of precision
        assert np.allclose(result.value / 1000, expected.value / 1000)

        # sum
        pytest.raises(ValueError, lambda: (s - s.min()).sum())
        s1 = s[0:10000]
        pytest.raises(ValueError, lambda: (s1 - s1.min()).sum())
        s2 = s[0:1000]
        result = (s2 - s2.min()).sum()
Example #5
    def test_min_max(self):
        # unordered cats have no min/max
        cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
        with pytest.raises(TypeError):
        with pytest.raises(TypeError):

        cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "a"
        assert _max == "d"

        cat = Series(Categorical(["a", "b", "c", "d"], categories=[
                     'd', 'c', 'b', 'a'], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "d"
        assert _max == "a"

        cat = Series(Categorical(
            [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
                                                    ], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "b"

        cat = Series(Categorical(
            [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == 1
Example #6
def test_name2num():
    num_to_test = 10
    str_len = 4
    letters = string.ascii_letters
    x = Series(dict(zip(letters, map(ord, letters))))
    base = 256 ** np.arange(str_len)
    mn = base.dot(np.repeat(x.min(), str_len))
    mx = base.dot(np.repeat(x.max(), str_len))

    for _ in xrange(num_to_test):
        name = random.sample(letters, str_len)
        num = name2num(name)
        assert mn <= num <= mx
Example #7
    def test_min_max_skipna(self, skipna):
        # GH 25303
        cat = Series(
            Categorical(["a", "b", np.nan, "a"],
                        categories=["b", "a"],
        _min = cat.min(skipna=skipna)
        _max = cat.max(skipna=skipna)

        if skipna is True:
            assert _min == "b"
            assert _max == "a"
            assert np.isnan(_min)
            assert np.isnan(_max)
Example #8
def test_td64_summation_overflow():
    # GH#9442
    ser = Series(pd.date_range("20130101", periods=100000, freq="H"))
    ser[0] += pd.Timedelta("1s 1ms")

    # mean
    result = (ser - ser.min()).mean()
    expected = pd.Timedelta(
        (pd.TimedeltaIndex(ser - ser.min()).asi8 / len(ser)).sum())

    # the computation is converted to float so
    # might be some loss of precision
    assert np.allclose(result.value / 1000, expected.value / 1000)

    # sum
    msg = "overflow in timedelta operation"
    with pytest.raises(ValueError, match=msg):
        (ser - ser.min()).sum()

    s1 = ser[0:10000]
    with pytest.raises(ValueError, match=msg):
        (s1 - s1.min()).sum()
    s2 = ser[0:1000]
    (s2 - s2.min()).sum()
Example #9
def get_stats(s: pd.Series):
    """ Calculate basic sample `s` statistics. """
    q1 = s.quantile(0.25)
    median = s.median()
    q3 = s.quantile(0.75)
    p90 = s.quantile(0.90)
    p95 = s.quantile(0.95)
    p99 = s.quantile(0.99)
    iqr = q3 - q1
    mean = round(s.mean(), 2)
    std = round(s.std(), 2)
    min = s.min()
    max = s.max()
    n = len(s)
    return [q1, median, q3, p90, p95, p99, iqr, mean, std, min, max, n]
Example #10
def numeric_stats_pandas(series: pd.Series):
    #     summary["min"] = summary["value_counts_without_nan"].index.min()
    # vc.index.min()
    return {
        "mean": series.mean(),
        "std": series.std(),
        "variance": series.var(),
        "min": series.min(),
        "max": series.max(),
        # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
        "kurtosis": series.kurt(),
        # Unbiased skew normalized by N-1
        "skewness": series.skew(),
        "sum": series.sum(),
def compress_float(series: pd.Series) -> pd.Series:
    Compressing to half-precision floating-point format can degrade computational performance
    CPUs often do not have native support for 16-bit floats and simulate the data type.
    :param series:
    minv, maxv = series.min(), series.max()
    tester = type_tester(minv, maxv, np.finfo)
    test_types = [np.float16, np.float32, np.float64]

    compressed_type = get_compressed_type(test_types, tester)
    return series.astype(compressed_type)
Example #12
    def test_min_max_dt64_api_consistency_with_NaT(self):
        # Calling the following sum functions returned an error for dataframes but
        # returned NaT for series. These tests check that the API is consistent in
        # min/max calls on empty Series/DataFrames. See GH:33704 for more
        # information
        df = DataFrame(dict(x=pd.to_datetime([])))
        expected_dt_series = Series(pd.to_datetime([]))
        # check axis 0
        assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is
        assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is

        # check axis 1
        tm.assert_series_equal(df.min(axis=1), expected_dt_series)
        tm.assert_series_equal(df.max(axis=1), expected_dt_series)
def _interval_index(facet: pd.Series, thresholds: Optional[List[Any]]) -> pd.IntervalIndex:
    Creates a Interval Index from list of threshold values. See pd.IntervalIndex.from_breaks
    Ex. [0,1,2] -> [(0, 1], (1,2]]
    :param facet: input data series
    :param thresholds: list of int or float values defining the threshold splits
    :return: pd.IntervalIndex
    if not thresholds:
        raise ValueError("Threshold values must be provided for continuous features")
    facet_max, facet_min = facet.max(), facet.min()
    threshold_intervals = thresholds.copy()
    # add  max value if not exists in threshold limits
    if abs(facet_max) not in thresholds:
    return pd.IntervalIndex.from_breaks(threshold_intervals)
Example #14
def infer_vmin_vmax(data:pd.Series, continuous_type="infer"):
    vmin = None
    vmax = None
    # Infer continuous type
    if continuous_type in ["infer", None]:
        continuous_type = infer_continuous_type(data)
    # +/-
    if continuous_type == "diverging":
        vmax = data.abs().max()
        vmin = -vmax
    # Other
    if continuous_type == "sequential":
        vmax = data.max()
        vmin = data.min()
    assert all(map(bool, [vmin,vmax])), "`vmin` and `vmax` should not be None at this point.  Please check `infer_continuous_type`"
    return vmin, vmax
Example #15
def calc_data_identifier(feature: pd.Series):
    # Dependent on the type of the provided Series object, statistical features are calculated, converted to string and usable as a data identifier

    precision = 9

    n_rows = len(feature)

    if pd.api.types.is_numeric_dtype(feature.dtype):
        mean = round(float(feature.mean()), precision)
        max = round(float(feature.max()), precision)
        min = round(float(feature.min()), precision)
        std = round(float(feature.std()), precision)

        data_analysis = {
            'n_rows': n_rows,
            'mean': mean,
            'max': max,
            'min': min,
            'std': std,

    elif pd.api.types.is_string_dtype(feature.dtype):

        value_counts = feature.value_counts().to_dict()

        data_analysis = {
            'n_rows': n_rows,
            'value_counts': value_counts,

    elif is_datetime(feature.dtype):
        # convert datetime values to string to make them json exportable
        value_counts = feature.value_counts()
        value_counts.index = value_counts.index.astype(str)
        value_counts = value_counts.to_dict()

        data_analysis = {
            'n_rows': n_rows,
            'value_counts': value_counts,

        raise (Exception(f"Unexpected feature dtype: {feature.dtype}"))

    data_identifier_string = json.dumps(data_analysis, sort_keys=True)

    return data_identifier_string
def create_histogram(series: pd.Series, cwd: str, ranking_score_capping: float,
                     limit: int) -> None:
    series_max_boundary = 1
    series_min_boundary = round(series.min(), 1)
    ranking_score_capping_bool = True if ranking_score_capping > series_min_boundary else False
    num_of_messages = series.count()
    if series_min_boundary >= 0:
        bin_num = int((series_max_boundary + series_min_boundary) * 100)
        bin_num = int((series_max_boundary + abs(series_min_boundary)) * 100)
    bin_seq = []
    for x in range(bin_num - 1):
        bin_seq.append(round(series_min_boundary, 2))
        series_min_boundary += 0.01

    fig = Figure(figsize=(18, 9), facecolor='#ffffff')
    ax = fig.add_subplot()
            label="num of message scores in 0.01 bins")
    ax.grid(color='#000000', linestyle="--")
    if ranking_score_capping_bool:
                   label=f"capping value ({ranking_score_capping})")
    if limit <= num_of_messages:
        limit_score_value = series.iloc[limit]
            label=f"limit from configuration file ({limit})"
            f"\nscore of last message before the limit is ~ {round(limit_score_value, 3)}"
    ax.legend(loc='upper right', fontsize=10, shadow=True, facecolor='#ffffff')
        f"Unfiltered messages ({num_of_messages}), sorted by ranking score, before using capping and limit",
    ax.set_xlabel("Ranking score", fontsize=10)
    ax.set_ylabel("Num of messages in 0.01 bins", fontsize=10)
    fig.savefig(join(cwd + sep + 'histogram.png'))
def points_calculation_tree(var: pd.Series,
                            target: pd.Series,
                            min_size: float = 5,
                            rnd: int = 2) -> list:
    Calculate points for binning numeric variable dependent on target variable.

    Keyword arguments:
        var (pd.Series) -- Numeric variable
        target (pd.Series) -- Target binary variable
        min_size (float) -- minimum size of group in percent
        rnd -- Round level for variable values (default 2)
    size = var.shape[0]
    var_name = var.name
    if (var.isnull().sum() / size) > (1 - min_size / 100):
        print('WARNING! Variable "{vname}" has too much null values!'.format(
        return [-np.inf, np.inf]
    elif (var.value_counts(dropna=True).max() / size) > (1 - min_size / 100):
        print('WARNING! Variable "{vname}" has too often one value!'.format(
        return [-np.inf, np.inf]
        indx = var.notnull()
        var = var[indx]
        target = target[indx]
        xmin = var.min()
        xmax = var.max()
        min_samples = round(min_size * size / 100)
        clf = tree.DecisionTreeClassifier(min_samples_leaf=min_samples,
        clf.fit(var.to_frame(), target)
        cut_points = pd.Series(
        cut_points.columns = ['point', 'cnt']
        cut_points = cut_points.loc[cut_points['cnt'] == 1, :]
        cut_points = cut_points.loc[(cut_points['point'] <= xmax) &
                                    (cut_points['point'] >= xmin), :]
        cut_points = cut_points['point'].sort_values(ascending=True).values
        cut_points = [
        ] + list(cut_points.round(rnd)) + [
        return cut_points
Example #18
def showNumericalInfo(data:pd.Series):
    显示数值统计信息,unique值,mean, median, mode, max, min
    data, Series
    print(data.name, data.dtype)
    print("Miss:", data.isnull().sum())
    print("Unique:", data.nunique())
    print("Max:", data.max())
    print("Min:", data.min())
    print("Mean:", data.mean())
    print("Median:", data.median())
    print("Mode:", data.mode()[0])
Example #19
def discretize_series(series: pd.Series) -> pd.Series:
    # Find minimum and maximum values of the series, and define boundaries
    s_min, s_max = series.min(), series.max()
    boundaries = [s_min / 2.0, 0.0, s_max / 2.0]

    # Assign tier based on the boundaries
    def get_tier(val):
        if val < boundaries[0]:
            return "Very Low"
        elif val >= boundaries[0] and val < boundaries[1]:
            return "Low"
        elif val >= boundaries[1] and val < boundaries[2]:
            return "High"
            return "Very High"

    return pd.Series([get_tier(val) for val in series])
Example #20
def infer_continuous_type(data:pd.Series):
    assert all(data.map(is_number)), "All values in `data` must be numerical"

    _vmin = data.min()
    _vmax = data.max()

    # Conditions
    conditions = defaultdict(list)
    conditions["A"].append(_vmin < 0)
    conditions["A"].append(_vmax > 0)

    # +/-
    if all(conditions["A"]):
        return "diverging"
    # Other
        return "sequential"
Example #21
    def transform(self, target: pd.Series) -> pd.Series:
        Логарифмическое преобразование целевой переменной.

        target: pandas.Series, shape = [n_samples, ]
            Вектор целевой переменной.

        log_target: pandas.Series, shape = [n_samples, ]
            Вектор прологарифмированной целевой переменной.

        self.target_min = target.min()
        return np.log(target - self.target_min + 1 + self.bias)
Example #22
def forkAnalysis(fileName):
    df = pd.read_csv(fileName, encoding='gbk')
    block = df['block']
    result = []
    #    print block
    for i, row in block.iteritems():
        if '已孤儿' in row.encode('utf-8'):

    pre = result[0]
    for i in xrange(1, len(result)):
        temp = result[i]
        result[i] = result[i] - pre
        pre = temp
    ser = Series(result)
    print '---------------分叉相关--------------------'
    print '最大值:', ser.max(), '最小值:', ser.min(), '平均值:', ser.mean(
    ), '方差:', ser.var()
 def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
     segment is divided by the median to determine its top or bottom part
     the part is smoothed and raised above the segment or put down below the segment
     if len(segment) < 2:
         return segment
     comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
     segment = segment - segment.min()
     segment_median = segment.median()
     part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
     part = pd.Series(part, index = segment.index)
     smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
     difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
     max_diff = max(difference)
     bound = [val + max_diff for val in smoothed_part.values]
     bound = pd.Series(bound, index = segment.index)
     return bound
def hist_bin_width_fd(x: pd.Series) -> float:
    """Create bin widths for histograms based on the Freedman-Diaconis rule.

        x (pd.Series): Series of data to use to generate bin widths.

        float: Number that specifies the bin widths.
    iqr = np.subtract(*np.percentile(x, [75, 25]))
    h = 2.0 * iqr * x.size**(-1.0 / 3.0)
    if (x.max() - x.min()) / h > 1e8 * x.size:
            "Bin width estimated with the Freedman-Diaconis rule is very small"
            " (= {})".format(h),
    return h
Example #25
    def fit(self, target: pd.Series) -> None:
        Расчет минимального значения целевой переменной для
        корректного расчета логарифма на отрицательных значениях.

        target: pandas.Series, shape = [n_samples, ]
            Вектор целевой переменной.


        self.target_min = target.min()
        self.fitted = True
        return self
Example #26
    def test_timedelta64_analytics(self):

        # index min/max
        dti = pd.date_range('2012-1-1', periods=3, freq='D')
        td = Series(dti) - pd.Timestamp('20120101')

        result = td.idxmin()
        assert result == 0

        result = td.idxmax()
        assert result == 2

        # GH#2982
        # with NaT
        td[0] = np.nan

        result = td.idxmin()
        assert result == 1

        result = td.idxmax()
        assert result == 2

        # abs
        s1 = Series(pd.date_range('20120101', periods=3))
        s2 = Series(pd.date_range('20120102', periods=3))
        expected = Series(s2 - s1)

        result = (s1 - s2).abs()
        tm.assert_series_equal(result, expected)

        # max/min
        result = td.max()
        expected = pd.Timedelta('2 days')
        assert result == expected

        result = td.min()
        expected = pd.Timedelta('1 days')
        assert result == expected
def do_stats_numeric(series: pd.Series, updated_dict: dict):
    stats = updated_dict["stats"]
    stats["max"] = series.max()
    stats["mean"] = series.mean()
    for percentile, value in series.quantile([0.95, 0.75, 0.50, 0.25, 0.05]).to_dict().items():
        stats[f"perc{int(percentile*100)}"] = value
    stats["min"] = series.min()

    stats["range"] = stats["max"] - stats["min"]
    stats["iqr"] = stats["perc75"] - stats["perc25"]

    stats["std"] = series.std()
    stats["variance"] = series.var()
    stats["kurtosis"] = series.kurt()
    stats["skewness"] = series.skew()
    stats["sum"] = series.sum()
    stats["mad"] = series.mad()
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    return updated_dict
Example #28
def _make_grid(values: Series, size: int,
               attempt_geometric: bool) -> MakeGridResult:
    start, stop = values.min(), values.max()
    message = None
    geometric = attempt_geometric
    if geometric and (start < 0 or stop <= 0):
        message = (
            "Refusing to create a geometric grid for a series with negative or all-zero values"
        geometric = False
    if geometric and start == 0:
        start = values.drop_duplicates().nsmallest(2).iloc[1]
        assert start != 0
    f: Any = np.geomspace if geometric else np.linspace
    return MakeGridResult(
        grid=f(start, stop, size),
Example #29
    def test_timedelta64_analytics(self):

        # index min/max
        dti = pd.date_range("2012-1-1", periods=3, freq="D")
        td = Series(dti) - pd.Timestamp("20120101")

        result = td.idxmin()
        assert result == 0

        result = td.idxmax()
        assert result == 2

        # GH#2982
        # with NaT
        td[0] = np.nan

        result = td.idxmin()
        assert result == 1

        result = td.idxmax()
        assert result == 2

        # abs
        s1 = Series(pd.date_range("20120101", periods=3))
        s2 = Series(pd.date_range("20120102", periods=3))
        expected = Series(s2 - s1)

        result = (s1 - s2).abs()
        tm.assert_series_equal(result, expected)

        # max/min
        result = td.max()
        expected = pd.Timedelta("2 days")
        assert result == expected

        result = td.min()
        expected = pd.Timedelta("1 days")
        assert result == expected
Example #30
def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a date series.

        series: The Series to describe.
        series_description: The dict containing the series description so far.

        A dict containing calculated series description values.
    stats = {
        "min": series.min(),
        "max": series.max(),
        "histogramdata": series,  # TODO: calc histogram here?

    stats["range"] = stats["max"] - stats["min"]

    return stats
Example #31
def hist_in_range(
        series: pandas.Series, min_value: Optional[float] = None,
        max_value: Optional[float] = None, bins: int = 50) \
        -> Any:
    """Display histogram of values in a given range.

        series: pd.Series to compute the histogram on.
        min_value: Minimum value in series to be used.
        max_value: Maximum value in series to be used.
    Returns: The axes object of the plot.

    min_value = min_value or series.min()
    max_value = max_value or series.max()
    plot_range = (series >= min_value) & (series <= max_value)
    range_perc = plot_range.sum() / series.count() * 100
    print(f'{range_perc:.2f}% of values in range')
    return series[plot_range].hist(bins=bins)
Example #32
    def split(
            X: pd.DataFrame,
            reference: pd.Series,
            bins: int = 10,
            shuffle: Union[bool] = None,
            random_state: Union[int] = None) -> Tuple[np.ndarray, np.ndarray]:

        shuffle = shuffle if shuffle is not None else self.shuffle
        random_state = random_state if random_state is not None else self.random_state
        min_ref, max_ref = int(reference.min() - 1), int(reference.max() + 1)
        cut_threshold = np.linspace(min_ref, max_ref, bins)
        out = pd.cut(reference, bins=cut_threshold, labels=False)

        skf = StratifiedKFold(self.n_split,
        for train_idx, val_idx in skf.split(X, out):
            yield train_idx, val_idx
def describe_dc_as_dataframe(dc: pd.Series, ds_md: dict) -> pd.Series:
    """ describes the profile criteria for column
        dc: the Series to create Profile for
        ds_md: the Metadata dictionary of the DataFrame that is to be profiled
        A Series containing calculated description values.
    dc = pd.to_numeric(dc, errors='coerce')
    null_values = dc.isna().sum()
    unique_values = len(dc.dropna().unique()) / len(dc)
    constancy = dc.value_counts(normalize=True).max(
    )  #constancy defined as amount of most frequent value divided by amount of numbers in column

    dc_stats = [
        ["Metadaten spezifisch für Spalte",
         column_metadata(dc.name, ds_md)], ["Anzahl an Zeilen",
        ["Anzahl an fehlenden Werten", null_values],
        ["Fehlende Werte (Prozent)", (null_values / len(dc)) * TO_PERCENT],
        ["Distinkte Werte (Prozent)", unique_values * TO_PERCENT],
        ["Konstanz (Prozent)", constancy * TO_PERCENT],
        ["Mittelwert", format(dc.mean(), 'f')],
            "Minimumwert (Jahr, Wert)",
                dc.idxmin().date(): format(dc.min(), 'f')
            } if len(dc.dropna()) > 0 else "")
            "Maximumwert (Jahr, Wert)",
                dc.idxmax().date(): format(dc.max(), 'f')
            } if len(dc.dropna()) > 0 else "")
        ], ["Datenpunkte vorhanden für",

    profile = pd.DataFrame(data=dc_stats, columns=["Kriterien", "Ergebnis"])
    profile.set_index("Kriterien", inplace=True)

    return profile
Example #34
def describe_timestamp_1d(data: Series,
                          percentiles: Sequence[float]) -> Series:
    """Describe series containing datetime64 dtype.

    data : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    # GH-30164
    from pandas import Series

    formatted_percentiles = format_percentiles(percentiles)

    stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
    d = ([data.count(), data.mean(), data.min()] +
         data.quantile(percentiles).tolist() + [data.max()])
    return Series(d, index=stat_index, name=data.name)
Example #35
    def _fit(self, X: pd.Series, y):
        if not is_numeric_dtype(X) and X.name not in self.categorical_cols:
            raise ValueError(
                'Column {} is not numeric and not in categorical_cols.'.format(

        if X.name in self.categorical_cols:
            X = self.encode_with_label(X, y)

        if not self.encode:
            self.min_[X.name], self.max_[X.name] = X.min(), X.max()

        X, y = self._drop_na(X, y)
        min_frac = self.min_frac if is_number(
            self.min_frac) else self.min_frac[X.name]
        DT = DecisionTreeClassifier(max_leaf_nodes=self.bins,
        DT.fit(X.to_frame(), y)
        return parse_tree(DT.tree_), DT
Example #36
def min_max_normalize(values: pd.Series) -> pd.Series:
    Min-Max normalize a series.

        values: the series to normalize

        the normalized series

    >>> min_max_normalize(pd.Series([1,2,3]))
    0    0.0
    1    0.5
    2    1.0
    dtype: float64
    max_value = values.max()
    min_value = values.min()
    return tp.cast(pd.Series, (values - min_value) / (max_value - min_value))
def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a date series.

        series: The Series to describe.
        series_description: The dict containing the series description so far.

        A dict containing calculated series description values.
    stats = {"min": series.min(), "max": series.max(), "histogramdata": series}

    bins = config["plot"]["histogram"]["bins"].get(int)
    # Bins should never be larger than the number of distinct values
    bins = min(series_description["distinct_count_with_nan"], bins)
    stats["histogram_bins"] = bins

    stats["range"] = stats["max"] - stats["min"]

    return stats
Example #38
def count_estims(dist, gamma = 0.95):
    Counts all estimates
    :param dist: dsitribution
    :param gamma: probability of realisation of value
    :return point: point estimates
    :return interval: confidance intervals for point estimates
    import numpy as np
    x = Series(dist)
    #Точечные оценки
    point = {}
    N = x.count()

    med_ = med_u(x)#
    med = np.median(dist)
    mad = x.mad()#
    mean_c = mean(dist)#
    var = np.var(dist)
    std = np.std(dist)
    mod = stats.mode(dist).mode#
    kurt = stats.kurtosis(dist)
    skew_my = stats.skew(dist)#
    Chi = 1/np.sqrt(np.abs(kurt))
    quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5)
    W = std/mean_c;#

    quantiles_str = ""
    for index in quantiles.index:
        quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index])

    point['MED'] = np.round(med, 5)
    point['MED*'] = np.round(med_, 5)
    point['MAD'] = np.round(mad, 5)
    point['Min'] = np.round(x.min(), 5)
    point['Max'] = np.round(x.max(), 5)
    point['Mean'] = np.round(mean_c, 5)
    point['S^2'] = np.round(var, 5)
    point['S'] = np.round(std, 5)
    point['MOD'] = np.round(mod, 5)
    point['E'] = np.round(kurt, 5)
    point['A'] = np.round(skew_my, 5)
    point['Chi'] = np.round(Chi, 5)
    point['X(alpha)'] = quantiles_str
    point['W'] = np.round(W, 5)

    #Интервальные оценки
    from scipy.stats import t, norm
    import numpy as np
    interval = {}
    if N < 61:
        l = t.ppf((1-gamma)/2, N-1)
        u = t.ppf(1-(1-gamma)/2, N-1)
        l = norm.ppf((1-gamma)/2)
        u = norm.ppf(1-(1-gamma)/2)
    X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x))
    A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x))
    S_cf = (std + l*sigma_S(x), std+u*sigma_S(x))
    E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x))
    if W < 1:
        v = l/np.sqrt(2*(N-1))
        W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5)
    else: W_cf = (None, None)

    interval['Mean'] = np.round(X_cf, 5)
    interval['S'] = np.round(S_cf, 5)
    interval['E'] = np.round(E_cf, 5)
    interval['A'] = np.round(A_cf, 5)
    interval['W'] = W_cf

    return point, interval
    def source_data(self):
        st_date = self.stTrain
#        st_date = '2014-10-1'
        stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2]))
        if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date():
            raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!')
        if self.view:
            db_red = psycopg2.connect(host="***", database="***", port="***",
                                  user="******", password="******")
            db_red.autocommit = True
            df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view
                                from appstoredata_itunes_metrics where game='***' 
                                and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, 
            df_red['date'] = pd.to_datetime(df_red['date'])
            ts_view_target1 = Series(df_red.view.tolist(), 
                                     index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_install_target1 = Series(df_red.install.tolist(), 
                                        index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_view_target1) < (self.endP-stD).days :
                ts_view_target1[pd.to_datetime(st_date)] = 0
                ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
                ts_install_target1[pd.to_datetime(st_date)] = 0
                ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_view_target = (ts_view_target1)/(ts_view_target1.sum())
            ts_install_target = (ts_install_target1)/(ts_install_target1.sum())
            ts_view_target = []
            ts_view_target1 = []
            ts_install_target = []  
            ts_install_target1 = []
        db = MySQLdb.connect(
        host = '***', 
        user = '******', 
        passwd = '***', 
        db = '***', 
        port = '***')
        df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country,
                               sum(metrics_daily.value) as value, dim_channel.channel_type as type
                               from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id 
                               left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 
                               and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db)  
        df_mysql['date'] = pd.to_datetime(df_mysql['date'])
        all_data_target = df_mysql[df_mysql.country==self.target]
        org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)]
        ts_org_target1 = Series(org_data_target.value.tolist(), 
                               index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_all_target1 = Series(all_data_target.value.tolist(), 
                                index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_org_target = (ts_org_target1)/(ts_org_target1.sum())
        ts_all_target = (ts_all_target1)/(ts_all_target1.sum())
        if self.baseorg:
            org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)]
            ts_org_base1 = Series(org_data_base.value.tolist(), 
                                 index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)   
            ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min())
            ts_org_base = []
            ts_org_base1 = []
        if self.paid:
            paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)]
            ts_paid_target1 = Series(paid_data_target.value.tolist(),
                                    index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_paid_target1) < (self.endP-stD).days :
                ts_paid_target1[pd.to_datetime(st_date)] = 0
                ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum())
            ts_paid_target = []
            ts_paid_target1 = []
        if self.rank:
            df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where 
                                    country='%s' and device!='android'and game='***' 
                                    and category='Overall' group by date;''' % self.target, con=db)  
            df_rank['date'] = pd.to_datetime(df_rank['date'])
            ts_rank_target1 = Series(df_rank.bestRank.tolist(), 
                                     index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_rank_target1) < (self.endP-stD).days :
                ts_rank_target1[pd.to_datetime(st_date)] = 0
                ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum())
            ts_rank_target = []
            ts_rank_target1 = []
#        endog = ts_org_target1
#        endog = ts_install_target
        endog = ts_all_target1
        Tlist = [self.paid, self.baseorg, self.view, self.rank]
        dff = DataFrame()
        tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target]
        tlist = ['paid', 'base', 'view', 'rank']
        for i in xrange(0,len(Tlist)):
            if Tlist[i]:
                dff[tlist[i]] = tList[i]
        if dff.empty:
            raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!')
        exog = dff
        return (endog, exog)

