def test_sum_overflow(self, use_bottleneck): with pd.option_context('use_bottleneck', use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes for dtype in ['int32', 'int64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert int(result) == v.sum(dtype='int64') result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] for dtype in ['float32', 'float64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def test_nat_operations(): # GH 8617 s = Series([0, pd.NaT], dtype='m8[ns]') exp = s[0] assert s.median() == exp assert s.min() == exp assert s.max() == exp
def test_min_max_numeric_only(self): # TODO deprecate numeric_only argument for Categorical and use # skipna as well, see GH25303 cat = Series(Categorical( ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "a" _min = cat.min(numeric_only=True) _max = cat.max(numeric_only=True) assert _min == "b" assert _max == "a" _min = cat.min(numeric_only=False) _max = cat.max(numeric_only=False) assert np.isnan(_min) assert _max == "a"
def test_overflow(self): # GH 9442 s = Series(pd.date_range('20130101', periods=100000, freq='H')) s[0] += pd.Timedelta('1s 1ms') # mean result = (s - s.min()).mean() expected = pd.Timedelta((pd.DatetimeIndex((s - s.min())).asi8 / len(s) ).sum()) # the computation is converted to float so # might be some loss of precision assert np.allclose(result.value / 1000, expected.value / 1000) # sum pytest.raises(ValueError, lambda: (s - s.min()).sum()) s1 = s[0:10000] pytest.raises(ValueError, lambda: (s1 - s1.min()).sum()) s2 = s[0:1000] result = (s2 - s2.min()).sum()
def test_min_max(self): # unordered cats have no min/max cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) with pytest.raises(TypeError): cat.min() with pytest.raises(TypeError): cat.max() cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) _min = cat.min() _max = cat.max() assert _min == "a" assert _max == "d" cat = Series(Categorical(["a", "b", "c", "d"], categories=[ 'd', 'c', 'b', 'a'], ordered=True)) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" cat = Series(Categorical( [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' ], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "b" cat = Series(Categorical( [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == 1
def test_name2num(): num_to_test = 10 str_len = 4 letters = string.ascii_letters x = Series(dict(zip(letters, map(ord, letters)))) base = 256 ** np.arange(str_len) mn = base.dot(np.repeat(x.min(), str_len)) mx = base.dot(np.repeat(x.max(), str_len)) for _ in xrange(num_to_test): name = random.sample(letters, str_len) num = name2num(name) assert mn <= num <= mx
def test_min_max_skipna(self, skipna): # GH 25303 cat = Series( Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True)) _min = cat.min(skipna=skipna) _max = cat.max(skipna=skipna) if skipna is True: assert _min == "b" assert _max == "a" else: assert np.isnan(_min) assert np.isnan(_max)
def test_td64_summation_overflow(): # GH#9442 ser = Series(pd.date_range("20130101", periods=100000, freq="H")) ser[0] += pd.Timedelta("1s 1ms") # mean result = (ser - ser.min()).mean() expected = pd.Timedelta( (pd.TimedeltaIndex(ser - ser.min()).asi8 / len(ser)).sum()) # the computation is converted to float so # might be some loss of precision assert np.allclose(result.value / 1000, expected.value / 1000) # sum msg = "overflow in timedelta operation" with pytest.raises(ValueError, match=msg): (ser - ser.min()).sum() s1 = ser[0:10000] with pytest.raises(ValueError, match=msg): (s1 - s1.min()).sum() s2 = ser[0:1000] (s2 - s2.min()).sum()
def get_stats(s: pd.Series): """ Calculate basic sample `s` statistics. """ q1 = s.quantile(0.25) median = s.median() q3 = s.quantile(0.75) p90 = s.quantile(0.90) p95 = s.quantile(0.95) p99 = s.quantile(0.99) iqr = q3 - q1 mean = round(s.mean(), 2) std = round(s.std(), 2) min = s.min() max = s.max() n = len(s) return [q1, median, q3, p90, p95, p99, iqr, mean, std, min, max, n]
def numeric_stats_pandas(series: pd.Series): # summary["min"] = summary["value_counts_without_nan"].index.min() # vc.index.min() return { "mean": series.mean(), "std": series.std(), "variance": series.var(), "min": series.min(), "max": series.max(), # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. "kurtosis": series.kurt(), # Unbiased skew normalized by N-1 "skewness": series.skew(), "sum": series.sum(), }
def compress_float(series: pd.Series) -> pd.Series: """ Compressing to half-precision floating-point format can degrade computational performance CPUs often do not have native support for 16-bit floats and simulate the data type. https://en.wikipedia.org/wiki/Half-precision_floating-point_format https://stackoverflow.com/a/49997863/470433 https://stackoverflow.com/a/15341193/470433 :param series: :return: """ minv, maxv = series.min(), series.max() tester = type_tester(minv, maxv, np.finfo) test_types = [np.float16, np.float32, np.float64] compressed_type = get_compressed_type(test_types, tester) return series.astype(compressed_type)
def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in # min/max calls on empty Series/DataFrames. See GH:33704 for more # information df = DataFrame(dict(x=pd.to_datetime([]))) expected_dt_series = Series(pd.to_datetime([])) # check axis 0 assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_dt_series) tm.assert_series_equal(df.max(axis=1), expected_dt_series)
def _interval_index(facet: pd.Series, thresholds: Optional[List[Any]]) -> pd.IntervalIndex: """ Creates a Interval Index from list of threshold values. See pd.IntervalIndex.from_breaks Ex. [0,1,2] -> [(0, 1], (1,2]] :param facet: input data series :param thresholds: list of int or float values defining the threshold splits :return: pd.IntervalIndex """ if not thresholds: raise ValueError("Threshold values must be provided for continuous features") facet_max, facet_min = facet.max(), facet.min() threshold_intervals = thresholds.copy() # add max value if not exists in threshold limits if abs(facet_max) not in thresholds: threshold_intervals.append(facet_max) return pd.IntervalIndex.from_breaks(threshold_intervals)
def infer_vmin_vmax(data:pd.Series, continuous_type="infer"): vmin = None vmax = None # Infer continuous type if continuous_type in ["infer", None]: continuous_type = infer_continuous_type(data) # +/- if continuous_type == "diverging": vmax = data.abs().max() vmin = -vmax # Other if continuous_type == "sequential": vmax = data.max() vmin = data.min() assert all(map(bool, [vmin,vmax])), "`vmin` and `vmax` should not be None at this point. Please check `infer_continuous_type`" return vmin, vmax
def calc_data_identifier(feature: pd.Series): # Dependent on the type of the provided Series object, statistical features are calculated, converted to string and usable as a data identifier precision = 9 n_rows = len(feature) if pd.api.types.is_numeric_dtype(feature.dtype): mean = round(float(feature.mean()), precision) max = round(float(feature.max()), precision) min = round(float(feature.min()), precision) std = round(float(feature.std()), precision) data_analysis = { 'n_rows': n_rows, 'mean': mean, 'max': max, 'min': min, 'std': std, } elif pd.api.types.is_string_dtype(feature.dtype): value_counts = feature.value_counts().to_dict() data_analysis = { 'n_rows': n_rows, 'value_counts': value_counts, } elif is_datetime(feature.dtype): # convert datetime values to string to make them json exportable value_counts = feature.value_counts() value_counts.index = value_counts.index.astype(str) value_counts = value_counts.to_dict() data_analysis = { 'n_rows': n_rows, 'value_counts': value_counts, } else: raise (Exception(f"Unexpected feature dtype: {feature.dtype}")) data_identifier_string = json.dumps(data_analysis, sort_keys=True) return data_identifier_string
def create_histogram(series: pd.Series, cwd: str, ranking_score_capping: float, limit: int) -> None: series_max_boundary = 1 series_min_boundary = round(series.min(), 1) ranking_score_capping_bool = True if ranking_score_capping > series_min_boundary else False num_of_messages = series.count() if series_min_boundary >= 0: bin_num = int((series_max_boundary + series_min_boundary) * 100) else: bin_num = int((series_max_boundary + abs(series_min_boundary)) * 100) bin_seq = [] for x in range(bin_num - 1): bin_seq.append(round(series_min_boundary, 2)) series_min_boundary += 0.01 fig = Figure(figsize=(18, 9), facecolor='#ffffff') ax = fig.add_subplot() ax.set_facecolor('#e5e5e5') ax.hist(series, bins=bin_seq, color="#61ade0", label="num of message scores in 0.01 bins") ax.grid(color='#000000', linestyle="--") if ranking_score_capping_bool: ax.axvline(ranking_score_capping, color="#df1b12", linestyle="--", linewidth=3, label=f"capping value ({ranking_score_capping})") if limit <= num_of_messages: limit_score_value = series.iloc[limit] ax.axvline( limit_score_value, color="#fdc530", linestyle="--", linewidth=3, label=f"limit from configuration file ({limit})" f"\nscore of last message before the limit is ~ {round(limit_score_value, 3)}" ) ax.legend(loc='upper right', fontsize=10, shadow=True, facecolor='#ffffff') ax.set_title( f"Unfiltered messages ({num_of_messages}), sorted by ranking score, before using capping and limit", fontsize=10) ax.set_xlabel("Ranking score", fontsize=10) ax.set_ylabel("Num of messages in 0.01 bins", fontsize=10) fig.savefig(join(cwd + sep + 'histogram.png'))
def points_calculation_tree(var: pd.Series, target: pd.Series, min_size: float = 5, rnd: int = 2) -> list: """ Calculate points for binning numeric variable dependent on target variable. Keyword arguments: var (pd.Series) -- Numeric variable target (pd.Series) -- Target binary variable min_size (float) -- minimum size of group in percent rnd -- Round level for variable values (default 2) """ size = var.shape[0] var_name = var.name if (var.isnull().sum() / size) > (1 - min_size / 100): print('WARNING! Variable "{vname}" has too much null values!'.format( vname=var_name)) return [-np.inf, np.inf] elif (var.value_counts(dropna=True).max() / size) > (1 - min_size / 100): print('WARNING! Variable "{vname}" has too often one value!'.format( vname=var_name)) return [-np.inf, np.inf] else: indx = var.notnull() var = var[indx] target = target[indx] xmin = var.min() xmax = var.max() min_samples = round(min_size * size / 100) clf = tree.DecisionTreeClassifier(min_samples_leaf=min_samples, random_state=777) clf.fit(var.to_frame(), target) cut_points = pd.Series( clf.tree_.threshold).value_counts().to_frame('cnt').reset_index() cut_points.columns = ['point', 'cnt'] cut_points = cut_points.loc[cut_points['cnt'] == 1, :] cut_points = cut_points.loc[(cut_points['point'] <= xmax) & (cut_points['point'] >= xmin), :] cut_points = cut_points['point'].sort_values(ascending=True).values cut_points = [ -np.inf, ] + list(cut_points.round(rnd)) + [ np.inf, ] return cut_points
def showNumericalInfo(data:pd.Series): ''' @Description 显示数值统计信息,unique值,mean, median, mode, max, min ------------ @Params data, Series ''' print(data.name, data.dtype) print("Miss:", data.isnull().sum()) print("Unique:", data.nunique()) print("Max:", data.max()) print("Min:", data.min()) print("Mean:", data.mean()) print("Median:", data.median()) print("Mode:", data.mode()[0]) print(data.value_counts().head(n=10))
def discretize_series(series: pd.Series) -> pd.Series: # Find minimum and maximum values of the series, and define boundaries s_min, s_max = series.min(), series.max() boundaries = [s_min / 2.0, 0.0, s_max / 2.0] # Assign tier based on the boundaries def get_tier(val): if val < boundaries[0]: return "Very Low" elif val >= boundaries[0] and val < boundaries[1]: return "Low" elif val >= boundaries[1] and val < boundaries[2]: return "High" else: return "Very High" return pd.Series([get_tier(val) for val in series])
def infer_continuous_type(data:pd.Series): assert all(data.map(is_number)), "All values in `data` must be numerical" _vmin = data.min() _vmax = data.max() # Conditions conditions = defaultdict(list) conditions["A"].append(_vmin < 0) conditions["A"].append(_vmax > 0) # +/- if all(conditions["A"]): return "diverging" # Other else: return "sequential"
def transform(self, target: pd.Series) -> pd.Series: """ Логарифмическое преобразование целевой переменной. Parameters ---------- target: pandas.Series, shape = [n_samples, ] Вектор целевой переменной. Returns ------- log_target: pandas.Series, shape = [n_samples, ] Вектор прологарифмированной целевой переменной. """ self.check_is_fitted self.target_min = target.min() return np.log(target - self.target_min + 1 + self.bias)
def forkAnalysis(fileName): df = pd.read_csv(fileName, encoding='gbk') block = df['block'] result = [] # print block for i, row in block.iteritems(): if '已孤儿' in row.encode('utf-8'): result.append(int(row[:-5].encode('utf-8'))) pre = result[0] for i in xrange(1, len(result)): temp = result[i] result[i] = result[i] - pre pre = temp ser = Series(result) print '---------------分叉相关--------------------' print '最大值:', ser.max(), '最小值:', ser.min(), '平均值:', ser.mean( ), '方差:', ser.var()
def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series: ''' segment is divided by the median to determine its top or bottom part the part is smoothed and raised above the segment or put down below the segment ''' if len(segment) < 2: return segment comparison_operator = operator.gt if bound == Bound.UPPER else operator.le segment = segment - segment.min() segment_median = segment.median() part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values] part = pd.Series(part, index = segment.index) smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA) difference = [abs(x - y) for x, y in zip(part, smoothed_part)] max_diff = max(difference) bound = [val + max_diff for val in smoothed_part.values] bound = pd.Series(bound, index = segment.index) return bound
def hist_bin_width_fd(x: pd.Series) -> float: """Create bin widths for histograms based on the Freedman-Diaconis rule. Args: x (pd.Series): Series of data to use to generate bin widths. Returns: float: Number that specifies the bin widths. """ iqr = np.subtract(*np.percentile(x, [75, 25])) h = 2.0 * iqr * x.size**(-1.0 / 3.0) if (x.max() - x.min()) / h > 1e8 * x.size: warnings.warn( "Bin width estimated with the Freedman-Diaconis rule is very small" " (= {})".format(h), RuntimeWarning, stacklevel=2) return h
def fit(self, target: pd.Series) -> None: """ Расчет минимального значения целевой переменной для корректного расчета логарифма на отрицательных значениях. Parameters ---------- target: pandas.Series, shape = [n_samples, ] Вектор целевой переменной. Returns ------- self """ self.target_min = target.min() self.fitted = True return self
def test_timedelta64_analytics(self): # index min/max dti = pd.date_range('2012-1-1', periods=3, freq='D') td = Series(dti) - pd.Timestamp('20120101') result = td.idxmin() assert result == 0 result = td.idxmax() assert result == 2 # GH#2982 # with NaT td[0] = np.nan result = td.idxmin() assert result == 1 result = td.idxmax() assert result == 2 # abs s1 = Series(pd.date_range('20120101', periods=3)) s2 = Series(pd.date_range('20120102', periods=3)) expected = Series(s2 - s1) # FIXME: don't leave commented-out code # this fails as numpy returns timedelta64[us] # result = np.abs(s1-s2) # assert_frame_equal(result,expected) result = (s1 - s2).abs() tm.assert_series_equal(result, expected) # max/min result = td.max() expected = pd.Timedelta('2 days') assert result == expected result = td.min() expected = pd.Timedelta('1 days') assert result == expected
def do_stats_numeric(series: pd.Series, updated_dict: dict): stats = updated_dict["stats"] stats["max"] = series.max() stats["mean"] = series.mean() for percentile, value in series.quantile([0.95, 0.75, 0.50, 0.25, 0.05]).to_dict().items(): stats[f"perc{int(percentile*100)}"] = value stats["min"] = series.min() stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats["perc75"] - stats["perc25"] stats["std"] = series.std() stats["variance"] = series.var() stats["kurtosis"] = series.kurt() stats["skewness"] = series.skew() stats["sum"] = series.sum() stats["mad"] = series.mad() stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN return updated_dict
def _make_grid(values: Series, size: int, attempt_geometric: bool) -> MakeGridResult: start, stop = values.min(), values.max() message = None geometric = attempt_geometric if geometric and (start < 0 or stop <= 0): message = ( "Refusing to create a geometric grid for a series with negative or all-zero values" ) geometric = False if geometric and start == 0: start = values.drop_duplicates().nsmallest(2).iloc[1] assert start != 0 f: Any = np.geomspace if geometric else np.linspace return MakeGridResult( grid=f(start, stop, size), geometric=geometric, message=message, )
def test_timedelta64_analytics(self): # index min/max dti = pd.date_range("2012-1-1", periods=3, freq="D") td = Series(dti) - pd.Timestamp("20120101") result = td.idxmin() assert result == 0 result = td.idxmax() assert result == 2 # GH#2982 # with NaT td[0] = np.nan result = td.idxmin() assert result == 1 result = td.idxmax() assert result == 2 # abs s1 = Series(pd.date_range("20120101", periods=3)) s2 = Series(pd.date_range("20120102", periods=3)) expected = Series(s2 - s1) # FIXME: don't leave commented-out code # this fails as numpy returns timedelta64[us] # result = np.abs(s1-s2) # assert_frame_equal(result,expected) result = (s1 - s2).abs() tm.assert_series_equal(result, expected) # max/min result = td.max() expected = pd.Timedelta("2 days") assert result == expected result = td.min() expected = pd.Timedelta("1 days") assert result == expected
def describe_date_1d(series: pd.Series, series_description: dict) -> dict: """Describe a date series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ stats = { "min": series.min(), "max": series.max(), "histogramdata": series, # TODO: calc histogram here? } stats["range"] = stats["max"] - stats["min"] return stats
def hist_in_range( series: pandas.Series, min_value: Optional[float] = None, max_value: Optional[float] = None, bins: int = 50) \ -> Any: """Display histogram of values in a given range. Arguments: series: pd.Series to compute the histogram on. min_value: Minimum value in series to be used. max_value: Maximum value in series to be used. Returns: The axes object of the plot. """ min_value = min_value or series.min() max_value = max_value or series.max() plot_range = (series >= min_value) & (series <= max_value) range_perc = plot_range.sum() / series.count() * 100 print(f'{range_perc:.2f}% of values in range') return series[plot_range].hist(bins=bins)
def split( self, X: pd.DataFrame, reference: pd.Series, bins: int = 10, shuffle: Union[bool] = None, random_state: Union[int] = None) -> Tuple[np.ndarray, np.ndarray]: shuffle = shuffle if shuffle is not None else self.shuffle random_state = random_state if random_state is not None else self.random_state min_ref, max_ref = int(reference.min() - 1), int(reference.max() + 1) cut_threshold = np.linspace(min_ref, max_ref, bins) out = pd.cut(reference, bins=cut_threshold, labels=False) skf = StratifiedKFold(self.n_split, shuffle=shuffle, random_state=random_state) for train_idx, val_idx in skf.split(X, out): yield train_idx, val_idx
def describe_dc_as_dataframe(dc: pd.Series, ds_md: dict) -> pd.Series: """ describes the profile criteria for column Args: dc: the Series to create Profile for ds_md: the Metadata dictionary of the DataFrame that is to be profiled Returns: A Series containing calculated description values. """ dc = pd.to_numeric(dc, errors='coerce') null_values = dc.isna().sum() unique_values = len(dc.dropna().unique()) / len(dc) constancy = dc.value_counts(normalize=True).max( ) #constancy defined as amount of most frequent value divided by amount of numbers in column dc_stats = [ ["Metadaten spezifisch für Spalte", column_metadata(dc.name, ds_md)], ["Anzahl an Zeilen", len(dc)], ["Anzahl an fehlenden Werten", null_values], ["Fehlende Werte (Prozent)", (null_values / len(dc)) * TO_PERCENT], ["Distinkte Werte (Prozent)", unique_values * TO_PERCENT], ["Konstanz (Prozent)", constancy * TO_PERCENT], ["Mittelwert", format(dc.mean(), 'f')], [ "Minimumwert (Jahr, Wert)", ({ dc.idxmin().date(): format(dc.min(), 'f') } if len(dc.dropna()) > 0 else "") ], [ "Maximumwert (Jahr, Wert)", ({ dc.idxmax().date(): format(dc.max(), 'f') } if len(dc.dropna()) > 0 else "") ], ["Datenpunkte vorhanden für", check_is_consecutive(dc)] ] profile = pd.DataFrame(data=dc_stats, columns=["Kriterien", "Ergebnis"]) profile.set_index("Kriterien", inplace=True) return profile
def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: """Describe series containing datetime64 dtype. Parameters ---------- data : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ # GH-30164 from pandas import Series formatted_percentiles = format_percentiles(percentiles) stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ([data.count(), data.mean(), data.min()] + data.quantile(percentiles).tolist() + [data.max()]) return Series(d, index=stat_index, name=data.name)
def _fit(self, X: pd.Series, y): if not is_numeric_dtype(X) and X.name not in self.categorical_cols: raise ValueError( 'Column {} is not numeric and not in categorical_cols.'.format( X.name)) if X.name in self.categorical_cols: X = self.encode_with_label(X, y) if not self.encode: self.min_[X.name], self.max_[X.name] = X.min(), X.max() X, y = self._drop_na(X, y) min_frac = self.min_frac if is_number( self.min_frac) else self.min_frac[X.name] DT = DecisionTreeClassifier(max_leaf_nodes=self.bins, min_samples_leaf=min_frac, random_state=self.random_state) DT.fit(X.to_frame(), y) return parse_tree(DT.tree_), DT
def min_max_normalize(values: pd.Series) -> pd.Series: """ Min-Max normalize a series. Args: values: the series to normalize Returns: the normalized series Test: >>> min_max_normalize(pd.Series([1,2,3])) 0 0.0 1 0.5 2 1.0 dtype: float64 """ max_value = values.max() min_value = values.min() return tp.cast(pd.Series, (values - min_value) / (max_value - min_value))
def describe_date_1d(series: pd.Series, series_description: dict) -> dict: """Describe a date series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ stats = {"min": series.min(), "max": series.max(), "histogramdata": series} bins = config["plot"]["histogram"]["bins"].get(int) # Bins should never be larger than the number of distinct values bins = min(series_description["distinct_count_with_nan"], bins) stats["histogram_bins"] = bins stats["range"] = stats["max"] - stats["min"] return stats
def count_estims(dist, gamma = 0.95): ''' Counts all estimates :param dist: dsitribution :param gamma: probability of realisation of value :return point: point estimates :return interval: confidance intervals for point estimates ''' import numpy as np x = Series(dist) #Точечные оценки point = {} N = x.count() med_ = med_u(x)# med = np.median(dist) mad = x.mad()# mean_c = mean(dist)# var = np.var(dist) std = np.std(dist) mod = stats.mode(dist).mode# kurt = stats.kurtosis(dist) skew_my = stats.skew(dist)# Chi = 1/np.sqrt(np.abs(kurt)) quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5) W = std/mean_c;# quantiles_str = "" for index in quantiles.index: quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index]) point['MED'] = np.round(med, 5) point['MED*'] = np.round(med_, 5) point['MAD'] = np.round(mad, 5) point['Min'] = np.round(x.min(), 5) point['Max'] = np.round(x.max(), 5) point['Mean'] = np.round(mean_c, 5) point['S^2'] = np.round(var, 5) point['S'] = np.round(std, 5) point['MOD'] = np.round(mod, 5) point['E'] = np.round(kurt, 5) point['A'] = np.round(skew_my, 5) point['Chi'] = np.round(Chi, 5) point['X(alpha)'] = quantiles_str point['W'] = np.round(W, 5) #Интервальные оценки from scipy.stats import t, norm import numpy as np interval = {} if N < 61: l = t.ppf((1-gamma)/2, N-1) u = t.ppf(1-(1-gamma)/2, N-1) else: l = norm.ppf((1-gamma)/2) u = norm.ppf(1-(1-gamma)/2) X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x)) A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x)) S_cf = (std + l*sigma_S(x), std+u*sigma_S(x)) E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x)) if W < 1: v = l/np.sqrt(2*(N-1)) W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5) else: W_cf = (None, None) interval['Mean'] = np.round(X_cf, 5) interval['S'] = np.round(S_cf, 5) interval['E'] = np.round(E_cf, 5) interval['A'] = np.round(A_cf, 5) interval['W'] = W_cf return point, interval
def source_data(self): st_date = self.stTrain # st_date = '2014-10-1' stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2])) if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date(): raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!') if self.view: db_red = psycopg2.connect(host="***", database="***", port="***", user="******", password="******") db_red.autocommit = True df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view from appstoredata_itunes_metrics where game='***' and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, con=db_red) df_red['date'] = pd.to_datetime(df_red['date']) ts_view_target1 = Series(df_red.view.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1 = Series(df_red.install.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_view_target1) < (self.endP-stD).days : ts_view_target1[pd.to_datetime(st_date)] = 0 ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1[pd.to_datetime(st_date)] = 0 ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_view_target = (ts_view_target1)/(ts_view_target1.sum()) ts_install_target = (ts_install_target1)/(ts_install_target1.sum()) else: ts_view_target = [] ts_view_target1 = [] ts_install_target = [] ts_install_target1 = [] db = MySQLdb.connect( host = '***', user = '******', passwd = '***', db = '***', port = '***') df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country, sum(metrics_daily.value) as value, dim_channel.channel_type as type from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db) df_mysql['date'] = pd.to_datetime(df_mysql['date']) all_data_target = df_mysql[df_mysql.country==self.target] org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)] ts_org_target1 = Series(org_data_target.value.tolist(), index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_all_target1 = Series(all_data_target.value.tolist(), index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_target = (ts_org_target1)/(ts_org_target1.sum()) ts_all_target = (ts_all_target1)/(ts_all_target1.sum()) if self.baseorg: org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)] ts_org_base1 = Series(org_data_base.value.tolist(), index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min()) else: ts_org_base = [] ts_org_base1 = [] if self.paid: paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)] ts_paid_target1 = Series(paid_data_target.value.tolist(), index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_paid_target1) < (self.endP-stD).days : ts_paid_target1[pd.to_datetime(st_date)] = 0 ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum()) else: ts_paid_target = [] ts_paid_target1 = [] if self.rank: df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where country='%s' and device!='android'and game='***' and category='Overall' group by date;''' % self.target, con=db) df_rank['date'] = pd.to_datetime(df_rank['date']) ts_rank_target1 = Series(df_rank.bestRank.tolist(), index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_rank_target1) < (self.endP-stD).days : ts_rank_target1[pd.to_datetime(st_date)] = 0 ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum()) else: ts_rank_target = [] ts_rank_target1 = [] # endog = ts_org_target1 # endog = ts_install_target endog = ts_all_target1 Tlist = [self.paid, self.baseorg, self.view, self.rank] dff = DataFrame() tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target] tlist = ['paid', 'base', 'view', 'rank'] for i in xrange(0,len(Tlist)): if Tlist[i]: dff[tlist[i]] = tList[i] if dff.empty: raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!') exog = dff return (endog, exog)
frame ''' A B C a 0 1 2 b 3 4 5 c 6 7 8 ''' print frame.max() ''' A 6 B 7 C 8 ''' f = lambda x: x.max() - x.min() print frame.apply(f) # 作用到每一列 ''' A 6 B 6 C 6 ''' print frame.apply(f, axis=1) # 作用到每一行 ''' a 2 b 2 c 2 '''