def test_sum_overflow(self, use_bottleneck): with pd.option_context('use_bottleneck', use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes for dtype in ['int32', 'int64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert int(result) == v.sum(dtype='int64') result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] for dtype in ['float32', 'float64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() not_max = lens[lens != lens_max] for ind, length in iteritems(not_max): body[ind] += [np.nan] * (lens_max - length)
def test_nat_operations(): # GH 8617 s = Series([0, pd.NaT], dtype='m8[ns]') exp = s[0] assert s.median() == exp assert s.min() == exp assert s.max() == exp
def getelapsed(dtin: pd.Series): """ takes a series of date strings and returns the number of elapsed days from the earliest to the last date """ dmax = datetime.datetime.strptime(dtin.max(), '%Y-%m-%d') dmin = datetime.datetime.strptime(dtin.min(), '%Y-%m-%d') ddif: datetime.timedelta = dmax - dmin return ddif.days
def _expand_elements(body): lens = Series([len(elem) for elem in body]) lens_max = lens.max() not_max = lens[lens != lens_max] empty = [''] for ind, length in not_max.items(): body[ind] += empty * (lens_max - length)
def create_interaction_description(interaction_count_series: Series) -> Dict: """某种类型交互的用户交互次数统计值""" interaction_description = dict() interaction_description[MIN] = interaction_count_series.min() interaction_description[MAX] = interaction_count_series.max() interaction_description[MEAN] = interaction_count_series.mean() interaction_description[MEDIAN] = interaction_count_series.median() return interaction_description
def NormalizeDatasetMethod1(ds: pd.Series): minimum = ds.min() maximum = ds.max() delta = maximum - minimum result = [] for i in range(len(ds)): result.append(float((ds[i] - minimum) / delta)) return result
def from_series(feature_name: str, series: Series): """从pandas.Series中构造""" assert types.is_numeric_dtype(series), series.dtypes return NumericColumn(feature_name=feature_name, min_value=series.min(), max_value=series.max(), mean_value=series.mean(), std_value=series.std())
def get_numerical_distribution(cls, column: pd.Series, column_baseline: Dict = None): if column_baseline: bins = [ x["lower_bound"] for x in column_baseline["numerical_stats"]["distribution"] ] bins.append(column_baseline["numerical_stats"]["distribution"][-1] ["upper_bound"]) # Insert a bin if new value is less than the min value if column.min() < column_baseline["numerical_stats"]["min"]: bins.insert(0, column.min().item()) # Insert a bin if new value is less than the max value if column.max() > column_baseline["numerical_stats"]["max"]: bins.append(column.max().item()) bin_size = len(bins) - 1 labels = [str(x + 1) for x in range(bin_size)] cuts = pd.cut(x=column, bins=bins, precision=2, labels=labels) else: bin_size = 10 labels = [str(x + 1) for x in range(bin_size)] cuts, bins = pd.cut(x=column, bins=bin_size, precision=2, labels=labels, retbins=True) value_counts = cuts.value_counts(normalize=True).to_dict() distribution = [] for index, bin_value in enumerate(bins[:-1]): _bin = { "lower_bound": bin_value, "upper_bound": bins[index + 1], "percent": value_counts[str(index + 1)] * 100, # Normalize to 100 } distribution.append(_bin) return distribution
def nbr_pages_parrecherche(Region,type): Result=getSoupFromUrl(getURL_Annonces(Region,1,type)) Balises_a=Result.find_all("a") Numeros_pages= Series([int(A.text) for A in Balises_a if A.text.isnumeric()]) if len(Numeros_pages)==0: return 1 else: return Numeros_pages.max()
def __normolization_min_max(a: pd.Series, index: list) -> pd.Series: if a.name in index: minimum = a.min() maximum = a.max() a = (a - minimum) / (maximum - minimum) return a
def __init__(self, col: Series): col: ndarray = col.to_numpy() self._min: number = col.min(initial=None) self._max: number = col.max(initial=None) self._range: number = self._max - self._min self._mean: number = col.mean() self._std: number = col.std()
def split_data(date_blocks: pd.Series, X: pd.DataFrame, y: pd.Series): val_block = date_blocks.max() return ( X.loc[date_blocks < val_block], y.loc[date_blocks < val_block], X.loc[date_blocks == val_block], y.loc[date_blocks == val_block] )
def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() not_max = lens[lens != lens_max] empty = [''] for ind, length in iteritems(not_max): body[ind] += empty * (lens_max - length)
def float_formatter(column: pd.Series, value: float, minimize: bool = True) -> str: # type: ignore """ Returns a formatter to be used when printing data frames to LaTeX. """ if value == (column.min() if minimize else column.max()): return f"\\textbf{{{value:,.2f}}}" return f"{value:,.2f}"
def get_max(s: pd.Series): ps = s.index m = s.max() if isinstance(ps[0], str): s = (s == m).astype(int).replace(0, np.nan) s[~s.isna()] = ps[~s.isna()] return s else: return (s == m).astype(int).replace(0, np.nan) * ps
def create_category_series(category_series: pd.Series, fill_gaps: bool = True, fill_steps: int = 1): """Returns sorted distinct category values, optionally with gaps filled""" if fill_gaps: return list( range(category_series.min(), category_series.max() + 1, fill_steps)) return list(sorted(category_series.unique().tolist()))
def nbr_pages_parrecherche(Region, type): Result = getSoupFromUrl(getURL_Annonces(Region, 1, type)) Balises_a = Result.find_all("a") Numeros_pages = Series( [int(A.text) for A in Balises_a if A.text.isnumeric()]) if len(Numeros_pages) == 0: return 1 else: return Numeros_pages.max()
def normalize_column(column: pd.Series) -> pd.Series: """ Normalizes a column of data and applies a visual scale to it. :param column: a column of numeric data :return: a normalized column of data """ return ((column - column.min()) / (column.max() - column.min()) + .1) * VISUAL_SCALE
def fit(self, x: pd.Series): if self.method == "Gaussian": self.mean, self.std = x.mean(), x.std() elif self.method == "RankGaussian": # TODO: store state pass elif self.method == "MinMax": self.min, self.max = x.min(), x.max() return self
def trajectory_is_constant(self, trajectory: pandas.Series) -> bool: """ Determines whether a specific trajectory remains at a reletively constant frequency throughout the experiment. Trajectories must change in frequency by at least 10% over the course of the experiment. """ maximum_difference = trajectory.max() - trajectory.min() return maximum_difference <= self.filter_consistency
def __normolization_centroid(a: pd.Series, index: list) -> pd.Series: if a.name in index: minimum = a.min() maximum = a.max() centroid = (maximum - minimum) / 2 a = (a - centroid) / (maximum - minimum) return a
def plot(self, forecast: np.ndarray, training_data: pd.Series, test_data: pd.Series = None, show: bool = False) -> matplotlib.figure.Figure: logger.debug('Plotting...') history = training_data timeframe = history.index[-1] - history.index[-2] forecast = pd.Series(forecast, index=pd.date_range(start=history.index[-1] + timeframe, periods=len(forecast), freq=timeframe)) highest_datapoint = max(history.max(), forecast.max()) lowest_datapoint = min(history.min(), forecast.min()) if test_data is not None: highest_datapoint = max(highest_datapoint, test_data.max()) lowest_datapoint = min(lowest_datapoint, test_data.min()) fig, ax1 = plt.subplots() # ax1.set_ylim(bottom=lowest_datapoint - (lowest_datapoint / 2), top=highest_datapoint + (highest_datapoint / 2)) ax1.plot(history, color='red', linewidth=config.plot.linewidth) if test_data is not None: ax1.plot(test_data, color='orange', linewidth=config.plot.linewidth) ax1.plot(forecast, color='black', linestyle=':', linewidth=config.plot.linewidth + 0.2) ax1.set_title( f'{self.currency}/{self.to_currency} Price (Orange) vs {self.currency}/{self.to_currency} Price Forecast (Black)' ) ax1.set_ylabel(f'{self.currency}/{self.to_currency} Price') ax1.set_xlabel('Date') legend = ax1.legend() texts = legend.get_texts() if test_data is not None: texts[0].set_text('Actual Price (training data)') texts[1].set_text('Actual Price (test data)') if len(texts) == 3: texts[2].set_text('Forecasted Price') else: texts[0].set_text('Actual Price') texts[1].set_text('Forecasted Price') if show: plt.show() return fig
def normalize_column(df_column: Series) -> Series: # Just normalize numeric columns if df_column.dtype == np.float64 or df_column.dtype == np.int64: max_value = df_column.max() min_value = df_column.min() if min_value != max_value: df_column = (df_column - min_value) / (max_value - min_value) return df_column # If min=max, normalization is undefined so I return the same column
def time_gap(dates: pd.Series, uom='weekly'): """ Returns the relative time gap from the latest date. The uom (unit of mesure) arguments defines the output units, by default weeks. If not specified, uom would be hourly """ uom = (3600 * 24 * 7) if uom == 'weekly' else 3600 ## can be improved max_Date = dates.max() time_delta = max_Date - dates return 1 + np.floor(time_delta.dt.total_seconds() / uom)
def _check_inputs( s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series, ) -> None: """ Check that inputs have valid names and could be proabilities """ if ( s_test_pred.min() < 0 or s_test_pred.max() > 1 or s_calib_pred.min() < 0 or s_calib_pred.max() > 1 ): raise RuntimeError( "Probabilities outside (0,1) range were passed to calibrate" ) if not s_calib_pred.name == s_test_pred.name: warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}") if s_test_pred.isnull().sum() > 0: _log_missing_indices(s_test_pred) raise RuntimeError("Missing values in s_test_pred") if s_calib_pred.isnull().sum() > 0: _log_missing_indices(s_calib_pred) raise RuntimeError("Missing values in s_calib_pred") if s_calib_actual.isnull().sum() > 0: _log_missing_indices(s_calib_actual) raise RuntimeError("Missing values in s_calib_actual") if ( not len(s_calib_pred) == len(s_calib_actual) or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0 ): raise RuntimeError( f"len(s_calib_pred): {len(s_calib_pred)} " f"len(s_calib_actual): {len(s_calib_actual)} " f"index diff: " f"{s_calib_pred.index.difference(s_calib_actual.index)}" f"s_calib_pred.head() : {s_calib_pred.head()}" f"s_calib_pred.tail() : {s_calib_pred.tail()}" f"s_calib_actual.head() : {s_calib_actual.head()}" f"s_calib_actual.tail() : {s_calib_actual.tail()}" )
def _assign_bins(values: pd.Series, no_bins, column_names) -> pd.DataFrame: """ Assigns values to bins [1; no_bins] :return: DataFrame with three columns: bin, left_bound, right_bound """ limits = np.linspace(values.min(), values.max(), no_bins) return pd.DataFrame( np.array([_find_bin(limits, val) for val in values.values]), columns=column_names )
def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType): assert data_type in cls.accepted_types c = cls(column_name, data.min(), data.max(), data_type, has_missing=data.hasnans) c._data = data return c
def calculate_row_max_arm(row: pd.Series) -> str: ''' Finds the winning probability for a given row :param row: :return: ''' row = row.squeeze() max_value_arm = str(row[row == row.max()].index[0]) return max_value_arm
def setup( self, data: Series, prop: Property, axis: Axis | None = None, ) -> Scale: new = copy(self) forward, inverse = self._get_transform() mpl_scale = self._get_scale(data.name, forward, inverse) if axis is None: axis = PseudoAxis(mpl_scale) axis.update_units(data) mpl_scale.set_default_locators_and_formatters(axis) normalize: Optional[Callable[[ArrayLike], ArrayLike]] if prop.normed: if self.norm is None: vmin, vmax = data.min(), data.max() else: vmin, vmax = self.norm vmin, vmax = axis.convert_units((vmin, vmax)) a = forward(vmin) b = forward(vmax) - forward(vmin) def normalize(x): return (x - a) / b else: normalize = vmin = vmax = None forward_pipe = [ axis.convert_units, forward, normalize, prop.get_mapping(new, data) ] def spacer(x): return np.min(np.diff(np.sort(x.dropna().unique()))) # TODO make legend optional on per-plot basis with ScaleSpec parameter? if prop.legend: axis.set_view_interval(vmin, vmax) locs = axis.major.locator() locs = locs[(vmin <= locs) & (locs <= vmax)] labels = axis.major.formatter.format_ticks(locs) legend = list(locs), list(labels) else: legend = None scale_type = self.__class__.__name__.lower() return Scale(forward_pipe, spacer, legend, scale_type, mpl_scale)
def test_min_max_numeric_only(self): # TODO deprecate numeric_only argument for Categorical and use # skipna as well, see GH25303 cat = Series(Categorical( ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "a" _min = cat.min(numeric_only=True) _max = cat.max(numeric_only=True) assert _min == "b" assert _max == "a" _min = cat.min(numeric_only=False) _max = cat.max(numeric_only=False) assert np.isnan(_min) assert _max == "a"
def bucketize_data(column_data: pd.Series, num_buckets: int) -> List[int]: min_val = column_data.min() max_val = column_data.max() bucket_size = (max_val - min_val) / num_buckets boundries = [] boundry = min_val while (len(boundries) + 1) < num_buckets: boundry += bucket_size boundries.append(round(boundry + 0.000001)) return boundries
def test_min_max_dt64_api_consistency_empty_df(self): # check DataFrame/Series api consistency when calling min/max on an empty # DataFrame/Series. df = DataFrame(dict(x=[])) expected_float_series = Series([], dtype=float) # check axis 0 assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_float_series) tm.assert_series_equal(df.min(axis=1), expected_float_series)
def test_sum_overflow_float(self, use_bottleneck, dtype): with pd.option_context("use_bottleneck", use_bottleneck): v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def test_min_max(self): # unordered cats have no min/max cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) with pytest.raises(TypeError): cat.min() with pytest.raises(TypeError): cat.max() cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) _min = cat.min() _max = cat.max() assert _min == "a" assert _max == "d" cat = Series(Categorical(["a", "b", "c", "d"], categories=[ 'd', 'c', 'b', 'a'], ordered=True)) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" cat = Series(Categorical( [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' ], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "b" cat = Series(Categorical( [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == 1
def test_name2num(): num_to_test = 10 str_len = 4 letters = string.ascii_letters x = Series(dict(zip(letters, map(ord, letters)))) base = 256 ** np.arange(str_len) mn = base.dot(np.repeat(x.min(), str_len)) mx = base.dot(np.repeat(x.max(), str_len)) for _ in xrange(num_to_test): name = random.sample(letters, str_len) num = name2num(name) assert mn <= num <= mx
def test_timedelta64_analytics(self): # index min/max dti = pd.date_range('2012-1-1', periods=3, freq='D') td = Series(dti) - pd.Timestamp('20120101') result = td.idxmin() assert result == 0 result = td.idxmax() assert result == 2 # GH#2982 # with NaT td[0] = np.nan result = td.idxmin() assert result == 1 result = td.idxmax() assert result == 2 # abs s1 = Series(pd.date_range('20120101', periods=3)) s2 = Series(pd.date_range('20120102', periods=3)) expected = Series(s2 - s1) # FIXME: don't leave commented-out code # this fails as numpy returns timedelta64[us] # result = np.abs(s1-s2) # assert_frame_equal(result,expected) result = (s1 - s2).abs() tm.assert_series_equal(result, expected) # max/min result = td.max() expected = pd.Timedelta('2 days') assert result == expected result = td.min() expected = pd.Timedelta('1 days') assert result == expected
def get_summary_indicators_from_hist(sf, hist, int_index=False): seriesHist = Series(hist) maxs = { 'freq': dict() } means = {'freq': seriesHist.mean()} medians = {'freq': seriesHist.median()} stds = {'freq': seriesHist.std()} maxs['freq']['freq'] = seriesHist.max() maxs['freq']['index'] = seriesHist.idxmax() index_total = 'NA' if int_index: index = seriesHist.index index = index.astype(int) index_list = index.tolist() index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))]) index_series = Series(index_list) means['index'] = index_series.mean() medians['index'] = index_series.median() stds['index'] = index_series.std() maxs['freq']['index'] = int(maxs['freq']['index']) maxs['index'] = dict() maxs['index']['index'] = max(index_list) maxs['index']['freq'] = hist[str(maxs['index']['index'])] return { 'means': means, 'medians': medians, 'stds': stds, 'max': maxs, 'index_total': index_total }
def count_estims(dist, gamma = 0.95): ''' Counts all estimates :param dist: dsitribution :param gamma: probability of realisation of value :return point: point estimates :return interval: confidance intervals for point estimates ''' import numpy as np x = Series(dist) #Точечные оценки point = {} N = x.count() med_ = med_u(x)# med = np.median(dist) mad = x.mad()# mean_c = mean(dist)# var = np.var(dist) std = np.std(dist) mod = stats.mode(dist).mode# kurt = stats.kurtosis(dist) skew_my = stats.skew(dist)# Chi = 1/np.sqrt(np.abs(kurt)) quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5) W = std/mean_c;# quantiles_str = "" for index in quantiles.index: quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index]) point['MED'] = np.round(med, 5) point['MED*'] = np.round(med_, 5) point['MAD'] = np.round(mad, 5) point['Min'] = np.round(x.min(), 5) point['Max'] = np.round(x.max(), 5) point['Mean'] = np.round(mean_c, 5) point['S^2'] = np.round(var, 5) point['S'] = np.round(std, 5) point['MOD'] = np.round(mod, 5) point['E'] = np.round(kurt, 5) point['A'] = np.round(skew_my, 5) point['Chi'] = np.round(Chi, 5) point['X(alpha)'] = quantiles_str point['W'] = np.round(W, 5) #Интервальные оценки from scipy.stats import t, norm import numpy as np interval = {} if N < 61: l = t.ppf((1-gamma)/2, N-1) u = t.ppf(1-(1-gamma)/2, N-1) else: l = norm.ppf((1-gamma)/2) u = norm.ppf(1-(1-gamma)/2) X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x)) A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x)) S_cf = (std + l*sigma_S(x), std+u*sigma_S(x)) E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x)) if W < 1: v = l/np.sqrt(2*(N-1)) W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5) else: W_cf = (None, None) interval['Mean'] = np.round(X_cf, 5) interval['S'] = np.round(S_cf, 5) interval['E'] = np.round(E_cf, 5) interval['A'] = np.round(A_cf, 5) interval['W'] = W_cf return point, interval
def source_data(self): st_date = self.stTrain # st_date = '2014-10-1' stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2])) if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date(): raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!') if self.view: db_red = psycopg2.connect(host="***", database="***", port="***", user="******", password="******") db_red.autocommit = True df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view from appstoredata_itunes_metrics where game='***' and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, con=db_red) df_red['date'] = pd.to_datetime(df_red['date']) ts_view_target1 = Series(df_red.view.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1 = Series(df_red.install.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_view_target1) < (self.endP-stD).days : ts_view_target1[pd.to_datetime(st_date)] = 0 ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1[pd.to_datetime(st_date)] = 0 ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_view_target = (ts_view_target1)/(ts_view_target1.sum()) ts_install_target = (ts_install_target1)/(ts_install_target1.sum()) else: ts_view_target = [] ts_view_target1 = [] ts_install_target = [] ts_install_target1 = [] db = MySQLdb.connect( host = '***', user = '******', passwd = '***', db = '***', port = '***') df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country, sum(metrics_daily.value) as value, dim_channel.channel_type as type from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db) df_mysql['date'] = pd.to_datetime(df_mysql['date']) all_data_target = df_mysql[df_mysql.country==self.target] org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)] ts_org_target1 = Series(org_data_target.value.tolist(), index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_all_target1 = Series(all_data_target.value.tolist(), index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_target = (ts_org_target1)/(ts_org_target1.sum()) ts_all_target = (ts_all_target1)/(ts_all_target1.sum()) if self.baseorg: org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)] ts_org_base1 = Series(org_data_base.value.tolist(), index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min()) else: ts_org_base = [] ts_org_base1 = [] if self.paid: paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)] ts_paid_target1 = Series(paid_data_target.value.tolist(), index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_paid_target1) < (self.endP-stD).days : ts_paid_target1[pd.to_datetime(st_date)] = 0 ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum()) else: ts_paid_target = [] ts_paid_target1 = [] if self.rank: df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where country='%s' and device!='android'and game='***' and category='Overall' group by date;''' % self.target, con=db) df_rank['date'] = pd.to_datetime(df_rank['date']) ts_rank_target1 = Series(df_rank.bestRank.tolist(), index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_rank_target1) < (self.endP-stD).days : ts_rank_target1[pd.to_datetime(st_date)] = 0 ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum()) else: ts_rank_target = [] ts_rank_target1 = [] # endog = ts_org_target1 # endog = ts_install_target endog = ts_all_target1 Tlist = [self.paid, self.baseorg, self.view, self.rank] dff = DataFrame() tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target] tlist = ['paid', 'base', 'view', 'rank'] for i in xrange(0,len(Tlist)): if Tlist[i]: dff[tlist[i]] = tList[i] if dff.empty: raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!') exog = dff return (endog, exog)
l = Series((HTTP_DF['origin'])) l = l.value_counts() clear_scr() print "\n" print "Questions" print "---------" print "Question:1." print "-----------" print "Which hostname or IP address made the most requests?" print "Answer:" print "-------" print "The MAXIMUM number of requests were made by '%s'.\nFrom this address, a total of %d requests were made" % (l.idxmax(),l.max()) print "\n" l = HTTP_DF.groupby(['origin'])['bytes_transferred'].sum() print "Question:2." print "-----------" print "Which hostname or IP address received the most total bytes from the server? How many bytes did it receive?" print "Answer:" print "-------" print "The MAXIMUM number of bytes were received by '%s'. This address has received a total of %d bytes." % (l.idxmax(), l.max()) print "\n" l = Series((HTTP_DF['hour'])) l = l.value_counts()
datad = get_dummies(datas, prefix=col, prefix_sep='__') data[datad.columns] = datad # drop non-predictor columns and fill in missing values with means data = data.drop(drop_cols + category_cols, axis=1) data = data.fillna(data.mean()) rf = RandomForestClassifier( n_estimators=1000, oob_score=True, random_state=42, class_weight='balanced_subsample', verbose=False, n_jobs=-1 ) # model using all variables evals = cv_results(x=data, y=outcome, model=rf, nfolds=10, nparts=20, verbose=True) # get importances and keep only those variables at least one-tenth as important as the most important variable _ = rf.fit(data, outcome) importance = Series(rf.feature_importances_, index=data.columns).sort_values(ascending=False) importance2 = importance / importance.max() most_important = importance[importance2.gt(0.1)] # model using only most important variables evals2 = cv_results(x=data.loc[:, most_important.index], y=outcome, model=rf, nfolds=10, nparts=20, verbose=True) # compare both models eval_df = evals.merge(evals2, left_on='prob', right_on='prob', suffixes=['_full', '_imp']) eval_df['renewed_pct_diff'] = eval_df['renewed_pct_full'] - eval_df['renewed_pct_imp']
frame ''' A B C a 0 1 2 b 3 4 5 c 6 7 8 ''' print frame.max() ''' A 6 B 7 C 8 ''' f = lambda x: x.max() - x.min() print frame.apply(f) # 作用到每一列 ''' A 6 B 6 C 6 ''' print frame.apply(f, axis=1) # 作用到每一行 ''' a 2 b 2 c 2 '''
s.name = 'name' # length assert len(s) == s.size == s.shape[0] # number of element that a not NaN s.count() # get a array of unique values s.unique() # count(*) group by non-NaN value, get a Series s.value_counts() # aggregation and statistic s.max() s.mean() s.var() # location of the max element s.idxmax() # rank s = Series([4, 1, 2, 5]) s.rank() # return [3,1,2,4] # plot s.plot() plt.show() # translate ##################################################