Example #1
0
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context('use_bottleneck', use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ['int32', 'int64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype='int64')
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ['float32', 'float64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
Example #2
0
def _expand_elements(body):
    lens = Series(lmap(len, body))
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    for ind, length in iteritems(not_max):
        body[ind] += [np.nan] * (lens_max - length)
Example #3
0
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
def getelapsed(dtin: pd.Series):
    """ takes a series of date strings and returns the number of elapsed days
    from the earliest to the last date
    """
    dmax = datetime.datetime.strptime(dtin.max(), '%Y-%m-%d')
    dmin = datetime.datetime.strptime(dtin.min(), '%Y-%m-%d')
    ddif: datetime.timedelta = dmax - dmin
    return ddif.days
Example #5
0
def _expand_elements(body):
    lens = Series([len(elem) for elem in body])
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    empty = ['']
    for ind, length in not_max.items():
        body[ind] += empty * (lens_max - length)
Example #6
0
 def create_interaction_description(interaction_count_series: Series) -> Dict:
     """某种类型交互的用户交互次数统计值"""
     interaction_description = dict()
     interaction_description[MIN] = interaction_count_series.min()
     interaction_description[MAX] = interaction_count_series.max()
     interaction_description[MEAN] = interaction_count_series.mean()
     interaction_description[MEDIAN] = interaction_count_series.median()
     return interaction_description
Example #7
0
def NormalizeDatasetMethod1(ds: pd.Series):
    minimum = ds.min()
    maximum = ds.max()
    delta = maximum - minimum
    result = []
    for i in range(len(ds)):
        result.append(float((ds[i] - minimum) / delta))
    return result
Example #8
0
 def from_series(feature_name: str, series: Series):
     """从pandas.Series中构造"""
     assert types.is_numeric_dtype(series), series.dtypes
     return NumericColumn(feature_name=feature_name,
                          min_value=series.min(),
                          max_value=series.max(),
                          mean_value=series.mean(),
                          std_value=series.std())
Example #9
0
    def get_numerical_distribution(cls,
                                   column: pd.Series,
                                   column_baseline: Dict = None):
        if column_baseline:
            bins = [
                x["lower_bound"]
                for x in column_baseline["numerical_stats"]["distribution"]
            ]
            bins.append(column_baseline["numerical_stats"]["distribution"][-1]
                        ["upper_bound"])

            # Insert a bin if new value is less than the min value
            if column.min() < column_baseline["numerical_stats"]["min"]:
                bins.insert(0, column.min().item())

            # Insert a bin if new value is less than the max value
            if column.max() > column_baseline["numerical_stats"]["max"]:
                bins.append(column.max().item())

            bin_size = len(bins) - 1
            labels = [str(x + 1) for x in range(bin_size)]
            cuts = pd.cut(x=column, bins=bins, precision=2, labels=labels)
        else:
            bin_size = 10
            labels = [str(x + 1) for x in range(bin_size)]
            cuts, bins = pd.cut(x=column,
                                bins=bin_size,
                                precision=2,
                                labels=labels,
                                retbins=True)

        value_counts = cuts.value_counts(normalize=True).to_dict()

        distribution = []

        for index, bin_value in enumerate(bins[:-1]):
            _bin = {
                "lower_bound": bin_value,
                "upper_bound": bins[index + 1],
                "percent":
                value_counts[str(index + 1)] * 100,  # Normalize to 100
            }

            distribution.append(_bin)

        return distribution
Example #10
0
def nbr_pages_parrecherche(Region,type):
	Result=getSoupFromUrl(getURL_Annonces(Region,1,type))
	Balises_a=Result.find_all("a")
	Numeros_pages= Series([int(A.text) for A in Balises_a if A.text.isnumeric()])
	if len(Numeros_pages)==0:
		return 1
	else:
		return Numeros_pages.max()
Example #11
0
def _expand_elements(body):
    lens = Series([len(elem) for elem in body])
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    empty = ['']
    for ind, length in not_max.items():
        body[ind] += empty * (lens_max - length)
Example #12
0
def __normolization_min_max(a: pd.Series, index: list) -> pd.Series:
    if a.name in index:
        minimum = a.min()
        maximum = a.max()

        a = (a - minimum) / (maximum - minimum)

    return a
Example #13
0
    def __init__(self, col: Series):
        col: ndarray = col.to_numpy()

        self._min: number = col.min(initial=None)
        self._max: number = col.max(initial=None)
        self._range: number = self._max - self._min
        self._mean: number = col.mean()
        self._std: number = col.std()
Example #14
0
def split_data(date_blocks: pd.Series, X: pd.DataFrame, y: pd.Series):
    val_block = date_blocks.max()
    return (
        X.loc[date_blocks < val_block],
        y.loc[date_blocks < val_block],
        X.loc[date_blocks == val_block],
        y.loc[date_blocks == val_block]
    )
Example #15
0
def _expand_elements(body):
    lens = Series(lmap(len, body))
    lens_max = lens.max()
    not_max = lens[lens != lens_max]

    empty = ['']
    for ind, length in iteritems(not_max):
        body[ind] += empty * (lens_max - length)
Example #16
0
def float_formatter(column: pd.Series,
                    value: float,
                    minimize: bool = True) -> str:  # type: ignore
    """
    Returns a formatter to be used when printing data frames to LaTeX.
    """
    if value == (column.min() if minimize else column.max()):
        return f"\\textbf{{{value:,.2f}}}"
    return f"{value:,.2f}"
Example #17
0
 def get_max(s: pd.Series):
     ps = s.index
     m = s.max()
     if isinstance(ps[0], str):
         s = (s == m).astype(int).replace(0, np.nan)
         s[~s.isna()] = ps[~s.isna()]
         return s
     else:
         return (s == m).astype(int).replace(0, np.nan) * ps
Example #18
0
def create_category_series(category_series: pd.Series,
                           fill_gaps: bool = True,
                           fill_steps: int = 1):
    """Returns sorted distinct category values, optionally with gaps filled"""
    if fill_gaps:
        return list(
            range(category_series.min(),
                  category_series.max() + 1, fill_steps))
    return list(sorted(category_series.unique().tolist()))
Example #19
0
def nbr_pages_parrecherche(Region, type):
    Result = getSoupFromUrl(getURL_Annonces(Region, 1, type))
    Balises_a = Result.find_all("a")
    Numeros_pages = Series(
        [int(A.text) for A in Balises_a if A.text.isnumeric()])
    if len(Numeros_pages) == 0:
        return 1
    else:
        return Numeros_pages.max()
Example #20
0
def normalize_column(column: pd.Series) -> pd.Series:
    """
    Normalizes a column of data and applies a visual scale to it.

    :param column: a column of numeric data
    :return: a normalized column of data
    """
    return ((column - column.min()) /
            (column.max() - column.min()) + .1) * VISUAL_SCALE
Example #21
0
 def fit(self, x: pd.Series):
     if self.method == "Gaussian":
         self.mean, self.std = x.mean(), x.std()
     elif self.method == "RankGaussian":
         # TODO: store state
         pass
     elif self.method == "MinMax":
         self.min, self.max = x.min(), x.max()
     return self
Example #22
0
    def trajectory_is_constant(self, trajectory: pandas.Series) -> bool:
        """
			Determines whether a specific trajectory remains at a reletively constant frequency throughout the experiment.
			Trajectories must change in frequency by at least 10% over the course of the experiment.
		"""

        maximum_difference = trajectory.max() - trajectory.min()

        return maximum_difference <= self.filter_consistency
Example #23
0
def __normolization_centroid(a: pd.Series, index: list) -> pd.Series:
    if a.name in index:
        minimum = a.min()
        maximum = a.max()
        centroid = (maximum - minimum) / 2

        a = (a - centroid) / (maximum - minimum)

    return a
Example #24
0
    def plot(self,
             forecast: np.ndarray,
             training_data: pd.Series,
             test_data: pd.Series = None,
             show: bool = False) -> matplotlib.figure.Figure:
        logger.debug('Plotting...')

        history = training_data
        timeframe = history.index[-1] - history.index[-2]
        forecast = pd.Series(forecast,
                             index=pd.date_range(start=history.index[-1] +
                                                 timeframe,
                                                 periods=len(forecast),
                                                 freq=timeframe))

        highest_datapoint = max(history.max(), forecast.max())
        lowest_datapoint = min(history.min(), forecast.min())
        if test_data is not None:
            highest_datapoint = max(highest_datapoint, test_data.max())
            lowest_datapoint = min(lowest_datapoint, test_data.min())

        fig, ax1 = plt.subplots()
        # ax1.set_ylim(bottom=lowest_datapoint - (lowest_datapoint / 2), top=highest_datapoint + (highest_datapoint / 2))

        ax1.plot(history, color='red', linewidth=config.plot.linewidth)
        if test_data is not None:
            ax1.plot(test_data,
                     color='orange',
                     linewidth=config.plot.linewidth)

        ax1.plot(forecast,
                 color='black',
                 linestyle=':',
                 linewidth=config.plot.linewidth + 0.2)
        ax1.set_title(
            f'{self.currency}/{self.to_currency} Price (Orange) vs {self.currency}/{self.to_currency} Price Forecast (Black)'
        )
        ax1.set_ylabel(f'{self.currency}/{self.to_currency} Price')
        ax1.set_xlabel('Date')

        legend = ax1.legend()

        texts = legend.get_texts()
        if test_data is not None:
            texts[0].set_text('Actual Price (training data)')
            texts[1].set_text('Actual Price (test data)')
            if len(texts) == 3:
                texts[2].set_text('Forecasted Price')
        else:
            texts[0].set_text('Actual Price')
            texts[1].set_text('Forecasted Price')

        if show:
            plt.show()

        return fig
Example #25
0
def normalize_column(df_column: Series) -> Series:
    # Just normalize numeric columns
    if df_column.dtype == np.float64 or df_column.dtype == np.int64:
        max_value = df_column.max()
        min_value = df_column.min()

        if min_value != max_value:
            df_column = (df_column - min_value) / (max_value - min_value)

    return df_column  # If min=max, normalization is undefined so I return the same column
Example #26
0
 def time_gap(dates: pd.Series, uom='weekly'):
     """
     Returns the relative time gap from the latest date. 
     The uom (unit of mesure) arguments defines the output units, by default weeks.
     If not specified, uom would be hourly
     """
     uom = (3600 * 24 * 7) if uom == 'weekly' else 3600  ## can be improved
     max_Date = dates.max()
     time_delta = max_Date - dates
     return 1 + np.floor(time_delta.dt.total_seconds() / uom)
Example #27
0
    def _check_inputs(
        s_test_pred: pd.Series,
        s_calib_pred: pd.Series,
        s_calib_actual: pd.Series,
    ) -> None:
        """ Check that inputs have valid names and could be proabilities """

        if (
            s_test_pred.min() < 0
            or s_test_pred.max() > 1
            or s_calib_pred.min() < 0
            or s_calib_pred.max() > 1
        ):
            raise RuntimeError(
                "Probabilities outside (0,1) range were passed to calibrate"
            )

        if not s_calib_pred.name == s_test_pred.name:
            warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}")
        if s_test_pred.isnull().sum() > 0:
            _log_missing_indices(s_test_pred)
            raise RuntimeError("Missing values in s_test_pred")
        if s_calib_pred.isnull().sum() > 0:
            _log_missing_indices(s_calib_pred)
            raise RuntimeError("Missing values in s_calib_pred")
        if s_calib_actual.isnull().sum() > 0:
            _log_missing_indices(s_calib_actual)
            raise RuntimeError("Missing values in s_calib_actual")

        if (
            not len(s_calib_pred) == len(s_calib_actual)
            or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0
        ):
            raise RuntimeError(
                f"len(s_calib_pred): {len(s_calib_pred)} "
                f"len(s_calib_actual): {len(s_calib_actual)} "
                f"index diff: "
                f"{s_calib_pred.index.difference(s_calib_actual.index)}"
                f"s_calib_pred.head() : {s_calib_pred.head()}"
                f"s_calib_pred.tail() : {s_calib_pred.tail()}"
                f"s_calib_actual.head() : {s_calib_actual.head()}"
                f"s_calib_actual.tail() : {s_calib_actual.tail()}"
            )
Example #28
0
def _assign_bins(values: pd.Series, no_bins, column_names) -> pd.DataFrame:
    """
    Assigns values to bins [1; no_bins]
    :return: DataFrame with three columns: bin, left_bound, right_bound
    """
    limits = np.linspace(values.min(), values.max(), no_bins)
    return pd.DataFrame(
        np.array([_find_bin(limits, val) for val in values.values]),
        columns=column_names
    )
Example #29
0
 def from_dataframe(cls, column_name: str, data: pd.Series,
                    data_type: DataType):
     assert data_type in cls.accepted_types
     c = cls(column_name,
             data.min(),
             data.max(),
             data_type,
             has_missing=data.hasnans)
     c._data = data
     return c
def calculate_row_max_arm(row: pd.Series) -> str:
    '''
    Finds the winning probability for a given row
    :param row:
    :return:
    '''
    row = row.squeeze()
    max_value_arm = str(row[row == row.max()].index[0])

    return max_value_arm
Example #31
0
    def setup(
        self,
        data: Series,
        prop: Property,
        axis: Axis | None = None,
    ) -> Scale:

        new = copy(self)
        forward, inverse = self._get_transform()

        mpl_scale = self._get_scale(data.name, forward, inverse)

        if axis is None:
            axis = PseudoAxis(mpl_scale)
            axis.update_units(data)

        mpl_scale.set_default_locators_and_formatters(axis)

        normalize: Optional[Callable[[ArrayLike], ArrayLike]]
        if prop.normed:
            if self.norm is None:
                vmin, vmax = data.min(), data.max()
            else:
                vmin, vmax = self.norm
            vmin, vmax = axis.convert_units((vmin, vmax))
            a = forward(vmin)
            b = forward(vmax) - forward(vmin)

            def normalize(x):
                return (x - a) / b

        else:
            normalize = vmin = vmax = None

        forward_pipe = [
            axis.convert_units, forward, normalize,
            prop.get_mapping(new, data)
        ]

        def spacer(x):
            return np.min(np.diff(np.sort(x.dropna().unique())))

        # TODO make legend optional on per-plot basis with ScaleSpec parameter?
        if prop.legend:
            axis.set_view_interval(vmin, vmax)
            locs = axis.major.locator()
            locs = locs[(vmin <= locs) & (locs <= vmax)]
            labels = axis.major.formatter.format_ticks(locs)
            legend = list(locs), list(labels)

        else:
            legend = None

        scale_type = self.__class__.__name__.lower()
        return Scale(forward_pipe, spacer, legend, scale_type, mpl_scale)
Example #32
0
    def test_min_max_numeric_only(self):
        # TODO deprecate numeric_only argument for Categorical and use
        # skipna as well, see GH25303
        cat = Series(Categorical(
            ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True))

        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "a"

        _min = cat.min(numeric_only=True)
        _max = cat.max(numeric_only=True)
        assert _min == "b"
        assert _max == "a"

        _min = cat.min(numeric_only=False)
        _max = cat.max(numeric_only=False)
        assert np.isnan(_min)
        assert _max == "a"
def bucketize_data(column_data: pd.Series, num_buckets: int) -> List[int]:
    min_val = column_data.min()
    max_val = column_data.max()
    bucket_size = (max_val - min_val) / num_buckets
    boundries = []
    boundry = min_val
    while (len(boundries) + 1) < num_buckets:
        boundry += bucket_size
        boundries.append(round(boundry + 0.000001))

    return boundries
Example #34
0
 def test_min_max_dt64_api_consistency_empty_df(self):
     # check DataFrame/Series api consistency when calling min/max on an empty
     # DataFrame/Series.
     df = DataFrame(dict(x=[]))
     expected_float_series = Series([], dtype=float)
     # check axis 0
     assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
     assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
     # check axis 1
     tm.assert_series_equal(df.min(axis=1), expected_float_series)
     tm.assert_series_equal(df.min(axis=1), expected_float_series)
Example #35
0
    def test_sum_overflow_float(self, use_bottleneck, dtype):
        with pd.option_context("use_bottleneck", use_bottleneck):
            v = np.arange(5000000, dtype=dtype)
            s = Series(v)

            result = s.sum(skipna=False)
            assert result == v.sum(dtype=dtype)
            result = s.min(skipna=False)
            assert np.allclose(float(result), 0.0)
            result = s.max(skipna=False)
            assert np.allclose(float(result), v[-1])
Example #36
0
    def test_min_max(self):
        # unordered cats have no min/max
        cat = Series(Categorical(["a", "b", "c", "d"], ordered=False))
        with pytest.raises(TypeError):
            cat.min()
        with pytest.raises(TypeError):
            cat.max()

        cat = Series(Categorical(["a", "b", "c", "d"], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "a"
        assert _max == "d"

        cat = Series(Categorical(["a", "b", "c", "d"], categories=[
                     'd', 'c', 'b', 'a'], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert _min == "d"
        assert _max == "a"

        cat = Series(Categorical(
            [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a'
                                                    ], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == "b"

        cat = Series(Categorical(
            [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True))
        _min = cat.min()
        _max = cat.max()
        assert np.isnan(_min)
        assert _max == 1
Example #37
0
def test_name2num():
    num_to_test = 10
    str_len = 4
    letters = string.ascii_letters
    x = Series(dict(zip(letters, map(ord, letters))))
    base = 256 ** np.arange(str_len)
    mn = base.dot(np.repeat(x.min(), str_len))
    mx = base.dot(np.repeat(x.max(), str_len))

    for _ in xrange(num_to_test):
        name = random.sample(letters, str_len)
        num = name2num(name)
        assert mn <= num <= mx
Example #38
0
    def test_timedelta64_analytics(self):

        # index min/max
        dti = pd.date_range('2012-1-1', periods=3, freq='D')
        td = Series(dti) - pd.Timestamp('20120101')

        result = td.idxmin()
        assert result == 0

        result = td.idxmax()
        assert result == 2

        # GH#2982
        # with NaT
        td[0] = np.nan

        result = td.idxmin()
        assert result == 1

        result = td.idxmax()
        assert result == 2

        # abs
        s1 = Series(pd.date_range('20120101', periods=3))
        s2 = Series(pd.date_range('20120102', periods=3))
        expected = Series(s2 - s1)

        # FIXME: don't leave commented-out code
        # this fails as numpy returns timedelta64[us]
        # result = np.abs(s1-s2)
        # assert_frame_equal(result,expected)

        result = (s1 - s2).abs()
        tm.assert_series_equal(result, expected)

        # max/min
        result = td.max()
        expected = pd.Timedelta('2 days')
        assert result == expected

        result = td.min()
        expected = pd.Timedelta('1 days')
        assert result == expected
Example #39
0
  def get_summary_indicators_from_hist(sf, hist, int_index=False):
    seriesHist = Series(hist)
    maxs = {
      'freq': dict()
    }
    
    means = {'freq': seriesHist.mean()}
    medians = {'freq': seriesHist.median()}
    stds = {'freq': seriesHist.std()}
    maxs['freq']['freq'] = seriesHist.max()
    maxs['freq']['index'] = seriesHist.idxmax()
    index_total = 'NA'

    if int_index:
      index = seriesHist.index
      index = index.astype(int)
      index_list = index.tolist()
      index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))])
      index_series = Series(index_list)

      means['index'] = index_series.mean()
      medians['index'] = index_series.median()
      stds['index'] = index_series.std()
      
      maxs['freq']['index'] = int(maxs['freq']['index'])

      maxs['index'] = dict()
      maxs['index']['index'] = max(index_list)
      maxs['index']['freq'] = hist[str(maxs['index']['index'])]

    return {
      'means': means,
      'medians': medians,
      'stds': stds,
      'max': maxs,
      'index_total': index_total
    }
Example #40
0
def count_estims(dist, gamma = 0.95):
    '''
    Counts all estimates
    :param dist: dsitribution
    :param gamma: probability of realisation of value
    :return point: point estimates
    :return interval: confidance intervals for point estimates
    '''
    import numpy as np
    x = Series(dist)
    #Точечные оценки
    point = {}
    N = x.count()

    med_ = med_u(x)#
    med = np.median(dist)
    mad = x.mad()#
    mean_c = mean(dist)#
    var = np.var(dist)
    std = np.std(dist)
    mod = stats.mode(dist).mode#
    kurt = stats.kurtosis(dist)
    skew_my = stats.skew(dist)#
    Chi = 1/np.sqrt(np.abs(kurt))
    quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5)
    W = std/mean_c;#

    quantiles_str = ""
    for index in quantiles.index:
        quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index])

    point['MED'] = np.round(med, 5)
    point['MED*'] = np.round(med_, 5)
    point['MAD'] = np.round(mad, 5)
    point['Min'] = np.round(x.min(), 5)
    point['Max'] = np.round(x.max(), 5)
    point['Mean'] = np.round(mean_c, 5)
    point['S^2'] = np.round(var, 5)
    point['S'] = np.round(std, 5)
    point['MOD'] = np.round(mod, 5)
    point['E'] = np.round(kurt, 5)
    point['A'] = np.round(skew_my, 5)
    point['Chi'] = np.round(Chi, 5)
    point['X(alpha)'] = quantiles_str
    point['W'] = np.round(W, 5)



    #Интервальные оценки
    from scipy.stats import t, norm
    import numpy as np
    interval = {}
    if N < 61:
        l = t.ppf((1-gamma)/2, N-1)
        u = t.ppf(1-(1-gamma)/2, N-1)
    else:
        l = norm.ppf((1-gamma)/2)
        u = norm.ppf(1-(1-gamma)/2)
    X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x))
    A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x))
    S_cf = (std + l*sigma_S(x), std+u*sigma_S(x))
    E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x))
    if W < 1:
        v = l/np.sqrt(2*(N-1))
        W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5)
    else: W_cf = (None, None)

    interval['Mean'] = np.round(X_cf, 5)
    interval['S'] = np.round(S_cf, 5)
    interval['E'] = np.round(E_cf, 5)
    interval['A'] = np.round(A_cf, 5)
    interval['W'] = W_cf

    return point, interval
    def source_data(self):
        
        st_date = self.stTrain
#        st_date = '2014-10-1'
        stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2]))
        if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date():
            raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!')
        if self.view:
            db_red = psycopg2.connect(host="***", database="***", port="***",
                                  user="******", password="******")
            db_red.autocommit = True
            df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view
                                from appstoredata_itunes_metrics where game='***' 
                                and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, 
                                con=db_red)  
                            
            df_red['date'] = pd.to_datetime(df_red['date'])
            ts_view_target1 = Series(df_red.view.tolist(), 
                                     index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_install_target1 = Series(df_red.install.tolist(), 
                                        index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_view_target1) < (self.endP-stD).days :
                ts_view_target1[pd.to_datetime(st_date)] = 0
                ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
                ts_install_target1[pd.to_datetime(st_date)] = 0
                ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_view_target = (ts_view_target1)/(ts_view_target1.sum())
            ts_install_target = (ts_install_target1)/(ts_install_target1.sum())
        else:
            ts_view_target = []
            ts_view_target1 = []
            ts_install_target = []  
            ts_install_target1 = []
        
        db = MySQLdb.connect(
        host = '***', 
        user = '******', 
        passwd = '***', 
        db = '***', 
        port = '***')
        
        df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country,
                               sum(metrics_daily.value) as value, dim_channel.channel_type as type
                               from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id 
                               left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 
                               and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db)  
                       
        
        df_mysql['date'] = pd.to_datetime(df_mysql['date'])
        all_data_target = df_mysql[df_mysql.country==self.target]
        org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)]
        ts_org_target1 = Series(org_data_target.value.tolist(), 
                               index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_all_target1 = Series(all_data_target.value.tolist(), 
                                index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_org_target = (ts_org_target1)/(ts_org_target1.sum())
        ts_all_target = (ts_all_target1)/(ts_all_target1.sum())
        
        if self.baseorg:
            org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)]
            ts_org_base1 = Series(org_data_base.value.tolist(), 
                                 index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)   
            ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min())
        else:
            ts_org_base = []
            ts_org_base1 = []
        
        if self.paid:
            paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)]
            ts_paid_target1 = Series(paid_data_target.value.tolist(),
                                    index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_paid_target1) < (self.endP-stD).days :
                ts_paid_target1[pd.to_datetime(st_date)] = 0
                ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum())
        else:
            ts_paid_target = []
            ts_paid_target1 = []
            
        if self.rank:
            df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where 
                                    country='%s' and device!='android'and game='***' 
                                    and category='Overall' group by date;''' % self.target, con=db)  
            
            df_rank['date'] = pd.to_datetime(df_rank['date'])
            ts_rank_target1 = Series(df_rank.bestRank.tolist(), 
                                     index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_rank_target1) < (self.endP-stD).days :
                ts_rank_target1[pd.to_datetime(st_date)] = 0
                ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum())
        else:
            ts_rank_target = []
            ts_rank_target1 = []
        
#        endog = ts_org_target1
#        endog = ts_install_target
        endog = ts_all_target1
        
        Tlist = [self.paid, self.baseorg, self.view, self.rank]
        dff = DataFrame()
        tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target]
        tlist = ['paid', 'base', 'view', 'rank']
        for i in xrange(0,len(Tlist)):
            if Tlist[i]:
                dff[tlist[i]] = tList[i]
        if dff.empty:
            raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!')
                
        exog = dff
        
        return (endog, exog)
    
    l = Series((HTTP_DF['origin']))
    l = l.value_counts()

    clear_scr()
    
    print "\n"
    print "Questions"
    print "---------"
    print "Question:1."
    print "-----------"
    print "Which hostname or IP address made the most requests?"
    print "Answer:"
    print "-------"
    print "The MAXIMUM number of requests were made by '%s'.\nFrom this address, a total of %d requests were made" % (l.idxmax(),l.max())
    print "\n"

    l = HTTP_DF.groupby(['origin'])['bytes_transferred'].sum()
    print "Question:2."
    print "-----------"
    print "Which hostname or IP address received the most total bytes from the server?  How many bytes did it receive?"
    print "Answer:"
    print "-------"
    print "The MAXIMUM number of bytes were received by '%s'. This address has received a total of %d bytes." % (l.idxmax(), l.max())
    print "\n"
    

    l = Series((HTTP_DF['hour']))
    l = l.value_counts()
    datad = get_dummies(datas, prefix=col, prefix_sep='__')
    data[datad.columns] = datad

# drop non-predictor columns and fill in missing values with means
data = data.drop(drop_cols + category_cols, axis=1)
data = data.fillna(data.mean())

rf = RandomForestClassifier(
    n_estimators=1000,
    oob_score=True,
    random_state=42,
    class_weight='balanced_subsample',
    verbose=False,
    n_jobs=-1
)

# model using all variables
evals = cv_results(x=data, y=outcome, model=rf, nfolds=10, nparts=20, verbose=True)

# get importances and keep only those variables at least one-tenth as important as the most important variable
_ = rf.fit(data, outcome)
importance = Series(rf.feature_importances_, index=data.columns).sort_values(ascending=False)
importance2 = importance / importance.max()
most_important = importance[importance2.gt(0.1)]

# model using only most important variables
evals2 = cv_results(x=data.loc[:, most_important.index], y=outcome, model=rf, nfolds=10, nparts=20, verbose=True)

# compare both models
eval_df = evals.merge(evals2, left_on='prob', right_on='prob', suffixes=['_full', '_imp'])
eval_df['renewed_pct_diff'] = eval_df['renewed_pct_full'] - eval_df['renewed_pct_imp']
frame
'''

   A  B  C
a  0  1  2
b  3  4  5
c  6  7  8
'''
print
frame.max()
'''
A    6
B    7
C    8
'''
f = lambda x: x.max() - x.min()
print
frame.apply(f)  # 作用到每一列
'''
A    6
B    6
C    6
'''
print
frame.apply(f, axis=1)  # 作用到每一行
'''
a    2
b    2
c    2
'''
Example #45
0
s.name = 'name'

# length
assert len(s) == s.size == s.shape[0]

# number of element that a not NaN
s.count()

# get a array of unique values
s.unique()

# count(*) group by non-NaN value, get a Series
s.value_counts()

# aggregation and statistic
s.max()
s.mean()
s.var()

# location of the max element
s.idxmax()

# rank
s = Series([4, 1, 2, 5])
s.rank()                     # return [3,1,2,4]

# plot
s.plot()
plt.show()

# translate ##################################################