Example #1
0
 def _fit_core(self, s: pd.Series) -> None:
     if s.count() == 0:
         raise RuntimeError("Valid values are not enough for training.")
     R = pd.Series(np.zeros(len(s)), index=s.index)
     n = s.count()
     Lambda = pd.Series(np.zeros(len(s)), index=s.index)
     s_copy = s.copy()
     i = 0
     while s_copy.count() > 0:
         i += 1
         ind = (s_copy - s_copy.mean()).abs().idxmax()
         R[ind] = (abs(s_copy[ind] - s_copy.mean()) /
                   s_copy.std() if s_copy.std() > 0 else 0)
         s_copy[ind] = np.nan
         p = 1 - self.alpha / (2 * (n - i + 1))
         Lambda[ind] = ((n - i) * t.ppf(p, n - i - 1) / np.sqrt(
             (n - i - 1 + t.ppf(p, n - i - 1)**2) * (n - i + 1)))
         if R[ind] <= Lambda[ind]:
             break
     self._normal_sum = s[Lambda >= R].sum()
     self._normal_squared_sum = (s[Lambda >= R]**2).sum()
     self._normal_count = s[Lambda >= R].count()
     i = 1
     n = self._normal_count + 1
     p = 1 - self.alpha / (2 * (n - i + 1))
     self._lambda = ((n - i) * t.ppf(p, n - i - 1) / np.sqrt(
         (n - i - 1 + t.ppf(p, n - i - 1)**2) * (n - i + 1)))
def describe_unsupported(series: pd.Series, series_description: dict):
    """Describe an unsupported series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # number of observations in the Series
    leng = len(series)
    # number of non-NaN observations in the Series
    count = series.count()
    # number of infinte observations in the Series
    n_infinite = count - series.count()

    results_data = {
        "count": count,
        "p_missing": 1 - count * 1.0 / leng,
        "n_missing": leng - count,
        "p_infinite": n_infinite * 1.0 / leng,
        "n_infinite": n_infinite,
        "memorysize": series.memory_usage(),
    }

    return results_data
Example #3
0
    def test_count_level_without_multiindex(self):
        ser = Series(range(3))

        msg = "Series.count level is only valid with a MultiIndex"
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning):
                ser.count(level=1)
Example #4
0
def to_deviation(series: pandas.Series, threshold_deviation_user_count: Optional[int] = None) -> pandas.Series:
    if series.count() == 0 or (
        threshold_deviation_user_count is not None and series.count() <= threshold_deviation_user_count
    ):
        return pandas.Series([numpy.nan] * len(series))
    else:
        std = series.std(ddof=0)
        mean = series.mean()
        if std != 0.0:
            return series.map(lambda x: (x - mean) / std * 10 + 50)
        else:
            return series.map(lambda x: 50 if not numpy.isnan(x) else numpy.nan)
Example #5
0
    def filter_edges(self, condition: pd.Series):
        if self._edges.shape[0] != condition.count():
            msg = 'Number of edges {edges} is different from the length of the condition array {condition}'.format(
                edges=self._edges.shape[0], condition=condition.count())
            raise ValueError(msg)

        filtered_edges = self._edges[condition]
        return SpatioTemporalNetwork(nodes=self._nodes,
                                     edges=filtered_edges,
                                     origin=self._origin,
                                     destination=self._destination,
                                     node_id=self._node_id)
Example #6
0
    def test_count_level_series(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]
        )

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))
Example #7
0
def energy_mean_nan(nb, harmonics=2, method='ffill'):
    """Compute the energy of this NetBlock.

  Returns a dictionary:
  e24 - energy at 1/(24hr) and selected harmonics
  te - total energy
  ra = e24/te
  nrows - Number of datapoints in the sample
  nan - Number of time bins for which values had to be interpolated
  """

    # Note that "mean" is not automatically correct.  Consider median, others
    timeseries = Series([nb.data[tt].mean() for tt in nb.TBall])
    nan = nb.TB.bucketcount - timeseries.count()
    if method:
        timeseries = timeseries.fillna(method=method)
        # TODO (mattmathis) exlore limit= options
    else:
        # zero fill is only correct for energy algebra
        timeseries = [0.0 if np.isnan(tv) else tv for tv in timeseries]
    e24, te = energy.power_ratio(timeseries, len(timeseries), harmonics)
    try:
        ra = e24 / te
    except:
        # failed to fill all NaNs or other failure
        ra = float('nan')
    return {'e24': e24, 'te': te, 'ra': ra, "nrows": len(nb.data), 'nan': nan}
Example #8
0
def cluster(id, data, dataset):
    from pandas import Series, DataFrame
    id_index = Series(id.tolist())
    from cluster import density_cluster
    N = id_index.count()
    distance = compute_distance(data)
    distance_c = min_distance(distance)
    # id.values -> 对应的key
    index_id = Series(id_index.index, index=id_index.values)
    log.warn("the init distance_c is: " + str(distance_c))
    # to creat the base index table
    # 生成对应的索引,用于控制rho,delta,index的内容
    rho_id = rho_function(index_id, distance, distance_c=distance_c)
    #delta_id, data_id = delta_function(id_index, index_id, rho_id, distance)
    # gamma=rho*delta
    threshold = DataFrame([], columns=['H', 'd_c', 'cluster'])
    threshold = ent_dc_step_by_step(id_index,
                                    index_id,
                                    data,
                                    threshold=threshold,
                                    distance=distance,
                                    distance_c=distance_c,
                                    dataset=dataset)
    r = threshold
    # log.debug("rho:\n" + str(rho))
    log.debug("threshold\n" + str(DataFrame(threshold)))
    return r
Example #9
0
def basic_stat_map(s: pd.Series) -> dict:
    return {
        "mean": s.mean(),
        "median": s.median(),
        "std": s.std(),
        "count": s.count(),
    }
 def _entropy(self, column: pd.Series):
     n = column.count()
     valueCounts = column.value_counts().tolist()
     result = 0
     for x in valueCounts:
         result += (-x / n) * log2(x / n)
     return (result)
Example #11
0
def get_cumsum(data: pd.Series):
    cumsum = 0
    i = 0
    while True:
        yield cumsum
        cumsum += data.iloc[i]
        i = (i + 1) % data.count()
Example #12
0
    def _box_stats(ds: pd.Series,
                   med: bool = True,
                   iqr: bool = True,
                   count: bool = True) -> str:
        """
        Create the metric part with stats of the box (axis) caption

        Parameters
        ----------
        ds: pd.Series
            data on which stats are found
        med: bool
        iqr: bool
        count: bool
            statistics

        Returns
        -------
        stats: str
            caption with summary stats
        """
        # interquartile range
        iqr = ds.quantile(q=[0.75, 0.25]).diff()
        iqr = abs(float(iqr.loc[0.25]))

        met_str = []
        if med:
            met_str.append('Median: {:.3g}'.format(ds.median()))
        if iqr:
            met_str.append('IQR: {:.3g}'.format(iqr))
        if count:
            met_str.append('N: {:d}'.format(ds.count()))
        stats = '\n'.join(met_str)

        return stats
Example #13
0
def count_fraction_of_true(series: pd.Series):
    # We are assuming this is called by a Boolean series
    if series.dtype != np.bool:
        raise ValueError
    num_true = series.sum()
    total = float(series.count())
    return num_true / total if total > 0.0 else 0.0, num_true
def get_counts(series: pd.Series) -> dict:
    # The value_counts() function is used to get a Series containing counts of unique values.
    value_counts_with_nan = series.value_counts(dropna=False)

    # Fix for data with only a single value; reset_index was flipping the data returned
    if len(value_counts_with_nan) == 1:
        if pd.isna(value_counts_with_nan.index[0]):
            value_counts_without_nan = pd.Series()
        else:
            value_counts_without_nan = value_counts_with_nan
    else:
        value_counts_without_nan = (value_counts_with_nan.reset_index().dropna(
        ).set_index("index").iloc[:, 0])
    # print(value_counts_without_nan.index.dtype.name)

    # IGNORING NAN FOR NOW AS IT CAUSES ISSUES [FIX]
    # distinct_count_with_nan = value_counts_with_nan.count()

    distinct_count_without_nan = value_counts_without_nan.count()
    return {
        "value_counts_without_nan": value_counts_without_nan,
        "distinct_count_without_nan": distinct_count_without_nan,
        "num_rows_with_data": series.count(),
        "num_rows_total": len(series),
        # IGNORING NAN FOR NOW AS IT CAUSES ISSUES [FIX]:
        # "value_counts_with_nan": value_counts_with_nan,
        # "distinct_count_with_nan": distinct_count_with_nan,
    }
Example #15
0
    def describe_unsupported(series: pd.Series, series_description: dict):
        """Describe an unsupported series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        """

        # number of observations in the Series
        length = len(series)

        # number of non-NaN observations in the Series
        count = series.count()

        results_data = {
            "n":
            length,
            "count":
            count,
            "p_missing":
            1 - count / length,
            "n_missing":
            length - count,
            "memory_size":
            series.memory_usage(deep=config["memory_deep"].get(bool)),
        }

        return results_data
Example #16
0
 def _getSeriesScoreMultipliedByCount(self,
                                      targetSeries: pd.Series) -> float:
     totalCount = targetSeries.count()
     trueCount = targetSeries.sum()
     falseCount = totalCount - trueCount
     return totalCount - (trueCount * trueCount +
                          falseCount * falseCount) / totalCount
Example #17
0
def series_datatype(data: pd.Series,
                    values: Optional[List[str]] = None) -> DataType:
    """
    determine given data series is categorical or continuous using set of rules

    :param data: data for facet/label/predicted_label columns
    :param values: list of facet or label values provided by user
    :return: Enum {CATEGORICAL|CONTINUOUS}
    """
    # if datatype is boolean or categorical we return data as categorical
    data_type = DataType.CATEGORICAL
    data_uniqueness_fraction = divide(data.nunique(), data.count())
    logger.info(f"data uniqueness fraction: {data_uniqueness_fraction}")
    # Assumption: user will give single value for threshold currently
    # Todo: fix me if multiple thresholds for facet or label are supported
    if data.dtype.name == "category" or (isinstance(values, list)
                                         and len(values) > 1):
        return data_type
    if data.dtype.name in ["str", "string", "object"]:
        # cast the dtype to int, if exception is raised data is categorical
        casted_data = data.astype("int64", copy=True, errors="ignore")
        if np.issubdtype(
                casted_data.dtype, np.integer
        ) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS  # type: ignore
    elif np.issubdtype(data.dtype, np.floating):
        data_type = DataType.CONTINUOUS
    elif np.issubdtype(data.dtype, np.integer):
        # Current rule: If data has more than 5% if unique values then it is continuous
        # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly
        if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD:
            data_type = DataType.CONTINUOUS
    return data_type
Example #18
0
def describe_categorical_1d(
    data: Series,
    percentiles_ignored: Sequence[float],
) -> Series:
    """Describe series containing categorical data.

    Parameters
    ----------
    data : Series
        Series to be described.
    percentiles_ignored : list-like of numbers
        Ignored, but in place to unify interface.
    """
    names = ["count", "unique", "top", "freq"]
    objcounts = data.value_counts()
    count_unique = len(objcounts[objcounts != 0])
    if count_unique > 0:
        top, freq = objcounts.index[0], objcounts.iloc[0]
        dtype = None
    else:
        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        top, freq = np.nan, np.nan
        dtype = "object"

    result = [data.count(), count_unique, top, freq]

    from pandas import Series

    return Series(result, index=names, name=data.name, dtype=dtype)
Example #19
0
def multi_processing_cluster(job, work, df, id, data):
    # threshold = DataFrame([], columns=['H', 'd_c', 'cluster'])
    from pandas import Series, DataFrame
    id_index = Series(id.tolist())
    from cluster import density_cluster
    N = id_index.count()
    distance = compute_distance(data)
    distance_c = min_distance(distance)
    max = max_distance(distance, distance_c)
    max = average_task(max, job)
    log.debug(str("max:") + str(max))
    distance_c = distance_c + work * max
    max_distance_c = distance_c + max
    # id.values -> 对应的key
    index_id = Series(id_index.index, index=id_index.values)
    log.warn("work id " + str(work) + " the starting distance_c is: " +
             str(distance_c) + ". working under" + str(max_distance_c))
    # to creat the base index table
    # 生成对应的索引,用于控制rho,delta,index的内容
    rho_id = rho_function(index_id, distance, distance_c=distance_c)
    delta_id, data_id = delta_function(id_index, index_id, rho_id, distance)
    # gamma=rho*delta
    threshold = df
    threshold = multi_ent_dc_step_by_step(id_index,
                                          index_id,
                                          threshold=threshold,
                                          distance=distance,
                                          distance_c=distance_c,
                                          max_distance_c=max_distance_c)
    r = threshold
    # log.debug("rho:\n" + str(rho))
    log.debug("worker " + str(work) + " has finished. threshold\n" +
              str(DataFrame(threshold)))
Example #20
0
    def describe_supported(series: pd.Series,
                           series_description: dict) -> dict:
        """Describe a supported series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        """

        # number of observations in the Series
        length = len(series)

        # number of non-NaN observations in the Series
        count = series.count()

        distinct_count = series_description["distinct_count_without_nan"]
        value_counts = series_description["value_counts_without_nan"]
        unique_count = value_counts.where(value_counts == 1).count()

        stats = {
            "n": length,
            "count": count,
            "n_distinct": distinct_count,
            "p_distinct": distinct_count / count,
            "p_missing": 1 - (count / length),
            "n_missing": length - count,
            "is_unique": unique_count == count,
            "n_unique": unique_count,
            "p_unique": unique_count / count,
            "memory_size":
            series.memory_usage(config["memory_deep"].get(bool)),
        }

        return stats
Example #21
0
        def modified_qcut(series: Series, q: int, labels: List[Any]):
            """
            修正的qcut

            Parameters
            ----------
            series: Series
                    因子排序值rank

            q: int
               分组组数

            labels: List[Any]
                    标签, 如分为5组, [1,2,3,4,5]

            Returns
            -------
            组别: Series
            """

            # 如果有效数据个数小于组别, 则全部为0, 即不持仓
            if series.count() < q:
                new_series = pd.Series([0.0] * len(series))
                new_series.index = series.index
                return new_series
            else:
                return pd.qcut(x=series, q=q, labels=labels)
Example #22
0
    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))
Example #23
0
def describe_numeric_1d(series: Series,
                        percentiles: Sequence[float]) -> Series:
    """Describe series containing numerical data.

    Parameters
    ----------
    series : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    from pandas import Series

    # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]";
    # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str,
    # float]]]"
    formatted_percentiles = format_percentiles(
        percentiles)  # type: ignore[arg-type]

    stat_index = ["count", "mean", "std", "min"
                  ] + formatted_percentiles + ["max"]
    d = ([series.count(),
          series.mean(),
          series.std(),
          series.min()] + series.quantile(percentiles).tolist() +
         [series.max()])
    return Series(d, index=stat_index, name=series.name)
Example #24
0
    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2],
                                   [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))
Example #25
0
    def true_sequence_count(series: Series):
        index = series.where(series == False).last_valid_index()

        if index is None:
            return series.count()
        else:
            s = series[series.index > index]
            return s.count()
Example #26
0
    def test_count_categorical(self):

        ser = Series(
            Categorical([np.nan, 1, 2, np.nan],
                        categories=[5, 4, 3, 2, 1],
                        ordered=True))
        result = ser.count()
        assert result == 2
    def test_count_level_series(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]],
            codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]],
        )

        ser = Series(np.random.randn(len(index)), index=index)

        result = ser.count(level=0)
        expected = ser.groupby(level=0).count()
        tm.assert_series_equal(result.astype("f8"),
                               expected.reindex(result.index).fillna(0))

        result = ser.count(level=1)
        expected = ser.groupby(level=1).count()
        tm.assert_series_equal(result.astype("f8"),
                               expected.reindex(result.index).fillna(0))
Example #28
0
def _trapezium_integration_variable(d_ti: pd.Series) -> Optional[float]:
    """Gapfill version of trap int - will fill out"""
    # Clear no numbers
    d_ti = d_ti.dropna()

    if d_ti.count() == 0:
        return None

    # One entry
    if d_ti.count() == 1:
        return d_ti[0] * 0.5

    # Fall back on average but warn to check data
    if d_ti.count() <= 3:
        d_sum = d_ti.sum()

        if d_sum == 0:
            return 0

        if d_ti.count() == 0:
            return 0

        return 0.5 * d_sum / d_ti.count()

    bucket_middle = d_ti.count() - 2

    bucket_middle_weights = [1] + [2] * bucket_middle + [1]

    weights = d_ti.values * bucket_middle_weights

    weights_sum = weights.sum()

    bucket_energy = 0.5 * weights_sum / ((d_ti.count() - 1) * 2)

    return bucket_energy
Example #29
0
    def _getTrueAndFalseRatios(self, series: pd.Series) -> (float, float):
        totalCount = series.count()

        trueCount = series.sum()
        falseCount = totalCount - trueCount

        trueRatio = trueCount / totalCount
        falseRatio = falseCount / totalCount

        return trueRatio, falseRatio
Example #30
0
    def __init__(self, level: int, targets: pd.Series, cut=None):
        self.level = level
        self.cut = cut

        self.sampleCount = targets.count()
        self.trueVals = targets.sum()
        self.falseVals = self.sampleCount - self.trueVals

        self.lessThanNode = None
        self.greaterThanOrEqualNode = None
Example #31
0
    def test_count(self, datetime_series):
        assert datetime_series.count() == len(datetime_series)

        datetime_series[::2] = np.NaN

        assert datetime_series.count() == np.isfinite(datetime_series).sum()

        mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]])
        ts = Series(np.arange(len(mi)), index=mi)

        left = ts.count(level=1)
        right = Series([2, 3, 1], index=[1, 2, np.nan])
        tm.assert_series_equal(left, right)

        ts.iloc[[0, 3, 5]] = np.nan
        tm.assert_series_equal(ts.count(level=1), right - 1)

        # GH#29478
        with pd.option_context("use_inf_as_na", True):
            assert pd.Series([pd.Timestamp("1990/1/1")]).count() == 1
Example #32
0
 def _fit_core(self, s: pd.Series) -> None:
     if s.count() == 0:
         raise RuntimeError("Valid values are not enough for training.")
     if self.high is None:
         self.abs_high_ = float("inf")
     else:
         self.abs_high_ = s.quantile(self.high)
     if self.low is None:
         self.abs_low_ = -float("inf")
     else:
         self.abs_low_ = s.quantile(self.low)
Example #33
0
    def Phases(self):
        rows = []
        for prefix in ('parse', 'compile', 'run'):
            for name,callTimes in self.times[prefix].iteritems():
                s = Series(callTimes)
                callCount = s.count()
                meanTime = s.mean()
                totalTime = s.sum()
                rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime))

        columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
Example #34
0
    def Calls(self):
        rows = []
        for name,callTimes in self.times['call'].iteritems():
            s = Series(callTimes)
            func,loc = formatName(name)
            callCount = s.count()
            meanTime = s.mean()
            totalTime = s.sum()
            rows.append((func,loc,callCount,meanTime,totalTime))

        columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
Example #35
0
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True):
    #  NOTE: inf only happens when explicitly setting bins

    # short circuit empty series
    s = Series(s)
    if s.count() == 0:
        return np.repeat(None, len(s))

    if not np.iterable(bins):
        ind, label = cut(s, bins, retbins=retbins, labels=labels)
        # for now, pandas base cut doesn't support infinite ranges
        # so it bases first bin at 0 where we base on 1, and 0 is 
        # [-inf, first] for us
        ind = ind + 1
    else:
        bins = np.asarray(bins)
        #if (np.diff(bins) < 0).any():
        #    raise ValueError('bins must increase monotonically.')
        ind, label = inf_bins_to_cuts(s, bins)

    # build out ranges
    ranges = []
    ranges.append(NumRange(-inf, label[0]))
    for x in range(len(label)-1):
       nr = NumRange(label[x], label[x+1]) 
       ranges.append(nr)
    ranges.append(NumRange(label[-1], inf))

    if not infinite:
        na_mask = (ind == 0) | (ind == len(bins))
        np.putmask(ind, na_mask, -1)

    #ind = ind.astype(int)
    ind[s.isnull().values] = -1
    # fastpath=True to skip the hashmap indexing. 
    # The code generator will check identity, which won't match because
    # ind is an int position vector and ranges is a list of objects.
    # if fastpath is off, then it'll look like none of the values match
    return Categorical(ind, ranges, fastpath=True)
Example #36
0
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True):
    #  NOTE: inf only happens when explicitly setting bins

    # short circuit empty series
    s = Series(s)
    if s.count() == 0:
        return np.repeat(None, len(s))

    if not np.iterable(bins):
        ind, label = cut(s, bins, retbins=retbins, labels=labels)
        # for now, pandas base cut doesn't support infinite ranges
        # so it bases first bin at 0 where we base on 1, and 0 is 
        # [-inf, first] for us
        ind = ind + 1
    else:
        bins = np.asarray(bins)
        #if (np.diff(bins) < 0).any():
        #    raise ValueError('bins must increase monotonically.')
        ind, label = inf_bins_to_cuts(s, bins)
    

    # build out ranges
    ranges = []
    ranges.append(NumRange(-inf, label[0]))
    for x in range(len(label)-1):
       nr = NumRange(label[x], label[x+1]) 
       ranges.append(nr)
    ranges.append(NumRange(label[-1], inf))

    if not infinite:
        na_mask = (ind == 0) | (ind == len(bins))
        np.putmask(ind, na_mask, -1)

    #ind = ind.astype(int)
    ind[s.isnull().values] = -1
    return Categorical(ind, ranges)
Example #37
0
def count_estims(dist, gamma = 0.95):
    '''
    Counts all estimates
    :param dist: dsitribution
    :param gamma: probability of realisation of value
    :return point: point estimates
    :return interval: confidance intervals for point estimates
    '''
    import numpy as np
    x = Series(dist)
    #Точечные оценки
    point = {}
    N = x.count()

    med_ = med_u(x)#
    med = np.median(dist)
    mad = x.mad()#
    mean_c = mean(dist)#
    var = np.var(dist)
    std = np.std(dist)
    mod = stats.mode(dist).mode#
    kurt = stats.kurtosis(dist)
    skew_my = stats.skew(dist)#
    Chi = 1/np.sqrt(np.abs(kurt))
    quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5)
    W = std/mean_c;#

    quantiles_str = ""
    for index in quantiles.index:
        quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index])

    point['MED'] = np.round(med, 5)
    point['MED*'] = np.round(med_, 5)
    point['MAD'] = np.round(mad, 5)
    point['Min'] = np.round(x.min(), 5)
    point['Max'] = np.round(x.max(), 5)
    point['Mean'] = np.round(mean_c, 5)
    point['S^2'] = np.round(var, 5)
    point['S'] = np.round(std, 5)
    point['MOD'] = np.round(mod, 5)
    point['E'] = np.round(kurt, 5)
    point['A'] = np.round(skew_my, 5)
    point['Chi'] = np.round(Chi, 5)
    point['X(alpha)'] = quantiles_str
    point['W'] = np.round(W, 5)



    #Интервальные оценки
    from scipy.stats import t, norm
    import numpy as np
    interval = {}
    if N < 61:
        l = t.ppf((1-gamma)/2, N-1)
        u = t.ppf(1-(1-gamma)/2, N-1)
    else:
        l = norm.ppf((1-gamma)/2)
        u = norm.ppf(1-(1-gamma)/2)
    X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x))
    A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x))
    S_cf = (std + l*sigma_S(x), std+u*sigma_S(x))
    E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x))
    if W < 1:
        v = l/np.sqrt(2*(N-1))
        W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5)
    else: W_cf = (None, None)

    interval['Mean'] = np.round(X_cf, 5)
    interval['S'] = np.round(S_cf, 5)
    interval['E'] = np.round(E_cf, 5)
    interval['A'] = np.round(A_cf, 5)
    interval['W'] = W_cf

    return point, interval
pieces=[x.strip() for x in val.split(',')]

#以指定形式 分割

first,second,third=pieces

first+'::'+second+'::'+third

#更地道  的做法

'::'.join(pieces)

#统计字符串 出现次数

val.count(',')

# 替换

val.replace(',','::')


#************************************************************
#  正则表达式 regex

#re模块  三个类 :模式匹配   替换  拆分

import re

text="foo bar\t baz \tqux"
Example #39
0
class InstagramTimeSeries:
    def __init__(self, region, start_timestamp, end_timestamp, freq="1h"):
        # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq)
        self.start_timestamp = start_timestamp
        self.end_timestamp = end_timestamp
        self.region = region
        self._db = MongoDBInterface()
        self._db.setDB(InstagramConfig.db)
        self._db.setCollection(InstagramConfig.posts_collection)
        self.days_to_predict = 1
        self.freq = freq

    def rangeQuery(self, region, startTimestamp, endTimestamp):
        region_conditions = {}
        period_conditions = {}
        if region:
            region_conditions = {"region.code": region}

        period_conditions = {"created_time": {"$gte": startTimestamp, "$lt": endTimestamp}}
        conditions = dict(region_conditions, **period_conditions)

        return self._db.getAllDocuments(conditions).sort([("created_time", -1)])

    def getRawSeries(self):
        return self.series

    def buildTimeSeries(self, count_people=True, avoid_flooding=True):
        """Return a pandas Series object
        
        count_people = True means we only want to count single user
        instead of # of photos for that region

        avoid_flooding = True means we want to avoid a single user
        flooding many photos into instagram in a short time. Now we
        set the time window as within 5 minutes only count as a single
        user
        
        """
        window_avoid_flooding = 300
        data = []
        photo_cnt = 0
        for photo in self.rangeQuery(self.region, self.start_timestamp, self.end_timestamp):
            p = {"user": photo["user"], "created_time": photo["created_time"]}
            data.append(p)
            photo_cnt += 1
            if photo_cnt % 10000 == 0:
                print photo_cnt
        data = sorted(data, key=lambda x: x["created_time"])
        print (len(data))
        user_last_upload = {}  # for a single user, when is his last upload
        counts = []
        dates = []

        counts.append(1)  # VERY IMPORTANT. FIX THE SIZE OF TIMESERIES IN PANDAS
        dates.append(datetime.utcfromtimestamp(float(self.start_timestamp)))

        for photo_json in data:
            user = photo_json["user"]["username"]
            utc_date = datetime.utcfromtimestamp(float(photo_json["created_time"]))
            if count_people:
                if user not in user_last_upload:
                    user_last_upload[user] = int(photo_json["created_time"])
                    dates.append(utc_date)
                    counts.append(1)
                else:
                    if float(photo_json["created_time"]) - float(user_last_upload[user]) > window_avoid_flooding:
                        user_last_upload[user] = int(photo_json["created_time"])
                        dates.append(utc_date)
                        counts.append(1)
            else:
                dates.append(utc_date)
                counts.append(1)

        counts.append(1)  # VERY IMPORTANT, FIX THE SIZE OF TIMESERIES IN PANDAS
        dates.append(datetime.utcfromtimestamp(float(self.end_timestamp) - 1))
        self.series = Series(counts, index=dates)

        print (self.series.count())
        try:
            self.series2 = self.series.resample(self.freq, how="sum", label="right")
            # self.series2 = self.series2.fillna(0) #fill NaN values with zeros
        except Exception as e:  # not enough data
            print (e)
            pass
        print (self.series2.count())
        return self.series2

    def smoothSeriesEwma(self, series, span=5.0, adjust=True, halflife=None, min_periods=0, how="mean"):
        return pandas.ewma(
            series,
            com=None,
            span=span,
            halflife=halflife,
            min_periods=min_periods,
            freq="1h",
            adjust=adjust,
            how=how,
            ignore_na=True,
        )

    def smoothSeriesEwmstd(self, series, span=5.0, adjust=True, halflife=None, min_periods=0):
        return pandas.ewmstd(
            series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True
        )

    def smoothSeriesEwmvar(self, series, span=5.0, adjust=True, halflife=None, min_periods=0):
        return pandas.ewmstd(
            series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True
        )

    def dataPrepare(self, serie):
        """This is to return the 'future data points' that you want to
        predict. e.g. predict for each hour tomorrow how many people will
        show up at Times Square

        """
        ts = serie
        index = ts.index
        if len(index) < 3:
            raise Exception("Only %d data points" % (len(index)))
        start_date = ts.index[0]

        """Notice training here is in the format of
        (days from begining of the timeseries, number of data at that time)
        
        """
        training = []
        for idx in index:
            days_diff = (idx - start_date).days + (idx - start_date).seconds / (24 * 3600.0)
            training.append((days_diff, ts[idx]))
        nearest_current_date = index[-1]

        testing = []
        align = []
        converted_align = []
        for hour in range(25 * self.days_to_predict):
            next_date = nearest_current_date + timedelta(seconds=3600 * (hour + 1))
            delta = next_date - start_date
            days_from_start = (delta.seconds + delta.days * 86400) / (3600 * 24.0)
            testing.append(days_from_start)
            align.append(next_date)
            converted_align.append(calendar.timegm(next_date.utctimetuple()))

        return training, testing, align, converted_align
Example #40
0
# introspection #############################################
# get 1-d array
a = s.values

# get index
i = s.index

# assign name
s.name = 'name'

# length
assert len(s) == s.size == s.shape[0]

# number of element that a not NaN
s.count()

# get a array of unique values
s.unique()

# count(*) group by non-NaN value, get a Series
s.value_counts()

# aggregation and statistic
s.max()
s.mean()
s.var()

# location of the max element
s.idxmax()