def _fit_core(self, s: pd.Series) -> None: if s.count() == 0: raise RuntimeError("Valid values are not enough for training.") R = pd.Series(np.zeros(len(s)), index=s.index) n = s.count() Lambda = pd.Series(np.zeros(len(s)), index=s.index) s_copy = s.copy() i = 0 while s_copy.count() > 0: i += 1 ind = (s_copy - s_copy.mean()).abs().idxmax() R[ind] = (abs(s_copy[ind] - s_copy.mean()) / s_copy.std() if s_copy.std() > 0 else 0) s_copy[ind] = np.nan p = 1 - self.alpha / (2 * (n - i + 1)) Lambda[ind] = ((n - i) * t.ppf(p, n - i - 1) / np.sqrt( (n - i - 1 + t.ppf(p, n - i - 1)**2) * (n - i + 1))) if R[ind] <= Lambda[ind]: break self._normal_sum = s[Lambda >= R].sum() self._normal_squared_sum = (s[Lambda >= R]**2).sum() self._normal_count = s[Lambda >= R].count() i = 1 n = self._normal_count + 1 p = 1 - self.alpha / (2 * (n - i + 1)) self._lambda = ((n - i) * t.ppf(p, n - i - 1) / np.sqrt( (n - i - 1 + t.ppf(p, n - i - 1)**2) * (n - i + 1)))
def describe_unsupported(series: pd.Series, series_description: dict): """Describe an unsupported series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # number of observations in the Series leng = len(series) # number of non-NaN observations in the Series count = series.count() # number of infinte observations in the Series n_infinite = count - series.count() results_data = { "count": count, "p_missing": 1 - count * 1.0 / leng, "n_missing": leng - count, "p_infinite": n_infinite * 1.0 / leng, "n_infinite": n_infinite, "memorysize": series.memory_usage(), } return results_data
def test_count_level_without_multiindex(self): ser = Series(range(3)) msg = "Series.count level is only valid with a MultiIndex" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning): ser.count(level=1)
def to_deviation(series: pandas.Series, threshold_deviation_user_count: Optional[int] = None) -> pandas.Series: if series.count() == 0 or ( threshold_deviation_user_count is not None and series.count() <= threshold_deviation_user_count ): return pandas.Series([numpy.nan] * len(series)) else: std = series.std(ddof=0) mean = series.mean() if std != 0.0: return series.map(lambda x: (x - mean) / std * 10 + 50) else: return series.map(lambda x: 50 if not numpy.isnan(x) else numpy.nan)
def filter_edges(self, condition: pd.Series): if self._edges.shape[0] != condition.count(): msg = 'Number of edges {edges} is different from the length of the condition array {condition}'.format( edges=self._edges.shape[0], condition=condition.count()) raise ValueError(msg) filtered_edges = self._edges[condition] return SpatioTemporalNetwork(nodes=self._nodes, edges=filtered_edges, origin=self._origin, destination=self._destination, node_id=self._node_id)
def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]] ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))
def energy_mean_nan(nb, harmonics=2, method='ffill'): """Compute the energy of this NetBlock. Returns a dictionary: e24 - energy at 1/(24hr) and selected harmonics te - total energy ra = e24/te nrows - Number of datapoints in the sample nan - Number of time bins for which values had to be interpolated """ # Note that "mean" is not automatically correct. Consider median, others timeseries = Series([nb.data[tt].mean() for tt in nb.TBall]) nan = nb.TB.bucketcount - timeseries.count() if method: timeseries = timeseries.fillna(method=method) # TODO (mattmathis) exlore limit= options else: # zero fill is only correct for energy algebra timeseries = [0.0 if np.isnan(tv) else tv for tv in timeseries] e24, te = energy.power_ratio(timeseries, len(timeseries), harmonics) try: ra = e24 / te except: # failed to fill all NaNs or other failure ra = float('nan') return {'e24': e24, 'te': te, 'ra': ra, "nrows": len(nb.data), 'nan': nan}
def cluster(id, data, dataset): from pandas import Series, DataFrame id_index = Series(id.tolist()) from cluster import density_cluster N = id_index.count() distance = compute_distance(data) distance_c = min_distance(distance) # id.values -> 对应的key index_id = Series(id_index.index, index=id_index.values) log.warn("the init distance_c is: " + str(distance_c)) # to creat the base index table # 生成对应的索引,用于控制rho,delta,index的内容 rho_id = rho_function(index_id, distance, distance_c=distance_c) #delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # gamma=rho*delta threshold = DataFrame([], columns=['H', 'd_c', 'cluster']) threshold = ent_dc_step_by_step(id_index, index_id, data, threshold=threshold, distance=distance, distance_c=distance_c, dataset=dataset) r = threshold # log.debug("rho:\n" + str(rho)) log.debug("threshold\n" + str(DataFrame(threshold))) return r
def basic_stat_map(s: pd.Series) -> dict: return { "mean": s.mean(), "median": s.median(), "std": s.std(), "count": s.count(), }
def _entropy(self, column: pd.Series): n = column.count() valueCounts = column.value_counts().tolist() result = 0 for x in valueCounts: result += (-x / n) * log2(x / n) return (result)
def get_cumsum(data: pd.Series): cumsum = 0 i = 0 while True: yield cumsum cumsum += data.iloc[i] i = (i + 1) % data.count()
def _box_stats(ds: pd.Series, med: bool = True, iqr: bool = True, count: bool = True) -> str: """ Create the metric part with stats of the box (axis) caption Parameters ---------- ds: pd.Series data on which stats are found med: bool iqr: bool count: bool statistics Returns ------- stats: str caption with summary stats """ # interquartile range iqr = ds.quantile(q=[0.75, 0.25]).diff() iqr = abs(float(iqr.loc[0.25])) met_str = [] if med: met_str.append('Median: {:.3g}'.format(ds.median())) if iqr: met_str.append('IQR: {:.3g}'.format(iqr)) if count: met_str.append('N: {:d}'.format(ds.count())) stats = '\n'.join(met_str) return stats
def count_fraction_of_true(series: pd.Series): # We are assuming this is called by a Boolean series if series.dtype != np.bool: raise ValueError num_true = series.sum() total = float(series.count()) return num_true / total if total > 0.0 else 0.0, num_true
def get_counts(series: pd.Series) -> dict: # The value_counts() function is used to get a Series containing counts of unique values. value_counts_with_nan = series.value_counts(dropna=False) # Fix for data with only a single value; reset_index was flipping the data returned if len(value_counts_with_nan) == 1: if pd.isna(value_counts_with_nan.index[0]): value_counts_without_nan = pd.Series() else: value_counts_without_nan = value_counts_with_nan else: value_counts_without_nan = (value_counts_with_nan.reset_index().dropna( ).set_index("index").iloc[:, 0]) # print(value_counts_without_nan.index.dtype.name) # IGNORING NAN FOR NOW AS IT CAUSES ISSUES [FIX] # distinct_count_with_nan = value_counts_with_nan.count() distinct_count_without_nan = value_counts_without_nan.count() return { "value_counts_without_nan": value_counts_without_nan, "distinct_count_without_nan": distinct_count_without_nan, "num_rows_with_data": series.count(), "num_rows_total": len(series), # IGNORING NAN FOR NOW AS IT CAUSES ISSUES [FIX]: # "value_counts_with_nan": value_counts_with_nan, # "distinct_count_with_nan": distinct_count_with_nan, }
def describe_unsupported(series: pd.Series, series_description: dict): """Describe an unsupported series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # number of observations in the Series length = len(series) # number of non-NaN observations in the Series count = series.count() results_data = { "n": length, "count": count, "p_missing": 1 - count / length, "n_missing": length - count, "memory_size": series.memory_usage(deep=config["memory_deep"].get(bool)), } return results_data
def _getSeriesScoreMultipliedByCount(self, targetSeries: pd.Series) -> float: totalCount = targetSeries.count() trueCount = targetSeries.sum() falseCount = totalCount - trueCount return totalCount - (trueCount * trueCount + falseCount * falseCount) / totalCount
def series_datatype(data: pd.Series, values: Optional[List[str]] = None) -> DataType: """ determine given data series is categorical or continuous using set of rules :param data: data for facet/label/predicted_label columns :param values: list of facet or label values provided by user :return: Enum {CATEGORICAL|CONTINUOUS} """ # if datatype is boolean or categorical we return data as categorical data_type = DataType.CATEGORICAL data_uniqueness_fraction = divide(data.nunique(), data.count()) logger.info(f"data uniqueness fraction: {data_uniqueness_fraction}") # Assumption: user will give single value for threshold currently # Todo: fix me if multiple thresholds for facet or label are supported if data.dtype.name == "category" or (isinstance(values, list) and len(values) > 1): return data_type if data.dtype.name in ["str", "string", "object"]: # cast the dtype to int, if exception is raised data is categorical casted_data = data.astype("int64", copy=True, errors="ignore") if np.issubdtype( casted_data.dtype, np.integer ) and data_uniqueness_fraction >= UNIQUENESS_THRESHOLD: data_type = DataType.CONTINUOUS # type: ignore elif np.issubdtype(data.dtype, np.floating): data_type = DataType.CONTINUOUS elif np.issubdtype(data.dtype, np.integer): # Current rule: If data has more than 5% if unique values then it is continuous # Todo: Needs to be enhanced, This rule doesn't always determine the datatype correctly if data_uniqueness_fraction >= UNIQUENESS_THRESHOLD: data_type = DataType.CONTINUOUS return data_type
def describe_categorical_1d( data: Series, percentiles_ignored: Sequence[float], ) -> Series: """Describe series containing categorical data. Parameters ---------- data : Series Series to be described. percentiles_ignored : list-like of numbers Ignored, but in place to unify interface. """ names = ["count", "unique", "top", "freq"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) if count_unique > 0: top, freq = objcounts.index[0], objcounts.iloc[0] dtype = None else: # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency top, freq = np.nan, np.nan dtype = "object" result = [data.count(), count_unique, top, freq] from pandas import Series return Series(result, index=names, name=data.name, dtype=dtype)
def multi_processing_cluster(job, work, df, id, data): # threshold = DataFrame([], columns=['H', 'd_c', 'cluster']) from pandas import Series, DataFrame id_index = Series(id.tolist()) from cluster import density_cluster N = id_index.count() distance = compute_distance(data) distance_c = min_distance(distance) max = max_distance(distance, distance_c) max = average_task(max, job) log.debug(str("max:") + str(max)) distance_c = distance_c + work * max max_distance_c = distance_c + max # id.values -> 对应的key index_id = Series(id_index.index, index=id_index.values) log.warn("work id " + str(work) + " the starting distance_c is: " + str(distance_c) + ". working under" + str(max_distance_c)) # to creat the base index table # 生成对应的索引,用于控制rho,delta,index的内容 rho_id = rho_function(index_id, distance, distance_c=distance_c) delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # gamma=rho*delta threshold = df threshold = multi_ent_dc_step_by_step(id_index, index_id, threshold=threshold, distance=distance, distance_c=distance_c, max_distance_c=max_distance_c) r = threshold # log.debug("rho:\n" + str(rho)) log.debug("worker " + str(work) + " has finished. threshold\n" + str(DataFrame(threshold)))
def describe_supported(series: pd.Series, series_description: dict) -> dict: """Describe a supported series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # number of observations in the Series length = len(series) # number of non-NaN observations in the Series count = series.count() distinct_count = series_description["distinct_count_without_nan"] value_counts = series_description["value_counts_without_nan"] unique_count = value_counts.where(value_counts == 1).count() stats = { "n": length, "count": count, "n_distinct": distinct_count, "p_distinct": distinct_count / count, "p_missing": 1 - (count / length), "n_missing": length - count, "is_unique": unique_count == count, "n_unique": unique_count, "p_unique": unique_count / count, "memory_size": series.memory_usage(config["memory_deep"].get(bool)), } return stats
def modified_qcut(series: Series, q: int, labels: List[Any]): """ 修正的qcut Parameters ---------- series: Series 因子排序值rank q: int 分组组数 labels: List[Any] 标签, 如分为5组, [1,2,3,4,5] Returns ------- 组别: Series """ # 如果有效数据个数小于组别, 则全部为0, 即不持仓 if series.count() < q: new_series = pd.Series([0.0] * len(series)) new_series.index = series.index return new_series else: return pd.qcut(x=series, q=q, labels=labels)
def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0))
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. Parameters ---------- series : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ from pandas import Series # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]"; # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str, # float]]]" formatted_percentiles = format_percentiles( percentiles) # type: ignore[arg-type] stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name)
def true_sequence_count(series: Series): index = series.where(series == False).last_valid_index() if index is None: return series.count() else: s = series[series.index > index] return s.count()
def test_count_categorical(self): ser = Series( Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) result = ser.count() assert result == 2
def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], ) ser = Series(np.random.randn(len(index)), index=index) result = ser.count(level=0) expected = ser.groupby(level=0).count() tm.assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) result = ser.count(level=1) expected = ser.groupby(level=1).count() tm.assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))
def _trapezium_integration_variable(d_ti: pd.Series) -> Optional[float]: """Gapfill version of trap int - will fill out""" # Clear no numbers d_ti = d_ti.dropna() if d_ti.count() == 0: return None # One entry if d_ti.count() == 1: return d_ti[0] * 0.5 # Fall back on average but warn to check data if d_ti.count() <= 3: d_sum = d_ti.sum() if d_sum == 0: return 0 if d_ti.count() == 0: return 0 return 0.5 * d_sum / d_ti.count() bucket_middle = d_ti.count() - 2 bucket_middle_weights = [1] + [2] * bucket_middle + [1] weights = d_ti.values * bucket_middle_weights weights_sum = weights.sum() bucket_energy = 0.5 * weights_sum / ((d_ti.count() - 1) * 2) return bucket_energy
def _getTrueAndFalseRatios(self, series: pd.Series) -> (float, float): totalCount = series.count() trueCount = series.sum() falseCount = totalCount - trueCount trueRatio = trueCount / totalCount falseRatio = falseCount / totalCount return trueRatio, falseRatio
def __init__(self, level: int, targets: pd.Series, cut=None): self.level = level self.cut = cut self.sampleCount = targets.count() self.trueVals = targets.sum() self.falseVals = self.sampleCount - self.trueVals self.lessThanNode = None self.greaterThanOrEqualNode = None
def test_count(self, datetime_series): assert datetime_series.count() == len(datetime_series) datetime_series[::2] = np.NaN assert datetime_series.count() == np.isfinite(datetime_series).sum() mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) ts = Series(np.arange(len(mi)), index=mi) left = ts.count(level=1) right = Series([2, 3, 1], index=[1, 2, np.nan]) tm.assert_series_equal(left, right) ts.iloc[[0, 3, 5]] = np.nan tm.assert_series_equal(ts.count(level=1), right - 1) # GH#29478 with pd.option_context("use_inf_as_na", True): assert pd.Series([pd.Timestamp("1990/1/1")]).count() == 1
def _fit_core(self, s: pd.Series) -> None: if s.count() == 0: raise RuntimeError("Valid values are not enough for training.") if self.high is None: self.abs_high_ = float("inf") else: self.abs_high_ = s.quantile(self.high) if self.low is None: self.abs_low_ = -float("inf") else: self.abs_low_ = s.quantile(self.low)
def Phases(self): rows = [] for prefix in ('parse', 'compile', 'run'): for name,callTimes in self.times[prefix].iteritems(): s = Series(callTimes) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime)) columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
def Calls(self): rows = [] for name,callTimes in self.times['call'].iteritems(): s = Series(callTimes) func,loc = formatName(name) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append((func,loc,callCount,meanTime,totalTime)) columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True): # NOTE: inf only happens when explicitly setting bins # short circuit empty series s = Series(s) if s.count() == 0: return np.repeat(None, len(s)) if not np.iterable(bins): ind, label = cut(s, bins, retbins=retbins, labels=labels) # for now, pandas base cut doesn't support infinite ranges # so it bases first bin at 0 where we base on 1, and 0 is # [-inf, first] for us ind = ind + 1 else: bins = np.asarray(bins) #if (np.diff(bins) < 0).any(): # raise ValueError('bins must increase monotonically.') ind, label = inf_bins_to_cuts(s, bins) # build out ranges ranges = [] ranges.append(NumRange(-inf, label[0])) for x in range(len(label)-1): nr = NumRange(label[x], label[x+1]) ranges.append(nr) ranges.append(NumRange(label[-1], inf)) if not infinite: na_mask = (ind == 0) | (ind == len(bins)) np.putmask(ind, na_mask, -1) #ind = ind.astype(int) ind[s.isnull().values] = -1 # fastpath=True to skip the hashmap indexing. # The code generator will check identity, which won't match because # ind is an int position vector and ranges is a list of objects. # if fastpath is off, then it'll look like none of the values match return Categorical(ind, ranges, fastpath=True)
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True): # NOTE: inf only happens when explicitly setting bins # short circuit empty series s = Series(s) if s.count() == 0: return np.repeat(None, len(s)) if not np.iterable(bins): ind, label = cut(s, bins, retbins=retbins, labels=labels) # for now, pandas base cut doesn't support infinite ranges # so it bases first bin at 0 where we base on 1, and 0 is # [-inf, first] for us ind = ind + 1 else: bins = np.asarray(bins) #if (np.diff(bins) < 0).any(): # raise ValueError('bins must increase monotonically.') ind, label = inf_bins_to_cuts(s, bins) # build out ranges ranges = [] ranges.append(NumRange(-inf, label[0])) for x in range(len(label)-1): nr = NumRange(label[x], label[x+1]) ranges.append(nr) ranges.append(NumRange(label[-1], inf)) if not infinite: na_mask = (ind == 0) | (ind == len(bins)) np.putmask(ind, na_mask, -1) #ind = ind.astype(int) ind[s.isnull().values] = -1 return Categorical(ind, ranges)
def count_estims(dist, gamma = 0.95): ''' Counts all estimates :param dist: dsitribution :param gamma: probability of realisation of value :return point: point estimates :return interval: confidance intervals for point estimates ''' import numpy as np x = Series(dist) #Точечные оценки point = {} N = x.count() med_ = med_u(x)# med = np.median(dist) mad = x.mad()# mean_c = mean(dist)# var = np.var(dist) std = np.std(dist) mod = stats.mode(dist).mode# kurt = stats.kurtosis(dist) skew_my = stats.skew(dist)# Chi = 1/np.sqrt(np.abs(kurt)) quantiles = np.round(x.quantile([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]), 5) W = std/mean_c;# quantiles_str = "" for index in quantiles.index: quantiles_str+='<p><pre>{0}\t{1}</pre></p>'.format(index, quantiles[index]) point['MED'] = np.round(med, 5) point['MED*'] = np.round(med_, 5) point['MAD'] = np.round(mad, 5) point['Min'] = np.round(x.min(), 5) point['Max'] = np.round(x.max(), 5) point['Mean'] = np.round(mean_c, 5) point['S^2'] = np.round(var, 5) point['S'] = np.round(std, 5) point['MOD'] = np.round(mod, 5) point['E'] = np.round(kurt, 5) point['A'] = np.round(skew_my, 5) point['Chi'] = np.round(Chi, 5) point['X(alpha)'] = quantiles_str point['W'] = np.round(W, 5) #Интервальные оценки from scipy.stats import t, norm import numpy as np interval = {} if N < 61: l = t.ppf((1-gamma)/2, N-1) u = t.ppf(1-(1-gamma)/2, N-1) else: l = norm.ppf((1-gamma)/2) u = norm.ppf(1-(1-gamma)/2) X_cf = (mean_c+l*sigma_X(x), mean_c+u*sigma_X(x)) A_cf = (skew_my + l * sigma_A(x), skew_my + u * sigma_A(x)) S_cf = (std + l*sigma_S(x), std+u*sigma_S(x)) E_cf = (kurt + l*sigma_E(x), kurt+u*sigma_E(x)) if W < 1: v = l/np.sqrt(2*(N-1)) W_cf = np.round((W/(1+v*np.sqrt(1+2*W**2)), W/(1-v*np.sqrt(1+2*W**2))), 5) else: W_cf = (None, None) interval['Mean'] = np.round(X_cf, 5) interval['S'] = np.round(S_cf, 5) interval['E'] = np.round(E_cf, 5) interval['A'] = np.round(A_cf, 5) interval['W'] = W_cf return point, interval
pieces=[x.strip() for x in val.split(',')] #以指定形式 分割 first,second,third=pieces first+'::'+second+'::'+third #更地道 的做法 '::'.join(pieces) #统计字符串 出现次数 val.count(',') # 替换 val.replace(',','::') #************************************************************ # 正则表达式 regex #re模块 三个类 :模式匹配 替换 拆分 import re text="foo bar\t baz \tqux"
class InstagramTimeSeries: def __init__(self, region, start_timestamp, end_timestamp, freq="1h"): # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq) self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.region = region self._db = MongoDBInterface() self._db.setDB(InstagramConfig.db) self._db.setCollection(InstagramConfig.posts_collection) self.days_to_predict = 1 self.freq = freq def rangeQuery(self, region, startTimestamp, endTimestamp): region_conditions = {} period_conditions = {} if region: region_conditions = {"region.code": region} period_conditions = {"created_time": {"$gte": startTimestamp, "$lt": endTimestamp}} conditions = dict(region_conditions, **period_conditions) return self._db.getAllDocuments(conditions).sort([("created_time", -1)]) def getRawSeries(self): return self.series def buildTimeSeries(self, count_people=True, avoid_flooding=True): """Return a pandas Series object count_people = True means we only want to count single user instead of # of photos for that region avoid_flooding = True means we want to avoid a single user flooding many photos into instagram in a short time. Now we set the time window as within 5 minutes only count as a single user """ window_avoid_flooding = 300 data = [] photo_cnt = 0 for photo in self.rangeQuery(self.region, self.start_timestamp, self.end_timestamp): p = {"user": photo["user"], "created_time": photo["created_time"]} data.append(p) photo_cnt += 1 if photo_cnt % 10000 == 0: print photo_cnt data = sorted(data, key=lambda x: x["created_time"]) print (len(data)) user_last_upload = {} # for a single user, when is his last upload counts = [] dates = [] counts.append(1) # VERY IMPORTANT. FIX THE SIZE OF TIMESERIES IN PANDAS dates.append(datetime.utcfromtimestamp(float(self.start_timestamp))) for photo_json in data: user = photo_json["user"]["username"] utc_date = datetime.utcfromtimestamp(float(photo_json["created_time"])) if count_people: if user not in user_last_upload: user_last_upload[user] = int(photo_json["created_time"]) dates.append(utc_date) counts.append(1) else: if float(photo_json["created_time"]) - float(user_last_upload[user]) > window_avoid_flooding: user_last_upload[user] = int(photo_json["created_time"]) dates.append(utc_date) counts.append(1) else: dates.append(utc_date) counts.append(1) counts.append(1) # VERY IMPORTANT, FIX THE SIZE OF TIMESERIES IN PANDAS dates.append(datetime.utcfromtimestamp(float(self.end_timestamp) - 1)) self.series = Series(counts, index=dates) print (self.series.count()) try: self.series2 = self.series.resample(self.freq, how="sum", label="right") # self.series2 = self.series2.fillna(0) #fill NaN values with zeros except Exception as e: # not enough data print (e) pass print (self.series2.count()) return self.series2 def smoothSeriesEwma(self, series, span=5.0, adjust=True, halflife=None, min_periods=0, how="mean"): return pandas.ewma( series, com=None, span=span, halflife=halflife, min_periods=min_periods, freq="1h", adjust=adjust, how=how, ignore_na=True, ) def smoothSeriesEwmstd(self, series, span=5.0, adjust=True, halflife=None, min_periods=0): return pandas.ewmstd( series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True ) def smoothSeriesEwmvar(self, series, span=5.0, adjust=True, halflife=None, min_periods=0): return pandas.ewmstd( series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True ) def dataPrepare(self, serie): """This is to return the 'future data points' that you want to predict. e.g. predict for each hour tomorrow how many people will show up at Times Square """ ts = serie index = ts.index if len(index) < 3: raise Exception("Only %d data points" % (len(index))) start_date = ts.index[0] """Notice training here is in the format of (days from begining of the timeseries, number of data at that time) """ training = [] for idx in index: days_diff = (idx - start_date).days + (idx - start_date).seconds / (24 * 3600.0) training.append((days_diff, ts[idx])) nearest_current_date = index[-1] testing = [] align = [] converted_align = [] for hour in range(25 * self.days_to_predict): next_date = nearest_current_date + timedelta(seconds=3600 * (hour + 1)) delta = next_date - start_date days_from_start = (delta.seconds + delta.days * 86400) / (3600 * 24.0) testing.append(days_from_start) align.append(next_date) converted_align.append(calendar.timegm(next_date.utctimetuple())) return training, testing, align, converted_align
# introspection ############################################# # get 1-d array a = s.values # get index i = s.index # assign name s.name = 'name' # length assert len(s) == s.size == s.shape[0] # number of element that a not NaN s.count() # get a array of unique values s.unique() # count(*) group by non-NaN value, get a Series s.value_counts() # aggregation and statistic s.max() s.mean() s.var() # location of the max element s.idxmax()