def test_series(self, orient, numpy): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) if orient == "split": dec = _clean_dict(output) output = Series(**dec) else: output = Series(output) if orient in (None, "index"): s.name = None output = output.sort_values() s.index = ["6", "7", "8", "9", "10", "15"] elif orient in ("records", "values"): s.name = None s.index = [0, 1, 2, 3, 4, 5] tm.assert_series_equal(output, s, check_dtype=False)
def genOCRData(alpha,N,mu1,mu2,train): k = np.random.randint(2,8) if train: mu1 = Series(np.zeros(dim-k)).append(Series(np.ones(k))) mu1.index = range(len(mu1)) np.random.shuffle(mu1) mu1.index = range(len(mu1)) mu2 = Series(np.zeros(dim-k)).append(Series(np.ones(k))) mu2.index = range(len(mu2)) np.random.shuffle(mu2) mu2.index = range(len(mu2)) data1 = DataFrame(np.random.normal(mu,alpha*sigma,(N,dim)))+mu1 data2 = DataFrame(np.random.normal(mu,alpha*sigma,(N,dim)))+mu2 return data1, data2, mu1, mu2
def compute_summary(self, combined_df): combined_mean = combined_df.mean() average_cons = combined_mean['consistency'] average_ambi = combined_mean['ambiguity'] # completeness = self.compute_completeness() noise = self.compute_noise() series = Series([average_cons, average_ambi]) series.index = ['Average Consistency', 'Average Ambiguity'] series_2 = Series(noise) series_2.index = [i.title() for i in series_2.index] series = series.append(series_2) return series
def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) series_copy = series * 2 series_copy[::2] = np.NaN # nothing used from the input combined = series.combine_first(series_copy) tm.assert_series_equal(combined, series) # Holes filled from input combined = series_copy.combine_first(series) assert np.isfinite(combined).all() tm.assert_series_equal(combined[::2], series[::2]) tm.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types index = tm.makeStringIndex(20) floats = Series(tm.randn(20), index=index) strings = Series(tm.makeStringIndex(10), index=index[::2]) combined = strings.combine_first(floats) tm.assert_series_equal(strings, combined.loc[index[::2]]) tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) s.index = s.index.astype('O') assert_series_equal(s, result)
def data_sum(self, grouped): ''' 计算列和 :param grouped: :return: sum_series ''' format_ = lambda x: '%.2f' % x sum_clicks = grouped['Clicks'].sum() sum_impressions = grouped['Impressions'].sum() sum_orders = grouped['1-day Orders Placed (#)'].sum() sum_spend = grouped['Total Spend'].sum() sum_sales = grouped['1-day Ordered Product Sales'].sum() if sum_clicks == 0: sum_conversion = 0 else: sum_conversion = sum_orders/sum_clicks if sum_impressions == 0: sum_ctr = 0 else: sum_ctr = sum_clicks/sum_impressions if sum_sales == 0: sum_acos = 0 else: sum_acos = sum_spend/sum_sales sum_series = Series([sum_clicks, sum_orders, sum_spend, sum_sales, sum_conversion, sum_acos, sum_ctr]).apply(format_) sum_series.index = ['Clicks', '1-day Orders Placed (#)', 'Total Spend', '1-day Ordered Product Sales', 'Average conversion rate', 'Average ACOS', 'Average CTR'] return sum_series
def slide10(): print '2012Q4, Q-JAN' p = pd.Period('2012Q4', freq='Q-JAN') print p print '2012Q4 start' print p.asfreq('D', 'start') print '2012Q4 end' print p.asfreq('D', 'end') print p.asfreq('B', 'e') print '4PM on the 2nd to last business day of the quater' print (p.asfreq('B', 'e') - 1).asfreq('T', 's') p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60 print p4pm print p4pm.to_timestamp() print 'timeseries' rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN') ts = Series(np.arange(len(rng)), index=rng) print ts new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60 print 'new range' print new_rng ts.index = new_rng.to_timestamp() print ts
def imageData(a,N,dim,mu1,mu2,train): dim = 35 k = 30 if train: mu1 = Series(np.zeros(dim-k)).append(Series(np.ones(k))) mu1.index = range(len(mu1)) np.random.shuffle(mu1) mu1.index = range(len(mu1)) mu2 = Series(np.zeros(dim-k)).append(Series(np.ones(k))) mu2.index = range(len(mu2)) np.random.shuffle(mu2) mu2.index = range(len(mu2)) data1 = DataFrame(np.random.normal(0,a*1,(N/2,dim)))+mu1 data2 = DataFrame(np.random.normal(0,a*1,(N/2,dim)))+mu2 y = np.append(np.ones(N/2),np.zeros(N/2)) return np.append(data1,data2,0), y, mu1, mu2
def simulation(data,df_info): #initialize result data structure result = {} site_count = 0 #loop through sites for site in data.columns: s = df_info.ix[site] #get info, better if we can have siteID df = DataFrame(data[site]) # t is a series of time, check the open time and alert t = Series(df.index).apply(is_open_at, op=s.OpenWeekday, ed=s.CloseWeekday, op_S=s.OpenWeekend, ed_S=s.CloseWeekend, delay=1) t.index = df.index df['open'] = t # df['ecm'] = df.apply(is_lighting_ecm, name=site, thld=s.Threshold, mult=1.3, add = 3, axis=1) df['ecm'] = df.apply(is_lighting_ecm2, name=site, thld=s.Threshold, ave=s.DaytimeAve, mult=0.7, axis=1) df.ix[-1,'ecm'] = False #make sure the alarm will stop, i know i am lazy if df['ecm'].any(): item = event_to_ecm(df,site,s.Threshold,limit=6) if len(item['start'])>0: result[site] = item site_count = site_count + 1 print str(site_count) + ' of ' + str(len(data.columns)) + ' : ' + site return result
def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 s.index = MultiIndex.from_product([range(10)] * 4) self.ss = s.to_sparse()
def aggByTimePeriodByTradingDay(dfIntra=None,hhmmLow=958, hhmmHigh=1002,dateCol='date', aggCol='volume'): ''' Aggregate an intra day series with a date column of yyyyMmDdHhMmSs values, and a column of values that you will aggregate using group by, like volume. Example: aggByTimePeriodByTradingDay() ''' # get day df = dfIntra if (df is None) or (df.empty): df = readData('cl201404.csv') yyyyMmDd = map(lambda x:int(str(x)[0:8]),df[dateCol]) hhmm = map(lambda x:int(str(x)[8:12]),df[dateCol]) # insert in dataframe df['yyyyMmDd'] = yyyyMmDd df['hhmm'] = hhmm # get bars between hhmm times validBars = map(lambda x:(x >= hhmmLow) and (x <= hhmmHigh) ,hhmm) # select those bars dfvb = df.loc[validBars] # groupby day dfgb = dfvb.groupby('yyyyMmDd') # do agg sum dfgba = dfgb[aggCol].aggregate(np.sum) dates = Series(dfvb['yyyyMmDd'].unique()) dates.index = range(len(dates)) agg = dfgba agg.index = range(len(dfgba)) newdf = DataFrame({'date':dates,'agg':agg}) plotDf(newdf,priceCol='agg',dateCol='date')
def test_rank_int(self): s = self.s.dropna().astype('i8') for method, res in self.results.items(): result = s.rank(method=method) expected = Series(res).dropna() expected.index = result.index assert_series_equal(result, expected)
def test_series_getitem_multiindex(access_method, level1_value, expected): # GH 6018 # series regression getitem with a multi-index s = Series([1, 2, 3]) s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) result = access_method(s, level1_value) tm.assert_series_equal(result, expected)
def test_set_index_makes_timeseries(self): idx = tm.makeDateIndex(10) s = Series(lrange(10)) s.index = idx with tm.assert_produces_warning(FutureWarning): self.assertTrue(s.is_time_series) self.assertTrue(s.index.is_all_dates)
def test_resample_empty_dataframe(empty_frame, freq, resample_method): # GH13212 df = empty_frame # count retains dimensions too result = getattr(df.resample(freq), resample_method)() if resample_method != 'size': expected = df.copy() else: # GH14962 expected = Series([]) if isinstance(df.index, PeriodIndex): expected.index = df.index.asfreq(freq=freq) else: expected.index = df.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq assert_almost_equal(result, expected, check_dtype=False)
def test_reindex_nan(): ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8]) i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] assert_series_equal(ts.reindex(i), ts.iloc[j]) ts.index = ts.index.astype('object') # reindex coerces index.dtype to float, loc/iloc doesn't assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False)
def asSeries(df, name='', limit=0): '''Get the time series indexed by day of release.''' if 'Gross' not in df or 'Day #' not in df: print('{} has an empty dataframe'.format(name)) return Series() series = Series(df['Gross']) series.index = df['Day #'] if limit > 0: series = series[:limit] series.name = name return series
def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)], names=['a', 'b']) expected = ser.reset_index('b', drop=True) result = ser.droplevel('b', axis='index') tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): ser.droplevel(1, axis='columns')
def test_slice_floats2(): s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) assert len(s.loc[12.0:]) == 8 assert len(s.loc[12.5:]) == 7 i = np.arange(10, 20, dtype=float) i[2] = 12.2 s.index = i assert len(s.loc[12.0:]) == 8 assert len(s.loc[12.5:]) == 7
def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(lrange(10)) assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp)
def combine_spread(file_set, shift, drop_return_data=False): """ Combine the spread of input files, return with mean and standard deviation calculated. """ data = [] values = {} for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'): values[val] = {} # Collect data from all files into dictionaries for i, _file in enumerate(file_set): data.append(Spread().read(_file)) for val in values.keys(): values[val][i] = Series( data=data[i].spread[val]['val'], index=data[i].times ) data[i].times = (np.array(data[i].times) - shift[i]) spread = Spread() spread.spread['num'] = len(file_set) for val in values.keys(): # Shift time as per synchronisation for i in values[val]: values[val][i].index = np.array(values[val][i].index) - shift[i] # Convert to DataFrame df = DataFrame(data=values[val]) # If not a single file, keep only indices with at least two non-NaN if len(file_set) > 1: df = df.dropna() # If return data dropped, fill data here if drop_return_data: for i in df.columns: data[i].spread[val]['val'] = df[i].tolist() # Get times, mean and standard error as lists mean = list(df.mean(axis=1)) std_error = list(df.std(axis=1)) times = list(df.index) # Add to Spread object spread.spread[val]['val'] = mean spread.spread[val]['std'] = std_error spread.spread['times'] = times return spread, data
def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(lrange(10)) assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp)
def test_to_xarray_multiindex(self): from xarray import DataArray s = Series(range(6)) s.index.name = "foo" s.index = pd.MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"]) result = s.to_xarray() assert len(result) == 2 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, DataArray) tm.assert_series_equal(result.to_series(), s)
def peakToTroughs(dailyret,dates): ''' Example: sr = s['retdat'] stkd = s['stockData'] dt = stkd['Date'] ptk = peakToTroughs(sr,dt) ''' ''' get cummulative percent changes''' drs = Series(dailyret) soc1dr = drs+1 soc1cumdr = soc1dr.cumprod() localPeaksPairs = peakdetect(y_axis=soc1cumdr,lookahead=1)[0] indexOfLocalPeaks = np.empty(len(localPeaksPairs)); for i in range(len(indexOfLocalPeaks)): indexOfLocalPeaks[i] = localPeaksPairs[i][0] # data frame with 2 columns, where column 1 is a peak, and column 2 is the next peak that follows it dd = DataFrame({'a':indexOfLocalPeaks[0:(len(indexOfLocalPeaks)-1)],'b':indexOfLocalPeaks[1:len(indexOfLocalPeaks)]}) # add one more row to dd to represent the last peak and last row of soc1cumdr, so # that you calculate the last possible trough, if it there was one between the last peak and the last day # of data lastDdValue = dd.iloc[len(dd)-1,1] lastValueInData = len(soc1cumdr)-1 dd = rbind(dd,[lastDdValue,lastValueInData]) def minBetween2Peaks(x): lowindex = int(x[0]) highindex = int(x[1]) minval = min(soc1cumdr[lowindex:(highindex+1)]) return minval localMins = dd.apply(minBetween2Peaks,1) localMins.index = range(len(localMins)) localPeaks = soc1cumdr[indexOfLocalPeaks.astype(int)] localPeaks.index = range(len(localPeaks)) diffs = (localMins - localPeaks)/localPeaks # get indices of localMins in soc1cumdr so that you can get their dates def ff(x): ''' this function gets the index of soc1cumdr whose value = x''' r = soc1cumdr[soc1cumdr==x].index[0] return r indexOfLocalMins = map(ff,localMins) datesOfLocalMins = Series(dates)[indexOfLocalMins] datesOfLocalMins.index = range(len(datesOfLocalMins)) # calculate peak to end of data def minBetweenPeakAndEnd(x): arr = soc1cumdr.iloc[x[0]:len(soc1cumdr)] return min(arr) absMinsToEnd = dd.apply(minBetweenPeakAndEnd,1) absMinsToEnd.index = range(len(absMinsToEnd)) diffsToEnd = (absMinsToEnd - localPeaks)/localPeaks ret = DataFrame({'Date':datesOfLocalMins,'Peak':localPeaks,'Valley':localMins,'Diff':diffs,'DiffToEnd':diffsToEnd}) return ret
def rewrite_index(series: pd.Series) -> pd.Series: """Replace `source_reward_path` with info extracted from config at that path.""" if "source_reward_path" in series.index.names: new_index = series.index.to_frame(index=False) source_reward = results.path_to_config(new_index["source_reward_type"], new_index["source_reward_path"]) new_index = new_index.drop( columns=["source_reward_type", "source_reward_path"]) new_index = pd.concat([source_reward, new_index], axis=1) new_index = pd.MultiIndex.from_frame(new_index) series.index = new_index return series
def test_value_counts(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) result = obj.value_counts() counter = collections.Counter(obj) expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) expected.index = expected.index.astype(obj.dtype) if isinstance(obj, pd.MultiIndex): expected.index = Index(expected.index) if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype expected = expected.astype("Int64") # TODO(GH#32514): Order of entries with the same count is inconsistent # on CI (gh-32449) if obj.duplicated().any(): result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected)
def shift_dataVX(df): df = add_last_bar(df) df.fillna(method='ffill') df = add_last_bar(df) df.fillna(method='ffill') df['PrevCloses'] = my_rolling_apply_series(df['Close'], to_csv_str, AdaptationWindow) dates = Series(df.index) dates.index = df.index df['PrevDates'] = my_rolling_apply_series(dates, to_csv_str, AdaptationWindow) return df
def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) ser.index = MultiIndex.from_arrays( [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] ) expected = ser.reset_index("b", drop=True) result = ser.droplevel("b", axis="index") tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): ser.droplevel(1, axis="columns")
def generate_target_regression(origin_data): ''' 生成目标序列,需要剔除末端数据 Y_t = X_(t+1) :param origin_data(Series) : 原始数据 :return target(Series) : 目标数据 ''' if not isinstance(origin_data, Series): origin_data = Series(origin_data) origin_data.index = range(0, len(origin_data)) target = Series(len(origin_data), dtype='float64') for i in range(0, len(origin_data) - 1): target[i] = origin_data[i + 1] return target
def simulate2Group(n=100, p=1000, n1=None, effect=None): if n1 is None: n1 = int(numpy.ceil(0.5 * n)) if effect is None: effect = [1] * 10 x = DataFrame(numpy.random.randn(n, p)) y = Series(([0] * n1) + ([1] * (n - n1))) x.columns = ["g" + str(g) for g in range(p)] x.index = ["i" + str(i) for i in range(n)] y.index = x.index for i in range(len(effect)): x.ix[y == 1, i] = x.ix[y == 1, i] + effect[i] return {"x": x, "y": y}
def jitter_offset(x, nbins=100, multiplier=2.0): bins, edges = histogram(x, bins=nbins, density=True) bins /= bins.max() try: assignments = digitize(x, edges, right=True) - 1 assignments[assignments < 0] = 0 output = sqrt(Series(bins).iloc[assignments]) except ValueError: output = Series([0] * x.shape[0]) output.index = x.index output *= random.choice([-multiplier, multiplier], output.shape[0]) output *= random.choice(arange(1, 25) / 100., output.shape[0]) return output
def difference_process_log(origin_data): ''' 序列差分处理,需要剔除首端数据 log(n/n-1) 可用于计算收益率 :param origin_data(Series) : 原数据 :return diff_data(Series) : 平稳化数据 ''' if not isinstance(origin_data, Series): origin_data = Series(origin_data) origin_data.index = range(0, len(origin_data)) diff = Series(len(origin_data), dtype='float64') for i in range(1, len(origin_data)): diff[i] = np.log(origin_data[i] / origin_data[i - 1]) return diff
def fit_transform(self, X: pd.DataFrame, y: pd.Series): y.index = X[self.column_id].unique() features = extract_relevant_features( X, y, column_id=self.column_id, column_sort=self.column_sort, default_fc_parameters=self.default_fc_parameters, n_jobs=self.n_jobs) self.columns = features.columns.tolist() return features.reset_index(drop=True)
def test_reindex_categorical(): index = date_range("20000101", periods=3) # reindexing to an invalid Categorical s = Series(["a", "b", "c"], dtype="category") result = s.reindex(index) expected = Series( Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) ) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) expected.index = [1, 2] result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) expected.index = [2, 3] result = s.reindex([2, 3]) tm.assert_series_equal(result, expected)
def difference_process(origin_data): ''' 序列差分处理,需要剔除首端数据 t_n - (t_n-1) 处理后剔除需要第一个值 :param origin_data(Series) : 原数据 :return diff_data(Series) : 平稳化数据 ''' if not isinstance(origin_data, Series): origin_data = Series(origin_data) origin_data.index = range(0, len(origin_data)) diff = Series(len(origin_data), dtype='float64') for i in range(1, len(origin_data)): diff[i] = origin_data[i] - origin_data[i - 1] return diff
def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): # dont cast these 3-like values to bool ser = Series([True, False]) if not unique: ser.index = [1, 1] indexer_sli(ser)[1] = val assert type(ser.iloc[1]) == type(val) expected = Series([True, val], dtype=object, index=ser.index) if not unique and indexer_sli is not tm.iloc: expected = Series([val, val], dtype=object, index=[1, 1]) tm.assert_series_equal(ser, expected)
def simulate2Group(n=100, p=1000, n1=None, effect=None): if n1 is None: n1 = int(numpy.ceil(0.5 * n)) if effect is None: effect = [1] * 10 x = DataFrame(numpy.random.randn(n, p)) y = Series(([0] * n1) + ([1] * (n-n1))) x.columns = ["g"+str(g) for g in xrange(p)] x.index = ["i"+str(i) for i in xrange(n)] y.index = x.index for i in xrange(len(effect)): x.ix[y==1, i] = x.ix[y==1, i] + effect[i] return {"x":x, "y":y}
def _get_quotes(self, currency_name): """Get quotes for currencies vs selected currency code :param currency_name: currency code for calculating quotes :return: DataFrame with quotes vs currency code """ payload = {'access_key': self.access_key} # Get information about currency quotes vs USD and fix some information # currency codes and quotes quotes_usd = requests.get(self.quotes_url, params=payload) quotes_usd = Series(quotes_usd.json()['quotes']) quotes_usd.index = quotes_usd.index.str.slice(3) quotes_usd.index = quotes_usd.index.str.replace('RUB', 'RUR') # Calculate quotes vs selected currency quotes_curr = quotes_usd[currency_name] / quotes_usd quotes_curr['BYR'] *= 10000 quotes_df = DataFrame(quotes_curr, columns=['currency.rate']) return quotes_df
def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame({"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, index=["x", "y", "z"]) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame({"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, index=["x", "y", "z"]) assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack() expected = DataFrame({"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, index=["x", "y", "z"]) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame({"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, index=["x", "y", "z"]) assert_frame_equal(result, expected)
def get_spot_curve(spot_rates_curve: pd.Series): tenors = [] for i in spot_rates_curve.index: n, per = i.split("-") n = int(n) tenor = n / 12 if per == "month" else n tenors.append(tenor) spot_rates_curve.index = tenors spot_rates_curve.name = "rate" spot_rates_curve = spot_rates_curve.astype(float) / 100 return spot_rates_curve.to_frame()
def wrap_results(self): results = self.results # see if we can infer the results if len(results) > 0 and is_sequence(results[0]): return self.wrap_results_for_axis() # dict of scalars from pandas import Series result = Series(results) result.index = self.res_index return result
def test_series_grouper_result_length_difference(): # GH 40014 obj = Series(np.random.randn(10), dtype="float64") obj.index = obj.index.astype("O") labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2) result, counts = grouper.get_result() expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object) tm.assert_equal(result, expected) exp_counts = np.array([3, 4], dtype=np.int64) tm.assert_equal(counts, exp_counts)
def test_reindex_categorical(): index = date_range('20000101', periods=3) # reindexing to an invalid Categorical s = Series(['a', 'b', 'c'], dtype='category') result = s.reindex(index) expected = Series(Categorical(values=[np.nan, np.nan, np.nan], categories=['a', 'b', 'c'])) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', 'c'])) expected.index = [1, 2] result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) expected = Series(Categorical( values=['c', np.nan], categories=['a', 'b', 'c'])) expected.index = [2, 3] result = s.reindex([2, 3]) tm.assert_series_equal(result, expected)
def BaggingTrain(X, tree_num = 20, treedeeplim = np.inf): """ Use C4.5 decision tree as base learner to construct a Bagging model @X: DataFrame, each row is a sample and each column is a feature, except the last column, which is the sample labels. The feature colunms should be continuous values @tree_num: the number of base learners in the Bagging model @treedeeplim: the maximum deep value of a decision tree allowed. It starts from 0, i.e., for a decision tree with a depth no greater than 1, its treedeeplim value is 0. """ D = np.ones(X.shape[0])/X.shape[0] D = Series(D) D.index = X.index trees = [] #All the base learners err_rates = [] seed = 16 for _ in range(tree_num): #Here, _ is a dummy variable X_, D_ = resample(X, D, random_state = seed) #resample(*arrays, replace, random_state) Resample arrays or sparse # matrices in a consistent way. # The default strategy implements one step of the bootstrapping # procedure # *arrays: sequence of indexable data-structures. Indexable data- # structures can be arrays, lists, dataframes or scipy sparse # matrices with consistent first dimension. # replace: boolean, True by default. Implements resampling with # replacement. If False, this will implement (sliced) random # permutations. # random_state: int or RandomState instance. X_.index = range(1, X_.shape[0] + 1) D_.index = range(1, D_.shape[0] + 1) D_ = D_/np.sum(D_) y_ = X_.ix[:,X_.shape[1] - 1] seed += 1 tree = createSingleTree(X = X_, D = D_, deep = 0, deeplim = treedeeplim) trees.append(tree) hx = predictBase(tree = tree, X = X_) err_rate = np.sum(D_[list(hx != y_)]) err_rates.append(err_rate) return trees, err_rates
def test_resample_empty_dataframe_all_ts(empty_frame, freq, resample_method): # GH13212 df = empty_frame # count retains dimensions too result = getattr(df.resample(freq), resample_method)() if resample_method != 'size': expected = df.copy() else: # GH14962 expected = Series([]) expected.index = df.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq assert_almost_equal(result, expected, check_dtype=False)
def test_subtracting_two_series_with_unordered_index_and_all_nan_index( self, data_result, data_expected): # GH 38439 a_index_result = MultiIndex.from_tuples(data_result[0]) b_index_result = MultiIndex.from_tuples(data_result[1]) a_series_result = Series(data_result[2], index=a_index_result) b_series_result = Series(data_result[3], index=b_index_result) result = a_series_result.align(b_series_result) a_index_expected = MultiIndex.from_tuples(data_expected[0]) b_index_expected = MultiIndex.from_tuples(data_expected[1]) a_series_expected = Series(data_expected[2], index=a_index_expected) b_series_expected = Series(data_expected[3], index=b_index_expected) a_series_expected.index = a_series_expected.index.set_levels([ a_series_expected.index.levels[0].astype("float"), a_series_expected.index.levels[1].astype("float"), ]) b_series_expected.index = b_series_expected.index.set_levels([ b_series_expected.index.levels[0].astype("float"), b_series_expected.index.levels[1].astype("float"), ]) tm.assert_series_equal(result[0], a_series_expected) tm.assert_series_equal(result[1], b_series_expected)
def make_qtrly(s: pd.Series, t: str = 'first', name: str = None) -> pd.Series: s.index = pd.DatetimeIndex(s.index.values, dtype=dt.date) s.index.freq = s.index.inferred_freq name = name or s.name or '' # print(s) if t == 'mean': s = s.resample('1Q').mean().astype(np.float64) elif t == 'first': s = s.resample('1Q').first().astype(np.float64) elif t == 'last': s = s.resample('1Q').last().astype(np.float64) if s.isnull().any(): print( f'Series {name} still has some empty data. Filling that in with the last known value.' ) s.fillna(method='ffill', inplace=True) # Conform everything to the end of the quarter idx = s.index for i, v in enumerate(idx): v.replace(month=math.ceil(v.month / 3) * 3) v.replace(day=calendar.monthrange(v.year, v.month)[-1]) s.index = idx # s.index = s.index + pd.Timedelta(3, unit='M') - pd.Timedelta(1, unit='d') # s.index = pd.to_datetime([d + relativedelta(days=1) for d in s.index]) # s.index.freq = s.index.inferred_freq # I wanted to make this function more dynamic and eliminate the if/else bullshit, with the below line (which failed) # s = s.resample('3MS').apply(eval(t + '(self)', {"__builtins__": None}, safe_funcs)).astype(np.float64) # print(s) return s
def attach_rows(self, result): # assumes if len(row_labels) > len(result) it's bc it was truncated # at the front, for AR lags, for example squeezed = result.squeeze() k_endog = np.array(self.ynames, ndmin=1).shape[0] if k_endog > 1 and squeezed.shape == (k_endog, ): squeezed = squeezed[None, :] # May be zero-dim, for example in the case of forecast one step in tsa if squeezed.ndim < 2: out = Series(squeezed) else: out = DataFrame(result) out.columns = self.ynames out.index = self.row_labels[-len(result):] return out
def test_value_counts_unique(self, tz_naive_fixture): tz = tz_naive_fixture # GH 7735 idx = date_range("2011-01-01 09:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) exp_idx = date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") expected.index = expected.index._with_freq(None) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) expected = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) expected = expected._with_freq(None) tm.assert_index_equal(idx.unique(), expected) idx = DatetimeIndex( [ "2013-01-01 09:00", "2013-01-01 09:00", "2013-01-01 09:00", "2013-01-01 08:00", "2013-01-01 08:00", pd.NaT, ], tz=tz, ) exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = DatetimeIndex( ["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx)
def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too result = getattr(df.resample(freq), resample_method)() if resample_method != "size": expected = df.copy() else: # GH14962 expected = Series([], dtype=object) expected.index = _asfreq_compat(df.index, freq) tm.assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq tm.assert_almost_equal(result, expected, check_dtype=False)
def proc_use_data(data, mtype, site, time_period='D', n_std=4): """ Function for parse_ht_xml to process the data and aggregate it to a defined resolution. """ ### Select the process sequence based on the mtype and convert to period volume data[data < 0] = nan count1 = float(data.count().values[0]) if mtype == 'Water Meter': ## Check to determine whether it is cumulative or period volume diff1 = data.diff()[1:] neg_index = diff1 < 0 neg_ratio = sum(neg_index.values) / count1 if neg_ratio > 0.1: outliers = abs(data - data.mean()) > (data.std() * n_std) data[outliers] = nan vol = data else: # Replace the negative values with zero and the very large values diff1[diff1 < 0] = data[diff1 < 0] outliers = abs(diff1 - diff1.mean()) > (diff1.std() * n_std) diff1[outliers] = nan vol = diff1 elif (mtype == 'Abstraction Volume') | (mtype == 'Average Flow'): outliers = abs(data - data.mean()) > (data.std() * n_std) data[outliers] = nan vol = data elif mtype == 'Flow': outliers = abs(data - data.mean()) > (data.std() * n_std) data[outliers] = nan # Determine the diff index t1 = Series(data.index).diff().dt.seconds.shift(-1) t1.iloc[-1] = t1.iloc[-2] t1.index = data.index # Convert to volume vol = data.multiply(t1, axis=0) * 0.001 ## Estimate the NAs vol2 = vol.fillna(method='ffill') ## Resample the volumes vol_res = vol2.resample(time_period).sum() vol_res.loc[:, 'site'] = site return (vol_res)
def test_value_counts_null(null_obj, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif len(obj) < 1: pytest.skip("Test doesn't make sense on empty data") elif isinstance(orig, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj.values if needs_i8_conversion(obj.dtype): values[0:2] = iNaT else: values[0:2] = null_obj klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) expected = Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) result = obj.value_counts() if obj.duplicated().any(): # TODO: # Order of entries with the same count is inconsistent on CI (gh-32449) expected = expected.sort_index() result = result.sort_index() tm.assert_series_equal(result, expected) # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment new_entry = Series({np.nan: 3}, dtype=np.int64) expected = expected.append(new_entry) result = obj.value_counts(dropna=False) if obj.duplicated().any(): # TODO: # Order of entries with the same count is inconsistent on CI (gh-32449) expected = expected.sort_index() result = result.sort_index() tm.assert_series_equal(result, expected)
def read_in(self, mat): """Read the input part. """ # Check if more then one time series model is present if not isinstance(mat['IN'], np.ndarray): mat['IN'] = [mat['IN']] # Read all the time series models for i, IN in enumerate(mat['IN']): data = dict() for name in IN._fieldnames: if name != 'values': data[name] = getattr(IN, name) else: tindex = [ matlab2datetime(tval) for tval in IN.values[:, 0] ] series = Series(IN.values[:, 1], index=tindex) # round on seconds, to get rid of conversion milliseconds series.index = series.index.round('s') if hasattr(IN, 'type'): IN.Type = IN.type if IN.Type in ['EVAP', 'PREC', 'WELL']: # in menyanthes, the flux is summed over the # time-step, so divide by the timestep now step = series.index.to_series().diff() / offsets.Day(1) step = step.values.astype(np.float) series = series / step if series.values[0] != 0: series = series[1:] data['values'] = series # add to self.IN if not hasattr(IN, 'Name') and not hasattr(IN, 'name'): IN.Name = 'IN' + str(i) if hasattr(IN, 'name'): IN.Name = IN.name self.IN[IN.Name] = data
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = data.unstack(fill_value=-1) expected = DataFrame({"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float) assert_frame_equal(result, expected)
def test_resample_empty_dataframe(self, freq, resample_method): # GH13212 index = self.create_series().index[:0] f = DataFrame(index=index) # count retains dimensions too result = getattr(f.resample(freq), resample_method)() if resample_method != 'size': expected = f.copy() else: # GH14962 expected = Series([]) expected.index = f.index._shallow_copy(freq=freq) assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq assert_almost_equal(result, expected, check_dtype=False)
def _dkl_n_group(self, group_df, name, cols, p_n): """ Calculate DKL(n|y) for a single group Parameters ---------- group_df : DataFrame name : String Name of group (from Pandas.grouped()) Returns ------- DKL_n : Series """ DKL_n = Series(self._entropy(group_df, p_n), index=cols, name=name) DKL_n.index = ['DKL(n|y)_' + str(item) for item in DKL_n.index] return DKL_n
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack(fill_value=-1) expected = DataFrame( {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 ) tm.assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame( {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float ) tm.assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame( {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} ).set_index(["x", "y", "z"]) unstacked = df.unstack(["x", "y"], fill_value=0) key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] tm.assert_frame_equal(result, df) # From a series s = df["w"] result = s.unstack(["x", "y"], fill_value=0) expected = unstacked["w"] tm.assert_frame_equal(result, expected)