Example #1
0
    def test_series(self, orient, numpy):
        s = Series([10, 20, 30, 40, 50, 60], name="series",
                   index=[6, 7, 8, 9, 10, 15]).sort_values()

        encode_kwargs = {} if orient is None else dict(orient=orient)
        decode_kwargs = {} if numpy is None else dict(numpy=numpy)

        output = ujson.decode(ujson.encode(s, **encode_kwargs),
                              **decode_kwargs)

        if orient == "split":
            dec = _clean_dict(output)
            output = Series(**dec)
        else:
            output = Series(output)

        if orient in (None, "index"):
            s.name = None
            output = output.sort_values()
            s.index = ["6", "7", "8", "9", "10", "15"]
        elif orient in ("records", "values"):
            s.name = None
            s.index = [0, 1, 2, 3, 4, 5]

        tm.assert_series_equal(output, s, check_dtype=False)
Example #2
0
def genOCRData(alpha,N,mu1,mu2,train):
	k = np.random.randint(2,8)
	if train:
		mu1 = Series(np.zeros(dim-k)).append(Series(np.ones(k)))
		mu1.index = range(len(mu1))
		np.random.shuffle(mu1)
		mu1.index = range(len(mu1))
		mu2 = Series(np.zeros(dim-k)).append(Series(np.ones(k)))
		mu2.index = range(len(mu2))
		np.random.shuffle(mu2)
		mu2.index = range(len(mu2))
	data1 = DataFrame(np.random.normal(mu,alpha*sigma,(N,dim)))+mu1
	data2 = DataFrame(np.random.normal(mu,alpha*sigma,(N,dim)))+mu2
	return data1, data2, mu1, mu2
Example #3
0
    def compute_summary(self, combined_df):
        combined_mean = combined_df.mean()
        average_cons = combined_mean['consistency']
        average_ambi = combined_mean['ambiguity']
        # completeness = self.compute_completeness()
        noise = self.compute_noise()

        series = Series([average_cons, average_ambi])
        series.index = ['Average Consistency', 'Average Ambiguity']

        series_2 = Series(noise)
        series_2.index = [i.title() for i in series_2.index]
        series = series.append(series_2)
        return series
    def test_combine_first(self):
        values = tm.makeIntIndex(20).values.astype(float)
        series = Series(values, index=tm.makeIntIndex(20))

        series_copy = series * 2
        series_copy[::2] = np.NaN

        # nothing used from the input
        combined = series.combine_first(series_copy)

        tm.assert_series_equal(combined, series)

        # Holes filled from input
        combined = series_copy.combine_first(series)
        assert np.isfinite(combined).all()

        tm.assert_series_equal(combined[::2], series[::2])
        tm.assert_series_equal(combined[1::2], series_copy[1::2])

        # mixed types
        index = tm.makeStringIndex(20)
        floats = Series(tm.randn(20), index=index)
        strings = Series(tm.makeStringIndex(10), index=index[::2])

        combined = strings.combine_first(floats)

        tm.assert_series_equal(strings, combined.loc[index[::2]])
        tm.assert_series_equal(floats[1::2].astype(object),
                               combined.loc[index[1::2]])

        # corner case
        s = Series([1., 2, 3], index=[0, 1, 2])
        result = s.combine_first(Series([], index=[]))
        s.index = s.index.astype('O')
        assert_series_equal(s, result)
 def data_sum(self, grouped):
     '''
     计算列和
     :param grouped:
     :return: sum_series
     '''
     format_ = lambda x: '%.2f' % x
     sum_clicks = grouped['Clicks'].sum()
     sum_impressions = grouped['Impressions'].sum()
     sum_orders = grouped['1-day Orders Placed (#)'].sum()
     sum_spend = grouped['Total Spend'].sum()
     sum_sales = grouped['1-day Ordered Product Sales'].sum()
     if sum_clicks == 0:
         sum_conversion = 0
     else:
         sum_conversion = sum_orders/sum_clicks
     if sum_impressions == 0:
         sum_ctr = 0
     else:
         sum_ctr = sum_clicks/sum_impressions
     if sum_sales == 0:
         sum_acos = 0
     else:
         sum_acos = sum_spend/sum_sales
     sum_series = Series([sum_clicks, sum_orders, sum_spend, sum_sales, sum_conversion, sum_acos, sum_ctr]).apply(format_)
     sum_series.index = ['Clicks', '1-day Orders Placed (#)', 'Total Spend',
                             '1-day Ordered Product Sales', 'Average conversion rate', 'Average ACOS', 'Average CTR']
     return sum_series
def slide10():
    print '2012Q4, Q-JAN'
    p = pd.Period('2012Q4', freq='Q-JAN')
    print p
    print '2012Q4 start'
    print p.asfreq('D', 'start')
    print '2012Q4 end'
    print p.asfreq('D', 'end')

    print p.asfreq('B', 'e')
    print '4PM on the 2nd to last business day of the quater'
    print (p.asfreq('B', 'e') - 1).asfreq('T', 's')
    p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
    print p4pm
    print p4pm.to_timestamp()

    print 'timeseries'
    rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')
    ts = Series(np.arange(len(rng)), index=rng)
    print ts
    new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
    print 'new range'
    print new_rng
    ts.index = new_rng.to_timestamp()
    print ts
Example #7
0
def imageData(a,N,dim,mu1,mu2,train):
	dim = 35
	k = 30
	if train:
		mu1 = Series(np.zeros(dim-k)).append(Series(np.ones(k)))
		mu1.index = range(len(mu1))
		np.random.shuffle(mu1)
		mu1.index = range(len(mu1))
		mu2 = Series(np.zeros(dim-k)).append(Series(np.ones(k)))
		mu2.index = range(len(mu2))
		np.random.shuffle(mu2)
		mu2.index = range(len(mu2))
	data1 = DataFrame(np.random.normal(0,a*1,(N/2,dim)))+mu1
	data2 = DataFrame(np.random.normal(0,a*1,(N/2,dim)))+mu2
	y = np.append(np.ones(N/2),np.zeros(N/2))
	return np.append(data1,data2,0), y, mu1, mu2
def simulation(data,df_info):  
    #initialize result data structure
    result = {}
    site_count = 0
    #loop through sites
    for site in data.columns:
        s = df_info.ix[site]    #get info, better if we can have siteID 
        df = DataFrame(data[site])
        # t is a series of time, check the open time and alert
        t = Series(df.index).apply(is_open_at, op=s.OpenWeekday, ed=s.CloseWeekday, op_S=s.OpenWeekend, ed_S=s.CloseWeekend, delay=1)
        t.index = df.index
        
        df['open'] = t
#        df['ecm'] = df.apply(is_lighting_ecm, name=site, thld=s.Threshold, mult=1.3, add = 3, axis=1)
        df['ecm'] = df.apply(is_lighting_ecm2, name=site, thld=s.Threshold, ave=s.DaytimeAve, mult=0.7, axis=1)
        df.ix[-1,'ecm'] = False   #make sure the alarm will stop, i know i am lazy
           
        if df['ecm'].any():
            item = event_to_ecm(df,site,s.Threshold,limit=6)
            if len(item['start'])>0: result[site] = item
    
        site_count = site_count + 1
        print str(site_count) + ' of ' + str(len(data.columns)) + ' : ' + site
    
    return result
Example #9
0
 def setup(self):
     s = Series([np.nan] * 10000)
     s[0] = 3.0
     s[100] = -1.0
     s[999] = 12.1
     s.index = MultiIndex.from_product([range(10)] * 4)
     self.ss = s.to_sparse()
Example #10
0
def aggByTimePeriodByTradingDay(dfIntra=None,hhmmLow=958, hhmmHigh=1002,dateCol='date',
                                aggCol='volume'):
    '''
    Aggregate an intra day series with a date column of yyyyMmDdHhMmSs values, and a column
      of values that you will aggregate using group by, like volume.
    Example:
        aggByTimePeriodByTradingDay()
    '''
    # get day
    df = dfIntra
    if (df is None) or (df.empty):
        df = readData('cl201404.csv')
    yyyyMmDd = map(lambda x:int(str(x)[0:8]),df[dateCol])
    hhmm = map(lambda x:int(str(x)[8:12]),df[dateCol])
    # insert in dataframe
    df['yyyyMmDd'] = yyyyMmDd
    df['hhmm'] = hhmm
    # get bars between hhmm times
    validBars = map(lambda x:(x >= hhmmLow) and (x <= hhmmHigh) ,hhmm)
    # select those bars
    dfvb = df.loc[validBars]
    # groupby day
    dfgb = dfvb.groupby('yyyyMmDd')
    # do agg sum
    dfgba = dfgb[aggCol].aggregate(np.sum)
    dates = Series(dfvb['yyyyMmDd'].unique())
    dates.index = range(len(dates))
    agg = dfgba
    agg.index = range(len(dfgba))
    newdf = DataFrame({'date':dates,'agg':agg})
    plotDf(newdf,priceCol='agg',dateCol='date')
    def test_rank_int(self):
        s = self.s.dropna().astype('i8')

        for method, res in self.results.items():
            result = s.rank(method=method)
            expected = Series(res).dropna()
            expected.index = result.index
            assert_series_equal(result, expected)
Example #12
0
def test_series_getitem_multiindex(access_method, level1_value, expected):

    # GH 6018
    # series regression getitem with a multi-index

    s = Series([1, 2, 3])
    s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)])
    result = access_method(s, level1_value)
    tm.assert_series_equal(result, expected)
Example #13
0
    def test_set_index_makes_timeseries(self):
        idx = tm.makeDateIndex(10)

        s = Series(lrange(10))
        s.index = idx

        with tm.assert_produces_warning(FutureWarning):
            self.assertTrue(s.is_time_series)
        self.assertTrue(s.index.is_all_dates)
Example #14
0
def test_resample_empty_dataframe(empty_frame, freq, resample_method):
    # GH13212
    df = empty_frame
    # count retains dimensions too
    result = getattr(df.resample(freq), resample_method)()
    if resample_method != 'size':
        expected = df.copy()
    else:
        # GH14962
        expected = Series([])

    if isinstance(df.index, PeriodIndex):
        expected.index = df.index.asfreq(freq=freq)
    else:
        expected.index = df.index._shallow_copy(freq=freq)
    assert_index_equal(result.index, expected.index)
    assert result.index.freq == expected.index.freq
    assert_almost_equal(result, expected, check_dtype=False)
Example #15
0
def test_reindex_nan():
    ts = Series([2, 3, 5, 7], index=[1, 4, nan, 8])

    i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2]
    assert_series_equal(ts.reindex(i), ts.iloc[j])

    ts.index = ts.index.astype('object')

    # reindex coerces index.dtype to float, loc/iloc doesn't
    assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False)
Example #16
0
def asSeries(df, name='', limit=0):
    '''Get the time series indexed by day of release.'''
    if 'Gross' not in df or 'Day #' not in df:
        print('{} has an empty dataframe'.format(name))
        return Series()
    series = Series(df['Gross'])
    series.index = df['Day #']
    if limit > 0:
        series = series[:limit]
    series.name = name
    return series
Example #17
0
 def test_droplevel(self):
     # GH20342
     ser = Series([1, 2, 3, 4])
     ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)],
                                        names=['a', 'b'])
     expected = ser.reset_index('b', drop=True)
     result = ser.droplevel('b', axis='index')
     tm.assert_series_equal(result, expected)
     # test that droplevel raises ValueError on axis != 0
     with pytest.raises(ValueError):
         ser.droplevel(1, axis='columns')
Example #18
0
def test_slice_floats2():
    s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float))

    assert len(s.loc[12.0:]) == 8
    assert len(s.loc[12.5:]) == 7

    i = np.arange(10, 20, dtype=float)
    i[2] = 12.2
    s.index = i
    assert len(s.loc[12.0:]) == 8
    assert len(s.loc[12.5:]) == 7
Example #19
0
    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)
Example #20
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
Example #21
0
    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)
Example #22
0
    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)
Example #23
0
    def test_to_xarray_multiindex(self):
        from xarray import DataArray

        s = Series(range(6))
        s.index.name = "foo"
        s.index = pd.MultiIndex.from_product([["a", "b"], range(3)],
                                             names=["one", "two"])
        result = s.to_xarray()
        assert len(result) == 2
        tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"])
        assert isinstance(result, DataArray)
        tm.assert_series_equal(result.to_series(), s)
Example #24
0
def peakToTroughs(dailyret,dates):
    '''
    Example:
        sr = s['retdat']
        stkd = s['stockData']
        dt = stkd['Date']
        ptk = peakToTroughs(sr,dt)
    '''
    ''' get cummulative percent changes'''
    drs = Series(dailyret)
    soc1dr = drs+1
    soc1cumdr = soc1dr.cumprod()
    localPeaksPairs = peakdetect(y_axis=soc1cumdr,lookahead=1)[0]
    indexOfLocalPeaks  = np.empty(len(localPeaksPairs));
    for i in range(len(indexOfLocalPeaks)):
        indexOfLocalPeaks[i] = localPeaksPairs[i][0]
    # data frame with 2 columns, where column 1 is a peak, and column 2 is the next peak that follows it
    dd = DataFrame({'a':indexOfLocalPeaks[0:(len(indexOfLocalPeaks)-1)],'b':indexOfLocalPeaks[1:len(indexOfLocalPeaks)]})
    # add one more row to dd to represent the last peak and last row of soc1cumdr, so
    #   that you calculate the last possible trough, if it there was one between the last peak and the last day
    #   of data
    lastDdValue = dd.iloc[len(dd)-1,1]
    lastValueInData = len(soc1cumdr)-1
    dd = rbind(dd,[lastDdValue,lastValueInData])
    def minBetween2Peaks(x):
        lowindex = int(x[0])
        highindex = int(x[1])
        minval = min(soc1cumdr[lowindex:(highindex+1)])
        return minval
    localMins = dd.apply(minBetween2Peaks,1)
    localMins.index = range(len(localMins))
    localPeaks = soc1cumdr[indexOfLocalPeaks.astype(int)]
    localPeaks.index = range(len(localPeaks))
    diffs = (localMins - localPeaks)/localPeaks
    
    # get indices of localMins in soc1cumdr so that you can get their dates
    def ff(x):
        ''' this function gets the index of soc1cumdr whose value = x'''
        r = soc1cumdr[soc1cumdr==x].index[0]
        return r
    indexOfLocalMins = map(ff,localMins)
    datesOfLocalMins = Series(dates)[indexOfLocalMins]
    datesOfLocalMins.index = range(len(datesOfLocalMins))
    # calculate peak to end of data
    def minBetweenPeakAndEnd(x):
        arr = soc1cumdr.iloc[x[0]:len(soc1cumdr)]
        return min(arr)
    absMinsToEnd = dd.apply(minBetweenPeakAndEnd,1)
    absMinsToEnd.index = range(len(absMinsToEnd))
    diffsToEnd = (absMinsToEnd - localPeaks)/localPeaks
    ret =  DataFrame({'Date':datesOfLocalMins,'Peak':localPeaks,'Valley':localMins,'Diff':diffs,'DiffToEnd':diffsToEnd})

    return ret
def rewrite_index(series: pd.Series) -> pd.Series:
    """Replace `source_reward_path` with info extracted from config at that path."""
    if "source_reward_path" in series.index.names:
        new_index = series.index.to_frame(index=False)
        source_reward = results.path_to_config(new_index["source_reward_type"],
                                               new_index["source_reward_path"])
        new_index = new_index.drop(
            columns=["source_reward_type", "source_reward_path"])
        new_index = pd.concat([source_reward, new_index], axis=1)
        new_index = pd.MultiIndex.from_frame(new_index)
        series.index = new_index
    return series
Example #26
0
def test_value_counts(index_or_series_obj):
    obj = index_or_series_obj
    obj = np.repeat(obj, range(1, len(obj) + 1))
    result = obj.value_counts()

    counter = collections.Counter(obj)
    expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
    expected.index = expected.index.astype(obj.dtype)
    if isinstance(obj, pd.MultiIndex):
        expected.index = Index(expected.index)

    if not isinstance(result.dtype, np.dtype):
        # i.e IntegerDtype
        expected = expected.astype("Int64")

    # TODO(GH#32514): Order of entries with the same count is inconsistent
    #  on CI (gh-32449)
    if obj.duplicated().any():
        result = result.sort_index()
        expected = expected.sort_index()
    tm.assert_series_equal(result, expected)
def shift_dataVX(df):
    df = add_last_bar(df)
    df.fillna(method='ffill')
    df = add_last_bar(df)
    df.fillna(method='ffill')
    df['PrevCloses'] = my_rolling_apply_series(df['Close'], to_csv_str,
                                               AdaptationWindow)
    dates = Series(df.index)
    dates.index = df.index
    df['PrevDates'] = my_rolling_apply_series(dates, to_csv_str,
                                              AdaptationWindow)
    return df
Example #28
0
 def test_droplevel(self):
     # GH20342
     ser = Series([1, 2, 3, 4])
     ser.index = MultiIndex.from_arrays(
         [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"]
     )
     expected = ser.reset_index("b", drop=True)
     result = ser.droplevel("b", axis="index")
     tm.assert_series_equal(result, expected)
     # test that droplevel raises ValueError on axis != 0
     with pytest.raises(ValueError):
         ser.droplevel(1, axis="columns")
def generate_target_regression(origin_data):
    '''
    生成目标序列,需要剔除末端数据     Y_t = X_(t+1)
    :param origin_data(Series)  : 原始数据        
    :return target(Series)      : 目标数据
    '''
    if not isinstance(origin_data, Series):
        origin_data = Series(origin_data)
    origin_data.index = range(0, len(origin_data))
    target = Series(len(origin_data), dtype='float64')
    for i in range(0, len(origin_data) - 1):
        target[i] = origin_data[i + 1]
    return target
Example #30
0
def simulate2Group(n=100, p=1000, n1=None, effect=None):
    if n1 is None:
        n1 = int(numpy.ceil(0.5 * n))
    if effect is None:
        effect = [1] * 10
    x = DataFrame(numpy.random.randn(n, p))
    y = Series(([0] * n1) + ([1] * (n - n1)))
    x.columns = ["g" + str(g) for g in range(p)]
    x.index = ["i" + str(i) for i in range(n)]
    y.index = x.index
    for i in range(len(effect)):
        x.ix[y == 1, i] = x.ix[y == 1, i] + effect[i]
    return {"x": x, "y": y}
Example #31
0
def jitter_offset(x, nbins=100, multiplier=2.0):
    bins, edges = histogram(x, bins=nbins, density=True)
    bins /= bins.max()
    try:
        assignments = digitize(x, edges, right=True) - 1
        assignments[assignments < 0] = 0
        output = sqrt(Series(bins).iloc[assignments])
    except ValueError:
        output = Series([0] * x.shape[0])
    output.index = x.index
    output *= random.choice([-multiplier, multiplier], output.shape[0])
    output *= random.choice(arange(1, 25) / 100., output.shape[0])
    return output
def difference_process_log(origin_data):
    '''
    序列差分处理,需要剔除首端数据  log(n/n-1) 可用于计算收益率
    :param origin_data(Series) : 原数据            
    :return diff_data(Series)  : 平稳化数据         
    '''
    if not isinstance(origin_data, Series):
        origin_data = Series(origin_data)
    origin_data.index = range(0, len(origin_data))
    diff = Series(len(origin_data), dtype='float64')
    for i in range(1, len(origin_data)):
        diff[i] = np.log(origin_data[i] / origin_data[i - 1])
    return diff
    def fit_transform(self, X: pd.DataFrame, y: pd.Series):
        y.index = X[self.column_id].unique()
        features = extract_relevant_features(
            X,
            y,
            column_id=self.column_id,
            column_sort=self.column_sort,
            default_fc_parameters=self.default_fc_parameters,
            n_jobs=self.n_jobs)

        self.columns = features.columns.tolist()

        return features.reset_index(drop=True)
Example #34
0
def test_reindex_categorical():
    index = date_range("20000101", periods=3)

    # reindexing to an invalid Categorical
    s = Series(["a", "b", "c"], dtype="category")
    result = s.reindex(index)
    expected = Series(
        Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"])
    )
    expected.index = index
    tm.assert_series_equal(result, expected)

    # partial reindexing
    expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"]))
    expected.index = [1, 2]
    result = s.reindex([1, 2])
    tm.assert_series_equal(result, expected)

    expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"]))
    expected.index = [2, 3]
    result = s.reindex([2, 3])
    tm.assert_series_equal(result, expected)
def difference_process(origin_data):
    '''
    序列差分处理,需要剔除首端数据  t_n - (t_n-1)  处理后剔除需要第一个值
    :param origin_data(Series)  :  原数据           
    :return diff_data(Series)   :  平稳化数据  
    '''
    if not isinstance(origin_data, Series):
        origin_data = Series(origin_data)
    origin_data.index = range(0, len(origin_data))
    diff = Series(len(origin_data), dtype='float64')
    for i in range(1, len(origin_data)):
        diff[i] = origin_data[i] - origin_data[i - 1]
    return diff
Example #36
0
    def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique):
        # dont cast these 3-like values to bool
        ser = Series([True, False])
        if not unique:
            ser.index = [1, 1]

        indexer_sli(ser)[1] = val
        assert type(ser.iloc[1]) == type(val)

        expected = Series([True, val], dtype=object, index=ser.index)
        if not unique and indexer_sli is not tm.iloc:
            expected = Series([val, val], dtype=object, index=[1, 1])
        tm.assert_series_equal(ser, expected)
Example #37
0
def simulate2Group(n=100, p=1000, n1=None, effect=None):
    if n1 is None:
        n1 = int(numpy.ceil(0.5 * n))
    if effect is None:
        effect = [1] * 10
    x = DataFrame(numpy.random.randn(n, p))
    y = Series(([0] * n1) + ([1] * (n-n1)))
    x.columns = ["g"+str(g) for g in xrange(p)]
    x.index = ["i"+str(i) for i in xrange(n)]
    y.index = x.index
    for i in xrange(len(effect)):
        x.ix[y==1, i] = x.ix[y==1, i] + effect[i]
    return {"x":x, "y":y}
    def _get_quotes(self, currency_name):
        """Get quotes for currencies vs selected currency code

        :param currency_name: currency code for calculating quotes
        :return: DataFrame with quotes vs currency code
        """

        payload = {'access_key': self.access_key}

        # Get information about currency quotes vs USD and fix some information
        # currency codes and quotes
        quotes_usd = requests.get(self.quotes_url, params=payload)
        quotes_usd = Series(quotes_usd.json()['quotes'])
        quotes_usd.index = quotes_usd.index.str.slice(3)
        quotes_usd.index = quotes_usd.index.str.replace('RUB', 'RUR')

        # Calculate quotes vs selected currency
        quotes_curr = quotes_usd[currency_name] / quotes_usd
        quotes_curr['BYR'] *= 10000
        quotes_df = DataFrame(quotes_curr, columns=['currency.rate'])

        return quotes_df
Example #39
0
    def test_unstack_fill_frame_timedelta(self):

        # Test unstacking with time deltas
        td = [Timedelta(days=i) for i in range(4)]
        data = Series(td)
        data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = data.unstack()
        expected = DataFrame({"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, index=["x", "y", "z"])
        assert_frame_equal(result, expected)

        result = data.unstack(fill_value=td[1])
        expected = DataFrame({"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, index=["x", "y", "z"])
        assert_frame_equal(result, expected)
Example #40
0
    def test_unstack_fill_frame_datetime(self):

        # Test unstacking with date times
        dv = pd.date_range("2012-01-01", periods=4).values
        data = Series(dv)
        data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = data.unstack()
        expected = DataFrame({"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, index=["x", "y", "z"])
        assert_frame_equal(result, expected)

        result = data.unstack(fill_value=dv[0])
        expected = DataFrame({"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, index=["x", "y", "z"])
        assert_frame_equal(result, expected)
def get_spot_curve(spot_rates_curve: pd.Series):
    tenors = []
    for i in spot_rates_curve.index:
        n, per = i.split("-")
        n = int(n)
        tenor = n / 12 if per == "month" else n
        tenors.append(tenor)

    spot_rates_curve.index = tenors

    spot_rates_curve.name = "rate"
    spot_rates_curve = spot_rates_curve.astype(float) / 100

    return spot_rates_curve.to_frame()
Example #42
0
    def wrap_results(self):
        results = self.results

        # see if we can infer the results
        if len(results) > 0 and is_sequence(results[0]):

            return self.wrap_results_for_axis()

        # dict of scalars
        from pandas import Series
        result = Series(results)
        result.index = self.res_index

        return result
Example #43
0
def test_series_grouper_result_length_difference():
    # GH 40014
    obj = Series(np.random.randn(10), dtype="float64")
    obj.index = obj.index.astype("O")
    labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

    grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
    result, counts = grouper.get_result()

    expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object)
    tm.assert_equal(result, expected)

    exp_counts = np.array([3, 4], dtype=np.int64)
    tm.assert_equal(counts, exp_counts)
Example #44
0
def test_reindex_categorical():
    index = date_range('20000101', periods=3)

    # reindexing to an invalid Categorical
    s = Series(['a', 'b', 'c'], dtype='category')
    result = s.reindex(index)
    expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
                                  categories=['a', 'b', 'c']))
    expected.index = index
    tm.assert_series_equal(result, expected)

    # partial reindexing
    expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
                                                                 'c']))
    expected.index = [1, 2]
    result = s.reindex([1, 2])
    tm.assert_series_equal(result, expected)

    expected = Series(Categorical(
        values=['c', np.nan], categories=['a', 'b', 'c']))
    expected.index = [2, 3]
    result = s.reindex([2, 3])
    tm.assert_series_equal(result, expected)
Example #45
0
    def wrap_results(self):
        results = self.results

        # see if we can infer the results
        if len(results) > 0 and is_sequence(results[0]):

            return self.wrap_results_for_axis()

        # dict of scalars
        from pandas import Series
        result = Series(results)
        result.index = self.res_index

        return result
Example #46
0
def test_reindex_categorical():
    index = date_range('20000101', periods=3)

    # reindexing to an invalid Categorical
    s = Series(['a', 'b', 'c'], dtype='category')
    result = s.reindex(index)
    expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
                                  categories=['a', 'b', 'c']))
    expected.index = index
    tm.assert_series_equal(result, expected)

    # partial reindexing
    expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
                                                                 'c']))
    expected.index = [1, 2]
    result = s.reindex([1, 2])
    tm.assert_series_equal(result, expected)

    expected = Series(Categorical(
        values=['c', np.nan], categories=['a', 'b', 'c']))
    expected.index = [2, 3]
    result = s.reindex([2, 3])
    tm.assert_series_equal(result, expected)
Example #47
0
def BaggingTrain(X, tree_num = 20, treedeeplim = np.inf):
    """
    Use C4.5 decision tree as base learner to construct a Bagging model
    @X: DataFrame, each row is a sample and each column is a feature, except the 
        last column, which is the sample labels. The feature colunms should be 
        continuous values
    @tree_num: the number of base learners in the Bagging model
    @treedeeplim: the maximum deep value of a decision tree allowed. It starts 
                  from 0, i.e., for a decision tree with a depth no greater 
                  than 1, its treedeeplim value is 0.
    """
    D = np.ones(X.shape[0])/X.shape[0]
    D = Series(D)
    D.index = X.index
    
    trees = []
    #All the base learners
    
    err_rates = []

    seed = 16
    
    for _ in range(tree_num):
    #Here, _ is a dummy variable
        X_, D_ = resample(X, D, random_state = seed)
        #resample(*arrays, replace, random_state) Resample arrays or sparse 
        #  matrices in a consistent way. 
        #  The default strategy implements one step of the bootstrapping 
        #  procedure
        #  *arrays: sequence of indexable data-structures. Indexable data-
        #           structures can be arrays, lists, dataframes or scipy sparse 
        #           matrices with consistent first dimension.
        #  replace: boolean, True by default. Implements resampling with 
        #           replacement. If False, this will implement (sliced) random 
        #           permutations.
        #  random_state: int or RandomState instance.
        X_.index = range(1, X_.shape[0] + 1)
        D_.index = range(1, D_.shape[0] + 1)
        D_ = D_/np.sum(D_)
        y_ = X_.ix[:,X_.shape[1] - 1]
        seed += 1
        tree = createSingleTree(X = X_, D = D_, deep = 0, deeplim = treedeeplim)
        trees.append(tree)
        
        hx = predictBase(tree = tree, X = X_)
        err_rate = np.sum(D_[list(hx != y_)])
        err_rates.append(err_rate)

    return trees, err_rates
Example #48
0
def test_resample_empty_dataframe_all_ts(empty_frame, freq, resample_method):
    # GH13212
    df = empty_frame
    # count retains dimensions too
    result = getattr(df.resample(freq), resample_method)()
    if resample_method != 'size':
        expected = df.copy()
    else:
        # GH14962
        expected = Series([])

    expected.index = df.index._shallow_copy(freq=freq)
    assert_index_equal(result.index, expected.index)
    assert result.index.freq == expected.index.freq
    assert_almost_equal(result, expected, check_dtype=False)
Example #49
0
    def test_subtracting_two_series_with_unordered_index_and_all_nan_index(
            self, data_result, data_expected):
        # GH 38439
        a_index_result = MultiIndex.from_tuples(data_result[0])
        b_index_result = MultiIndex.from_tuples(data_result[1])
        a_series_result = Series(data_result[2], index=a_index_result)
        b_series_result = Series(data_result[3], index=b_index_result)
        result = a_series_result.align(b_series_result)

        a_index_expected = MultiIndex.from_tuples(data_expected[0])
        b_index_expected = MultiIndex.from_tuples(data_expected[1])
        a_series_expected = Series(data_expected[2], index=a_index_expected)
        b_series_expected = Series(data_expected[3], index=b_index_expected)
        a_series_expected.index = a_series_expected.index.set_levels([
            a_series_expected.index.levels[0].astype("float"),
            a_series_expected.index.levels[1].astype("float"),
        ])
        b_series_expected.index = b_series_expected.index.set_levels([
            b_series_expected.index.levels[0].astype("float"),
            b_series_expected.index.levels[1].astype("float"),
        ])

        tm.assert_series_equal(result[0], a_series_expected)
        tm.assert_series_equal(result[1], b_series_expected)
Example #50
0
def make_qtrly(s: pd.Series, t: str = 'first', name: str = None) -> pd.Series:
    s.index = pd.DatetimeIndex(s.index.values, dtype=dt.date)
    s.index.freq = s.index.inferred_freq
    name = name or s.name or ''
    # print(s)

    if t == 'mean':
        s = s.resample('1Q').mean().astype(np.float64)
    elif t == 'first':
        s = s.resample('1Q').first().astype(np.float64)
    elif t == 'last':
        s = s.resample('1Q').last().astype(np.float64)

    if s.isnull().any():
        print(
            f'Series {name} still has some empty data. Filling that in with the last known value.'
        )
        s.fillna(method='ffill', inplace=True)

    # Conform everything to the end of the quarter
    idx = s.index
    for i, v in enumerate(idx):
        v.replace(month=math.ceil(v.month / 3) * 3)
        v.replace(day=calendar.monthrange(v.year, v.month)[-1])
    s.index = idx

    # s.index = s.index + pd.Timedelta(3, unit='M') - pd.Timedelta(1, unit='d')

    # s.index = pd.to_datetime([d + relativedelta(days=1) for d in s.index])
    # s.index.freq = s.index.inferred_freq

    # I wanted to make this function more dynamic and eliminate the if/else bullshit, with the below line (which failed)
    # s = s.resample('3MS').apply(eval(t + '(self)', {"__builtins__": None}, safe_funcs)).astype(np.float64)

    # print(s)
    return s
Example #51
0
 def attach_rows(self, result):
     # assumes if len(row_labels) > len(result) it's bc it was truncated
     # at the front, for AR lags, for example
     squeezed = result.squeeze()
     k_endog = np.array(self.ynames, ndmin=1).shape[0]
     if k_endog > 1 and squeezed.shape == (k_endog, ):
         squeezed = squeezed[None, :]
     # May be zero-dim, for example in the case of forecast one step in tsa
     if squeezed.ndim < 2:
         out = Series(squeezed)
     else:
         out = DataFrame(result)
         out.columns = self.ynames
     out.index = self.row_labels[-len(result):]
     return out
Example #52
0
    def test_value_counts_unique(self, tz_naive_fixture):
        tz = tz_naive_fixture
        # GH 7735
        idx = date_range("2011-01-01 09:00", freq="H", periods=10)
        # create repeated values, 'n'th element is repeated by n+1 times
        idx = DatetimeIndex(np.repeat(idx.values, range(1,
                                                        len(idx) + 1)),
                            tz=tz)

        exp_idx = date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz)
        expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64")
        expected.index = expected.index._with_freq(None)

        for obj in [idx, Series(idx)]:

            tm.assert_series_equal(obj.value_counts(), expected)

        expected = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz)
        expected = expected._with_freq(None)
        tm.assert_index_equal(idx.unique(), expected)

        idx = DatetimeIndex(
            [
                "2013-01-01 09:00",
                "2013-01-01 09:00",
                "2013-01-01 09:00",
                "2013-01-01 08:00",
                "2013-01-01 08:00",
                pd.NaT,
            ],
            tz=tz,
        )

        exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"],
                                tz=tz)
        expected = Series([3, 2], index=exp_idx)

        for obj in [idx, Series(idx)]:
            tm.assert_series_equal(obj.value_counts(), expected)

        exp_idx = DatetimeIndex(
            ["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz)
        expected = Series([3, 2, 1], index=exp_idx)

        for obj in [idx, Series(idx)]:
            tm.assert_series_equal(obj.value_counts(dropna=False), expected)

        tm.assert_index_equal(idx.unique(), exp_idx)
Example #53
0
def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method):
    # GH13212
    df = empty_frame_dti
    # count retains dimensions too
    result = getattr(df.resample(freq), resample_method)()
    if resample_method != "size":
        expected = df.copy()
    else:
        # GH14962
        expected = Series([], dtype=object)

    expected.index = _asfreq_compat(df.index, freq)

    tm.assert_index_equal(result.index, expected.index)
    assert result.index.freq == expected.index.freq
    tm.assert_almost_equal(result, expected, check_dtype=False)
Example #54
0
def proc_use_data(data, mtype, site, time_period='D', n_std=4):
    """
    Function for parse_ht_xml to process the data and aggregate it to a defined resolution.
    """

    ### Select the process sequence based on the mtype and convert to period volume
    data[data < 0] = nan
    count1 = float(data.count().values[0])

    if mtype == 'Water Meter':
        ## Check to determine whether it is cumulative or period volume
        diff1 = data.diff()[1:]
        neg_index = diff1 < 0
        neg_ratio = sum(neg_index.values) / count1
        if neg_ratio > 0.1:
            outliers = abs(data - data.mean()) > (data.std() * n_std)
            data[outliers] = nan
            vol = data
        else:
            # Replace the negative values with zero and the very large values
            diff1[diff1 < 0] = data[diff1 < 0]
            outliers = abs(diff1 - diff1.mean()) > (diff1.std() * n_std)
            diff1[outliers] = nan
            vol = diff1
    elif (mtype == 'Abstraction Volume') | (mtype == 'Average Flow'):
        outliers = abs(data - data.mean()) > (data.std() * n_std)
        data[outliers] = nan
        vol = data
    elif mtype == 'Flow':
        outliers = abs(data - data.mean()) > (data.std() * n_std)
        data[outliers] = nan

        # Determine the diff index
        t1 = Series(data.index).diff().dt.seconds.shift(-1)
        t1.iloc[-1] = t1.iloc[-2]
        t1.index = data.index
        # Convert to volume
        vol = data.multiply(t1, axis=0) * 0.001

    ## Estimate the NAs
    vol2 = vol.fillna(method='ffill')

    ## Resample the volumes
    vol_res = vol2.resample(time_period).sum()
    vol_res.loc[:, 'site'] = site

    return (vol_res)
Example #55
0
def test_value_counts_null(null_obj, index_or_series_obj):
    orig = index_or_series_obj
    obj = orig.copy()

    if not allow_na_ops(obj):
        pytest.skip("type doesn't allow for NA operations")
    elif len(obj) < 1:
        pytest.skip("Test doesn't make sense on empty data")
    elif isinstance(orig, pd.MultiIndex):
        pytest.skip(f"MultiIndex can't hold '{null_obj}'")

    values = obj.values
    if needs_i8_conversion(obj.dtype):
        values[0:2] = iNaT
    else:
        values[0:2] = null_obj

    klass = type(obj)
    repeated_values = np.repeat(values, range(1, len(values) + 1))
    obj = klass(repeated_values, dtype=obj.dtype)

    # because np.nan == np.nan is False, but None == None is True
    # np.nan would be duplicated, whereas None wouldn't
    counter = collections.Counter(obj.dropna())
    expected = Series(dict(counter.most_common()), dtype=np.int64)
    expected.index = expected.index.astype(obj.dtype)

    result = obj.value_counts()
    if obj.duplicated().any():
        # TODO:
        #  Order of entries with the same count is inconsistent on CI (gh-32449)
        expected = expected.sort_index()
        result = result.sort_index()
    tm.assert_series_equal(result, expected)

    # can't use expected[null_obj] = 3 as
    # IntervalIndex doesn't allow assignment
    new_entry = Series({np.nan: 3}, dtype=np.int64)
    expected = expected.append(new_entry)

    result = obj.value_counts(dropna=False)
    if obj.duplicated().any():
        # TODO:
        #  Order of entries with the same count is inconsistent on CI (gh-32449)
        expected = expected.sort_index()
        result = result.sort_index()
    tm.assert_series_equal(result, expected)
Example #56
0
    def read_in(self, mat):
        """Read the input part.

        """

        # Check if more then one time series model is present
        if not isinstance(mat['IN'], np.ndarray):
            mat['IN'] = [mat['IN']]

        # Read all the time series models
        for i, IN in enumerate(mat['IN']):
            data = dict()

            for name in IN._fieldnames:
                if name != 'values':
                    data[name] = getattr(IN, name)
                else:
                    tindex = [
                        matlab2datetime(tval) for tval in IN.values[:, 0]
                    ]
                    series = Series(IN.values[:, 1], index=tindex)

                    # round on seconds, to get rid of conversion milliseconds
                    series.index = series.index.round('s')

                    if hasattr(IN, 'type'):
                        IN.Type = IN.type

                    if IN.Type in ['EVAP', 'PREC', 'WELL']:
                        # in menyanthes, the flux is summed over the
                        # time-step, so divide by the timestep now
                        step = series.index.to_series().diff() / offsets.Day(1)
                        step = step.values.astype(np.float)
                        series = series / step
                        if series.values[0] != 0:
                            series = series[1:]

                    data['values'] = series

            # add to self.IN
            if not hasattr(IN, 'Name') and not hasattr(IN, 'name'):
                IN.Name = 'IN' + str(i)
            if hasattr(IN, 'name'):
                IN.Name = IN.name

            self.IN[IN.Name] = data
Example #57
0
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = data.unstack(fill_value=-1)
        expected = DataFrame({"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16)
        assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame({"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float)
        assert_frame_equal(result, expected)
Example #58
0
    def test_resample_empty_dataframe(self, freq, resample_method):
        # GH13212
        index = self.create_series().index[:0]
        f = DataFrame(index=index)

        # count retains dimensions too
        result = getattr(f.resample(freq), resample_method)()
        if resample_method != 'size':
            expected = f.copy()
        else:
            # GH14962
            expected = Series([])

        expected.index = f.index._shallow_copy(freq=freq)
        assert_index_equal(result.index, expected.index)
        assert result.index.freq == expected.index.freq
        assert_almost_equal(result, expected, check_dtype=False)
    def _dkl_n_group(self, group_df, name, cols, p_n):
        """
        Calculate DKL(n|y) for a single group

        Parameters
        ----------
        group_df : DataFrame
        name : String
            Name of group (from Pandas.grouped())

        Returns
        -------
        DKL_n : Series
        """
        DKL_n = Series(self._entropy(group_df, p_n), index=cols, name=name)
        DKL_n.index = ['DKL(n|y)_' + str(item) for item in DKL_n.index]
        return DKL_n
Example #60
0
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples(
            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
        )

        result = data.unstack(fill_value=-1)
        expected = DataFrame(
            {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
        )
        tm.assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame(
            {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float
        )
        tm.assert_frame_equal(result, expected)

        # GH #13971: fill_value when unstacking multiple levels:
        df = DataFrame(
            {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
        ).set_index(["x", "y", "z"])
        unstacked = df.unstack(["x", "y"], fill_value=0)
        key = ("w", "b", "j")
        expected = unstacked[key]
        result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
        tm.assert_series_equal(result, expected)

        stacked = unstacked.stack(["x", "y"])
        stacked.index = stacked.index.reorder_levels(df.index.names)
        # Workaround for GH #17886 (unnecessarily casts to float):
        stacked = stacked.astype(np.int64)
        result = stacked.loc[df.index]
        tm.assert_frame_equal(result, df)

        # From a series
        s = df["w"]
        result = s.unstack(["x", "y"], fill_value=0)
        expected = unstacked["w"]
        tm.assert_frame_equal(result, expected)