Beispiel #1
0
def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name):
    """
    handles post processing for the cut method where
    we combine the index information if the originally passed
    datatype was a series
    """
    if x_is_series:
        fac = Series(fac, index=series_index, name=name)

    if not retbins:
        return fac

    return fac, bins
Beispiel #2
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame

    if isinstance(obj, Series):
        new_values = algos.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError("axis 1 is not supported")
        return DataFrame(obj._data.reindex_indexer(
            new_axis=new_index, indexer=indexer, axis=1))
    else:
        raise ValueError("'obj' should be either a Series or a DataFrame")
Beispiel #3
0
    def test_operators_corner(self):
        series = self.ts

        empty = Series([], index=Index([]))

        result = series + empty
        self.assert_(np.isnan(result).all())

        result = empty + Series([], index=Index([]))
        self.assert_(len(result) == 0)

        deltas = Series([timedelta(1)] * 5, index=np.arange(5))
        sub_deltas = deltas[::2]

        deltas5 = deltas * 5
        deltas = deltas + sub_deltas

        # float + int
        int_ts = self.ts.astype(int)[:-5]
        added = self.ts + int_ts
        expected = self.ts.values[:-5] + int_ts.values
        self.assert_(np.array_equal(added[:-5], expected))
Beispiel #4
0
    def test_iloc_getitem_array(self):

        # array like
        s = Series(index=range(1, 4))
        self.check_result('array like',
                          'iloc',
                          s.index,
                          'ix', {
                              0: [2, 4, 6],
                              1: [3, 6, 9],
                              2: [4, 8, 12]
                          },
                          typs=['ints'])
Beispiel #5
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame
    from pandas.core.internals import BlockManager

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1))
    else:
        raise NotImplementedError
Beispiel #6
0
def percentileRank(frame, column=None, kind='mean'):
    """
    Return score at percentile for each point in time (cross-section)

    Parameters
    ----------
    frame: DataFrame
    column: string or Series, optional
       Column name or specific Series to compute percentiles for.
       If not provided, percentiles are computed for all values at each
       point in time. Note that this can take a LONG time.
    kind: {'rank', 'weak', 'strict', 'mean'}, optional
        This optional parameter specifies the interpretation of the
        resulting score:

        - "rank": Average percentage ranking of score.  In case of
                  multiple matches, average the percentage rankings of
                  all matching scores.
        - "weak": This kind corresponds to the definition of a cumulative
                  distribution function.  A percentileofscore of 80%
                  means that 80% of values are less than or equal
                  to the provided score.
        - "strict": Similar to "weak", except that only values that are
                    strictly less than the given score are counted.
        - "mean": The average of the "weak" and "strict" scores, often used in
                  testing.  See

                  http://en.wikipedia.org/wiki/Percentile_rank

    Returns
    -------
    TimeSeries or DataFrame, depending on input
    """
    fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind)

    results = {}
    framet = frame.T
    if column is not None:
        if isinstance(column, Series):
            for date, xs in compat.iteritems(frame.T):
                results[date] = fun(xs, column.get(date, NaN))
        else:
            for date, xs in compat.iteritems(frame.T):
                results[date] = fun(xs, xs[column])
        results = Series(results)
    else:
        for column in frame.columns:
            for date, xs in compat.iteritems(framet):
                results.setdefault(date, {})[column] = fun(xs, xs[column])
        results = DataFrame(results).T
    return results
Beispiel #7
0
    def test_constructor_dict(self):
        frame = self.klass({'col1': self.ts1, 'col2': self.ts2})

        common.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False)
        common.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False)

        frame = self.klass({
            'col1': self.ts1,
            'col2': self.ts2
        },
                           columns=['col2', 'col3', 'col4'])

        self.assertEqual(len(frame), len(self.ts2))
        self.assert_('col1' not in frame)
        self.assert_(np.isnan(frame['col3']).all())

        # Corner cases
        self.assertEqual(len(self.klass({})), 0)
        self.assertRaises(Exception,
                          lambda x: self.klass([self.ts1, self.ts2]))

        # pass dict and array, nicht nicht
        self.assertRaises(Exception, self.klass, {
            'A': {
                'a': 'a',
                'b': 'b'
            },
            'B': ['a', 'b']
        })

        # can I rely on the order?
        self.assertRaises(Exception, self.klass, {
            'A': ['a', 'b'],
            'B': {
                'a': 'a',
                'b': 'b'
            }
        })
        self.assertRaises(Exception, self.klass, {
            'A': ['a', 'b'],
            'B': Series(['a', 'b'], index=['a', 'b'])
        })

        # Length-one dict micro-optimization
        frame = self.klass({'A': {'1': 1, '2': 2}})
        self.assert_(np.array_equal(frame.index, ['1', '2']))

        # empty dict plus index
        idx = Index([0, 1, 2])
        frame = self.klass({}, index=idx)
        self.assert_(frame.index is idx)
Beispiel #8
0
    def predict(self, beta=None, x=None, fill_value=None,
                fill_method=None, axis=0):
        """
        Parameters
        ----------
        beta : Series
        x : Series or DataFrame
        fill_value : scalar or dict, default None
        fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        axis : {0, 1}, default 0
            See DataFrame.fillna for more details

        Notes
        -----
        1. If both fill_value and fill_method are None then NaNs are dropped
        (this is the default behavior)
        2. An intercept will be automatically added to the new_y_values if
           the model was fitted using an intercept

        Returns
        -------
        Series of predicted values
        """
        if beta is None and x is None:
            return self.y_predict

        if beta is None:
            beta = self.beta
        else:
            beta = beta.reindex(self.beta.index)
            if isnull(beta).any():
                raise ValueError('Must supply betas for same variables')

        if x is None:
            x = self._x
            orig_x = x
        else:
            orig_x = x
            if fill_value is None and fill_method is None:
                x = x.dropna(how='any')
            else:
                x = x.fillna(value=fill_value, method=fill_method, axis=axis)
            if isinstance(x, Series):
                x = DataFrame({'x': x})
            if self._intercept:
                x['intercept'] = 1.

            x = x.reindex(columns=self._x.columns)

        rs = np.dot(x.values, beta.values)
        return Series(rs, x.index).reindex(orig_x.index)
Beispiel #9
0
    def test_value_counts(self):
        np.random.seed(1234)
        from pandas.tools.tile import cut

        arr = np.random.randn(4)
        factor = cut(arr, 4)

        tm.assertIsInstance(factor, Categorical)
        result = algos.value_counts(factor)
        cats = ['(-1.194, -0.535]', '(-0.535, 0.121]', '(0.121, 0.777]',
                '(0.777, 1.433]']
        expected_index = CategoricalIndex(cats, cats, ordered=True)
        expected = Series([1, 1, 1, 1], index=expected_index)
        tm.assert_series_equal(result.sort_index(), expected.sort_index())
Beispiel #10
0
    def test_firstValid(self):
        ts = self.ts.copy()
        ts[:5] = np.NaN

        index = ts._firstTimeWithValue()
        self.assertEqual(index, ts.index[5])

        ts[-5:] = np.NaN
        index = ts._lastTimeWithValue()
        self.assertEqual(index, ts.index[-6])

        ser = Series([], index=[])
        self.assert_(ser._lastTimeWithValue() is None)
        self.assert_(ser._firstTimeWithValue() is None)
Beispiel #11
0
    def test_repr(self):
        str(self.ts)
        str(self.series)
        str(self.series.astype(int))
        str(self.objSeries)

        str(Series(common.randn(1000), index=np.arange(1000)))

        # empty
        str(self.empty)

        # with NaNs
        self.series[5:7] = np.NaN
        str(self.series)
Beispiel #12
0
    def test_loc_getitem_label_list(self):

        # list of labels
        self.check_result('list lbl', 'loc', [0,2,4], 'ix', [0,2,4], typs = ['ints'], axes=0)
        self.check_result('list lbl', 'loc', [3,6,9], 'ix', [3,6,9], typs = ['ints'], axes=1)
        self.check_result('list lbl', 'loc', [4,8,12], 'ix', [4,8,12], typs = ['ints'], axes=2)
        self.check_result('list lbl', 'loc', ['a','b','d'], 'ix', ['a','b','d'], typs = ['labels'], axes=0)
        self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1)
        self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2)
        self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0)
        self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', 
                          [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0)

        # fails
        self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError)
        self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError)
        self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError)
        self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError)
 
        # array like
        self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0)
        self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1)
        self.check_result('array like', 'loc', Series(index=[4,8,12]).index, 'ix', [4,8,12], typs = ['ints'], axes=2)
Beispiel #13
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        # corner cases
        self.assertRaises(Exception, grouped.transform, lambda x: x.mean())
Beispiel #14
0
    def test_setitem_ambig(self):
        # difficulties with mixed-type data
        from decimal import Decimal

        # created as float type
        dm = DataMatrix(index=range(3), columns=range(3))

        coercable_series = Series([Decimal(1) for _ in range(3)],
                                  index=range(3))
        uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3))

        dm[0] = np.ones(3)
        self.assertEqual(len(dm.cols()), 3)
        self.assert_(dm.objects is None)

        dm[1] = coercable_series
        self.assertEqual(len(dm.cols()), 3)
        self.assert_(dm.objects is None)

        dm[2] = uncoercable_series
        self.assertEqual(len(dm.cols()), 3)
        self.assert_(dm.objects is not None)
        self.assert_(2 in dm.objects)
        self.assert_(2 not in dm.columns)
Beispiel #15
0
    def test_setitem(self):
        self.ts[self.ts.index[5]] = np.NaN
        self.ts[[1, 2, 17]] = np.NaN
        self.ts[6] = np.NaN
        self.assert_(np.isnan(self.ts[6]))
        self.assert_(np.isnan(self.ts[2]))
        self.ts[np.isnan(self.ts)] = 5
        self.assert_(not np.isnan(self.ts[2]))

        # caught this bug when writing tests
        series = Series(common.makeIntIndex(20).astype(float),
                        index=common.makeIntIndex(20))

        series[::2] = 0
        self.assert_((series[::2] == 0).all())
Beispiel #16
0
    def setUp(self):
        import warnings
        warnings.filterwarnings(action='ignore', category=FutureWarning)

        self.series_ints   = Series(np.random.rand(4), index=range(0,8,2))
        self.frame_ints    = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3))
        self.panel_ints    = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4))

        self.series_labels = Series(np.random.randn(4), index=list('abcd'))
        self.frame_labels  = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD'))
        self.panel_labels  = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW'))

        self.series_mixed  = Series(np.random.randn(4), index=[2, 4, 'null', 8])
        self.frame_mixed   = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8])
        self.panel_mixed   = Panel(np.random.randn(4,4,4), items=[2,4,'null',8])

        self.series_ts     = Series(np.random.randn(4), index=date_range('20130101', periods=4))
        self.frame_ts      = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4))
        self.panel_ts      = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4))

        #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00])
        #self.frame_floats  = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00])
        #self.panel_floats  = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00])

        self.frame_empty   = DataFrame({})
        self.series_empty  = Series({})
        self.panel_empty   = Panel({})

        # form agglomerates
        for o in self._objs:

            d = dict()
            for t in self._typs:
                d[t] = getattr(self,'%s_%s' % (o,t),None)

            setattr(self,o,d)
Beispiel #17
0
    def test_reindex_bool(self):

        # A series other than float, int, string, or object
        ts = self.ts[::2]
        bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index)

        # this should work fine
        reindexed_bool = bool_ts.reindex(self.ts.index)

        # if NaNs introduced
        self.assert_(reindexed_bool.dtype == np.object_)

        # NO NaNs introduced
        reindexed_bool = bool_ts.reindex(bool_ts.index[::2])
        self.assert_(reindexed_bool.dtype == np.bool_)
Beispiel #18
0
def _filter_data(lhs, rhs, weights=None):
    """
    Cleans the input for single OLS.

    Parameters
    ----------
    lhs : Series
        Dependent variable in the regression.
    rhs : dict, whose values are Series, DataFrame, or dict
        Explanatory variables of the regression.
    weights : array-like, optional
        1d array of weights.  If None, equivalent to an unweighted OLS.

    Returns
    -------
    Series, DataFrame
        Cleaned lhs and rhs
    """
    if not isinstance(lhs, Series):
        if len(lhs) != len(rhs):
            raise AssertionError("length of lhs must equal length of rhs")
        lhs = Series(lhs, index=rhs.index)

    rhs = _combine_rhs(rhs)
    lhs = DataFrame({'__y__': lhs}, dtype=float)
    pre_filt_rhs = rhs.dropna(how='any')

    combined = rhs.join(lhs, how='outer')
    if weights is not None:
        combined['__weights__'] = weights

    valid = (combined.count(1) == len(combined.columns)).values
    index = combined.index
    combined = combined[valid]

    if weights is not None:
        filt_weights = combined.pop('__weights__')
    else:
        filt_weights = None

    filt_lhs = combined.pop('__y__')
    filt_rhs = combined

    if hasattr(filt_weights, 'to_dense'):
        filt_weights = filt_weights.to_dense()

    return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
            pre_filt_rhs.to_dense(), index, valid)
Beispiel #19
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp)
Beispiel #20
0
    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = DateRange(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K),
                               index=self.rng,
                               columns=np.arange(K))

        self.matrix = DataMatrix(randn(N, K),
                                 index=self.rng,
                                 columns=np.arange(K))
Beispiel #21
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        self.assert_numpy_array_equal(labels, np.array(
            [0, 0, -1, 1, 2, 3], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            ['A', 'B', 3.14, np.inf], dtype=object))

        labels, uniques = algos.factorize(x, sort=True)
        self.assert_numpy_array_equal(labels, np.array(
            [2, 2, -1, 3, 0, 1], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            [3.14, np.inf, 'A', 'B'], dtype=object))
Beispiel #22
0
    def checkDataSet(self, dataset, start=None, end=None, skip_moving=False):
        exog = dataset.exog[start:end]
        endog = dataset.endog[start:end]
        x = DataMatrix(exog,
                       index=np.arange(exog.shape[0]),
                       columns=np.arange(exog.shape[1]))
        y = Series(endog, index=np.arange(len(endog)))

        self.checkOLS(exog, endog, x, y)

        if not skip_moving:
            self.checkMovingOLS('rolling', x, y)
            self.checkMovingOLS('rolling', x, y, nw_lags=0)
            self.checkMovingOLS('expanding', x, y, nw_lags=0)
            self.checkMovingOLS('rolling', x, y, nw_lags=1)
            self.checkMovingOLS('expanding', x, y, nw_lags=1)
            self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True)
Beispiel #23
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        transformed = grouped.transform(np.mean)
        for name, group in grouped:
            mean = group.mean()
            for idx in group.index:
                self.assertEqual(transformed[idx], mean)
Beispiel #24
0
 def __init__(self, data_set, indicator=None):
     """
     Args:
         data_set: dict(symbol=DataFrame)或DataFrame, 待分析的数据集是一个以品种名为key,value是DataFrame或者是一个DataFrame
         indicator: Series,指标序列,默认是一个空的Series,可以通过直接设置indicator属性设置,或者在类内编写指标获得
     Notes:
         数据集的长度应当与indicator长度相同,否则会报错
     """
     self.__identify = None  # 识别标签函数对象,目前主要是_group_identify 和 _rolling_identify
     self.__indicator = None  # 当前处理的指标对象
     self.__data = None  # 当前处理的数据集对象
     self.__group = None  # 当前处理的分组对象
     self.__symbol = None  # 当前品种对象
     self.__profit = None  # 当前品种的盈亏序列
     self._data_set = data_set.copy()  # 总体数据集
     self._indicator = Series() if indicator is None else indicator
     self._ind_len = 0  # 当前处理的指标数据行数
     self._group = None
Beispiel #25
0
    def test_order(self):

        ts = self.ts.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.order()
        self.assert_(np.isnan(result[-5:]).all())
        self.assert_(np.array_equal(result[:-5], np.sort(vals[5:])))

        result = ts.order(missingAtEnd=False)
        self.assert_(np.isnan(result[:5]).all())
        self.assert_(np.array_equal(result[5:], np.sort(vals[5:])))

        # something object-type
        ser = Series(['A', 'B'], [1, 2])
        # no failure
        ser.order()
Beispiel #26
0
    def test_combineSeries(self):

        # Series
        series = self.frame.getXS(self.frame.index[0])

        added = self.frame + series

        for key, s in added.iteritems():
            assert_series_equal(s, self.frame[key] + series[key])

        larger_series = series.toDict()
        larger_series['E'] = 1
        larger_series = Series(larger_series)
        larger_added = self.frame + larger_series

        for key, s in self.frame.iteritems():
            assert_series_equal(larger_added[key], s + series[key])
        self.assert_('E' in larger_added)
        self.assert_(np.isnan(larger_added['E']).all())

        # TimeSeries
        ts = self.tsframe['A']
        added = self.tsframe + ts

        for key, col in self.tsframe.iteritems():
            assert_series_equal(added[key], col + ts)

        smaller_frame = self.tsframe[:-5]
        smaller_added = smaller_frame + ts

        self.assert_(smaller_added.index.equals(self.tsframe.index))

        # length 0
        result = self.tsframe + ts[:0]

        # Frame is length 0
        result = self.tsframe[:0] + ts
        self.assertEqual(len(result), 0)

        # empty but with non-empty index
        frame = self.tsframe[:1].reindex(columns=[])
        result = frame * ts
        self.assertEqual(len(result), len(ts))
Beispiel #27
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame
    from pandas.core.internals import BlockManager

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        data = obj._data

        new_blocks = [b.take(indexer, axis=1) for b in data.blocks]
        new_axes = list(data.axes)
        new_axes[1] = new_index
        new_data = BlockManager(new_blocks, new_axes)
        return DataFrame(new_data)
    else:
        raise NotImplementedError
Beispiel #28
0
    def test_asfreq(self):
        ts = Series([0., 1., 2.],
                    index=[
                        datetime(2009, 10, 30),
                        datetime(2009, 11, 30),
                        datetime(2009, 12, 31)
                    ])

        daily_ts = ts.asfreq('WEEKDAY')
        monthly_ts = daily_ts.asfreq('EOM')
        self.assert_(np.array_equal(monthly_ts, ts))

        daily_ts = ts.asfreq('WEEKDAY', fillMethod='pad')
        monthly_ts = daily_ts.asfreq('EOM')
        self.assert_(np.array_equal(monthly_ts, ts))

        daily_ts = ts.asfreq(datetools.bday)
        monthly_ts = daily_ts.asfreq(datetools.bmonthEnd)
        self.assert_(np.array_equal(monthly_ts, ts))
Beispiel #29
0
def _filter_data(lhs, rhs, weights=None):
    """
    Cleans the input for single OLS.

    Parameters
    ----------
    lhs: Series
        Dependent variable in the regression.
    rhs: dict, whose values are Series, DataFrame, or dict
        Explanatory variables of the regression.

    Returns
    -------
    Series, DataFrame
        Cleaned lhs and rhs
    """
    if not isinstance(lhs, Series):
        assert(len(lhs) == len(rhs))
        lhs = Series(lhs, index=rhs.index)

    rhs = _combine_rhs(rhs)
    lhs = DataFrame({'__y__' : lhs}, dtype=float)
    pre_filt_rhs = rhs.dropna(how='any')

    combined = rhs.join(lhs, how='outer')
    if weights is not None:
        combined['__weights__'] = weights

    valid = (combined.count(1) == len(combined.columns)).values
    index = combined.index
    combined = combined[valid]

    if weights is not None:
        filt_weights = combined.pop('__weights__')
    else:
        filt_weights = None

    filt_lhs = combined.pop('__y__')
    filt_rhs = combined

    return (filt_lhs, filt_rhs, filt_weights,
            pre_filt_rhs, index, valid)
Beispiel #30
0
def _process_data_structure(arg, kill_inf=True):
    if isinstance(arg, DataFrame):
        return_hook = lambda v: type(arg)(
            v, index=arg.index, columns=arg.columns)
        values = arg.values
    elif isinstance(arg, Series):
        values = arg.values
        return_hook = lambda v: Series(v, arg.index)
    else:
        return_hook = lambda v: v
        values = arg

    if not issubclass(values.dtype.type, float):
        values = values.astype(float)

    if kill_inf:
        values = values.copy()
        values[np.isinf(values)] = np.NaN

    return return_hook, values