Example #1
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.frame['A'][::2] = np.nan
        self.frame['B'][::3] = np.nan
        self.frame['C'][::4] = np.nan
        self.frame['D'][::5] = np.nan

        ranks0 = self.frame.rank()
        ranks1 = self.frame.rank(1)
        mask = np.isnan(self.frame.values)

        fvals = self.frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Example #2
0
    def test_rank(self, float_frame):
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        float_frame['A'][::2] = np.nan
        float_frame['B'][::3] = np.nan
        float_frame['C'][::4] = np.nan
        float_frame['D'][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Example #3
0
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ['a', 'a', 'b', 'c']})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with tm.assert_raises_regex(TypeError, xpr):
            df.astype({"A": cls})

        with tm.assert_raises_regex(TypeError, xpr):
            df['A'].astype(cls)
Example #4
0
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range('2010-01-04', periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(['1.0', '2', '3.14', '4', '5.4'])
        df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({'b': 'str', 'd': 'float32'})
        result = df.astype(dt1)
        expected = DataFrame({
            'a': a,
            'b': Series(['0', '1', '2', '3', '4']),
            'c': c,
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            'a': a,
            'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
            'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
        assert_frame_equal(df.astype(dt3),
                           df.astype(str))
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({'b': str, 2: str})
        dt5 = dtype_class({'e': str})
        pytest.raises(KeyError, df.astype, dt4)
        pytest.raises(KeyError, df.astype, dt5)
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)
Example #5
0
    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors='ignore')
Example #6
0
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with tm.assert_raises_regex(ValueError, msg):
            df.astype(dtype)
Example #7
0
    def test_astype_cast_nan_inf_int(self):
        # GH14265, check nan and inf raise error when converting to int
        types = [np.int32, np.int64]
        values = [np.nan, np.inf]
        msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'

        for this_type in types:
            for this_val in values:
                df = DataFrame([this_val])
                with tm.assert_raises_regex(ValueError, msg):
                    df.astype(this_type)
Example #8
0
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        with pytest.raises(TypeError):
            df.astype(other)

        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError):
            df.astype(dtype)
Example #9
0
 def test_astype_categorical(self, dtype):
     # GH 18099
     d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
     df = DataFrame(d)
     result = df.astype(dtype)
     expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
     tm.assert_frame_equal(result, expected)
Example #10
0
    def test_astype_str(self, text_dtype):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        # Test str and unicode on Python 2.x and just str on Python 3.x
        result = df.astype(text_dtype)

        expected = DataFrame({
            "a": list(map(text_dtype,
                          map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b": list(map(text_dtype, map(Timestamp, b._values))),
            "c": list(map(text_dtype,
                          map(lambda x: Timedelta(x)._repr_base(format="all"),
                              c._values))),
            "d": list(map(text_dtype, d._values)),
            "e": list(map(text_dtype, e._values)),
        })

        assert_frame_equal(result, expected)
Example #11
0
def parse_table_schema(json, precise_float):
    """
    Builds a DataFrame from a given schema

    Parameters
    ----------
    json :
        A JSON table schema
    precise_float : boolean
        Flag controlling precision when decoding string to double values, as
        dictated by ``read_json``

    Returns
    -------
    df : DataFrame

    Raises
    ------
    NotImplementedError
        If the JSON table schema contains either timezone or timedelta data

    Notes
    -----
        Because ``write_json`` uses the string `index` to denote a name-less
        ``Index``, this function sets the name of the returned ``DataFrame`` to
        ``None`` when said string is encountered. Therefore, intentional usage
        of `index` as the ``Index`` name is not supported.

    See also
    --------
    build_table_schema : inverse function
    pandas.read_json
    """
    table = loads(json, precise_float=precise_float)
    col_order = [field['name'] for field in table['schema']['fields']]
    df = DataFrame(table['data'])[col_order]

    dtypes = {field['name']: convert_json_field_to_pandas_type(field)
              for field in table['schema']['fields']}

    # Cannot directly use as_type with timezone data on object; raise for now
    if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
        raise NotImplementedError('table="orient" can not yet read timezone '
                                  'data')

    # No ISO constructor for Timedelta as of yet, so need to raise
    if 'timedelta64' in dtypes.values():
        raise NotImplementedError('table="orient" can not yet read '
                                  'ISO-formatted Timedelta data')

    df = df.astype(dtypes)

    df = df.set_index(table['schema']['primaryKey'])
    if len(df.index.names) == 1 and df.index.name == 'index':
        df.index.name = None
    else:
        if all(x.startswith('level_') for x in df.index.names):
            df.index.names = [None] * len(df.index.names)

    return df
    def clean(numpy_array):  #load your csv data here in numpy_array
        data=ut.preprocessData(numpy_array)

        #print dataarray
        #print data

        ###### numpy into pandas dataframe
        df = pd.DataFrame(data)
        #print df
        #print df.dtypes

        df=df.astype('float16')
        #print df.dtypes


        ###### generate preprocessed csv file 
        #df.to_csv('preprocessed_data.csv', sep=',',index=False)

        ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin)
        df_norm= (df - df.min()) / (df.max()-df.min())
        df_norm=df_norm.fillna(-1)

        ##### generate normalized csv 
        #df_norm.to_csv('normalized_data.csv',sep=',', index=False)
        
        return df_norm.as_matrix() 
Example #13
0
def slice(unit, trialData, sort_by = None, show = False):
    
    data = trialData
    
    if sort_by in trialData.columns:
        data = trialData.sort(columns=sort_by)
    
    rates = DataFrame(index=data.index, columns = range(6))
    
    for ind, row in data.iterrows():
        pg = (row['PG in'], row['PG out'])
        fg = (row['C out'], row['FG in'])
        cent = (row['C in'], row['C out'])
        delay = (row['PG out'], row['C in'])
    
        counts = [ np.histogram(row[unit.id], bins = 1, range=pg)[0] ]
        
        counts.append(np.histogram(row[unit.id], bins=3, range=delay)[0])
        counts.extend([np.histogram(row[unit.id], bins = 1, range=period)[0] 
                        for period in [cent, fg]])
        
        counts = np.concatenate(counts)
        diffs = [pg[1]-pg[0], (delay[1]-delay[0])/3.0, (delay[1]-delay[0])/3.0,
                (delay[1]-delay[0])/3.0, cent[1]-cent[0], fg[1]-fg[0], ]
        
        
        rates.ix[ind] = counts/diffs
    
    if show:
        plt.imshow(rates.astype(float), aspect='auto', interpolation = 'nearest',
            extent=[0,5,0,len(rates)])
    
    return rates
Example #14
0
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = ['days', 'hours', 'minutes', 'seconds',
                   'milliseconds', 'microseconds', 'nanoseconds']
        hasnans = self.hasnans
        if hasnans:
            def f(x):
                if isnull(x):
                    return [np.nan] * len(columns)
                return x.components
        else:
            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self])
        result.columns = columns
        if not hasnans:
            result = result.astype('int64')
        return result
Example #15
0
def bool_frame_with_na():
    """
    Fixture for DataFrame of booleans with index of unique strings

    Columns are ['A', 'B', 'C', 'D']; some entries are missing

                    A      B      C      D
    zBZxY2IDGd  False  False  False  False
    IhBWBMWllt  False   True   True   True
    ctjdvZSR6R   True  False   True   True
    AVTujptmxb  False   True  False   True
    G9lrImrSWq  False  False  False   True
    sFFwdIUfz2    NaN    NaN    NaN    NaN
    s15ptEJnRb    NaN    NaN    NaN    NaN
    ...           ...    ...    ...    ...
    UW41KkDyZ4   True   True  False  False
    l9l6XkOdqV   True  False  False  False
    X2MeZfzDYA  False   True  False  False
    xWkIKU7vfX  False   True  False   True
    QOhL6VmpGU  False  False  False   True
    22PwkRJdat  False   True  False  False
    kfboQ3VeIK   True  False   True  False

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData()) > 0
    df = df.astype(object)
    # set some NAs
    df.loc[5:10] = np.nan
    df.loc[15:20, -2:] = np.nan
    return df
Example #16
0
    def test_passing_dtype(self):
        # see gh-6607
        df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
            'AB'), index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
            df.to_csv(path)

            # see gh-3795: passing 'str' as the dtype
            result = self.read_csv(path, dtype=str, index_col=0)
            expected = df.astype(str)
            tm.assert_frame_equal(result, expected)

            # for parsing, interpret object as str
            result = self.read_csv(path, dtype=object, index_col=0)
            tm.assert_frame_equal(result, expected)

            # we expect all object columns, so need to
            # convert to test for equivalence
            result = result.astype(float)
            tm.assert_frame_equal(result, df)

            # invalid dtype
            self.assertRaises(TypeError, self.read_csv, path,
                              dtype={'A': 'foo', 'B': 'float64'},
                              index_col=0)

        # see gh-12048: empty frame
        actual = self.read_csv(StringIO('A,B'), dtype=str)
        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
        tm.assert_frame_equal(actual, expected)
Example #17
0
    def _get_culled_matrix(least_num_seg: int,
                           dtm_data_frame: pd.DataFrame) -> pd.DataFrame:
        """Get the culled final_matrix and culled words.

        Gives a matrix that only contains the words that appears in more than
        `least_num_seg` segments.
        :param least_num_seg: least number of segment the word needs to appear
                                in to be kept.
        :param dtm_data_frame: the dtm in forms of panda data frames.
                                the indices(rows) are segment names
                                the columns are words.
        :return:
             the culled dtm data frame
        """
        # create a bool matrix to indicate whether a word is in a segment
        # at the line of segment s and the column of word w,
        # if the value is True, then means w is in s
        # otherwise means w is not in s
        is_in_data_frame = dtm_data_frame.astype(bool)

        # summing the boolean array gives an int, which indicates how many
        # True there are in that array.
        # this is an series, indicating each word is in how many segments
        # this array is a parallel array of words
        # noinspection PyUnresolvedReferences
        words_in_num_seg_series = is_in_data_frame.sum(axis=0)

        # get the index of all the words needs to remain
        # this is an array of int
        dtm_data_frame = dtm_data_frame.loc[
            :,  # select all rows (row indexer)
            words_in_num_seg_series >= least_num_seg  # col indexer
        ]

        return dtm_data_frame
Example #18
0
    def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Example #19
0
    def setup_cache(self):
        df = DataFrame([[1]])
        frames = {
            'int': df,
            'float': df.astype(float),
        }

        return frames
Example #20
0
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
               r" \[timedelta64\[{}\]\]").format(unit)
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
               r" \[datetime64\[{}\]\]").format(unit)
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)
Example #21
0
    def test_hist_non_numerical_raises(self):
        # gh-10444
        df = DataFrame(np.random.rand(10, 2))
        df_o = df.astype(np.object)

        msg = "hist method requires numerical columns, nothing to plot."
        with pytest.raises(ValueError, match=msg):
            df_o.hist()
Example #22
0
    def test_constant_drift(self):
        N = 10
        expected = DataFrame({'x': np.arange(N), 'y': np.zeros(N)}).iloc[1:]
        expected = expected.astype('float')
        expected.index.name = 'frame'
        expected.columns = ['x', 'y']

        actual = tp.compute_drift(self.steppers)
        assert_frame_equal(actual, expected)
Example #23
0
    def test_div(self):

        # no longer do integer div for any ops, but deal with the 0's
        p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
        result = p['first'] / p['second']
        expected = Series(p['first'].values.astype(float) / p['second'].values,
                          dtype='float64')
        expected.iloc[0:3] = np.inf
        assert_series_equal(result, expected)

        result = p['first'] / 0
        expected = Series(np.inf, index=p.index, name='first')
        assert_series_equal(result, expected)

        p = p.astype('float64')
        result = p['first'] / p['second']
        expected = Series(p['first'].values / p['second'].values)
        assert_series_equal(result, expected)

        p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]})
        result = p['first'] / p['second']
        assert_series_equal(result, p['first'].astype('float64'),
                            check_names=False)
        self.assertTrue(result.name is None)
        self.assertFalse(np.array_equal(result, p['second'] / p['first']))

        # inf signing
        s = Series([np.nan, 1., -1.])
        result = s / 0
        expected = Series([np.nan, np.inf, -np.inf])
        assert_series_equal(result, expected)

        # float/integer issue
        # GH 7785
        p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)})
        expected = Series([-0.01, -np.inf])

        result = p['second'].div(p['first'])
        assert_series_equal(result, expected, check_names=False)

        result = p['second'] / p['first']
        assert_series_equal(result, expected)

        # GH 9144
        s = Series([-1, 0, 1])

        result = 0 / s
        expected = Series([0.0, nan, 0.0])
        assert_series_equal(result, expected)

        result = s / 0
        expected = Series([-inf, nan, inf])
        assert_series_equal(result, expected)

        result = s // 0
        expected = Series([-inf, nan, inf])
        assert_series_equal(result, expected)
def boxcoxtrans(str,list):
        s=list
        w = pd.read_csv(str, usecols=s)

        f = DataFrame(w)
        c = f.astype(float)

        x = c.as_matrix()


        e = []

        for j in np.linspace(-2, 2, num=21):

                if j != 0:

                    b =(x**j)

                    d=[]
                    c=[]
                    for i in range(0,len(b)):
                        c = b[i]
                        d.append(c[0])
                    

                    t = stats.shapiro(d)
                    
                    
                    e.append(t[1])




        for i in range(0,len(e)):

            if e[i]==max(e):

                break
        t=(-2+0.2*i)

        if t>=0:
            t=(-2+0.2*(i+1))

        print 'optimal lembda=',t

        h=((x**t)-1)/t
        l=[]
        m=[]
        for i in range(0,len(h)):
            l = h[i]
            m.append(l[0])


        print pd.DataFrame(m)
        k=stats.shapiro(m)

        print 'shapiro test of trans column',k
Example #25
0
    def test_interp_inplace(self):
        df = DataFrame({'a': [1., 2., np.nan, 4.]})
        expected = DataFrame({'a': [1., 2., 3., 4.]})
        result = df.copy()
        result['a'].interpolate(inplace=True)
        assert_frame_equal(result, expected)

        result = df.copy()
        result['a'].interpolate(inplace=True, downcast='infer')
        assert_frame_equal(result, expected.astype('int64'))
def acc_cont_table(predictions, names, true, print_flag=True):
    
    """Create Conditional Accuracy Tables as in:
       Combining Information Extraction Systems Using Voting and Stacked Generalization
       by Sigletos et al, 2005"""
    
    from numpy import eye
    from pandas import DataFrame
    
    table = eye(len(predictions)) # table initilization
    for i in xrange(len(predictions)):
        for j in range(i+1, len(predictions)): # for each pair
            _, _, _, i_given_j, j_given_i = Pairwise_Tests(predictions[i], predictions[j], true, names[i], names[j])
            table[i, j] = i_given_j
            table[j, i] = j_given_i
    df = DataFrame(table, names, names)
    if print_flag:
        print df.astype('float').to_string(float_format= lambda x: '%0.2f'%(x))
    return df
Example #27
0
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)
Example #28
0
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)
Example #29
0
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)
Example #30
0
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)