Beispiel #1
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.frame['A'][::2] = np.nan
        self.frame['B'][::3] = np.nan
        self.frame['C'][::4] = np.nan
        self.frame['D'][::5] = np.nan

        ranks0 = self.frame.rank()
        ranks1 = self.frame.rank(1)
        mask = np.isnan(self.frame.values)

        fvals = self.frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Beispiel #2
0
    def test_rank(self, float_frame):
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        float_frame['A'][::2] = np.nan
        float_frame['B'][::3] = np.nan
        float_frame['C'][::4] = np.nan
        float_frame['D'][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Beispiel #3
0
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ['a', 'a', 'b', 'c']})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with tm.assert_raises_regex(TypeError, xpr):
            df.astype({"A": cls})

        with tm.assert_raises_regex(TypeError, xpr):
            df['A'].astype(cls)
Beispiel #4
0
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range('2010-01-04', periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(['1.0', '2', '3.14', '4', '5.4'])
        df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({'b': 'str', 'd': 'float32'})
        result = df.astype(dt1)
        expected = DataFrame({
            'a': a,
            'b': Series(['0', '1', '2', '3', '4']),
            'c': c,
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            'a': a,
            'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
            'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
        assert_frame_equal(df.astype(dt3),
                           df.astype(str))
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({'b': str, 2: str})
        dt5 = dtype_class({'e': str})
        pytest.raises(KeyError, df.astype, dt4)
        pytest.raises(KeyError, df.astype, dt5)
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)
Beispiel #5
0
    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors='ignore')
Beispiel #6
0
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with tm.assert_raises_regex(ValueError, msg):
            df.astype(dtype)
Beispiel #7
0
    def test_astype_cast_nan_inf_int(self):
        # GH14265, check nan and inf raise error when converting to int
        types = [np.int32, np.int64]
        values = [np.nan, np.inf]
        msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer'

        for this_type in types:
            for this_val in values:
                df = DataFrame([this_val])
                with tm.assert_raises_regex(ValueError, msg):
                    df.astype(this_type)
Beispiel #8
0
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        with pytest.raises(TypeError):
            df.astype(other)

        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError):
            df.astype(dtype)
Beispiel #9
0
    def _get_culled_matrix(least_num_seg: int,
                           dtm_data_frame: pd.DataFrame) -> pd.DataFrame:
        """Get the culled final_matrix and culled words.

        Gives a matrix that only contains the words that appears in more than
        `least_num_seg` segments.
        :param least_num_seg: least number of segment the word needs to appear
                                in to be kept.
        :param dtm_data_frame: the dtm in forms of panda data frames.
                                the indices(rows) are segment names
                                the columns are words.
        :return:
             the culled dtm data frame
        """
        # create a bool matrix to indicate whether a word is in a segment
        # at the line of segment s and the column of word w,
        # if the value is True, then means w is in s
        # otherwise means w is not in s
        is_in_data_frame = dtm_data_frame.astype(bool)

        # summing the boolean array gives an int, which indicates how many
        # True there are in that array.
        # this is an series, indicating each word is in how many segments
        # this array is a parallel array of words
        # noinspection PyUnresolvedReferences
        words_in_num_seg_series = is_in_data_frame.sum(axis=0)

        # get the index of all the words needs to remain
        # this is an array of int
        dtm_data_frame = dtm_data_frame.loc[
            :,  # select all rows (row indexer)
            words_in_num_seg_series >= least_num_seg  # col indexer
        ]

        return dtm_data_frame
Beispiel #10
0
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = ['days', 'hours', 'minutes', 'seconds',
                   'milliseconds', 'microseconds', 'nanoseconds']
        hasnans = self.hasnans
        if hasnans:
            def f(x):
                if isnull(x):
                    return [np.nan] * len(columns)
                return x.components
        else:
            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self])
        result.columns = columns
        if not hasnans:
            result = result.astype('int64')
        return result
Beispiel #11
0
 def test_astype_categorical(self, dtype):
     # GH 18099
     d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
     df = DataFrame(d)
     result = df.astype(dtype)
     expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
     tm.assert_frame_equal(result, expected)
Beispiel #12
0
def slice(unit, trialData, sort_by = None, show = False):
    
    data = trialData
    
    if sort_by in trialData.columns:
        data = trialData.sort(columns=sort_by)
    
    rates = DataFrame(index=data.index, columns = range(6))
    
    for ind, row in data.iterrows():
        pg = (row['PG in'], row['PG out'])
        fg = (row['C out'], row['FG in'])
        cent = (row['C in'], row['C out'])
        delay = (row['PG out'], row['C in'])
    
        counts = [ np.histogram(row[unit.id], bins = 1, range=pg)[0] ]
        
        counts.append(np.histogram(row[unit.id], bins=3, range=delay)[0])
        counts.extend([np.histogram(row[unit.id], bins = 1, range=period)[0] 
                        for period in [cent, fg]])
        
        counts = np.concatenate(counts)
        diffs = [pg[1]-pg[0], (delay[1]-delay[0])/3.0, (delay[1]-delay[0])/3.0,
                (delay[1]-delay[0])/3.0, cent[1]-cent[0], fg[1]-fg[0], ]
        
        
        rates.ix[ind] = counts/diffs
    
    if show:
        plt.imshow(rates.astype(float), aspect='auto', interpolation = 'nearest',
            extent=[0,5,0,len(rates)])
    
    return rates
    def clean(numpy_array):  #load your csv data here in numpy_array
        data=ut.preprocessData(numpy_array)

        #print dataarray
        #print data

        ###### numpy into pandas dataframe
        df = pd.DataFrame(data)
        #print df
        #print df.dtypes

        df=df.astype('float16')
        #print df.dtypes


        ###### generate preprocessed csv file 
        #df.to_csv('preprocessed_data.csv', sep=',',index=False)

        ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin)
        df_norm= (df - df.min()) / (df.max()-df.min())
        df_norm=df_norm.fillna(-1)

        ##### generate normalized csv 
        #df_norm.to_csv('normalized_data.csv',sep=',', index=False)
        
        return df_norm.as_matrix() 
Beispiel #14
0
    def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Beispiel #15
0
    def test_passing_dtype(self):
        # see gh-6607
        df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
            'AB'), index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
            df.to_csv(path)

            # see gh-3795: passing 'str' as the dtype
            result = self.read_csv(path, dtype=str, index_col=0)
            expected = df.astype(str)
            tm.assert_frame_equal(result, expected)

            # for parsing, interpret object as str
            result = self.read_csv(path, dtype=object, index_col=0)
            tm.assert_frame_equal(result, expected)

            # we expect all object columns, so need to
            # convert to test for equivalence
            result = result.astype(float)
            tm.assert_frame_equal(result, df)

            # invalid dtype
            self.assertRaises(TypeError, self.read_csv, path,
                              dtype={'A': 'foo', 'B': 'float64'},
                              index_col=0)

        # see gh-12048: empty frame
        actual = self.read_csv(StringIO('A,B'), dtype=str)
        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
        tm.assert_frame_equal(actual, expected)
Beispiel #16
0
def bool_frame_with_na():
    """
    Fixture for DataFrame of booleans with index of unique strings

    Columns are ['A', 'B', 'C', 'D']; some entries are missing

                    A      B      C      D
    zBZxY2IDGd  False  False  False  False
    IhBWBMWllt  False   True   True   True
    ctjdvZSR6R   True  False   True   True
    AVTujptmxb  False   True  False   True
    G9lrImrSWq  False  False  False   True
    sFFwdIUfz2    NaN    NaN    NaN    NaN
    s15ptEJnRb    NaN    NaN    NaN    NaN
    ...           ...    ...    ...    ...
    UW41KkDyZ4   True   True  False  False
    l9l6XkOdqV   True  False  False  False
    X2MeZfzDYA  False   True  False  False
    xWkIKU7vfX  False   True  False   True
    QOhL6VmpGU  False  False  False   True
    22PwkRJdat  False   True  False  False
    kfboQ3VeIK   True  False   True  False

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData()) > 0
    df = df.astype(object)
    # set some NAs
    df.loc[5:10] = np.nan
    df.loc[15:20, -2:] = np.nan
    return df
Beispiel #17
0
    def test_astype_str(self, text_dtype):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        # Test str and unicode on Python 2.x and just str on Python 3.x
        result = df.astype(text_dtype)

        expected = DataFrame({
            "a": list(map(text_dtype,
                          map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b": list(map(text_dtype, map(Timestamp, b._values))),
            "c": list(map(text_dtype,
                          map(lambda x: Timedelta(x)._repr_base(format="all"),
                              c._values))),
            "d": list(map(text_dtype, d._values)),
            "e": list(map(text_dtype, e._values)),
        })

        assert_frame_equal(result, expected)
Beispiel #18
0
def parse_table_schema(json, precise_float):
    """
    Builds a DataFrame from a given schema

    Parameters
    ----------
    json :
        A JSON table schema
    precise_float : boolean
        Flag controlling precision when decoding string to double values, as
        dictated by ``read_json``

    Returns
    -------
    df : DataFrame

    Raises
    ------
    NotImplementedError
        If the JSON table schema contains either timezone or timedelta data

    Notes
    -----
        Because ``write_json`` uses the string `index` to denote a name-less
        ``Index``, this function sets the name of the returned ``DataFrame`` to
        ``None`` when said string is encountered. Therefore, intentional usage
        of `index` as the ``Index`` name is not supported.

    See also
    --------
    build_table_schema : inverse function
    pandas.read_json
    """
    table = loads(json, precise_float=precise_float)
    col_order = [field['name'] for field in table['schema']['fields']]
    df = DataFrame(table['data'])[col_order]

    dtypes = {field['name']: convert_json_field_to_pandas_type(field)
              for field in table['schema']['fields']}

    # Cannot directly use as_type with timezone data on object; raise for now
    if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
        raise NotImplementedError('table="orient" can not yet read timezone '
                                  'data')

    # No ISO constructor for Timedelta as of yet, so need to raise
    if 'timedelta64' in dtypes.values():
        raise NotImplementedError('table="orient" can not yet read '
                                  'ISO-formatted Timedelta data')

    df = df.astype(dtypes)

    df = df.set_index(table['schema']['primaryKey'])
    if len(df.index.names) == 1 and df.index.name == 'index':
        df.index.name = None
    else:
        if all(x.startswith('level_') for x in df.index.names):
            df.index.names = [None] * len(df.index.names)

    return df
    def test_hist_non_numerical_raises(self):
        # gh-10444
        df = DataFrame(np.random.rand(10, 2))
        df_o = df.astype(np.object)

        msg = "hist method requires numerical columns, nothing to plot."
        with pytest.raises(ValueError, match=msg):
            df_o.hist()
Beispiel #20
0
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
               r" \[timedelta64\[{}\]\]").format(unit)
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
               r" \[datetime64\[{}\]\]").format(unit)
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)
Beispiel #21
0
    def setup_cache(self):
        df = DataFrame([[1]])
        frames = {
            'int': df,
            'float': df.astype(float),
        }

        return frames
Beispiel #22
0
    def test_constant_drift(self):
        N = 10
        expected = DataFrame({'x': np.arange(N), 'y': np.zeros(N)}).iloc[1:]
        expected = expected.astype('float')
        expected.index.name = 'frame'
        expected.columns = ['x', 'y']

        actual = tp.compute_drift(self.steppers)
        assert_frame_equal(actual, expected)
def boxcoxtrans(str,list):
        s=list
        w = pd.read_csv(str, usecols=s)

        f = DataFrame(w)
        c = f.astype(float)

        x = c.as_matrix()


        e = []

        for j in np.linspace(-2, 2, num=21):

                if j != 0:

                    b =(x**j)

                    d=[]
                    c=[]
                    for i in range(0,len(b)):
                        c = b[i]
                        d.append(c[0])
                    

                    t = stats.shapiro(d)
                    
                    
                    e.append(t[1])




        for i in range(0,len(e)):

            if e[i]==max(e):

                break
        t=(-2+0.2*i)

        if t>=0:
            t=(-2+0.2*(i+1))

        print 'optimal lembda=',t

        h=((x**t)-1)/t
        l=[]
        m=[]
        for i in range(0,len(h)):
            l = h[i]
            m.append(l[0])


        print pd.DataFrame(m)
        k=stats.shapiro(m)

        print 'shapiro test of trans column',k
Beispiel #24
0
    def test_div(self):

        # no longer do integer div for any ops, but deal with the 0's
        p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
        result = p['first'] / p['second']
        expected = Series(p['first'].values.astype(float) / p['second'].values,
                          dtype='float64')
        expected.iloc[0:3] = np.inf
        assert_series_equal(result, expected)

        result = p['first'] / 0
        expected = Series(np.inf, index=p.index, name='first')
        assert_series_equal(result, expected)

        p = p.astype('float64')
        result = p['first'] / p['second']
        expected = Series(p['first'].values / p['second'].values)
        assert_series_equal(result, expected)

        p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]})
        result = p['first'] / p['second']
        assert_series_equal(result, p['first'].astype('float64'),
                            check_names=False)
        self.assertTrue(result.name is None)
        self.assertFalse(np.array_equal(result, p['second'] / p['first']))

        # inf signing
        s = Series([np.nan, 1., -1.])
        result = s / 0
        expected = Series([np.nan, np.inf, -np.inf])
        assert_series_equal(result, expected)

        # float/integer issue
        # GH 7785
        p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)})
        expected = Series([-0.01, -np.inf])

        result = p['second'].div(p['first'])
        assert_series_equal(result, expected, check_names=False)

        result = p['second'] / p['first']
        assert_series_equal(result, expected)

        # GH 9144
        s = Series([-1, 0, 1])

        result = 0 / s
        expected = Series([0.0, nan, 0.0])
        assert_series_equal(result, expected)

        result = s / 0
        expected = Series([-inf, nan, inf])
        assert_series_equal(result, expected)

        result = s // 0
        expected = Series([-inf, nan, inf])
        assert_series_equal(result, expected)
Beispiel #25
0
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)
Beispiel #26
0
    def test_interp_inplace(self):
        df = DataFrame({'a': [1., 2., np.nan, 4.]})
        expected = DataFrame({'a': [1., 2., 3., 4.]})
        result = df.copy()
        result['a'].interpolate(inplace=True)
        assert_frame_equal(result, expected)

        result = df.copy()
        result['a'].interpolate(inplace=True, downcast='infer')
        assert_frame_equal(result, expected.astype('int64'))
Beispiel #27
0
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)
def acc_cont_table(predictions, names, true, print_flag=True):
    
    """Create Conditional Accuracy Tables as in:
       Combining Information Extraction Systems Using Voting and Stacked Generalization
       by Sigletos et al, 2005"""
    
    from numpy import eye
    from pandas import DataFrame
    
    table = eye(len(predictions)) # table initilization
    for i in xrange(len(predictions)):
        for j in range(i+1, len(predictions)): # for each pair
            _, _, _, i_given_j, j_given_i = Pairwise_Tests(predictions[i], predictions[j], true, names[i], names[j])
            table[i, j] = i_given_j
            table[j, i] = j_given_i
    df = DataFrame(table, names, names)
    if print_flag:
        print df.astype('float').to_string(float_format= lambda x: '%0.2f'%(x))
    return df
Beispiel #29
0
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)
Beispiel #30
0
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)
Beispiel #31
0
def cast_dtypes(
    df: pd.DataFrame,
    dtype: Optional[Mapping[str, Union[type, str]]] = None,
    inplace=False,
) -> pd.DataFrame:
    """
    Cast data types for columns in dataframe, skip columns that doesn't exist.

    The following obsplus specific datatypes are supported:
        'ops_datetime' - call :func:`obsplus.utils.time.to_datetime64` on column
        'ops_timedelta` - call :func:`obsplus.utils.time.to_timedelta64` on column

    Note: this is different from pd.astype because it skips columns which
    don't exist.

    Parameters
    ----------
    df
        Dataframe
    dtype
        A dict of columns and datatypes.
    inplace
        If true perform operation in place.
    """
    # get overlapping columns and dtypes
    overlap = set(dtype) & set(df.columns)
    dtype_codes = {i: dtype[i] for i in overlap}
    # if the dataframe is empty and has columns use simple astype
    if df.empty and len(df.columns):
        dtypes = {i: OPS_DTYPES.get(v, v) for i, v in dtype_codes.items()}
        return df.astype(dtypes)
    # else create functions and apply to each column
    funcs = {
        i: OPS_DTYPE_FUNCS.get(v, lambda x, y=v: x.astype(y))
        for i, v in dtype_codes.items()
    }
    return apply_funcs_to_columns(df, funcs=funcs, inplace=inplace)
def post_processing(df: pd.DataFrame, include_spot_prices=False) -> pd.DataFrame:
    unpacked_column_pool = unpack_column_pool(df)
    unpacked_column_token_prices = unpack_column_token_prices(df)

    df = df.assign(**unpacked_column_pool).assign(**unpacked_column_token_prices)
    if include_spot_prices:
        unpacked_column_spot_prices = unpack_column_spot_prices(df)
        df = df.assign(**unpacked_column_spot_prices)

    # Convert change_datetime from str to datetime, other columns to float64
    df["change_datetime"] = pd.to_datetime(df["change_datetime"], utc=True)
    df = df.astype({"pool_shares": "float64", "swap_fee": "float64"})

    # Calculate token_{x}_value columns
    token_x_value = calc_token_x_value(df)
    df = df.assign(**token_x_value)

    # Calculate TVL column
    symbols = assets_in_df(df)
    token_value_columns = [f'token_{s}_value' for s in symbols]
    column_tvl = df[token_value_columns].sum(axis=1)
    df = df.assign(tvl=column_tvl)

    # Calculate Invariant column
    df['invariant'] = 1
    for s in symbols:
        df['invariant'] *= (df[f'token_{s}_balance'] ** df[f'token_{s}_weight'])

    # Calculate total_token_balances
    token_balance_columns = [f'token_{s}_balance' for s in symbols]
    column_total_token_balances = df[token_balance_columns].sum(axis=1)
    df = df.assign(total_token_balances=column_total_token_balances)

    # Convert generated_fees_(token) columns from str or Decimal to float64
    generated_fees_columns = [f'generated_fees_{s}' for s in symbols]
    for generated_fee_col in generated_fees_columns: df[generated_fee_col] = df[generated_fee_col].astype('float64')
    return df
Beispiel #33
0
def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
    # test combined aggregations on ordered categorical cols GH27800

    # create the result dataframe
    input_df = DataFrame({
        "nr": [1, 2, 3, 4, 5, 6, 7, 8],
        "cat_ord": list("aabbccdd"),
        "cat": list("aaaabbbb"),
    })

    input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
    input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
    result_df = input_df.groupby("cat").agg(grp_col_dict)

    # create expected dataframe
    cat_index = pd.CategoricalIndex(["a", "b"],
                                    categories=["a", "b"],
                                    ordered=False,
                                    name="cat",
                                    dtype="category")

    # unpack the grp_col_dict to create the multi-index tuple
    # this tuple will be used to create the expected dataframe index
    multi_index_list = []
    for k, v in grp_col_dict.items():
        if isinstance(v, list):
            for value in v:
                multi_index_list.append([k, value])
        else:
            multi_index_list.append([k, v])
    multi_index = MultiIndex.from_tuples(tuple(multi_index_list))

    expected_df = DataFrame(data=exp_data,
                            columns=multi_index,
                            index=cat_index)

    tm.assert_frame_equal(result_df, expected_df)
Beispiel #34
0
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = [
            "days",
            "hours",
            "minutes",
            "seconds",
            "milliseconds",
            "microseconds",
            "nanoseconds",
        ]
        hasnans = self._hasnans
        if hasnans:

            def f(x):
                if isna(x):
                    return [np.nan] * len(columns)
                return x.components

        else:

            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self], columns=columns)
        if not hasnans:
            result = result.astype("int64")
        return result
Beispiel #35
0
    def test_passing_dtype(self):
        # see gh-6607
        df = DataFrame(np.random.rand(5, 2).round(4),
                       columns=list('AB'),
                       index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
            df.to_csv(path)

            # see gh-3795: passing 'str' as the dtype
            result = self.read_csv(path, dtype=str, index_col=0)
            expected = df.astype(str)
            tm.assert_frame_equal(result, expected)

            # for parsing, interpret object as str
            result = self.read_csv(path, dtype=object, index_col=0)
            tm.assert_frame_equal(result, expected)

            # we expect all object columns, so need to
            # convert to test for equivalence
            result = result.astype(float)
            tm.assert_frame_equal(result, df)

            # invalid dtype
            self.assertRaises(TypeError,
                              self.read_csv,
                              path,
                              dtype={
                                  'A': 'foo',
                                  'B': 'float64'
                              },
                              index_col=0)

        # see gh-12048: empty frame
        actual = self.read_csv(StringIO('A,B'), dtype=str)
        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
        tm.assert_frame_equal(actual, expected)
Beispiel #36
0
 def _generate_category_map(self, X: DataFrame) -> (DataFrame, dict):
     if self.features_in:
         fill_nan_map = dict()
         category_map = dict()
         X_category = X.astype('category')
         for column in X_category:
             rank = X_category[column].value_counts().sort_values(ascending=True)
             if self._minimum_cat_count is not None:
                 rank = rank[rank >= self._minimum_cat_count]
             if self._maximum_num_cat is not None:
                 rank = rank[-self._maximum_num_cat:]
             if self.cat_order == 'count' or self._minimum_cat_count is not None or self._maximum_num_cat is not None:
                 category_list = list(rank.index)  # category_list in 'count' order
                 if len(category_list) > 1:
                     if self.cat_order == 'original':
                         original_cat_order = list(X_category[column].cat.categories)
                         set_category_list = set(category_list)
                         category_list = [cat for cat in original_cat_order if cat in set_category_list]
                     elif self.cat_order == 'alphanumeric':
                         category_list.sort()
                 X_category[column] = X_category[column].astype(CategoricalDtype(categories=category_list))  # TODO: Remove columns if all NaN after this?
                 X_category[column] = X_category[column].cat.reorder_categories(category_list)
             elif self.cat_order == 'alphanumeric':
                 category_list = list(X_category[column].cat.categories)
                 category_list.sort()
                 X_category[column] = X_category[column].astype(CategoricalDtype(categories=category_list))
                 X_category[column] = X_category[column].cat.reorder_categories(category_list)
             category_map[column] = copy.deepcopy(X_category[column].cat.categories)
             if self._fillna_flag:
                 if self._fillna == 'mode':
                     if len(rank) > 0:
                         fill_nan_map[column] = list(rank.index)[-1]
         if not self._fillna_flag:
             fill_nan_map = None
         return X_category, category_map, fill_nan_map
     else:
         return DataFrame(index=X.index), None, None
def clean_dat(dat: pd.DataFrame, logger=None) -> pd.DataFrame:
    if logger == None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        logger = logging.getLogger(__name__)

    logger.debug(
        'Clean Data, number of inf and nun are for dataset: (%d, %d)' %
        ((dat == np.inf).sum().sum(), dat.isna().sum().sum()))
    logger.info(' -Set type to float32 at first && deal with inf.')
    dat = dat.astype(np.float32)
    dat = dat.replace([np.inf, -np.inf], np.nan)
    logger.info(' -Remove columns with half of nan')
    dat = dat.dropna(axis=1, thresh=dat.shape[0] * .5)
    logger.info(' -Remove costant columns')
    dat = dat.loc[:, (dat != dat.iloc[0]).any()]
    logger.info(' -Remove columns with too many so small numbers')
    for col in dat.columns:
        if (abs(dat[col] - 0.0) < 0.0001).sum() / dat.shape[0] > 0.8:
            print((abs(dat[col] - 0.0) < 0.0001).sum())
            dat.drop(col, axis=1, inplace=True)
    if dat.isna().sum().sum() > 0:
        logger.info(' -Start to fill the columns with nan')
        # imp = IterativeImputer(max_iter=10, random_state=0)
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        # dat = dat.fillna(dat.mean())
        tmp = imp.fit_transform(dat)
        if tmp.shape[1] != dat.shape[1]:
            tmp = dat.fillna(0)
        dat = pd.DataFrame(tmp, columns=dat.coulumns, index=dat.index)
    logger.info(' -Remove rows with any nan in the end')
    dat = dat.dropna(axis=0, how='any')
    logger.debug(
        'End with Data cleaning, number of inf and nun are for dataset: (%d, %d)'
        % ((dat == np.inf).sum().sum(), dat.isna().sum().sum()))
    return dat
def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
    # test single aggregations on ordered categorical cols GHGH27800

    # create the result dataframe
    input_df = DataFrame(
        {
            "nr": [1, 2, 3, 4, 5, 6, 7, 8],
            "cat_ord": list("aabbccdd"),
            "cat": list("aaaabbbb"),
        }
    )

    input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
    input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
    result_df = input_df.groupby("cat").agg(grp_col_dict)

    # create expected dataframe
    cat_index = pd.CategoricalIndex(
        ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
    )

    expected_df = DataFrame(data=exp_data, index=cat_index)

    tm.assert_frame_equal(result_df, expected_df)
Beispiel #39
0
def infer(X_test, save_dir):
    test_data_size = len(X_test)

    # load parameters
    print("====loading param====")
    w = np.loadtxt(os.path.join(save_dir, 'w'))
    b = np.loadtxt(os.path.join(save_dir, 'b'))

    # predict
    z = (np.dot(X_test, w) + b)
    y = sigmoid(z)
    y_ = np.around(y)
    y_answer = y_.T

    # with open('answer', 'w') as f:
    #     f.write('id, label\n')
    #     for i, v in enumerate(y_):
    #         f.write("%d,%d\n" % (i+1, v))
    #     f.close()
    answer = DataFrame(y_answer)
    answer.index += 1
    answer.columns = ['prediction']
    answer = answer.astype('int64')
    answer.to_csv('answer.csv', index_label='id')
Beispiel #40
0
    def test_hist_non_numerical_or_datetime_raises(self):
        # gh-10444, GH32590
        df = DataFrame(
            {
                "a": np.random.rand(10),
                "b": np.random.randint(0, 10, 10),
                "c": to_datetime(
                    np.random.randint(
                        1582800000000000000, 1583500000000000000, 10, dtype=np.int64
                    )
                ),
                "d": to_datetime(
                    np.random.randint(
                        1582800000000000000, 1583500000000000000, 10, dtype=np.int64
                    ),
                    utc=True,
                ),
            }
        )
        df_o = df.astype(object)

        msg = "hist method requires numerical or datetime columns, nothing to plot."
        with pytest.raises(ValueError, match=msg):
            df_o.hist()
Beispiel #41
0
    def convert_labels(image: np.ndarray, df: pd.DataFrame,
                       pixel_size: Tuple[float, float]) -> pd.DataFrame:
        """Pre-processes labels to be used in deepBlink.

        Renames X/Y to c/r respectively for easier handling with rearrangement to r/c.
        Rounds coordinates on borders to prevent Fiji out-of bounds behavior.
        """
        # Fiji point label format
        if all(c in df.columns for c in ("X", "Y")):
            df = df.rename(columns={"X": "c", "Y": "r"})[["r", "c"]]
        # TrackMate export format
        elif all(c in df.columns for c in ("POSITION_X", "POSITION_Y")):
            df = df[~df.index.isna(
            )]  # Remove unused headers for TrackMate v7.0.0+
            df = df.rename(columns={
                "POSITION_X": "c",
                "POSITION_Y": "r"
            })[["r", "c"]]
            df = df.reset_index(drop=True)
        else:
            raise ValueError(
                "Format of input labels not recognized. "
                "Requires X,Y or POSITION_X,POSITION_Y in columns. "
                f"Columns found are: {df.columns.to_list()}.")

        # Clip upper and lower bounds of coordinates
        df = df.astype({"r": np.float64, "c": np.float64})
        for name, var in zip(["r", "c"], image.shape):
            df[name] = df[name].where(df[name] < var, var)
            df[name] = df[name].where(df[name] > 0, 0)

        # Scale coordinates to pixel size
        size_x, size_y = pixel_size
        df["r"] = df["r"] / size_y
        df["c"] = df["c"] / size_x
        return df
    def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
        # GH 12396

        # tz-naive
        first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
        second = DataFrame(
            [
                [Timestamp("2015/01/01", tz=tz2)],
                [Timestamp("2016/01/01", tz=tz2)],
            ],
            index=[2, 3],
        )

        expected = DataFrame([
            pd.NaT,
            pd.NaT,
            Timestamp("2015/01/01", tz=tz2),
            Timestamp("2016/01/01", tz=tz2),
        ])
        if tz1 != tz2:
            expected = expected.astype(object)

        result = pd.concat([first, second])
        tm.assert_frame_equal(result, expected)
Beispiel #43
0
def get_means(adata, mycat):
    """ Calculates average and fraction expression per category in adata.obs
    Based on an AnnData object and an annotation category (e.g. louvain) returns 
    average expression and fraction cells expressing gene per category
    parameters
    ----------
    adata: AnnData
      an AnnData object
    mycat: str
      the category for summarisation (e.g. louvain, cell_names)
    returns
    -------
    average_obs
        average gene expression per category
    fraction_obs
        fraction cells expressing a gene per category
    """
    gene_ids = adata.raw.var.index.values
    try:
        x = adata.obs[mycat]
        adata.obs[mycat] = adata.obs[mycat].astype('category')
        clusters = adata.obs[mycat].cat.categories
        obs = adata.raw[:, gene_ids].X.toarray()
        obs = DataFrame(obs, columns=gene_ids, index=adata.obs[mycat])
        average_obs = obs.groupby(level=0).mean()
        obs_bool = obs.astype(bool)
        fraction_obs = obs_bool.groupby(level=0).sum() / obs_bool.groupby(
            level=0).count()
    except KeyError:
        print(
            "Oops!  The adata object does not have the specified column. Options are: "
        )
        print(list(adata.obs.columns))
        average_obs = None
        fraction_obs = None
    return (average_obs, fraction_obs)
Beispiel #44
0
def bool_frame_with_na():
    """
    Fixture for DataFrame of booleans with index of unique strings

    Columns are ['A', 'B', 'C', 'D']; some entries are missing

                    A      B      C      D
    zBZxY2IDGd  False  False  False  False
    IhBWBMWllt  False   True   True   True
    ctjdvZSR6R   True  False   True   True
    AVTujptmxb  False   True  False   True
    G9lrImrSWq  False  False  False   True
    sFFwdIUfz2    NaN    NaN    NaN    NaN
    s15ptEJnRb    NaN    NaN    NaN    NaN
    ...           ...    ...    ...    ...
    UW41KkDyZ4   True   True  False  False
    l9l6XkOdqV   True  False  False  False
    X2MeZfzDYA  False   True  False  False
    xWkIKU7vfX  False   True  False   True
    QOhL6VmpGU  False  False  False   True
    22PwkRJdat  False   True  False  False
    kfboQ3VeIK   True  False   True  False

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData()) > 0
    df = df.astype(object)
    # set some NAs
    df.iloc[5:10] = np.nan
    df.iloc[15:20, -2:] = np.nan

    # For `any` tests we need to have at least one True before the first NaN
    #  in each column
    for i in range(4):
        df.iloc[i, i] = True
    return df
Beispiel #45
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        '''
        Changes dataset's column types to reduce memory consumption

        Args:
            X: dataframe needs to be optimized

        Returns:
            Optimized dataframe
        '''
        if self.verbose:
            original_mem_size = X.memory_usage(deep=True).sum() / 1024**2
            print('MEM OPTIMIZER: Memory usage of dataframe: {:.2f} MB'.format(
                original_mem_size))

        X = X.astype(self.dtypes, copy=False)

        if self.verbose:
            new_mem_size = X.memory_usage(deep=True).sum() / 1024**2
            print('MEM OPTIMIZER: Memory usage after optimization: {:.2f} MB'.
                  format(new_mem_size))
            print('MEM OPTIMIZER: Decreased by {:.1f}%'.format(\
                100 * (original_mem_size - new_mem_size) / original_mem_size))
        return X
Beispiel #46
0
def test_first_multi_key_groupbby_categorical():
    # GH 22512
    df = DataFrame(
        {
            "A": [1, 1, 1, 2, 2],
            "B": [100, 100, 200, 100, 100],
            "C": ["apple", "orange", "mango", "mango", "orange"],
            "D": ["jupiter", "mercury", "mars", "venus", "venus"],
        }
    )
    df = df.astype({"D": "category"})
    result = df.groupby(by=["A", "B"]).first()
    expected = DataFrame(
        {
            "C": ["apple", "mango", "mango"],
            "D": Series(["jupiter", "mars", "venus"]).astype(
                pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
            ),
        }
    )
    expected.index = MultiIndex.from_tuples(
        [(1, 100), (1, 200), (2, 100)], names=["A", "B"]
    )
    tm.assert_frame_equal(result, expected)
Beispiel #47
0
def add_jobapi_data(perf_df: pd.DataFrame):
    """Given a dataframe from PerformanceSummary.to_df, add QPID Job API data for the job.
    Job API reference: https://stash.ihme.washington.edu/projects/QPID/repos/job-db/browse/docs/index.md
    """
    try:
        job_numbers = perf_df["job_number"].unique()
        assert len(job_numbers) == 1
        jobapi_data = requests.get(
            "http://jobapi.ihme.washington.edu/fair/queryjobids",
            params=[("job_number", job_numbers[0]), ("limit", 50000)],
        ).json()
        jobapi_df = pd.DataFrame(jobapi_data["data"]).add_prefix("qpid_")
        perf_df = perf_df.astype({
            "job_number": np.int64,
            "task_number": np.int64
        })
        perf_df = perf_df.merge(
            jobapi_df,
            left_on=["job_number", "task_number"],
            right_on=["qpid_job_number", "qpid_task_number"],
        )
    except Exception as e:
        logger.warning(f"Job API request failed with {e}")
    return perf_df
Beispiel #48
0
    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame(
            {
                "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
                "b": list(map(str, map(Timestamp, b._values))),
                "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
                "d": list(map(str, d._values)),
                "e": list(map(str, e._values)),
            }
        )

        tm.assert_frame_equal(result, expected)
def buildKB():
    global tfidf1, tfs1, processed_title
    print('started building the knowledge base')
    directory = './knowbase'
    processed_text = []
    processed_title = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            fullname = os.path.join(directory, filename)
            print('Processing file ', fullname)
            f = open(fullname, "r")
            text1 = f.read()
            prepr = preprocess(text1)
            processed_text.append(prepr)
            processed_title.append(filename)

    tfidf1 = TfidfVectorizer()

    tfs1 = tfidf1.fit_transform(processed_text)
    print('!!!!!!!Feature vector is writen to a csv file')
    dfReviews = DataFrame(tfs1.A, columns=tfidf1.get_feature_names())
    dfReviews = dfReviews.astype(float)
    dfReviews.to_csv("fv.csv")
    print('knowledge base is built')
Beispiel #50
0
    def get_maximal_cliques(self, delta=0, direction='both'):
        df = DF((self.df.drop(columns='w') if self.weighted else self.df.copy()))
        di = (delta == .0)
        if not di:
            if self.instantaneous:
                min_time, max_time = df.ts.min(), df.ts.max()
            else:
                min_time, max_time = df.ts.min(), df.tf.max()
            # apply the delta
            df['ts'] -= delta / 2.0
            df['tf'] = df['ts'] + delta
            # and clip to the start and finish of time
            df['ts'].clip(lower=min_time, inplace=True)
            df['tf'].clip(upper=max_time, inplace=True)
        else:
            df['tf'] = df['ts']

        if self.discrete:
            df = df.astype({'ts': int, 'tf': int})
        else:
            df['s'] = True
            df['f'] = True

        return TemporalLinkSetDF(df, disjoint_intervals=(di and not self.discrete), discrete=self.discrete, weighted=False).get_maximal_cliques(direction=direction)
Beispiel #51
0
def import_dataframe(input_dataframe: pandas.DataFrame,
                     text: str,
                     unique_id: str = None,
                     time: str = None,
                     twitter_times: bool = False,
                     columns_to_keep: List = []):
    """Imports a pandas dataframe into nate.

    Args:
        input_dataframe (pandas.DataFrame): The dataframe to be loaded.
        text (str): The name of the column containing the text data to be
            analyzed with nate. Required for all uses of nate.
        unique_id (str, optional): The name of the column containing unique
            identifiers (e.g. a unique name or hash ID#). Required
            for some uses of nate (e.g. Divsim).
        time (str, optional): The name of the column containing the time the
            observation was recorded. Required for some uses of
            nate (e.g. edge_burst).
        columns_to_keep (list, optional): A list of column names indicating
            which columns not specified elsewhere (e.g. for the
            time parameter) are kept.

    Returns:
        Nate: an instance of the `Nate` class containing all data from the
            columns specified in the parameters.

    The columns indicated in the text, unique_id, and time parameters will
    be renamed to 'text', 'unique_id', and 'time', accordingly. The names
    of the columns listed in 'columns_to_keep' will be preserved as-is.
    """
    
    if time!= None and twitter_times == False:
        input_dataframe = input_dataframe.astype({time: 'str'})
        input_dataframe[time] = pandas.to_datetime(input_dataframe[time], infer_datetime_format=True)
    return process_dataframe(input_dataframe, text, unique_id, time, twitter_times,
                             columns_to_keep)
Beispiel #52
0
    def read_dataframe(
            self,
            nodes: pd.DataFrame,
            attrs: Optional[Dict[str, Any]] = None) -> 'core.TreeNeuron':
        """Convert a SWC-like DataFrame into a TreeNeuron.

        Parameters
        ----------
        nodes : pandas.DataFrame
        attrs : dict or None
            Arbitrary attributes to include in the TreeNeuron

        Returns
        -------
        core.TreeNeuron
        """
        return core.TreeNeuron(sanitise_nodes(
            nodes.astype(self._dtypes, errors='ignore', copy=False)),
                               connectors=self._extract_connectors(nodes),
                               **(self._make_attributes(
                                   {
                                       'name': 'SWC',
                                       'origin': 'DataFrame'
                                   }, attrs)))
Beispiel #53
0
    def test_rank_methods_frame(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')
        import scipy

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals, m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if (LooseVersion(scipy.__version__) >=
                            LooseVersion('0.17.0')):
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Beispiel #54
0
    def __init__(self, words_df: pd.DataFrame):
        """

        Parameters
        ----------
        words_df : pd.DataFrame
            the expected columns as 'Word', 'Pronunciation', 'Pronunciation_with_accents', 'Definition', 'Occurence'
            with a 'Id' index
        """

        # so that fields are in native Python type
        self.words_df = words_df.astype(object)

        self.char_to_words = build_char_to_words(words_df["Word"])

        logger.debug("Get HSK index lists")
        # each list contains the words of that level and below
        # ie. 4 : [all words of level 4 and below]

        max_hsk_level = words_df["HSK_Level"].max()
        self.hsk_to_idx = defaultdict(list)
        for idx, hsk_level in enumerate(words_df["HSK_Level"]):
            for l in range(hsk_level, max_hsk_level + 1):
                self.hsk_to_idx[l].append(idx)
Beispiel #55
0
    def _transform(self, X: DataFrame) -> DataFrame:
        if self._bool_features:
            for feature in self._bool_features:
                X[feature] = (X[feature] == self._bool_features[feature]).astype(np.int8)
        # check if not same
        if self._type_map_real_opt != X.dtypes.to_dict():
            if self._int_features.size:
                null_count = X[self._int_features].isnull().any()
                # If int feature contains null during inference but not during fit.
                if null_count.any():
                    # TODO: Consider imputing to mode? This is tricky because training data had no missing values.
                    # TODO: Add unit test for this situation, to confirm it is handled properly.
                    with_null = null_count[null_count]
                    with_null_features = list(with_null.index)
                    logger.warning(f'WARNING: Int features without null values at train time contain null values at inference time! Imputing nulls to 0. To avoid this, pass the features as floats during fit!')
                    logger.warning(f'WARNING: Int features with nulls: {with_null_features}')
                    X[with_null_features] = X[with_null_features].fillna(0)

            if self._type_map_real_opt:
                # TODO: Confirm this works with sparse and other feature types!
                # FIXME: Address situation where test-time invalid type values cause crash:
                #  https://stackoverflow.com/questions/49256211/how-to-set-unexpected-data-type-to-na?noredirect=1&lq=1
                X = X.astype(self._type_map_real_opt)
        return X
def convert_pandas_dtypes(df: pd.DataFrame,
                          col_fix: type = float) -> pd.DataFrame:
    r"""Helper funtion to convert pandas column dtypes

    Parameters
    ----------
    df : pandas.DataFrame
        A pandas dataframe to convert columns
    col_fix : {float, str}, optional
        A column type to convert the input dataframe.

    Returns
    -------
    pd.DataFrame
        A dataframe with converted columns
    """
    try:
        df = df.astype(col_fix)
    except ValueError:
        raise ValueError(
            "Columns cannot be converted to {col}; check input features".
            format(col=col_fix))

    return df
Beispiel #57
0
def auto_arima(df: DataFrame, prepared_df: DataFrame,
               prediction_step_length: timedelta, feature_column: list):
    df = df[['time_', feature_column]]
    df = df.astype(np.int64)
    stepwise_model = pm.auto_arima(df[feature_column],
                                   start_p=1,
                                   start_q=1,
                                   max_p=3,
                                   max_q=3,
                                   m=7,
                                   start_P=0,
                                   seasonal=True,
                                   d=1,
                                   D=1,
                                   trace=True,
                                   error_action='ignore',
                                   suppress_warnings=True,
                                   stepwise=True)
    stepwise_model.aic()
    stepwise_model.fit(df[feature_column])
    future_forecast = stepwise_model.predict(
        n_periods=len(prepared_df['time_']))
    prepared_df[feature_column] = future_forecast
    return prepared_df
Beispiel #58
0
def read_data_file(fn, skiplines=1, maxlines=False):
    """  A function to read any foam data files returning data and
         index after header
    """

    # TODO check if sorting the index gives any performance benefits
    # print "opening file {}".format(fn)
    if not os.path.exists(fn):
        print("Can not open file " + fn)
        return None
    try:
        with open(fn, encoding="utf-8") as f:
            field = fn.split('/')[-1]
            content = f.readlines()
            content.append('bla')
            start, num_entries = if_header_skip(content)
            entries = len(content[start].split())
            is_a_vector = (True if entries > 1 else False)
            end = start + num_entries
            # FIXME this fails for eulerian/lagrangian vector fields
            # since no positional entry is produced
            if is_a_vector:
                data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(),
                            content[start:end:skiplines]))
                loc, names = evaluate_names(fn, entries)
                df = DataFrame(data=data, columns=names)
                if loc:
                    df['Loc'] = loc
                else:
                    df['Loc'] = range(len(df))
                if "Pos" in df:
                    df.set_index('Loc', append=False, inplace=True)
                    df["Pos"] = df["Pos"].astype(float)
                    df.set_index('Pos', append=True, inplace=True)
                else:
                    # if no pos is availible we have either
                    # an eulerian or lagrangian field
                    df.set_index('Loc', append=True, inplace=True)
                    df.index.names = ['Pos', 'Loc']
                    df = df.reorder_levels(['Loc', 'Pos'])
                df = df.astype(float)
                hashes = {}
                for row in df.columns:
                    hashes.update({row: hash_series(df[row])})
                return names, df, hashes
            # DataFile with a single row are seen as Eulerian or Lagrangian fields
            else:
                data = [np.float32(x) for x in content[start:end:skiplines]]
                entries = 1
                df = DataFrame(data=data, columns=[field])
                df['Loc'] = "Field"
                df.set_index('Loc', append=True, inplace=True)
                df.index.names=['Pos', 'Loc']
                df = df.reorder_levels(['Loc', 'Pos'])
                hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)}
                return field, df, hashes
    except Exception as e:
        if DEBUG:
            print("Error processing datafile " + fn)
            print(e)
        return None
Beispiel #59
0
    def setup_cache(self):
        df = DataFrame([[1]])
        frames = {"int": df, "float": df.astype(float)}

        return frames
Beispiel #60
0
    def __fit_hp(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 hp: pd.Series,
                 simple_imputer,
                 name: str,
                 user_defined_scores: list = None) -> pd.core.series.Series:
        """

        Method initialises the model, performs fitting and returns the desired metrics.


        :param train_df: training data as dataframe
        :param test_df: test data as dataframe; if not provided, a ratio of test_split of the
                          training data are used as test data
        :param hp: pd.Series with hyperparameter configuration
        :param simple_imputer: SimpleImputer instance from which to inherit column names etc.
        :param name to identify the current setting of hps.
        :param user_defined_scores: list with entries (Callable, str), where callable is a function
                          accepting arguments (true, predicted, confidence). True is an array with the true labels,
                          predicted with the predicted labels and confidence is an array with the confidence score for
                          each prediction.
                          Default metrics are:
                          f1_weighted, f1_micro, f1_macro, f1_weighted_train
                          recall_weighted, recall_weighted_train, precision_weighted, precision_weighted_train,
                          coverage_at_90, coverage_at_90_train, empirical_precision_at_90,
                          ece_pre_calibration (ece: expected calibration error), ece_post_calibration, time [min].
                          A user defined function could look as follows:

                          def my_function(true, predicted, confidence):
                               return (true[confidence > .75] == predicted[confidence > .75]).mean()

                          uds = (my_function, 'empirical_precision_above_75')

        :return: Series with hpo parameters and results.

        """

        from . import Imputer  # needs to be imported here to avoid circular dependency

        if not name:
            name = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        data_encoders = []
        data_featurizers = []

        if hp['global:concat_columns'] is False:

            # mark unused parameter
            for key, val in hp.items():
                if 'concat:' in key:
                    hp[key] = 'n.a.'

            # define column encoders and featurisers for each input column
            for input_column in simple_imputer.input_columns:

                # extract parameters for the current input column, take everything after the first colon
                col_parms = {
                    ':'.join(key.split(':')[1:]): val
                    for key, val in hp.items() if input_column in key
                }

                # define all input columns
                if col_parms['type'] == 'string':
                    # iterate over multiple embeddings (chars + strings for the same column)
                    for token in col_parms['tokens']:
                        # call kw. args. with: **{key: item for key, item in col_parms.items() if not key == 'type'})]
                        data_encoders += [
                            TfIdfEncoder(
                                input_columns=[input_column],
                                output_column=input_column + '_' + token,
                                tokens=token,
                                ngram_range=col_parms['ngram_range:' + token],
                                max_tokens=col_parms['max_tokens'])
                        ]
                        data_featurizers += [
                            BowFeaturizer(field_name=input_column + '_' +
                                          token,
                                          max_tokens=col_parms['max_tokens'])
                        ]

                elif col_parms['type'] == 'categorical':
                    data_encoders += [
                        CategoricalEncoder(input_columns=[input_column],
                                           output_column=input_column + '_' +
                                           col_parms['type'],
                                           max_tokens=col_parms['max_tokens'])
                    ]
                    data_featurizers += [
                        EmbeddingFeaturizer(field_name=input_column + '_' +
                                            col_parms['type'],
                                            max_tokens=col_parms['max_tokens'],
                                            embed_dim=col_parms['embed_dim'])
                    ]

                elif col_parms['type'] == 'numeric':
                    data_encoders += [
                        NumericalEncoder(input_columns=[input_column],
                                         output_column=input_column + '_' +
                                         col_parms['type'],
                                         normalize=col_parms['normalize'])
                    ]
                    data_featurizers += [
                        NumericalFeaturizer(
                            field_name=input_column + '_' + col_parms['type'],
                            numeric_latent_dim=col_parms['numeric_latent_dim'],
                            numeric_hidden_layers=col_parms[
                                'numeric_hidden_layers'])
                    ]
                else:
                    logger.warn(
                        'Found unknown column type. Canidates are string, categorical, numeric.'
                    )

        # Concatenate all columns
        else:
            # cast all columns to string for concatenation
            train_df = train_df.astype(str)
            test_df = test_df.astype(str)

            col_parms = {
                ':'.join(key.split(':')[1:]): val
                for key, val in hp.items() if 'concat' in key
            }
            for token in col_parms['tokens']:
                data_encoders += [
                    TfIdfEncoder(
                        input_columns=simple_imputer.input_columns,
                        output_column='-'.join(simple_imputer.input_columns) +
                        '_' + token,
                        tokens=token,
                        ngram_range=col_parms['ngram_range:' + token],
                        max_tokens=col_parms['max_tokens'])
                ]
                data_featurizers += [
                    BowFeaturizer(
                        field_name='-'.join(simple_imputer.input_columns) +
                        '_' + token,
                        max_tokens=col_parms['max_tokens'])
                ]
                # mark unused parameter
                for key, val in hp.items():
                    if not ('global:' in key or 'concat:' in key):
                        hp[key] = 'n.a.'

        # Define separate encoder and featurizer for each column
        # Define output column. Associated parameters are not tuned.
        if is_numeric_dtype(train_df[simple_imputer.output_column]):
            label_column = [NumericalEncoder(simple_imputer.output_column)]
            logger.info("Assuming numeric output column: {}".format(
                simple_imputer.output_column))
        else:
            label_column = [CategoricalEncoder(simple_imputer.output_column)]
            logger.info("Assuming categorical output column: {}".format(
                simple_imputer.output_column))

        global_parms = {
            key.split(':')[1]: val
            for key, val in hp.iteritems() if 'global' in key
        }

        hp_time = time.time()

        hp_imputer = Imputer(data_encoders=data_encoders,
                             data_featurizers=data_featurizers,
                             label_encoders=label_column,
                             output_path=self.output_path + name)

        hp_imputer.fit(
            train_df=train_df,
            test_df=test_df,
            ctx=get_context(),
            learning_rate=global_parms['learning_rate'],
            num_epochs=global_parms['num_epochs'],
            patience=global_parms['patience'],
            test_split=.1,
            weight_decay=global_parms['weight_decay'],
            batch_size=global_parms['batch_size'],
            final_fc_hidden_units=global_parms['final_fc_hidden_units'],
            calibrate=True)

        # add suitable metrics to hp series
        imputed = hp_imputer.predict(test_df)
        true = imputed[simple_imputer.output_column]
        predicted = imputed[simple_imputer.output_column + '_imputed']

        imputed_train = hp_imputer.predict(
            train_df.sample(min(train_df.shape[0], int(1e4))))
        true_train = imputed_train[simple_imputer.output_column]
        predicted_train = imputed_train[simple_imputer.output_column +
                                        '_imputed']

        if is_numeric_dtype(train_df[simple_imputer.output_column]):
            hp['mse'] = mean_squared_error(true, predicted)
            hp['mse_train'] = mean_squared_error(true_train, predicted_train)
            confidence = float('nan')
        else:
            confidence = imputed[simple_imputer.output_column +
                                 '_imputed_proba']
            confidence_train = imputed_train[simple_imputer.output_column +
                                             '_imputed_proba']
            hp['f1_micro'] = f1_score(true, predicted, average='micro')
            hp['f1_macro'] = f1_score(true, predicted, average='macro')
            hp['f1_weighted'] = f1_score(true, predicted, average='weighted')
            hp['f1_weighted_train'] = f1_score(true_train,
                                               predicted_train,
                                               average='weighted')
            hp['precision_weighted'] = f1_score(true,
                                                predicted,
                                                average='weighted')
            hp['precision_weighted_train'] = f1_score(true_train,
                                                      predicted_train,
                                                      average='weighted')
            hp['recall_weighted'] = recall_score(true,
                                                 predicted,
                                                 average='weighted')
            hp['recall_weighted_train'] = recall_score(true_train,
                                                       predicted_train,
                                                       average='weighted')
            hp['coverage_at_90'] = (confidence > .9).mean()
            hp['coverage_at_90_train'] = (confidence_train > .9).mean()
            hp['empirical_precision_at_90'] = (
                predicted[confidence > .9] == true[confidence > .9]).mean()
            hp['ece_pre_calibration'] = hp_imputer.calibration_info['ece_post']
            hp['ece_post_calibration'] = hp_imputer.calibration_info[
                'ece_post']
            hp['time [min]'] = (time.time() - hp_time) / 60

        for uds in user_defined_scores:
            hp[uds[1]] = uds[0](true=true,
                                predicted=predicted,
                                confidence=confidence)

        hp_imputer.save()

        return hp