Example #1
0
    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(exclude=np.number)
        ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude='category')
        ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
        assert_frame_equal(ri, ei)

        pytest.raises(NotImplementedError,
                      lambda: df.select_dtypes(exclude='period'))
Example #2
0
    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number,
                              exclude=['floating', 'timedelta'])
        ei = df[['b', 'c']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude='floating')
        ei = df[['b', 'c', 'f', 'k']]
        assert_frame_equal(ri, ei)
Example #3
0
 def test_select_dtypes_bad_arg_raises(self):
     df = DataFrame({'a': list('abc'),
                     'g': list(u('abc')),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.date_range('now', periods=3).values})
     with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'):
         df.select_dtypes(['blargy, blarg, blarg'])
Example #4
0
    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(include=['datetime64[D]'])

        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(exclude=['datetime64[as]'])
Example #5
0
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame({"a": list("abc"),
                        "g": list(u("abc")),
                        "b": list(range(1, 4)),
                        "c": np.arange(3, 6).astype("u1"),
                        "d": np.arange(4.0, 7.0, dtype="float64"),
                        "e": [True, False, True],
                        "f": pd.date_range("now", periods=3).values})
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with tm.assert_raises_regex(TypeError, msg):
            df.select_dtypes(**kwargs)
Example #6
0
    def test_select_dtypes_include(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc'))})
        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'])
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)
Example #7
0
    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        df['g'] = df.f.diff()
        assert not hasattr(np, 'u8')
        r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
        e = df[['a', 'b']]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
        e = df[['a', 'b', 'g']]
        assert_frame_equal(r, e)
Example #8
0
    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        exclude = np.datetime64,
        include = np.bool_, 'integer'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'c', 'e']]
        assert_frame_equal(r, e)

        exclude = 'datetime',
        include = 'bool', 'int64', 'int32'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'e']]
        assert_frame_equal(r, e)
Example #9
0
 def test_select_dtypes_exclude_using_list_like(self):
     df = DataFrame({'a': list('abc'),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True]})
     re = df.select_dtypes(exclude=[np.number])
     ee = df[['a', 'e']]
     assert_frame_equal(re, ee)
Example #10
0
 def test_select_dtypes_str_raises(self):
     df = DataFrame({'a': list('abc'),
                     'g': list(u('abc')),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.date_range('now', periods=3).values})
     string_dtypes = set((str, 'str', np.string_, 'S1',
                          'unicode', np.unicode_, 'U1'))
     try:
         string_dtypes.add(unicode)
     except NameError:
         pass
     for dt in string_dtypes:
         with tm.assert_raises_regex(TypeError,
                                     'string dtypes are not allowed'):
             df.select_dtypes(include=[dt])
         with tm.assert_raises_regex(TypeError,
                                     'string dtypes are not allowed'):
             df.select_dtypes(exclude=[dt])
Example #11
0
 def test_select_dtypes_raises_on_string(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(include='object')
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(exclude='object')
     with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'):
         df.select_dtypes(include=int, exclude='object')
Example #12
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Example #13
0
    def test_select_dtypes_include(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude=['timedelta'])
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime64'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetimetz'])
        ei = df[['h', 'i']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['timedelta'])
        ei = df[['k']]
        assert_frame_equal(ri, ei)

        self.assertRaises(NotImplementedError,
                          lambda: df.select_dtypes(include=['period']))
Example #14
0
    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = compat.OrderedDict
        df = DataFrame(odict([('a', list('abc')),
                              ('b', list(range(1, 4))),
                              ('c', np.arange(3, 6).astype('u1')),
                              ('d', np.arange(4.0, 7.0, dtype='float64')),
                              ('e', [True, False, True]),
                              ('f', pd.date_range('now', periods=3).values)]))
        df.columns = ['a', 'a', 'b', 'b', 'b', 'c']

        expected = DataFrame({'a': list(range(1, 4)),
                              'b': np.arange(3, 6).astype('u1')})

        result = df.select_dtypes(include=[np.number], exclude=['floating'])
        assert_frame_equal(result, expected)
Example #15
0
 def remove_discrete_variables_with_too_many_states(df: pd.DataFrame, num_states = 30):
     column_names = df.select_dtypes(include=['object']).apply(lambda x: len(x.unique()) >= num_states)
     cols = list(set(df.columns.tolist()) - set(column_names[column_names == True].index.tolist()))
     return df[cols]
Example #16
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     with tm.assert_raises_regex(ValueError, 'at least one of '
                                 'include or exclude '
                                 'must be nonempty'):
         df.select_dtypes()
Example #17
0
 def test_select_dtypes_empty(self):
     df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
     msg = 'at least one of include or exclude must be nonempty'
     with pytest.raises(ValueError, match=msg):
         df.select_dtypes()
Example #18
0
class DataWorker(object):
	
	
	def feat_value2int(series):
	    all_values = list(enumerate(np.unique(series)))
	    value_dict = {name : i for i,name in all_values}
	    return value_dict
	
	
	def __init__(self,data=None):
		"""
		Init DataWorker with pandas.DataFrame
		Otherwise make sure that the rdata can be transformed to DataFrame.
		"""
		if data is None:
			self.__data = {}
		if isinstance(data,DataFrame):
			self.__data = data.copy()
		else:
			self.__data = DataFrame(data)
		
		self.__featureDict = None

		
	@property
	def featureDict(self):
		self.__data.select_dtypes(include=['object'])
		
	@featureDict.setter
	def featureDict(self,value):
		pass
		
	@property
	def data(self):
		return self.__data
	@data.setter
	def data(self,df):
		self.data = df
	
	def getColNamesWithNan(self):
		s = self.__data.isnull().any()
		return	s.index[s==True].tolist()
	
	def dataClean(self,transDict = None,fillna={'all':'most_frequent'},yCol = -1):
		"""
		yCol: the col you wanna predict
		fillna: 
			{columnn:method_name} dictionary
			default:{'all':'most_frequent'}
			provied functions are : 'most_frequent','mean','median','first_n_frequent,n'(where the last n is a number)
			when key =='all' : fill column which include na with the same function,
			this key is suggested to put at the end
		"""
		
		# try to map all data to numeric

		self.__data = cd.fillna(self.__data,fillna)

		if transDict == None:
			self.__featureDict
		if yCol != -1:
			self.__data = cd.change_yCol(self.__data,yCol)
		
		
	def algorithmUsing():
		pass

	def showFeagure():
		pass
	def getResult():
		pass