def test_rank(self): tm._skip_if_no_scipy() from scipy.stats import rankdata self.frame['A'][::2] = np.nan self.frame['B'][::3] = np.nan self.frame['C'][::4] = np.nan self.frame['D'][::5] = np.nan ranks0 = self.frame.rank() ranks1 = self.frame.rank(1) mask = np.isnan(self.frame.values) fvals = self.frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp)
def test_rank(self, float_frame): rankdata = pytest.importorskip('scipy.stats.rankdata') float_frame['A'][::2] = np.nan float_frame['B'][::3] = np.nan float_frame['C'][::4] = np.nan float_frame['D'][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) mask = np.isnan(float_frame.values) fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp)
def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ['a', 'a', 'b', 'c']}) xpr = "Expected an instance of {}".format(cls.__name__) with tm.assert_raises_regex(TypeError, xpr): df.astype({"A": cls}) with tm.assert_raises_regex(TypeError, xpr): df['A'].astype(cls)
def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range('2010-01-04', periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(['1.0', '2', '3.14', '4', '5.4']) df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({'b': 'str', 'd': 'float32'}) result = df.astype(dt1) expected = DataFrame({ 'a': a, 'b': Series(['0', '1', '2', '3', '4']), 'c': c, 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) assert_frame_equal(result, expected) assert_frame_equal(df, original) dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64}) result = df.astype(dt2) expected = DataFrame({ 'a': a, 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str}) assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({'b': str, 2: str}) dt5 = dtype_class({'e': str}) pytest.raises(KeyError, df.astype, dt4) pytest.raises(KeyError, df.astype, dt5) assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) assert_frame_equal(df, equiv) assert_frame_equal(df, original) # GH 16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) result = df.astype(dt7) assert_frame_equal(df, equiv) assert_frame_equal(df, original)
def test_arg_for_errors_in_astype(self): # issue #14878 df = DataFrame([1, 2, 3]) with pytest.raises(ValueError): df.astype(np.float64, errors=True) df.astype(np.int8, errors='ignore')
def test_astype_cast_nan_inf_int(self, val, dtype): # see gh-14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with tm.assert_raises_regex(ValueError, msg): df.astype(dtype)
def test_astype_cast_nan_inf_int(self): # GH14265, check nan and inf raise error when converting to int types = [np.int32, np.int64] values = [np.nan, np.inf] msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer' for this_type in types: for this_val in values: df = DataFrame([this_val]) with tm.assert_raises_regex(ValueError, msg): df.astype(this_type)
def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 dtype = "M8[{}]".format(unit) other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) with pytest.raises(TypeError): df.astype(other) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError): df.astype(dtype)
def _get_culled_matrix(least_num_seg: int, dtm_data_frame: pd.DataFrame) -> pd.DataFrame: """Get the culled final_matrix and culled words. Gives a matrix that only contains the words that appears in more than `least_num_seg` segments. :param least_num_seg: least number of segment the word needs to appear in to be kept. :param dtm_data_frame: the dtm in forms of panda data frames. the indices(rows) are segment names the columns are words. :return: the culled dtm data frame """ # create a bool matrix to indicate whether a word is in a segment # at the line of segment s and the column of word w, # if the value is True, then means w is in s # otherwise means w is not in s is_in_data_frame = dtm_data_frame.astype(bool) # summing the boolean array gives an int, which indicates how many # True there are in that array. # this is an series, indicating each word is in how many segments # this array is a parallel array of words # noinspection PyUnresolvedReferences words_in_num_seg_series = is_in_data_frame.sum(axis=0) # get the index of all the words needs to remain # this is an array of int dtm_data_frame = dtm_data_frame.loc[ :, # select all rows (row indexer) words_in_num_seg_series >= least_num_seg # col indexer ] return dtm_data_frame
def components(self): """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- a DataFrame """ from pandas import DataFrame columns = ['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds'] hasnans = self.hasnans if hasnans: def f(x): if isnull(x): return [np.nan] * len(columns) return x.components else: def f(x): return x.components result = DataFrame([f(x) for x in self]) result.columns = columns if not hasnans: result = result.astype('int64') return result
def test_astype_categorical(self, dtype): # GH 18099 d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected)
def slice(unit, trialData, sort_by = None, show = False): data = trialData if sort_by in trialData.columns: data = trialData.sort(columns=sort_by) rates = DataFrame(index=data.index, columns = range(6)) for ind, row in data.iterrows(): pg = (row['PG in'], row['PG out']) fg = (row['C out'], row['FG in']) cent = (row['C in'], row['C out']) delay = (row['PG out'], row['C in']) counts = [ np.histogram(row[unit.id], bins = 1, range=pg)[0] ] counts.append(np.histogram(row[unit.id], bins=3, range=delay)[0]) counts.extend([np.histogram(row[unit.id], bins = 1, range=period)[0] for period in [cent, fg]]) counts = np.concatenate(counts) diffs = [pg[1]-pg[0], (delay[1]-delay[0])/3.0, (delay[1]-delay[0])/3.0, (delay[1]-delay[0])/3.0, cent[1]-cent[0], fg[1]-fg[0], ] rates.ix[ind] = counts/diffs if show: plt.imshow(rates.astype(float), aspect='auto', interpolation = 'nearest', extent=[0,5,0,len(rates)]) return rates
def clean(numpy_array): #load your csv data here in numpy_array data=ut.preprocessData(numpy_array) #print dataarray #print data ###### numpy into pandas dataframe df = pd.DataFrame(data) #print df #print df.dtypes df=df.astype('float16') #print df.dtypes ###### generate preprocessed csv file #df.to_csv('preprocessed_data.csv', sep=',',index=False) ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin) df_norm= (df - df.min()) / (df.max()-df.min()) df_norm=df_norm.fillna(-1) ##### generate normalized csv #df_norm.to_csv('normalized_data.csv',sep=',', index=False) return df_norm.as_matrix()
def test_rank_methods_frame(self): tm.skip_if_no_package('scipy', min_version='0.13', app='scipy.stats.rankdata') import scipy from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord('z') - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ['average', 'min', 'max', 'first', 'dense']: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != 'first' else 'ordinal') sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols) if LooseVersion(scipy.__version__) >= '0.17.0': expected = expected.astype('float64') tm.assert_frame_equal(result, expected)
def test_passing_dtype(self): # see gh-6607 df = DataFrame(np.random.rand(5, 2).round(4), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) expected = df.astype(str) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str result = self.read_csv(path, dtype=object, index_col=0) tm.assert_frame_equal(result, expected) # we expect all object columns, so need to # convert to test for equivalence result = result.astype(float) tm.assert_frame_equal(result, df) # invalid dtype self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, index_col=0) # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) tm.assert_frame_equal(actual, expected)
def bool_frame_with_na(): """ Fixture for DataFrame of booleans with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing A B C D zBZxY2IDGd False False False False IhBWBMWllt False True True True ctjdvZSR6R True False True True AVTujptmxb False True False True G9lrImrSWq False False False True sFFwdIUfz2 NaN NaN NaN NaN s15ptEJnRb NaN NaN NaN NaN ... ... ... ... ... UW41KkDyZ4 True True False False l9l6XkOdqV True False False False X2MeZfzDYA False True False False xWkIKU7vfX False True False True QOhL6VmpGU False False False True 22PwkRJdat False True False False kfboQ3VeIK True False True False [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) # set some NAs df.loc[5:10] = np.nan df.loc[15:20, -2:] = np.nan return df
def test_astype_str(self, text_dtype): # see gh-9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like # Test str and unicode on Python 2.x and just str on Python 3.x result = df.astype(text_dtype) expected = DataFrame({ "a": list(map(text_dtype, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(text_dtype, map(Timestamp, b._values))), "c": list(map(text_dtype, map(lambda x: Timedelta(x)._repr_base(format="all"), c._values))), "d": list(map(text_dtype, d._values)), "e": list(map(text_dtype, e._values)), }) assert_frame_equal(result, expected)
def parse_table_schema(json, precise_float): """ Builds a DataFrame from a given schema Parameters ---------- json : A JSON table schema precise_float : boolean Flag controlling precision when decoding string to double values, as dictated by ``read_json`` Returns ------- df : DataFrame Raises ------ NotImplementedError If the JSON table schema contains either timezone or timedelta data Notes ----- Because ``write_json`` uses the string `index` to denote a name-less ``Index``, this function sets the name of the returned ``DataFrame`` to ``None`` when said string is encountered. Therefore, intentional usage of `index` as the ``Index`` name is not supported. See also -------- build_table_schema : inverse function pandas.read_json """ table = loads(json, precise_float=precise_float) col_order = [field['name'] for field in table['schema']['fields']] df = DataFrame(table['data'])[col_order] dtypes = {field['name']: convert_json_field_to_pandas_type(field) for field in table['schema']['fields']} # Cannot directly use as_type with timezone data on object; raise for now if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()): raise NotImplementedError('table="orient" can not yet read timezone ' 'data') # No ISO constructor for Timedelta as of yet, so need to raise if 'timedelta64' in dtypes.values(): raise NotImplementedError('table="orient" can not yet read ' 'ISO-formatted Timedelta data') df = df.astype(dtypes) df = df.set_index(table['schema']['primaryKey']) if len(df.index.names) == 1 and df.index.name == 'index': df.index.name = None else: if all(x.startswith('level_') for x in df.index.names): df.index.names = [None] * len(df.index.names) return df
def test_hist_non_numerical_raises(self): # gh-10444 df = DataFrame(np.random.rand(10, 2)) df_o = df.astype(np.object) msg = "hist method requires numerical columns, nothing to plot." with pytest.raises(ValueError, match=msg): df_o.hist()
def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 dtype = "M8[{}]".format(unit) other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[timedelta64\[{}\]\]").format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[datetime64\[{}\]\]").format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype)
def setup_cache(self): df = DataFrame([[1]]) frames = { 'int': df, 'float': df.astype(float), } return frames
def test_constant_drift(self): N = 10 expected = DataFrame({'x': np.arange(N), 'y': np.zeros(N)}).iloc[1:] expected = expected.astype('float') expected.index.name = 'frame' expected.columns = ['x', 'y'] actual = tp.compute_drift(self.steppers) assert_frame_equal(actual, expected)
def boxcoxtrans(str,list): s=list w = pd.read_csv(str, usecols=s) f = DataFrame(w) c = f.astype(float) x = c.as_matrix() e = [] for j in np.linspace(-2, 2, num=21): if j != 0: b =(x**j) d=[] c=[] for i in range(0,len(b)): c = b[i] d.append(c[0]) t = stats.shapiro(d) e.append(t[1]) for i in range(0,len(e)): if e[i]==max(e): break t=(-2+0.2*i) if t>=0: t=(-2+0.2*(i+1)) print 'optimal lembda=',t h=((x**t)-1)/t l=[] m=[] for i in range(0,len(h)): l = h[i] m.append(l[0]) print pd.DataFrame(m) k=stats.shapiro(m) print 'shapiro test of trans column',k
def test_div(self): # no longer do integer div for any ops, but deal with the 0's p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) result = p['first'] / p['second'] expected = Series(p['first'].values.astype(float) / p['second'].values, dtype='float64') expected.iloc[0:3] = np.inf assert_series_equal(result, expected) result = p['first'] / 0 expected = Series(np.inf, index=p.index, name='first') assert_series_equal(result, expected) p = p.astype('float64') result = p['first'] / p['second'] expected = Series(p['first'].values / p['second'].values) assert_series_equal(result, expected) p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) result = p['first'] / p['second'] assert_series_equal(result, p['first'].astype('float64'), check_names=False) self.assertTrue(result.name is None) self.assertFalse(np.array_equal(result, p['second'] / p['first'])) # inf signing s = Series([np.nan, 1., -1.]) result = s / 0 expected = Series([np.nan, np.inf, -np.inf]) assert_series_equal(result, expected) # float/integer issue # GH 7785 p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) expected = Series([-0.01, -np.inf]) result = p['second'].div(p['first']) assert_series_equal(result, expected, check_names=False) result = p['second'] / p['first'] assert_series_equal(result, expected) # GH 9144 s = Series([-1, 0, 1]) result = 0 / s expected = Series([0.0, nan, 0.0]) assert_series_equal(result, expected) result = s / 0 expected = Series([-inf, nan, inf]) assert_series_equal(result, expected) result = s // 0 expected = Series([-inf, nan, inf]) assert_series_equal(result, expected)
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected)
def test_interp_inplace(self): df = DataFrame({'a': [1., 2., np.nan, 4.]}) expected = DataFrame({'a': [1., 2., 3., 4.]}) result = df.copy() result['a'].interpolate(inplace=True) assert_frame_equal(result, expected) result = df.copy() result['a'].interpolate(inplace=True, downcast='infer') assert_frame_equal(result, expected.astype('int64'))
def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 dtype = "M8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected)
def acc_cont_table(predictions, names, true, print_flag=True): """Create Conditional Accuracy Tables as in: Combining Information Extraction Systems Using Voting and Stacked Generalization by Sigletos et al, 2005""" from numpy import eye from pandas import DataFrame table = eye(len(predictions)) # table initilization for i in xrange(len(predictions)): for j in range(i+1, len(predictions)): # for each pair _, _, _, i_given_j, j_given_i = Pairwise_Tests(predictions[i], predictions[j], true, names[i], names[j]) table[i, j] = i_given_j table[j, i] = j_given_i df = DataFrame(table, names, names) if print_flag: print df.astype('float').to_string(float_format= lambda x: '%0.2f'%(x)) return df
def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected)
def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected)
def cast_dtypes( df: pd.DataFrame, dtype: Optional[Mapping[str, Union[type, str]]] = None, inplace=False, ) -> pd.DataFrame: """ Cast data types for columns in dataframe, skip columns that doesn't exist. The following obsplus specific datatypes are supported: 'ops_datetime' - call :func:`obsplus.utils.time.to_datetime64` on column 'ops_timedelta` - call :func:`obsplus.utils.time.to_timedelta64` on column Note: this is different from pd.astype because it skips columns which don't exist. Parameters ---------- df Dataframe dtype A dict of columns and datatypes. inplace If true perform operation in place. """ # get overlapping columns and dtypes overlap = set(dtype) & set(df.columns) dtype_codes = {i: dtype[i] for i in overlap} # if the dataframe is empty and has columns use simple astype if df.empty and len(df.columns): dtypes = {i: OPS_DTYPES.get(v, v) for i, v in dtype_codes.items()} return df.astype(dtypes) # else create functions and apply to each column funcs = { i: OPS_DTYPE_FUNCS.get(v, lambda x, y=v: x.astype(y)) for i, v in dtype_codes.items() } return apply_funcs_to_columns(df, funcs=funcs, inplace=inplace)
def post_processing(df: pd.DataFrame, include_spot_prices=False) -> pd.DataFrame: unpacked_column_pool = unpack_column_pool(df) unpacked_column_token_prices = unpack_column_token_prices(df) df = df.assign(**unpacked_column_pool).assign(**unpacked_column_token_prices) if include_spot_prices: unpacked_column_spot_prices = unpack_column_spot_prices(df) df = df.assign(**unpacked_column_spot_prices) # Convert change_datetime from str to datetime, other columns to float64 df["change_datetime"] = pd.to_datetime(df["change_datetime"], utc=True) df = df.astype({"pool_shares": "float64", "swap_fee": "float64"}) # Calculate token_{x}_value columns token_x_value = calc_token_x_value(df) df = df.assign(**token_x_value) # Calculate TVL column symbols = assets_in_df(df) token_value_columns = [f'token_{s}_value' for s in symbols] column_tvl = df[token_value_columns].sum(axis=1) df = df.assign(tvl=column_tvl) # Calculate Invariant column df['invariant'] = 1 for s in symbols: df['invariant'] *= (df[f'token_{s}_balance'] ** df[f'token_{s}_weight']) # Calculate total_token_balances token_balance_columns = [f'token_{s}_balance' for s in symbols] column_total_token_balances = df[token_balance_columns].sum(axis=1) df = df.assign(total_token_balances=column_total_token_balances) # Convert generated_fees_(token) columns from str or Decimal to float64 generated_fees_columns = [f'generated_fees_{s}' for s in symbols] for generated_fee_col in generated_fees_columns: df[generated_fee_col] = df[generated_fee_col].astype('float64') return df
def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): # test combined aggregations on ordered categorical cols GH27800 # create the result dataframe input_df = DataFrame({ "nr": [1, 2, 3, 4, 5, 6, 7, 8], "cat_ord": list("aabbccdd"), "cat": list("aaaabbbb"), }) input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() result_df = input_df.groupby("cat").agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category") # unpack the grp_col_dict to create the multi-index tuple # this tuple will be used to create the expected dataframe index multi_index_list = [] for k, v in grp_col_dict.items(): if isinstance(v, list): for value in v: multi_index_list.append([k, value]) else: multi_index_list.append([k, v]) multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) tm.assert_frame_equal(result_df, expected_df)
def components(self): """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- a DataFrame """ from pandas import DataFrame columns = [ "days", "hours", "minutes", "seconds", "milliseconds", "microseconds", "nanoseconds", ] hasnans = self._hasnans if hasnans: def f(x): if isna(x): return [np.nan] * len(columns) return x.components else: def f(x): return x.components result = DataFrame([f(x) for x in self], columns=columns) if not hasnans: result = result.astype("int64") return result
def test_passing_dtype(self): # see gh-6607 df = DataFrame(np.random.rand(5, 2).round(4), columns=list('AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) expected = df.astype(str) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str result = self.read_csv(path, dtype=object, index_col=0) tm.assert_frame_equal(result, expected) # we expect all object columns, so need to # convert to test for equivalence result = result.astype(float) tm.assert_frame_equal(result, df) # invalid dtype self.assertRaises(TypeError, self.read_csv, path, dtype={ 'A': 'foo', 'B': 'float64' }, index_col=0) # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) tm.assert_frame_equal(actual, expected)
def _generate_category_map(self, X: DataFrame) -> (DataFrame, dict): if self.features_in: fill_nan_map = dict() category_map = dict() X_category = X.astype('category') for column in X_category: rank = X_category[column].value_counts().sort_values(ascending=True) if self._minimum_cat_count is not None: rank = rank[rank >= self._minimum_cat_count] if self._maximum_num_cat is not None: rank = rank[-self._maximum_num_cat:] if self.cat_order == 'count' or self._minimum_cat_count is not None or self._maximum_num_cat is not None: category_list = list(rank.index) # category_list in 'count' order if len(category_list) > 1: if self.cat_order == 'original': original_cat_order = list(X_category[column].cat.categories) set_category_list = set(category_list) category_list = [cat for cat in original_cat_order if cat in set_category_list] elif self.cat_order == 'alphanumeric': category_list.sort() X_category[column] = X_category[column].astype(CategoricalDtype(categories=category_list)) # TODO: Remove columns if all NaN after this? X_category[column] = X_category[column].cat.reorder_categories(category_list) elif self.cat_order == 'alphanumeric': category_list = list(X_category[column].cat.categories) category_list.sort() X_category[column] = X_category[column].astype(CategoricalDtype(categories=category_list)) X_category[column] = X_category[column].cat.reorder_categories(category_list) category_map[column] = copy.deepcopy(X_category[column].cat.categories) if self._fillna_flag: if self._fillna == 'mode': if len(rank) > 0: fill_nan_map[column] = list(rank.index)[-1] if not self._fillna_flag: fill_nan_map = None return X_category, category_map, fill_nan_map else: return DataFrame(index=X.index), None, None
def clean_dat(dat: pd.DataFrame, logger=None) -> pd.DataFrame: if logger == None: logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.debug( 'Clean Data, number of inf and nun are for dataset: (%d, %d)' % ((dat == np.inf).sum().sum(), dat.isna().sum().sum())) logger.info(' -Set type to float32 at first && deal with inf.') dat = dat.astype(np.float32) dat = dat.replace([np.inf, -np.inf], np.nan) logger.info(' -Remove columns with half of nan') dat = dat.dropna(axis=1, thresh=dat.shape[0] * .5) logger.info(' -Remove costant columns') dat = dat.loc[:, (dat != dat.iloc[0]).any()] logger.info(' -Remove columns with too many so small numbers') for col in dat.columns: if (abs(dat[col] - 0.0) < 0.0001).sum() / dat.shape[0] > 0.8: print((abs(dat[col] - 0.0) < 0.0001).sum()) dat.drop(col, axis=1, inplace=True) if dat.isna().sum().sum() > 0: logger.info(' -Start to fill the columns with nan') # imp = IterativeImputer(max_iter=10, random_state=0) imp = SimpleImputer(missing_values=np.nan, strategy='mean') # dat = dat.fillna(dat.mean()) tmp = imp.fit_transform(dat) if tmp.shape[1] != dat.shape[1]: tmp = dat.fillna(0) dat = pd.DataFrame(tmp, columns=dat.coulumns, index=dat.index) logger.info(' -Remove rows with any nan in the end') dat = dat.dropna(axis=0, how='any') logger.debug( 'End with Data cleaning, number of inf and nun are for dataset: (%d, %d)' % ((dat == np.inf).sum().sum(), dat.isna().sum().sum())) return dat
def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): # test single aggregations on ordered categorical cols GHGH27800 # create the result dataframe input_df = DataFrame( { "nr": [1, 2, 3, 4, 5, 6, 7, 8], "cat_ord": list("aabbccdd"), "cat": list("aaaabbbb"), } ) input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() result_df = input_df.groupby("cat").agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" ) expected_df = DataFrame(data=exp_data, index=cat_index) tm.assert_frame_equal(result_df, expected_df)
def infer(X_test, save_dir): test_data_size = len(X_test) # load parameters print("====loading param====") w = np.loadtxt(os.path.join(save_dir, 'w')) b = np.loadtxt(os.path.join(save_dir, 'b')) # predict z = (np.dot(X_test, w) + b) y = sigmoid(z) y_ = np.around(y) y_answer = y_.T # with open('answer', 'w') as f: # f.write('id, label\n') # for i, v in enumerate(y_): # f.write("%d,%d\n" % (i+1, v)) # f.close() answer = DataFrame(y_answer) answer.index += 1 answer.columns = ['prediction'] answer = answer.astype('int64') answer.to_csv('answer.csv', index_label='id')
def test_hist_non_numerical_or_datetime_raises(self): # gh-10444, GH32590 df = DataFrame( { "a": np.random.rand(10), "b": np.random.randint(0, 10, 10), "c": to_datetime( np.random.randint( 1582800000000000000, 1583500000000000000, 10, dtype=np.int64 ) ), "d": to_datetime( np.random.randint( 1582800000000000000, 1583500000000000000, 10, dtype=np.int64 ), utc=True, ), } ) df_o = df.astype(object) msg = "hist method requires numerical or datetime columns, nothing to plot." with pytest.raises(ValueError, match=msg): df_o.hist()
def convert_labels(image: np.ndarray, df: pd.DataFrame, pixel_size: Tuple[float, float]) -> pd.DataFrame: """Pre-processes labels to be used in deepBlink. Renames X/Y to c/r respectively for easier handling with rearrangement to r/c. Rounds coordinates on borders to prevent Fiji out-of bounds behavior. """ # Fiji point label format if all(c in df.columns for c in ("X", "Y")): df = df.rename(columns={"X": "c", "Y": "r"})[["r", "c"]] # TrackMate export format elif all(c in df.columns for c in ("POSITION_X", "POSITION_Y")): df = df[~df.index.isna( )] # Remove unused headers for TrackMate v7.0.0+ df = df.rename(columns={ "POSITION_X": "c", "POSITION_Y": "r" })[["r", "c"]] df = df.reset_index(drop=True) else: raise ValueError( "Format of input labels not recognized. " "Requires X,Y or POSITION_X,POSITION_Y in columns. " f"Columns found are: {df.columns.to_list()}.") # Clip upper and lower bounds of coordinates df = df.astype({"r": np.float64, "c": np.float64}) for name, var in zip(["r", "c"], image.shape): df[name] = df[name].where(df[name] < var, var) df[name] = df[name].where(df[name] > 0, 0) # Scale coordinates to pixel size size_x, size_y = pixel_size df["r"] = df["r"] / size_y df["c"] = df["c"] / size_x return df
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): # GH 12396 # tz-naive first = Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) second = DataFrame( [ [Timestamp("2015/01/01", tz=tz2)], [Timestamp("2016/01/01", tz=tz2)], ], index=[2, 3], ) expected = DataFrame([ pd.NaT, pd.NaT, Timestamp("2015/01/01", tz=tz2), Timestamp("2016/01/01", tz=tz2), ]) if tz1 != tz2: expected = expected.astype(object) result = pd.concat([first, second]) tm.assert_frame_equal(result, expected)
def get_means(adata, mycat): """ Calculates average and fraction expression per category in adata.obs Based on an AnnData object and an annotation category (e.g. louvain) returns average expression and fraction cells expressing gene per category parameters ---------- adata: AnnData an AnnData object mycat: str the category for summarisation (e.g. louvain, cell_names) returns ------- average_obs average gene expression per category fraction_obs fraction cells expressing a gene per category """ gene_ids = adata.raw.var.index.values try: x = adata.obs[mycat] adata.obs[mycat] = adata.obs[mycat].astype('category') clusters = adata.obs[mycat].cat.categories obs = adata.raw[:, gene_ids].X.toarray() obs = DataFrame(obs, columns=gene_ids, index=adata.obs[mycat]) average_obs = obs.groupby(level=0).mean() obs_bool = obs.astype(bool) fraction_obs = obs_bool.groupby(level=0).sum() / obs_bool.groupby( level=0).count() except KeyError: print( "Oops! The adata object does not have the specified column. Options are: " ) print(list(adata.obs.columns)) average_obs = None fraction_obs = None return (average_obs, fraction_obs)
def bool_frame_with_na(): """ Fixture for DataFrame of booleans with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing A B C D zBZxY2IDGd False False False False IhBWBMWllt False True True True ctjdvZSR6R True False True True AVTujptmxb False True False True G9lrImrSWq False False False True sFFwdIUfz2 NaN NaN NaN NaN s15ptEJnRb NaN NaN NaN NaN ... ... ... ... ... UW41KkDyZ4 True True False False l9l6XkOdqV True False False False X2MeZfzDYA False True False False xWkIKU7vfX False True False True QOhL6VmpGU False False False True 22PwkRJdat False True False False kfboQ3VeIK True False True False [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) # set some NAs df.iloc[5:10] = np.nan df.iloc[15:20, -2:] = np.nan # For `any` tests we need to have at least one True before the first NaN # in each column for i in range(4): df.iloc[i, i] = True return df
def transform(self, X: pd.DataFrame) -> pd.DataFrame: ''' Changes dataset's column types to reduce memory consumption Args: X: dataframe needs to be optimized Returns: Optimized dataframe ''' if self.verbose: original_mem_size = X.memory_usage(deep=True).sum() / 1024**2 print('MEM OPTIMIZER: Memory usage of dataframe: {:.2f} MB'.format( original_mem_size)) X = X.astype(self.dtypes, copy=False) if self.verbose: new_mem_size = X.memory_usage(deep=True).sum() / 1024**2 print('MEM OPTIMIZER: Memory usage after optimization: {:.2f} MB'. format(new_mem_size)) print('MEM OPTIMIZER: Decreased by {:.1f}%'.format(\ 100 * (original_mem_size - new_mem_size) / original_mem_size)) return X
def test_first_multi_key_groupbby_categorical(): # GH 22512 df = DataFrame( { "A": [1, 1, 1, 2, 2], "B": [100, 100, 200, 100, 100], "C": ["apple", "orange", "mango", "mango", "orange"], "D": ["jupiter", "mercury", "mars", "venus", "venus"], } ) df = df.astype({"D": "category"}) result = df.groupby(by=["A", "B"]).first() expected = DataFrame( { "C": ["apple", "mango", "mango"], "D": Series(["jupiter", "mars", "venus"]).astype( pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) ), } ) expected.index = MultiIndex.from_tuples( [(1, 100), (1, 200), (2, 100)], names=["A", "B"] ) tm.assert_frame_equal(result, expected)
def add_jobapi_data(perf_df: pd.DataFrame): """Given a dataframe from PerformanceSummary.to_df, add QPID Job API data for the job. Job API reference: https://stash.ihme.washington.edu/projects/QPID/repos/job-db/browse/docs/index.md """ try: job_numbers = perf_df["job_number"].unique() assert len(job_numbers) == 1 jobapi_data = requests.get( "http://jobapi.ihme.washington.edu/fair/queryjobids", params=[("job_number", job_numbers[0]), ("limit", 50000)], ).json() jobapi_df = pd.DataFrame(jobapi_data["data"]).add_prefix("qpid_") perf_df = perf_df.astype({ "job_number": np.int64, "task_number": np.int64 }) perf_df = perf_df.merge( jobapi_df, left_on=["job_number", "task_number"], right_on=["qpid_job_number", "qpid_task_number"], ) except Exception as e: logger.warning(f"Job API request failed with {e}") return perf_df
def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame( { "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), } ) tm.assert_frame_equal(result, expected)
def buildKB(): global tfidf1, tfs1, processed_title print('started building the knowledge base') directory = './knowbase' processed_text = [] processed_title = [] for filename in os.listdir(directory): if filename.endswith(".txt"): fullname = os.path.join(directory, filename) print('Processing file ', fullname) f = open(fullname, "r") text1 = f.read() prepr = preprocess(text1) processed_text.append(prepr) processed_title.append(filename) tfidf1 = TfidfVectorizer() tfs1 = tfidf1.fit_transform(processed_text) print('!!!!!!!Feature vector is writen to a csv file') dfReviews = DataFrame(tfs1.A, columns=tfidf1.get_feature_names()) dfReviews = dfReviews.astype(float) dfReviews.to_csv("fv.csv") print('knowledge base is built')
def get_maximal_cliques(self, delta=0, direction='both'): df = DF((self.df.drop(columns='w') if self.weighted else self.df.copy())) di = (delta == .0) if not di: if self.instantaneous: min_time, max_time = df.ts.min(), df.ts.max() else: min_time, max_time = df.ts.min(), df.tf.max() # apply the delta df['ts'] -= delta / 2.0 df['tf'] = df['ts'] + delta # and clip to the start and finish of time df['ts'].clip(lower=min_time, inplace=True) df['tf'].clip(upper=max_time, inplace=True) else: df['tf'] = df['ts'] if self.discrete: df = df.astype({'ts': int, 'tf': int}) else: df['s'] = True df['f'] = True return TemporalLinkSetDF(df, disjoint_intervals=(di and not self.discrete), discrete=self.discrete, weighted=False).get_maximal_cliques(direction=direction)
def import_dataframe(input_dataframe: pandas.DataFrame, text: str, unique_id: str = None, time: str = None, twitter_times: bool = False, columns_to_keep: List = []): """Imports a pandas dataframe into nate. Args: input_dataframe (pandas.DataFrame): The dataframe to be loaded. text (str): The name of the column containing the text data to be analyzed with nate. Required for all uses of nate. unique_id (str, optional): The name of the column containing unique identifiers (e.g. a unique name or hash ID#). Required for some uses of nate (e.g. Divsim). time (str, optional): The name of the column containing the time the observation was recorded. Required for some uses of nate (e.g. edge_burst). columns_to_keep (list, optional): A list of column names indicating which columns not specified elsewhere (e.g. for the time parameter) are kept. Returns: Nate: an instance of the `Nate` class containing all data from the columns specified in the parameters. The columns indicated in the text, unique_id, and time parameters will be renamed to 'text', 'unique_id', and 'time', accordingly. The names of the columns listed in 'columns_to_keep' will be preserved as-is. """ if time!= None and twitter_times == False: input_dataframe = input_dataframe.astype({time: 'str'}) input_dataframe[time] = pandas.to_datetime(input_dataframe[time], infer_datetime_format=True) return process_dataframe(input_dataframe, text, unique_id, time, twitter_times, columns_to_keep)
def read_dataframe( self, nodes: pd.DataFrame, attrs: Optional[Dict[str, Any]] = None) -> 'core.TreeNeuron': """Convert a SWC-like DataFrame into a TreeNeuron. Parameters ---------- nodes : pandas.DataFrame attrs : dict or None Arbitrary attributes to include in the TreeNeuron Returns ------- core.TreeNeuron """ return core.TreeNeuron(sanitise_nodes( nodes.astype(self._dtypes, errors='ignore', copy=False)), connectors=self._extract_connectors(nodes), **(self._make_attributes( { 'name': 'SWC', 'origin': 'DataFrame' }, attrs)))
def test_rank_methods_frame(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') import scipy xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord('z') - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ['average', 'min', 'max', 'first', 'dense']: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != 'first' else 'ordinal') sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols) if (LooseVersion(scipy.__version__) >= LooseVersion('0.17.0')): expected = expected.astype('float64') tm.assert_frame_equal(result, expected)
def __init__(self, words_df: pd.DataFrame): """ Parameters ---------- words_df : pd.DataFrame the expected columns as 'Word', 'Pronunciation', 'Pronunciation_with_accents', 'Definition', 'Occurence' with a 'Id' index """ # so that fields are in native Python type self.words_df = words_df.astype(object) self.char_to_words = build_char_to_words(words_df["Word"]) logger.debug("Get HSK index lists") # each list contains the words of that level and below # ie. 4 : [all words of level 4 and below] max_hsk_level = words_df["HSK_Level"].max() self.hsk_to_idx = defaultdict(list) for idx, hsk_level in enumerate(words_df["HSK_Level"]): for l in range(hsk_level, max_hsk_level + 1): self.hsk_to_idx[l].append(idx)
def _transform(self, X: DataFrame) -> DataFrame: if self._bool_features: for feature in self._bool_features: X[feature] = (X[feature] == self._bool_features[feature]).astype(np.int8) # check if not same if self._type_map_real_opt != X.dtypes.to_dict(): if self._int_features.size: null_count = X[self._int_features].isnull().any() # If int feature contains null during inference but not during fit. if null_count.any(): # TODO: Consider imputing to mode? This is tricky because training data had no missing values. # TODO: Add unit test for this situation, to confirm it is handled properly. with_null = null_count[null_count] with_null_features = list(with_null.index) logger.warning(f'WARNING: Int features without null values at train time contain null values at inference time! Imputing nulls to 0. To avoid this, pass the features as floats during fit!') logger.warning(f'WARNING: Int features with nulls: {with_null_features}') X[with_null_features] = X[with_null_features].fillna(0) if self._type_map_real_opt: # TODO: Confirm this works with sparse and other feature types! # FIXME: Address situation where test-time invalid type values cause crash: # https://stackoverflow.com/questions/49256211/how-to-set-unexpected-data-type-to-na?noredirect=1&lq=1 X = X.astype(self._type_map_real_opt) return X
def convert_pandas_dtypes(df: pd.DataFrame, col_fix: type = float) -> pd.DataFrame: r"""Helper funtion to convert pandas column dtypes Parameters ---------- df : pandas.DataFrame A pandas dataframe to convert columns col_fix : {float, str}, optional A column type to convert the input dataframe. Returns ------- pd.DataFrame A dataframe with converted columns """ try: df = df.astype(col_fix) except ValueError: raise ValueError( "Columns cannot be converted to {col}; check input features". format(col=col_fix)) return df
def auto_arima(df: DataFrame, prepared_df: DataFrame, prediction_step_length: timedelta, feature_column: list): df = df[['time_', feature_column]] df = df.astype(np.int64) stepwise_model = pm.auto_arima(df[feature_column], start_p=1, start_q=1, max_p=3, max_q=3, m=7, start_P=0, seasonal=True, d=1, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) stepwise_model.aic() stepwise_model.fit(df[feature_column]) future_forecast = stepwise_model.predict( n_periods=len(prepared_df['time_'])) prepared_df[feature_column] = future_forecast return prepared_df
def read_data_file(fn, skiplines=1, maxlines=False): """ A function to read any foam data files returning data and index after header """ # TODO check if sorting the index gives any performance benefits # print "opening file {}".format(fn) if not os.path.exists(fn): print("Can not open file " + fn) return None try: with open(fn, encoding="utf-8") as f: field = fn.split('/')[-1] content = f.readlines() content.append('bla') start, num_entries = if_header_skip(content) entries = len(content[start].split()) is_a_vector = (True if entries > 1 else False) end = start + num_entries # FIXME this fails for eulerian/lagrangian vector fields # since no positional entry is produced if is_a_vector: data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(), content[start:end:skiplines])) loc, names = evaluate_names(fn, entries) df = DataFrame(data=data, columns=names) if loc: df['Loc'] = loc else: df['Loc'] = range(len(df)) if "Pos" in df: df.set_index('Loc', append=False, inplace=True) df["Pos"] = df["Pos"].astype(float) df.set_index('Pos', append=True, inplace=True) else: # if no pos is availible we have either # an eulerian or lagrangian field df.set_index('Loc', append=True, inplace=True) df.index.names = ['Pos', 'Loc'] df = df.reorder_levels(['Loc', 'Pos']) df = df.astype(float) hashes = {} for row in df.columns: hashes.update({row: hash_series(df[row])}) return names, df, hashes # DataFile with a single row are seen as Eulerian or Lagrangian fields else: data = [np.float32(x) for x in content[start:end:skiplines]] entries = 1 df = DataFrame(data=data, columns=[field]) df['Loc'] = "Field" df.set_index('Loc', append=True, inplace=True) df.index.names=['Pos', 'Loc'] df = df.reorder_levels(['Loc', 'Pos']) hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)} return field, df, hashes except Exception as e: if DEBUG: print("Error processing datafile " + fn) print(e) return None
def setup_cache(self): df = DataFrame([[1]]) frames = {"int": df, "float": df.astype(float)} return frames
def __fit_hp(self, train_df: pd.DataFrame, test_df: pd.DataFrame, hp: pd.Series, simple_imputer, name: str, user_defined_scores: list = None) -> pd.core.series.Series: """ Method initialises the model, performs fitting and returns the desired metrics. :param train_df: training data as dataframe :param test_df: test data as dataframe; if not provided, a ratio of test_split of the training data are used as test data :param hp: pd.Series with hyperparameter configuration :param simple_imputer: SimpleImputer instance from which to inherit column names etc. :param name to identify the current setting of hps. :param user_defined_scores: list with entries (Callable, str), where callable is a function accepting arguments (true, predicted, confidence). True is an array with the true labels, predicted with the predicted labels and confidence is an array with the confidence score for each prediction. Default metrics are: f1_weighted, f1_micro, f1_macro, f1_weighted_train recall_weighted, recall_weighted_train, precision_weighted, precision_weighted_train, coverage_at_90, coverage_at_90_train, empirical_precision_at_90, ece_pre_calibration (ece: expected calibration error), ece_post_calibration, time [min]. A user defined function could look as follows: def my_function(true, predicted, confidence): return (true[confidence > .75] == predicted[confidence > .75]).mean() uds = (my_function, 'empirical_precision_above_75') :return: Series with hpo parameters and results. """ from . import Imputer # needs to be imported here to avoid circular dependency if not name: name = datetime.now().strftime('%Y-%m-%d %H:%M:%S') data_encoders = [] data_featurizers = [] if hp['global:concat_columns'] is False: # mark unused parameter for key, val in hp.items(): if 'concat:' in key: hp[key] = 'n.a.' # define column encoders and featurisers for each input column for input_column in simple_imputer.input_columns: # extract parameters for the current input column, take everything after the first colon col_parms = { ':'.join(key.split(':')[1:]): val for key, val in hp.items() if input_column in key } # define all input columns if col_parms['type'] == 'string': # iterate over multiple embeddings (chars + strings for the same column) for token in col_parms['tokens']: # call kw. args. with: **{key: item for key, item in col_parms.items() if not key == 'type'})] data_encoders += [ TfIdfEncoder( input_columns=[input_column], output_column=input_column + '_' + token, tokens=token, ngram_range=col_parms['ngram_range:' + token], max_tokens=col_parms['max_tokens']) ] data_featurizers += [ BowFeaturizer(field_name=input_column + '_' + token, max_tokens=col_parms['max_tokens']) ] elif col_parms['type'] == 'categorical': data_encoders += [ CategoricalEncoder(input_columns=[input_column], output_column=input_column + '_' + col_parms['type'], max_tokens=col_parms['max_tokens']) ] data_featurizers += [ EmbeddingFeaturizer(field_name=input_column + '_' + col_parms['type'], max_tokens=col_parms['max_tokens'], embed_dim=col_parms['embed_dim']) ] elif col_parms['type'] == 'numeric': data_encoders += [ NumericalEncoder(input_columns=[input_column], output_column=input_column + '_' + col_parms['type'], normalize=col_parms['normalize']) ] data_featurizers += [ NumericalFeaturizer( field_name=input_column + '_' + col_parms['type'], numeric_latent_dim=col_parms['numeric_latent_dim'], numeric_hidden_layers=col_parms[ 'numeric_hidden_layers']) ] else: logger.warn( 'Found unknown column type. Canidates are string, categorical, numeric.' ) # Concatenate all columns else: # cast all columns to string for concatenation train_df = train_df.astype(str) test_df = test_df.astype(str) col_parms = { ':'.join(key.split(':')[1:]): val for key, val in hp.items() if 'concat' in key } for token in col_parms['tokens']: data_encoders += [ TfIdfEncoder( input_columns=simple_imputer.input_columns, output_column='-'.join(simple_imputer.input_columns) + '_' + token, tokens=token, ngram_range=col_parms['ngram_range:' + token], max_tokens=col_parms['max_tokens']) ] data_featurizers += [ BowFeaturizer( field_name='-'.join(simple_imputer.input_columns) + '_' + token, max_tokens=col_parms['max_tokens']) ] # mark unused parameter for key, val in hp.items(): if not ('global:' in key or 'concat:' in key): hp[key] = 'n.a.' # Define separate encoder and featurizer for each column # Define output column. Associated parameters are not tuned. if is_numeric_dtype(train_df[simple_imputer.output_column]): label_column = [NumericalEncoder(simple_imputer.output_column)] logger.info("Assuming numeric output column: {}".format( simple_imputer.output_column)) else: label_column = [CategoricalEncoder(simple_imputer.output_column)] logger.info("Assuming categorical output column: {}".format( simple_imputer.output_column)) global_parms = { key.split(':')[1]: val for key, val in hp.iteritems() if 'global' in key } hp_time = time.time() hp_imputer = Imputer(data_encoders=data_encoders, data_featurizers=data_featurizers, label_encoders=label_column, output_path=self.output_path + name) hp_imputer.fit( train_df=train_df, test_df=test_df, ctx=get_context(), learning_rate=global_parms['learning_rate'], num_epochs=global_parms['num_epochs'], patience=global_parms['patience'], test_split=.1, weight_decay=global_parms['weight_decay'], batch_size=global_parms['batch_size'], final_fc_hidden_units=global_parms['final_fc_hidden_units'], calibrate=True) # add suitable metrics to hp series imputed = hp_imputer.predict(test_df) true = imputed[simple_imputer.output_column] predicted = imputed[simple_imputer.output_column + '_imputed'] imputed_train = hp_imputer.predict( train_df.sample(min(train_df.shape[0], int(1e4)))) true_train = imputed_train[simple_imputer.output_column] predicted_train = imputed_train[simple_imputer.output_column + '_imputed'] if is_numeric_dtype(train_df[simple_imputer.output_column]): hp['mse'] = mean_squared_error(true, predicted) hp['mse_train'] = mean_squared_error(true_train, predicted_train) confidence = float('nan') else: confidence = imputed[simple_imputer.output_column + '_imputed_proba'] confidence_train = imputed_train[simple_imputer.output_column + '_imputed_proba'] hp['f1_micro'] = f1_score(true, predicted, average='micro') hp['f1_macro'] = f1_score(true, predicted, average='macro') hp['f1_weighted'] = f1_score(true, predicted, average='weighted') hp['f1_weighted_train'] = f1_score(true_train, predicted_train, average='weighted') hp['precision_weighted'] = f1_score(true, predicted, average='weighted') hp['precision_weighted_train'] = f1_score(true_train, predicted_train, average='weighted') hp['recall_weighted'] = recall_score(true, predicted, average='weighted') hp['recall_weighted_train'] = recall_score(true_train, predicted_train, average='weighted') hp['coverage_at_90'] = (confidence > .9).mean() hp['coverage_at_90_train'] = (confidence_train > .9).mean() hp['empirical_precision_at_90'] = ( predicted[confidence > .9] == true[confidence > .9]).mean() hp['ece_pre_calibration'] = hp_imputer.calibration_info['ece_post'] hp['ece_post_calibration'] = hp_imputer.calibration_info[ 'ece_post'] hp['time [min]'] = (time.time() - hp_time) / 60 for uds in user_defined_scores: hp[uds[1]] = uds[0](true=true, predicted=predicted, confidence=confidence) hp_imputer.save() return hp