def test_replace_str_to_str_chain(self): a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({'a': astr}) with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): df.replace({'a': dict(zip(astr, bstr))})
def test_nunique(self): df = DataFrame({ 'A': list('abbacc'), 'B': list('abxacc'), 'C': list('abbacx'), }) expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) result = df.groupby('A', as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list('abc') expected.index.name = 'A' result = df.groupby('A').nunique() tm.assert_frame_equal(result, expected) # with na result = df.replace({'x': None}).groupby('A').nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, index=list('abc')) expected.index.name = 'A' result = df.replace({'x': None}).groupby('A').nunique() tm.assert_frame_equal(result, expected)
def getphysiologicalinfo(pos, atlas="Talairach", r=5, nearest="on", rm_unfound=False): # load atlas : hdr, mask, gray, brodtxt, brodidx, label = loadatlas(atlas=atlas) # Get info of each electrode : nbchan = pos.shape[0] hemi, lobe, gyrus, matter, brod = [], [], [], [], [] for k in range(0, nbchan): hemiC, lobeC, gyrusC, matterC, brodC = physiochannel( list(pos[k, :]), mask, hdr, gray, label, r=r, nearest=nearest ) hemi.extend(hemiC), lobe.extend(lobeC), gyrus.extend(gyrusC) matter.extend(matterC), brod.extend(brodC) # Put everything in a panda structure : phyinf = DataFrame({"Hemisphere": hemi, "Lobe": lobe, "Gyrus": gyrus, "Matter": matter, "Brodmann": list(brod)}) # Replace corrupted values : phyinf.replace(to_replace="*", value="Not found", inplace=True) phyinf["Brodmann"].replace(to_replace="Brodmann area ", value="", inplace=True, regex=True) phyinf["Gyrus"].replace(to_replace=" Gyrus", value="", inplace=True, regex=True) phyinf["Lobe"].replace(to_replace=" Lobe", value="", inplace=True, regex=True) phyinf["Matter"].replace(to_replace=" Matter", value="", inplace=True, regex=True) # Convert Brodmann to number : BrodStr = list(phyinf["Brodmann"]) for k in range(0, len(BrodStr)): try: BrodStr[k] = int(BrodStr[k]) except: BrodStr[k] = BrodStr[k] phyinf["Brodmann"] = BrodStr return phyinf
def test_replace_with_empty_dictlike(self, mix_abc): # GH 15289 df = DataFrame(mix_abc) assert_frame_equal(df, df.replace({})) assert_frame_equal(df, df.replace(Series([]))) assert_frame_equal(df, df.replace({'b': {}})) assert_frame_equal(df, df.replace(Series({'b': {}})))
def test_replace_with_empty_dictlike(self): # GH 15289 mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) assert_frame_equal(df, df.replace({})) assert_frame_equal(df, df.replace(Series([]))) assert_frame_equal(df, df.replace({'b': {}})) assert_frame_equal(df, df.replace(Series({'b': {}})))
def test_replace_simple_nested_dict_with_nonexistent_value(self): df = DataFrame({'col': range(1, 5)}) expected = DataFrame({'col': ['a', 2, 3, 'b']}) result = df.replace({-1: '-', 1: 'a', 4: 'b'}) assert_frame_equal(expected, result) result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}}) assert_frame_equal(expected, result)
def test_regex_replace_dict_mixed(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} dfmix = DataFrame(mix) # dicts # single dict {re1: v1}, search the whole frame # need test for this... # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, inplace=True, regex=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, regex=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) res2 = dfmix.copy() res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, inplace=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c': mix['c']}) res = dfmix.replace('a', {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace('a', {'b': np.nan}, regex=True, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace('a', {'b': np.nan}, regex=True) res2 = dfmix.copy() res2.replace(regex='a', value={'b': np.nan}, inplace=True) expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec)
def test_replace_simple_nested_dict(self): df = DataFrame({'col': range(1, 5)}) expected = DataFrame({'col': ['a', 2, 3, 'b']}) result = df.replace({'col': {1: 'a', 4: 'b'}}) assert_frame_equal(expected, result) # in this case, should be the same as the not nested version result = df.replace({1: 'a', 4: 'b'}) assert_frame_equal(expected, result)
def test_replace_dict_tuple_list_ordering_remains_the_same(self): df = DataFrame(dict(A=[nan, 1])) res1 = df.replace(to_replace={nan: 0, 1: -1e8}) res2 = df.replace(to_replace=(1, nan), value=[-1e8, 0]) res3 = df.replace(to_replace=[1, nan], value=[-1e8, 0]) expected = DataFrame({'A': [0, -1e8]}) assert_frame_equal(res1, res2) assert_frame_equal(res2, res3) assert_frame_equal(res3, expected)
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2]}) result = df.replace(np.nan, 1) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64')}) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2]}) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected)
def test_replace_input_formats(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(to_rep, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], values[k]) assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) # dict to scalar filled = df.replace(to_rep, 0) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(to_rep[k], 0) assert_frame_equal(filled, DataFrame(expected)) self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(np.nan, values) expected = {} for k, v in compat.iteritems(df): expected[k] = v.replace(np.nan, values[k]) assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) self.assertRaises(ValueError, df.replace, to_rep, values[1:]) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)
def get_dataframe(self, data): dataframe = DataFrame(data) index = self.get_index(dataframe) if index: if self.index_none_value is not None: for key in index: dataframe.replace({key: None}, {key: self.index_none_value}, inplace=True) dataframe.set_index(index, inplace=True) else: # Name auto-index column to ensure valid CSV output dataframe.index.name = "row" return dataframe
def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ''] values = [-2, -1, 'missing'] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:])
def historicalRequest (self, securities, fields, startDate, endDate, **kwargs): """ Equivalent to the Excel BDH Function. If securities are provided as a list, the returned DataFrame will have a MultiIndex. """ defaults = {'startDate' : startDate, 'endDate' : endDate, 'periodicityAdjustment' : 'ACTUAL', 'periodicitySelection' : 'DAILY', 'nonTradingDayFillOption' : 'ACTIVE_DAYS_ONLY', 'adjustmentNormal' : False, 'adjustmentAbnormal' : False, 'adjustmentSplit' : True, 'adjustmentFollowDPDF' : False} defaults.update(kwargs) response = self.sendRequest('HistoricalData', securities, fields, defaults) data = [] keys = [] for msg in response: securityData = msg.getElement('securityData') fieldData = securityData.getElement('fieldData') fieldDataList = [fieldData.getValueAsElement(i) for i in range(fieldData.numValues())] df = DataFrame() for fld in fieldDataList: for v in [fld.getElement(i) for i in range(fld.numElements()) if fld.getElement(i).name() != 'date']: df.ix[fld.getElementAsDatetime('date'), str(v.name())] = v.getValue() df.index = df.index.to_datetime() df.replace('#N/A History', np.nan, inplace=True) keys.append(securityData.getElementAsString('security')) data.append(df) if len(data) == 0: return DataFrame() if type(securities) == str: data = pd.concat(data, axis=1) data.columns.name = 'Field' else: data = pd.concat(data, keys=keys, axis=1, names=['Security','Field']) data.index.name = 'Date' return data
def test_regex_replace_dict_nested(self): # nested dicts will not work until this is implemented for Series mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} dfmix = DataFrame(mix) res = dfmix.replace({'b': {r'\s*\.\s*': nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() res2.replace({'b': {r'\s*\.\s*': nan}}, inplace=True, regex=True) res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': nan}}) res4.replace(regex={'b': {r'\s*\.\s*': nan}}, inplace=True) expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) assert_frame_equal(res4, expec)
def test_regex_replace_numeric_to_object_conversion(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} df = DataFrame(mix) expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']}) res = df.replace(0, 'a') assert_frame_equal(res, expec) self.assertEqual(res.a.dtype, np.object_)
def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.get_dtype_counts().sort_values() expected = Series({'object': 5}) assert_series_equal(result, expected) result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = result.get_dtype_counts().sort_values() expected = Series({'int64': 5}) assert_series_equal(result, expected) # empty block df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64') result = df.fillna('nan') expected = DataFrame('nan', index=lrange(3), columns=['A', 'B']) assert_frame_equal(result, expected) # equiv of replace df = DataFrame(dict(A=[1, np.nan], B=[1., 2.])) for v in ['', 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) assert_frame_equal(result, expected)
def test_regex_replace_dict_nested(self, mix_abc): # nested dicts will not work until this is implemented for Series dfmix = DataFrame(mix_abc) res = dfmix.replace({'b': {r'\s*\.\s*': np.nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() res2.replace({'b': {r'\s*\.\s*': np.nan}}, inplace=True, regex=True) res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': np.nan}}) res4.replace(regex={'b': {r'\s*\.\s*': np.nan}}, inplace=True) expec = DataFrame({'a': mix_abc['a'], 'b': ['a', 'b', np.nan, np.nan], 'c': mix_abc['c'], }) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) assert_frame_equal(res4, expec)
def test_replace_convert(self): # gh 3907 df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']]) m = {'foo': 1, 'bar': 2, 'bah': 3} rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes assert_series_equal(expec, res)
def test_replace_regex_metachar(self): metachars = '[]', '()', r'\d', r'\w', r'\s' for metachar in metachars: df = DataFrame({'a': [metachar, 'else']}) result = df.replace({'a': {metachar: 'paren'}}) expected = DataFrame({'a': ['paren', 'else']}) assert_frame_equal(result, expected)
def fillna_dict(cls, prop): """ Use trade history then fill empty with value row above """ df = DataFrame(prop) df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan) df = df.fillna(method='ffill') return [r.to_dict() for k, r in df.iterrows()]
def test_replace_method(self, to_replace, method, expected): # GH 19632 df = DataFrame({'A': [0, 1, 2], 'B': [5, np.nan, 7], 'C': ['a', 'b', 'c']}) result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) assert_frame_equal(result, expected)
def test_regex_replace_numeric_to_object_conversion(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix_abc['b'], 'c': mix_abc['c'], }) res = df.replace(0, 'a') assert_frame_equal(res, expec) assert res.a.dtype == np.object_
def test_regex_replace_list_obj(self): obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r'\s*\.\s*', r'e|f|g'] values = [np.nan, 'crap'] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], 'b': ['crap'] * 3 + ['h'], 'c': ['h', 'crap', 'l', 'o'], }) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] values = [r'\1\1', r'\1_crap'] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap', 'f_crap', 'g_crap', 'h'], 'c': ['h', 'e_crap', 'l', 'o'], }) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r'\s*(\.)\s*', r'e'] values = [r'\1\1', r'crap'] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', 'h'], 'c': ['h', 'crap', 'l', 'o'], }) assert_frame_equal(res, expec) to_replace_res = [r'\s*(\.)\s*', r'e'] values = [r'\1\1', r'crap'] res = dfobj.replace(value=values, regex=to_replace_res) expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', 'h'], 'c': ['h', 'crap', 'l', 'o'], }) assert_frame_equal(res, expec)
def test_replace_mixed(self): mf = self.mixed_frame mf.iloc[5:20, mf.columns.get_loc('foo')] = nan mf.iloc[-10:, mf.columns.get_loc('A')] = nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-18, nan), self.mixed_frame) result = self.mixed_frame.replace(np.nan, -1e8) expected = self.mixed_frame.fillna(value=-1e8) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) # int block upcasting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64')}) expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64')}) result = df.replace(0, 0.5) assert_frame_equal(result, expected) df.replace(0, 0.5, inplace=True) assert_frame_equal(df, expected) # int block splitting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64'), 'C': Series([1, 2], dtype='int64')}) expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64'), 'C': Series([1, 2], dtype='int64')}) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64')}) expected = DataFrame({'A': Series([1, 'foo'], dtype='object'), 'B': Series([0, 1], dtype='int64')}) result = df.replace(2, 'foo') assert_frame_equal(result, expected) expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'), 'B': Series([0, 'foo'], dtype='object')}) result = df.replace([1, 2], ['foo', 'bar']) assert_frame_equal(result, expected) # test case from df = DataFrame({'A': Series([3, 0], dtype='int64'), 'B': Series([0, 3], dtype='int64')}) result = df.replace(3, df.mean().to_dict()) expected = df.copy().astype('float64') m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected)
def test_regex_replace_list_mixed(self, mix_ab): # mixed frame to make sure this doesn't break things dfmix = DataFrame(mix_ab) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r'\s*\.\s*', r'a'] values = [np.nan, 'crap'] mix2 = {'a': list(range(4)), 'b': list('ab..'), 'c': list('halo')} dfmix2 = DataFrame(mix2) res = dfmix2.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': mix2['a'], 'b': ['crap', 'b', np.nan, np.nan], 'c': ['h', 'crap', 'l', 'o'], }) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] values = [r'\1\1', r'\1_crap'] res = dfmix.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': mix_ab['a'], 'b': ['a_crap', 'b_crap', '..', '..'], }) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] values = [r'\1\1', r'crap', r'\1_crap'] res = dfmix.replace(to_replace_res, values, regex=True) expec = DataFrame({'a': mix_ab['a'], 'b': ['crap', 'b_crap', '..', '..'], }) assert_frame_equal(res, expec) to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] values = [r'\1\1', r'crap', r'\1_crap'] res = dfmix.replace(regex=to_replace_res, value=values) expec = DataFrame({'a': mix_ab['a'], 'b': ['crap', 'b_crap', '..', '..'], }) assert_frame_equal(res, expec)
def test_replace_input_formats_scalar(self): df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) # dict to scalar to_rep = {'A': np.nan, 'B': 0, 'C': ''} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in compat.iteritems(df)} assert_frame_equal(filled, DataFrame(expected)) pytest.raises(TypeError, df.replace, to_rep, [np.nan, 0, '']) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)
def test_replace_input_formats_scalar(self): df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], 'C': ['', 'asdf', 'fd']}) # dict to scalar to_rep = {'A': np.nan, 'B': 0, 'C': ''} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) msg = "value argument must be scalar, dict, or Series" with pytest.raises(TypeError, match=msg): df.replace(to_rep, [np.nan, 0, '']) # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): expected.replace(to_rep[i], -1, inplace=True) assert_frame_equal(result, expected)
def test_replace_series_dict(self): # from GH 3064 df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) result = df.replace(0, {'zero': 0.5, 'one': 1.0}) expected = DataFrame( {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}}) assert_frame_equal(result, expected) result = df.replace(0, df.mean()) assert_frame_equal(result, expected) # series to series/dict df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) s = Series({'zero': 0.0, 'one': 2.0}) result = df.replace(s, {'zero': 0.5, 'one': 1.0}) expected = DataFrame( {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}}) assert_frame_equal(result, expected) result = df.replace(s, df.mean()) assert_frame_equal(result, expected)
def test_replace_dtypes(self): # int df = DataFrame({'ints': [1, 2, 3]}) result = df.replace(1, 0) expected = DataFrame({'ints': [0, 2, 3]}) assert_frame_equal(result, expected) df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int32) result = df.replace(1, 0) expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int32) assert_frame_equal(result, expected) df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int16) result = df.replace(1, 0) expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int16) assert_frame_equal(result, expected) # bools df = DataFrame({'bools': [True, False, True]}) result = df.replace(False, True) self.assertTrue(result.values.all()) # complex blocks df = DataFrame({'complex': [1j, 2j, 3j]}) result = df.replace(1j, 0j) expected = DataFrame({'complex': [0j, 2j, 3j]}) assert_frame_equal(result, expected) # datetime blocks prev = datetime.today() now = datetime.today() df = DataFrame({'datetime64': Index([prev, now, prev])}) result = df.replace(prev, now) expected = DataFrame({'datetime64': Index([now] * 3)}) assert_frame_equal(result, expected)
def test_regex_replace_dict_mixed(self): mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} dfmix = DataFrame(mix) # dicts # single dict {re1: v1}, search the whole frame # need test for this... # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*\.\s*'}, {'b': nan}, inplace=True, regex=True) expec = DataFrame({ 'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': mix['c'] }) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) res2 = dfmix.copy() res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, regex=True) expec = DataFrame({ 'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c'] }) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) res2 = dfmix.copy() res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, inplace=True) expec = DataFrame({ 'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': mix['c'] }) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} expec = DataFrame({ 'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': mix['c'] }) res = dfmix.replace('a', {'b': nan}, regex=True) res2 = dfmix.copy() res2.replace('a', {'b': nan}, regex=True, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) res = dfmix.replace('a', {'b': nan}, regex=True) res2 = dfmix.copy() res2.replace(regex='a', value={'b': nan}, inplace=True) expec = DataFrame({ 'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': mix['c'] }) assert_frame_equal(res, expec) assert_frame_equal(res2, expec)
def _remove_punctuation(dataframe: pd.DataFrame): dataframe.replace(',', '', regex=True, inplace=True) dataframe.replace('\-$', '0', regex=True, inplace=True) dataframe.replace('', '0', regex=True, inplace=True) return dataframe
def test_regex_replace_scalar(self): obj = {'a': list('ab..'), 'b': list('efgh')} dfobj = DataFrame(obj) mix = {'a': lrange(4), 'b': list('ab..')} dfmix = DataFrame(mix) # simplest cases # regex -> value # obj frame res = dfobj.replace(r'\s*\.\s*', nan, regex=True) assert_frame_equal(dfobj, res.fillna('.')) # mixed res = dfmix.replace(r'\s*\.\s*', nan, regex=True) assert_frame_equal(dfmix, res.fillna('.')) # regex -> regex # obj frame res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) objc = obj.copy() objc['a'] = ['a', 'b', '...', '...'] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) mixc = mix.copy() mixc['b'] = ['a', 'b', '...', '...'] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.replace(re.compile(r'\s*\.\s*'), nan, regex=True) assert_frame_equal(dfobj, res.fillna('.')) # mixed res = dfmix.replace(re.compile(r'\s*\.\s*'), nan, regex=True) assert_frame_equal(dfmix, res.fillna('.')) # regex -> regex # obj frame res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') objc = obj.copy() objc['a'] = ['a', 'b', '...', '...'] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') mixc = mix.copy() mixc['b'] = ['a', 'b', '...', '...'] expec = DataFrame(mixc) assert_frame_equal(res, expec) res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1') mixc = mix.copy() mixc['b'] = ['a', 'b', '...', '...'] expec = DataFrame(mixc) assert_frame_equal(res, expec) res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1') mixc = mix.copy() mixc['b'] = ['a', 'b', '...', '...'] expec = DataFrame(mixc) assert_frame_equal(res, expec)
def test_replace_bool_with_string(self): df = DataFrame({'a': [True, False], 'b': list('ab')}) result = df.replace(True, 'a') expected = DataFrame({'a': ['a', False], 'b': df.b}) assert_frame_equal(result, expected)
def getData(link): url = 'http://pm.kksk.org' + link index_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36' } flag = True try_times = 0 while flag: try: proxies = getProxy() html = requests.get(url, headers=index_headers, proxies=proxies, verify=False, timeout=(120, 180)) # getHtml time.sleep( random.uniform(0.5, 1) ) #Random time sleep in order to prevent from getting IP banned html.encoding = 'gbk' #this web is encoded in gbk soup = BeautifulSoup(html.text, "html.parser") updatetimefetch = soup.find( 'div', class_='panel panel-primary hidden-xs') # updatetime fetch timefetch = updatetimefetch.find('h3') timestring = timefetch.string timenum = re.sub('\D', '', timestring) # pure number time date = timenum[0:8] hour = timenum[8:10] citynamediv = soup.find('div', class_='panel-title row') #get city name citynamefetch = citynamediv.find('h1') cityname = citynamefetch.text cityname = re.sub("[^a-zA-Z0-9\u4e00-\u9fa5]", '', cityname) cityname = cityname[:2] tablediv = soup.find('div', class_='panel panel-primary hidden-xs') datalist = [] for tr in tablediv.findAll('tr'): for td in tr.findAll(['td']): datalist.append(td.get_text()) table = chunks(datalist, 11) df = DataFrame(table) #transfer from list to dataframe df.drop(index=[0], inplace=True) #drop the first row which is average df.replace('', 'NaN', inplace=True) # replace empty value as NaN df.replace(' - ', 'NaN', inplace=True) df.columns = [ 'WatchPoint', 'AQI', 'Level', 'PrimaryPollution', 'PM2.5', 'PM10', 'CO', 'NO2', 'O3_1h', 'O3_8h', 'SO2' ] #change head col_name = df.columns.tolist() # add time to the dataframe head col_name.insert(0, 'Date') col_name.insert(1, 'Hour') df = df.reindex(columns=col_name) # reindex the dataframe head dateList = [date for x in range(0, (df.iloc[:, 0].size)) ] # create time list hourList = [hour for x in range(0, (df.iloc[:, 0].size))] df['Date'] = dateList # add time to the dataframe df['Hour'] = hourList csvName = 'data/' + cityname + '.csv' # file name if os.path.exists( csvName ) == True: # write csv with tablehead if no existing file df.to_csv(csvName, index=False, mode='a+', header=False) else: df.to_csv(csvName, index=False, mode='a+') print('Get data of ' + cityname) flag = False #return df except BaseException: try_times += 1 if try_times > 20: #Max numbers of trying new ip is 20 print('Proxy IP failed') return -1 flag = False else: flag = True
def test_replace_truthy(self): df = DataFrame({'a': [True, True]}) r = df.replace([np.inf, -np.inf], np.nan) e = df assert_frame_equal(r, e)
def test_replace_bool_with_bool(self): df = DataFrame(np.random.rand(2, 2) > 0.5) result = df.replace(False, True) expected = DataFrame(np.ones((2, 2), dtype=bool)) assert_frame_equal(result, expected)
def pipe_rename_vaccines(self, df: pd.DataFrame) -> pd.DataFrame: return df.replace(self.vaccine_mapping)
def build_dataset(job_id: str, data_frame: pd.DataFrame, source: Optional[str] = const.DEFAULT_SOURCE) -> List[dict]: """ Build a dataset from the dataframe. :param job_id: The dataset id where data should be uploaded. :type job_id: str :param data_frame: The dataframe to upload. :type data_frame: pd.DataFrame :param source: The source of the data_frame. :type source: Optional[str] :return: The job-id where the data was uploaded. :rtype: int """ dataset = [] logger.debug(f"Pushing {len(data_frame)} items to {job_id=}") data_frame.fillna(np.nan, inplace=True) data_frame.replace([np.nan], [None], inplace=True) for _, row in tqdm( data_frame.iterrows(), total=len(data_frame), desc="Building a dataset for uploading safely.", ): dedupe_id = "_".join( [row[const.CONVERSATION_UUID], row[const.CALL_UUID]]) errors = [] if const.RAW in data_frame.columns: data = json.loads(row[const.RAW]) else: data = row.to_dict() alternatives = data.get(const.UTTERANCES, []) try: alternatives = json.loads(alternatives) if isinstance( alternatives, str) else alternatives except json.JSONDecodeError: raise ValueError( f"{alternatives} of type {type(alternatives)} is not a valid json string." ) data_point = { const.PRIORITY: 1, const.DATA_SOURCE: source, const.DATA_ID: dedupe_id, const.DATA: { **data, const.CALL_UUID: str(row[const.CALL_UUID]), const.CONVERSATION_UUID: str(row[const.CONVERSATION_UUID]), const.ALTERNATIVES: alternatives, }, const.IS_GOLD: False, } try: jsonschema.validate(data_point[const.DATA], const.UPLOAD_DATASET_SCHEMA) dataset.append(data_point) except jsonschema.exceptions.ValidationError as e: errors.append(e) if len(errors) > len(data_frame) * 0.5: raise RuntimeError(f"Too many errors: {errors}") return dataset
def replace_qm(df: pd.DataFrame) -> pd.DataFrame: df.replace({"?": None, " ?": None}, inplace=True) return df
wine=[row[4] for row in reader] wine_result=json.dumps(wine) id=[row[0] for row in reader]''' #用pd来读取文件 reader = pd.read_csv('ttt.csv', header=None) print(reader) hypertension = DataFrame(reader[0]) low = DataFrame(reader[1]) sleep = DataFrame(reader[2]) wine = DataFrame(reader[3]) #获得行数【0】,列数为【1】 nrows = reader.shape[0] - 1 #获取记录 record = reader[1:] print(record) #删去含有NAN的项 clean_all = record.dropna() cl_all_nrows = clean_all.shape[1] #print(cl_all_nrows) #print(clean_all) print(sleep) y_axis = [1, 5, 10, 15] #p1 = plt.bar(left=0,bottom=low , width=y, color='yellow',height=0.5, orientation='horizontal') x_axis = tuple(sleep.keys()) plt.bar(x_axis, y_axis, color='rgb') plt.show() low = low.replace([0, 1, 2, 3], ['b0', 'b1', 'b2', 'b3']) hypertension = hypertension.replace([0, 1, 2], ['a0', 'a1', 'a2']) sleep = sleep.replace([0, 1, 2, 3, 4], ['c0', 'c1', 'c2', 'c3', 'c4']) wine = wine.replace([0, 1, 2, 3], ['d0', 'd1', 'd2', 'd3'])
""" Replace """ ser1 = Series([1, 2, 3, 4, 1, 2, 3, 4]) # using replace when can select which value I want to replace # replace, with ser1.replace(1, np.nan) # lists can also be passed ser1.replace([1, 4], [100, 400]) # can also pass a dictionary ser1.replace({4: np.nan}) dframe.replace({'Alma': 'SEA'}) """ Rename """ dframe = DataFrame(np.arange(12).reshape(3, 4), index=['NY', 'LA', 'SF'], columns=['A', 'B', 'C', 'D']) dframe.index.map(str.lower) # makes the index lower case dframe.index = dframe.index.map(str.lower) # rename dframe.rename(index=str.title, columns=str.lower) # passing dictionaries for index and columns to rename them dframe.rename(index={'ny': 'New York'}, columns={'A': 'Alpha'})
'honey ham': 'pig', 'nova lox': 'salmon' } # The map method on a Series accepts a function or dict-like object containing a mapping but here we have a small problem in that some of the meats above are capitalized and others are not. Convert each value to lower case data['animal'] = data['food'].map(str.lower).map(meat_to_animal) print data # We could also have passed a function that does all the work: print data['food'].map(lambda x: meat_to_animal[x.lower()]) # Using map is a convienient way to perform element-wise transformations and other data cleaning-related operations # Replacing Values - Filling in missing data with the fillna method can be thought of as a special case of more general value replacement. While map, as you've seen above can be used to modify a subset of values in an object, replace provides a simpler and more flexible way to do so. # Consider this series: data = Series([1., -999., 2., -999., -1000., 3.]) print data # The -999 values might be sentinel values for missing data. To replace these NA values that pandas understands we can use replace, producing a new Series: print data.replace(-999., np.nan) # If you want to replace multiple values at once, you instead pass a list then the substitute value: print data.replace([-999., -1000.], np.nan) # To use a different replacement for each value, pass a list of substitutes: print data.replace([-999., -1000.], [np.nan, 0]) # The argument passed can also be a dict: print data.replace({-999.: np.nan, -1000.: 0}) # Renaming Axis Indexes # Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects. The axes can also be modified in place without creating a new data structure: data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four']) # Like a Series, the axis indexes have a map method: print data.index.map(str.upper) # You can assign to index, modifying the DataFrame in place:
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot output correlations abs_perc_error_time = go.Figure() abs_perc_error = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], zip(production_data[target_column], production_data[prediction_column]))) error_trace = go.Scatter( x = production_data[date_column] if date_column else production_data.index, y = abs_perc_error, mode = 'lines', name = 'Absolute Percentage Error', marker=dict( size=6, color=red ) ) zero_trace = go.Scatter( x = production_data[date_column] if date_column else production_data.index, y = [0]*production_data.shape[0], mode = 'lines', opacity=0.5, marker=dict( size=6, color='green', ), showlegend=False, ) abs_perc_error_time.add_trace(error_trace) abs_perc_error_time.add_trace(zero_trace) abs_perc_error_time.update_layout( xaxis_title = "Timestamp" if date_column else "Index", yaxis_title = "Percent", legend = dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) abs_perc_error_time_json = json.loads(abs_perc_error_time.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": abs_perc_error_time_json['data'], "layout": abs_perc_error_time_json['layout'] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2] }) result = df.replace(np.nan, 1) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64') }) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2] }) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected)
def clean_df(df: pd.DataFrame) -> pd.DataFrame: df = df.applymap(lambda s: s.lower().strip() if isinstance(s, str) else s) df = df.replace(np.nan, '', regex=True) return df
def test_replace_int_to_int_chain(self): df = DataFrame({'a': lrange(1, 5)}) with tm.assert_raises_regex(ValueError, "Replacement not allowed .+"): df.replace({'a': dict(zip(range(1, 5), range(2, 6)))})
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() #prediction_ids = np.argmax(array_prediction, axis=-1) #prediction_labels = [prediction_column[x] for x in prediction_ids] if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = ['target'] params_data = [] step_size = 0.05 binded = list( zip(binaraized_target['target'].tolist(), production_data[prediction_column[0]].tolist())) binded.sort(key=lambda item: item[1], reverse=True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall']) offset = max(round(data_size * step_size), 1) for step in np.arange(offset, data_size + offset, offset): count = min(step, data_size) prob = round(binded[min(step, data_size - 1)][1], 2) top = round(100.0 * min(step, data_size) / data_size, 1) tp = sum([x[0] for x in binded[:min(step, data_size)]]) fp = count - tp precision = round(100.0 * tp / count, 1) recall = round(100.0 * tp / target_class_size, 1) params_data.append({ 'f1': float(top), 'f2': int(count), 'f3': float(prob), 'f4': int(tp), 'f5': int(fp), 'f6': float(precision), 'f7': float(recall) }) self.wi = BaseWidgetInfo(title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "rowsPerPage": 21, "columns": [{ "title": "Top(%)", "field": "f1", "sort": "asc" }, { "title": "Count", "field": "f2", }, { "title": "Prob", "field": "f3", }, { "title": "TP", "field": "f4" }, { "title": "FP", "field": "f5" }, { "title": "Precision", "field": "f6" }, { "title": "Recall", "field": "f7" }], "data": params_data }, additionalGraphs=[]) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = prediction_column #create tables tabs = [] for label in prediction_column: params_data = [] step_size = 0.05 binded = list( zip(binaraized_target[label].tolist(), production_data[label].tolist())) binded.sort(key=lambda item: item[1], reverse=True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall']) offset = max(round(data_size * step_size), 1) for step in np.arange(offset, data_size + offset, offset): count = min(step, data_size) prob = round(binded[min(step, data_size - 1)][1], 2) top = round(100.0 * min(step, data_size) / data_size, 1) tp = sum([x[0] for x in binded[:min(step, data_size)]]) fp = count - tp precision = round(100.0 * tp / count, 1) recall = round(100.0 * tp / target_class_size, 1) params_data.append({ 'f1': float(top), 'f2': int(count), 'f3': float(prob), 'f4': int(tp), 'f5': int(fp), 'f6': float(precision), 'f7': float(recall) }) tabs.append( TabInfo(id=label, title=label, widget=BaseWidgetInfo(title="", type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": 21, "columns": [{ "title": "Top(%)", "field": "f1", "sort": "asc" }, { "title": "Count", "field": "f2", }, { "title": "Prob", "field": "f3", }, { "title": "TP", "field": "f4" }, { "title": "FP", "field": "f5" }, { "title": "Precision", "field": "f6" }, { "title": "Recall", "field": "f7" }], "data": params_data }, additionalGraphs=[]))) self.wi = BaseWidgetInfo(type="tabs", title=self.title, size=1, details="", tabs=tabs) else: self.wi = None
def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) with tm.assert_raises_regex(TypeError, 'Cannot compare types .+'): df.replace({'asdf': 'asdb', True: 'yes'})
def test_fillna_dtype_conversion_equiv_replace(self, val): df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) expected = df.replace(np.nan, val) result = df.fillna(val) tm.assert_frame_equal(result, expected)
def test_replace_pure_bool_with_string_no_op(self): df = DataFrame(np.random.rand(2, 2) > 0.5) result = df.replace('asdf', 'fdsa') assert_frame_equal(df, result)
def test_replace_mixed(self): mf = self.mixed_frame mf.iloc[5:20, mf.columns.get_loc('foo')] = nan mf.iloc[-10:, mf.columns.get_loc('A')] = nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-18, nan), self.mixed_frame) result = self.mixed_frame.replace(np.nan, -1e8) expected = self.mixed_frame.fillna(value=-1e8) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) # int block upcasting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64') }) expected = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64') }) result = df.replace(0, 0.5) assert_frame_equal(result, expected) df.replace(0, 0.5, inplace=True) assert_frame_equal(df, expected) # int block splitting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64'), 'C': Series([1, 2], dtype='int64') }) expected = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64'), 'C': Series([1, 2], dtype='int64') }) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64') }) expected = DataFrame({ 'A': Series([1, 'foo'], dtype='object'), 'B': Series([0, 1], dtype='int64') }) result = df.replace(2, 'foo') assert_frame_equal(result, expected) expected = DataFrame({ 'A': Series(['foo', 'bar'], dtype='object'), 'B': Series([0, 'foo'], dtype='object') }) result = df.replace([1, 2], ['foo', 'bar']) assert_frame_equal(result, expected) # test case from df = DataFrame({ 'A': Series([3, 0], dtype='int64'), 'B': Series([0, 3], dtype='int64') }) result = df.replace(3, df.mean().to_dict()) expected = df.copy().astype('float64') m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected)
def to_sql( df: pd.DataFrame, table_name: str, creds: SqlCreds, sql_type: str = "table", schema: str = "dbo", index: bool = True, if_exists: str = "fail", batch_size: int = None, debug: bool = False, bcp_path: str = None, ): """ Writes the pandas DataFrame to a SQL table or view. Will write all columns to the table or view. If the destination table/view doesn't exist, will create it. Assumes the SQL table/view has the same number, name, and type of columns. To only write parts of the DataFrame, filter it beforehand and pass that to this function. Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen. Parameters ---------- df : pandas.DataFrame table_name : str Name of SQL table or view, without the schema creds : bcpandas.SqlCreds The credentials used in the SQL database. sql_type : {'table'}, can only be 'table' The type of SQL object of the destination. schema : str, default 'dbo' The SQL schema. index : bool, default True Write DataFrame index as a column. Uses the index name as the column name in the table. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a BCPandasValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name. If the database table exists then the dataframe cannot have new columns that aren't in the table, but conversely table columns can be missing from the dataframe. batch_size : int, optional Rows will be written in batches of this size at a time. By default, BCP sets this to 1000. debug : bool, default False If True, will not delete the temporary CSV and format files, and will output their location. bcp_path : str, default None The full path to the BCP utility, useful if it is not in the PATH environment variable """ # validation if df.shape[0] == 0 or df.shape[1] == 0: return assert sql_type == TABLE, "only supporting table, not view, for now" assert if_exists in IF_EXISTS_OPTIONS if df.columns.has_duplicates: raise BCPandasValueError( "Columns with duplicate names detected, SQL requires that column names be unique. " f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}") # TODO diff way to implement? could be big performance hit with big dataframe if index: df = df.copy(deep=True).reset_index() delim = get_delimiter(df) quotechar = get_quotechar(df) if batch_size is not None: if batch_size == 0: raise BCPandasValueError("Param batch_size can't be 0") if batch_size > df.shape[0]: raise BCPandasValueError( "Param batch_size can't be larger than the number of rows in the DataFrame" ) # save to temp path csv_file_path = get_temp_file() # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server df.replace({ True: 1, False: 0 }).to_csv( path_or_buf=csv_file_path, sep=delim, header=False, index=False, # already set as new col earlier if index=True quoting=csv.QUOTE_MINIMAL, # pandas default quotechar=quotechar, line_terminator=NEWLINE, doublequote=True, escapechar=None, # not needed, as using doublequote ) logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}") # build format file fmt_file_path = get_temp_file() sql_item_exists = _sql_item_exists(sql_type=sql_type, schema=schema, table_name=table_name, creds=creds) cols_dict = None # for mypy if if_exists == "append": # get dict of column names -> order of column cols_dict = dict( pd.read_sql_query( """ SELECT COLUMN_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{_schema}' AND TABLE_NAME = '{_tbl}' """.format(_schema=schema, _tbl=table_name), creds.engine, ).values) # check that column names match in db and dataframe exactly if sql_item_exists: # the db cols are always strings, unlike df cols extra_cols = [ str(x) for x in df.columns if str(x) not in cols_dict.keys() ] if extra_cols: raise BCPandasValueError( f"Column(s) detected in the dataframe that are not in the database, " f"cannot have new columns if `if_exists=='append'`, " f"the extra column(s): {extra_cols}") fmt_file_txt = build_format_file(df=df, delimiter=delim, db_cols_order=cols_dict) with open(fmt_file_path, "w") as ff: ff.write(fmt_file_txt) logger.debug(f"Created BCP format file at {fmt_file_path}") try: if if_exists == "fail": if sql_item_exists: raise BCPandasValueError( f"The {sql_type} called {schema}.{table_name} already exists, " f"`if_exists` param was set to `fail`.") else: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "replace": _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "append": if not sql_item_exists: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) # BCP the data in bcp( sql_item=table_name, direction=IN, flat_file=csv_file_path, format_file_path=fmt_file_path, creds=creds, sql_type=sql_type, schema=schema, batch_size=batch_size, bcp_path=bcp_path, ) finally: if not debug: logger.debug(f"Deleting temp CSV and format files") os.remove(csv_file_path) os.remove(fmt_file_path) else: logger.debug( f"`to_sql` DEBUG mode, not deleting the files. CSV file is at " f"{csv_file_path}, format file is at {fmt_file_path}")
def test_replace_regex_metachar(self, metachar): df = DataFrame({'a': [metachar, 'else']}) result = df.replace({'a': {metachar: 'paren'}}) expected = DataFrame({'a': ['paren', 'else']}) assert_frame_equal(result, expected)
def generate_barchart_for_incomegroup_distribution(idf,startyear = None, stopyear = datetime.now().year, targetdir = "results/dataoverview/", basefilename = "incomegroups.png", filterzerocommitment = True, valuename="USD_Commitment_Defl", incomegroups=['LDCs','LMICs','UMICs'], figsize=(30,56)): """ generates six barchart-graphics showing the distribution of commitments and projectnumer among the different incomegroups (LDCs,LICs...) over time. also creates the json-files of the aggregated data. @param idf (DataFrame): the inputDataFrame that shall be used as base for the images and json-files @param startyear (int): select a specific year as start. for example 2011 to get graphs beginning at 1-1-2011 upto the end of the data @param stopyear: select a specific year to end. for example 2018 to get graphs including data upto 31-12-2018 @param targetdir: where to store the results. will be created if needed @param filterzerocommitment: about 50% of the raw-data have not amount for the attribute 'USD_Commitment_Defl' per default those datapoints will be ignored @param valuename: defaults to USD_Commitment_Defl @param basefilename: the filename and format (based on extension) of the resulting image @incomegroups: only consider the listed incomegroups. expects an array of IncomegroupNames. There are: LDCs,LMICs,MADCTs,Other LICs,Part I unallocated by income, UMICs. project specific per default only LDCs, LMICs and UMICs are taken into account. with None or empty array every group is considered """ all_incomegroups = ["LDCs","LMICs","MADCTs","Other LICs","Part I unallocated by income", "UMICs"] nrows = 8 fig, axes = plt.subplots(nrows=nrows, ncols=1,figsize=figsize) df = DataFrame() for i in ['CommitmentDate','IncomegroupName',valuename]: df[i] = idf[i] if startyear: df = df[df['CommitmentDate'] > datetime(year=startyear-1,month=12,day=31)] targetdir = targetdir + "from_" + str(startyear) + "_" if stopyear: df = df[df['CommitmentDate'] < datetime(year=stopyear+1,month=1,day=1)] targetdir = targetdir + "upto_" + str(stopyear) if startyear or stopyear: targetdir = targetdir + "/" os.makedirs(targetdir,exist_ok=True) if filterzerocommitment: df = df[df[valuename].notnull()] df = df[df[valuename] != 0.0] if type(incomegroups) == type([]) and len(incomegroups) > 0: df = df[df['IncomegroupName'].isin(incomegroups)] # group by year and IncomegroupName df = df.set_index("CommitmentDate") groupeddf = df.groupby([Grouper(freq="A"), 'IncomegroupName'])[valuename] # resolve grouping, unstack, fill missing and reset_index() ddf = groupeddf.sum().unstack().fillna(0.0).reset_index() # get a percent view in a new DataFrame df2 = DataFrame() for i in all_incomegroups: try: df2[i]=ddf[i] except Exception as e: pass df2['CommitmentYear'] = ddf.reset_index()['CommitmentDate'].apply(lambda x: str(x.year)) df2=df2.set_index('CommitmentYear') countrytypesum = DataFrame(df2.T.sum()) for i in all_incomegroups: try: df2[i] = (df2[i] / countrytypesum[0]) * 100 except Exception as e: pass df2.replace([np.inf, -np.inf], np.nan, inplace=True) df2.plot(width=0.9,grid=True,kind='bar', ax=axes[1],title="mUSD per IncomeGroup (Prozent)") df2.plot(width=0.9,grid=True,kind='bar', stacked=True, ax=axes[2],title="mUSD per IncomeGroup (Prozent)") with open("%s%s-sum-percent.json" %(targetdir,basefilename),"w") as fd: fd.write(df2.to_json(orient="index")) # an absolut view # create index on year as string and drop old index ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year)) ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear") # plot absolut sum ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[0], title="mUSD per IncomeGroup (sum)") with open("%s%s-sum-absolut.json" %(targetdir,basefilename),"w") as fd: fd.write(ddf.to_json(orient="index")) # create bar-char with mean ddf = groupeddf.mean().unstack().fillna(0.0).reset_index() # create index on year as string and drop old index ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year)) ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear") ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[3], title="mUSD per IncomeGroup (mean)") with open("%s%s-mean.json" %(targetdir,basefilename),"w") as fd: fd.write(ddf.to_json(orient="index")) # create bar-char with median ddf = groupeddf.median().unstack().fillna(0.0).reset_index() # create index on year as string and drop old index ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year)) ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear") ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[4], title="mUSD per IncomeGroup (median)") with open("%s%s-median.json" %(targetdir,basefilename),"w") as fd: fd.write(ddf.to_json(orient="index")) ddf = groupeddf.count().unstack().fillna(0.0).reset_index() # get a percent view in a new DataFrame df2 = DataFrame() for i in all_incomegroups: try: df2[i]=ddf[i] except Exception as e: pass df2['CommitmentYear'] = ddf.reset_index()['CommitmentDate'].apply(lambda x: str(x.year)) df2=df2.set_index('CommitmentYear') countrytypesum = DataFrame(df2.T.sum()) for i in all_incomegroups: try: df2[i] = (df2[i] / countrytypesum[0]) * 100 except Exception as e: pass df2.replace([np.inf, -np.inf], np.nan, inplace=True) df2.plot(width=0.9,grid=True,kind='bar', ax=axes[6],title="projects per IncomeGroup (Prozent)") df2.plot(width=0.9,grid=True,kind='bar', stacked=True,ax=axes[7],title="projects per IncomeGroup (Prozent)") with open("%s%s-count-percent.json" %(targetdir,basefilename),"w") as fd: fd.write(df2.to_json(orient="index")) # create index on year as string and drop old index ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year)) ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear") ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[5], title="projects per IncomeGroup (count)") with open("%s%s-count-absolut.json" %(targetdir,basefilename),"w") as fd: fd.write(ddf.to_json(orient="index")) plt.savefig(targetdir+basefilename,bbox_inches='tight') plt.close(plt.gcf())
def add_custom_ta_features( df: pd.DataFrame, open: str, # noqa high: str, low: str, close: str, fillna: bool = False, colprefix: str = "", apply_pct: bool = False, plot: bool = False, ) -> pd.DataFrame: # Add Volatility TA df = ta.add_volatility_ta(df=df, high=high, low=low, close=close, fillna=fillna, colprefix=colprefix) # Add Trend TA df = ta.add_trend_ta(df=df, high=high, low=low, close=close, fillna=fillna, colprefix=colprefix) # Add Other TA df = ta.add_others_ta(df=df, close=close, fillna=fillna, colprefix=colprefix) # convert to pct if apply_pct: df = df.pct_change(fill_method='ffill') df = df.applymap(lambda x: x * 100) df.replace([np.inf, -np.inf], np.nan, inplace=True) df.astype(np.float32) df = df.round(5) if fillna: df.fillna(value=0, inplace=True) if plot: fig = make_subplots(rows=5, cols=1, shared_xaxes=True, vertical_spacing=0.02, subplot_titles=("Close", "Bollinger Bands", "MACD")) fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol), row=1, col=1) # Bollinger-Bands fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol), row=2, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['volatility_bbh'], name=symbol + ' High BB'), row=2, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['volatility_bbl'], name=symbol + ' Low BB'), row=2, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['volatility_bbm'], name=symbol + ' EMA BB'), row=2, col=1) # MACD fig.add_trace(go.Scatter(x=df.index, y=df['trend_macd'], name=symbol + ' MACD'), row=3, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_macd_signal'], name=symbol + ' MACD Signal'), row=3, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_macd_diff'], name=symbol + ' MACD Difference'), row=3, col=1) # SMA fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol), row=4, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_sma_fast'], name=symbol + ' SMA-Fast'), row=4, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_sma_slow'], name=symbol + ' SMA-Slow'), row=4, col=1) # EMA fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol), row=5, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_ema_fast'], name=symbol + ' EMA-Fast'), row=5, col=1) fig.add_trace(go.Scatter(x=df.index, y=df['trend_ema_slow'], name=symbol + ' EMA-Slow'), row=5, col=1) config = {'displayModeBar': False} fig.show(config=config) return df
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #plot output correlations error_distr = go.Figure() error = reference_data[prediction_column] - reference_data[target_column] error_distr.add_trace(go.Histogram(x=error, marker_color=red, name = 'error distribution', histnorm = 'percent')) error_distr.update_layout( xaxis_title = "Error (Actual - Predicted)", yaxis_title = "Percentage", ) error_distr_json = json.loads(error_distr.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": error_distr_json['data'], "layout": error_distr_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #plot support bar metrics_matrix = metrics.classification_report( reference_data[target_column], reference_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:, :-3].values[0] fig = go.Figure() fig.add_trace( go.Bar(x=target_names if target_names else metrics_frame.columns.tolist()[:-3], y=metrics_frame.iloc[-1:, :-3].values[0], marker_color=red, name='Support')) fig.update_layout( xaxis_title="Class", yaxis_title="Number of Objects", ) support_bar_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: if target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics me = np.mean(production_data[prediction_column] - production_data[target_column]) sde = np.std(production_data[prediction_column] - production_data[target_column], ddof = 1) abs_err = list(map(lambda x : abs(x[0] - x[1]), zip(production_data[prediction_column], production_data[target_column]))) mae = np.mean(abs_err) sdae = np.std(abs_err, ddof = 1) abs_perc_err = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], zip(production_data[prediction_column], production_data[target_column]))) mape = np.mean(abs_perc_err) sdape = np.std(abs_perc_err, ddof = 1) #sqrt_err = list(map(lambda x : (x[0] - x[1])**2, # zip(production_data[prediction_column], production_data[target_column]))) #mse = np.mean(sqrt_err) #sdse = np.std(sqrt_err, ddof = 1) #error_norm_json = json.loads(error_norm.to_json()) self.wi = BaseWidgetInfo( title="Production: Model Quality (+/- std)", type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [ { "value": str(round(me, 2)) + " (" + str(round(sde,2)) + ")", "label": "ME" }, { "value": str(round(mae, 2)) + " (" + str(round(sdae,2)) + ")", "label": "MAE" }, { "value": str(round(mape, 2)) + " (" + str(round(sdape, 2)) + ")", "label": "MAPE" }#, #{ # "value": str(round(mse, 2)) + " (" + str(round(sdse, 2)) + ")", # "label": "MSE" #} ] }, additionalGraphs=[], ) else: self.wi = None else: self.wi = None
def rename_columns(df_: pd.DataFrame) -> pd.DataFrame: df_ = df_.replace(-1, np.nan) zero_replacements = [ "Age", "Education", "MaritalStatus", "EmploymentStatus", "OccupationArea", "CreditScoreEeMini", ] df_[zero_replacements] = df_[zero_replacements].replace(0.0, np.nan) value_replacements = { "UseOfLoan": { 0: "Loan_consolidation", 1: "Real_estate", 2: "Home_improvement", 3: "Business", 4: "Education", 5: "Travel", 6: "Vehicle", 7: "Other", 8: "Health", 101: "Working_capital_financing", 102: "Purchase_of_machinery_equipment", 103: "Renovation_of_real_estate", 104: "Accounts_receivable_financing ", 105: "Acquisition_of_means_of_transport", 106: "Construction_finance", 107: "Acquisition_of_stocks", 108: "Acquisition_of_real_estate", 109: "Guaranteeing_obligation ", 110: "Other_business", }, "Education": { 1: "Primary", 2: "Basic", 3: "Vocational", 4: "Secondary", 5: "Higher", }, "MaritalStatus": { 1: "Married", 2: "Cohabitant", 3: "Single", 4: "Divorced", 5: "Widow", }, "EmploymentStatus": { 1: "Unemployed", 2: "Partially", 3: "Fully", 4: "Self_employed", 5: "Entrepreneur", 6: "Retiree", }, "NewCreditCustomer": { 0: "Existing_credit_customer", 1: "New_credit_Customer" }, "OccupationArea": { 1: "Other", 2: "Mining", 3: "Processing", 4: "Energy", 5: "Utilities", 6: "Construction", 7: "Retail_and_wholesale", 8: "Transport_and_warehousing", 9: "Hospitality_and_catering", 10: "Info_and_telecom", 11: "Finance_and_insurance", 12: "Real_estate", 13: "Research", 14: "Administrative", 15: "Civil_service_and_military", 16: "Education", 17: "Healthcare_and_social_help", 18: "Art_and_entertainment", 19: "Agriculture_forestry_and_fishing", }, "HomeOwnershipType": { 0: "Homeless", 1: "Owner", 2: "Living_with_parents", 3: "Tenant_pre_furnished_property", 4: "Tenant_unfurnished_property", 5: "Council_house", 6: "Joint_tenant", 7: "Joint_ownership", 8: "Mortgage", 9: "Owner_with_encumbrance", 10: "Other", }, "NrOfDependants": { "10Plus": 11 }, "Gender": { 0: "Male", 1: "Female", 2: "Unknown" }, } df_ = df_.replace(value_replacements) return df_