Exemple #1
1
 def test_replace_str_to_str_chain(self):
     a = np.arange(1, 5)
     astr = a.astype(str)
     bstr = np.arange(2, 6).astype(str)
     df = DataFrame({'a': astr})
     with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"):
         df.replace({'a': dict(zip(astr, bstr))})
Exemple #2
0
    def test_nunique(self):
        df = DataFrame({
            'A': list('abbacc'),
            'B': list('abxacc'),
            'C': list('abbacx'),
        })

        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
        result = df.groupby('A', as_index=False).nunique()
        tm.assert_frame_equal(result, expected)

        # as_index
        expected.index = list('abc')
        expected.index.name = 'A'
        result = df.groupby('A').nunique()
        tm.assert_frame_equal(result, expected)

        # with na
        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
        tm.assert_frame_equal(result, expected)

        # dropna
        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                             index=list('abc'))
        expected.index.name = 'A'
        result = df.replace({'x': None}).groupby('A').nunique()
        tm.assert_frame_equal(result, expected)
def getphysiologicalinfo(pos, atlas="Talairach", r=5, nearest="on", rm_unfound=False):
    # load atlas :
    hdr, mask, gray, brodtxt, brodidx, label = loadatlas(atlas=atlas)

    # Get info of each electrode :
    nbchan = pos.shape[0]
    hemi, lobe, gyrus, matter, brod = [], [], [], [], []
    for k in range(0, nbchan):
        hemiC, lobeC, gyrusC, matterC, brodC = physiochannel(
            list(pos[k, :]), mask, hdr, gray, label, r=r, nearest=nearest
        )
        hemi.extend(hemiC), lobe.extend(lobeC), gyrus.extend(gyrusC)
        matter.extend(matterC), brod.extend(brodC)

    # Put everything in a panda structure :
    phyinf = DataFrame({"Hemisphere": hemi, "Lobe": lobe, "Gyrus": gyrus, "Matter": matter, "Brodmann": list(brod)})

    # Replace corrupted values :
    phyinf.replace(to_replace="*", value="Not found", inplace=True)
    phyinf["Brodmann"].replace(to_replace="Brodmann area ", value="", inplace=True, regex=True)
    phyinf["Gyrus"].replace(to_replace=" Gyrus", value="", inplace=True, regex=True)
    phyinf["Lobe"].replace(to_replace=" Lobe", value="", inplace=True, regex=True)
    phyinf["Matter"].replace(to_replace=" Matter", value="", inplace=True, regex=True)

    # Convert Brodmann to number :
    BrodStr = list(phyinf["Brodmann"])
    for k in range(0, len(BrodStr)):
        try:
            BrodStr[k] = int(BrodStr[k])
        except:
            BrodStr[k] = BrodStr[k]
    phyinf["Brodmann"] = BrodStr

    return phyinf
Exemple #4
0
    def test_replace_with_empty_dictlike(self, mix_abc):
        # GH 15289
        df = DataFrame(mix_abc)
        assert_frame_equal(df, df.replace({}))
        assert_frame_equal(df, df.replace(Series([])))

        assert_frame_equal(df, df.replace({'b': {}}))
        assert_frame_equal(df, df.replace(Series({'b': {}})))
Exemple #5
0
    def test_replace_with_empty_dictlike(self):
        # GH 15289
        mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
        df = DataFrame(mix)
        assert_frame_equal(df, df.replace({}))
        assert_frame_equal(df, df.replace(Series([])))

        assert_frame_equal(df, df.replace({'b': {}}))
        assert_frame_equal(df, df.replace(Series({'b': {}})))
Exemple #6
0
    def test_replace_simple_nested_dict_with_nonexistent_value(self):
        df = DataFrame({'col': range(1, 5)})
        expected = DataFrame({'col': ['a', 2, 3, 'b']})

        result = df.replace({-1: '-', 1: 'a', 4: 'b'})
        assert_frame_equal(expected, result)

        result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}})
        assert_frame_equal(expected, result)
Exemple #7
0
    def test_regex_replace_dict_mixed(self):
        mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
        dfmix = DataFrame(mix)

        # dicts
        # single dict {re1: v1}, search the whole frame
        # need test for this...

        # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
        # frame
        res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan},
                     inplace=True, regex=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
        # whole frame
        res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True,
                     regex=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'})
        res2 = dfmix.copy()
        res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'},
                     inplace=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # scalar -> dict
        # to_replace regex, {value: value}
        expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
                           mix['c']})
        res = dfmix.replace('a', {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace('a', {'b': np.nan}, regex=True, inplace=True)
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace('a', {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace(regex='a', value={'b': np.nan}, inplace=True)
        expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)
Exemple #8
0
    def test_replace_simple_nested_dict(self):
        df = DataFrame({'col': range(1, 5)})
        expected = DataFrame({'col': ['a', 2, 3, 'b']})

        result = df.replace({'col': {1: 'a', 4: 'b'}})
        assert_frame_equal(expected, result)

        # in this case, should be the same as the not nested version
        result = df.replace({1: 'a', 4: 'b'})
        assert_frame_equal(expected, result)
Exemple #9
0
    def test_replace_dict_tuple_list_ordering_remains_the_same(self):
        df = DataFrame(dict(A=[nan, 1]))
        res1 = df.replace(to_replace={nan: 0, 1: -1e8})
        res2 = df.replace(to_replace=(1, nan), value=[-1e8, 0])
        res3 = df.replace(to_replace=[1, nan], value=[-1e8, 0])

        expected = DataFrame({'A': [0, -1e8]})
        assert_frame_equal(res1, res2)
        assert_frame_equal(res2, res3)
        assert_frame_equal(res3, expected)
Exemple #10
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({'A': date_range('20130101', periods=3,
                                        tz='US/Eastern'),
                        'B': [0, np.nan, 2]})
        result = df.replace(np.nan, 1)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': Series([0, 1, 2], dtype='float64')})
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': [np.nan, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Eastern'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Pacific'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)
Exemple #11
0
    def test_replace_input_formats(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(to_rep, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], values[k])
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
                              'C': ['', 'asdf', 'fd']})
        assert_frame_equal(result, expected)

        # dict to scalar
        filled = df.replace(to_rep, 0)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], 0)
        assert_frame_equal(filled, DataFrame(expected))

        self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(np.nan, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(np.nan, values[k])
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        self.assertRaises(ValueError, df.replace, to_rep, values[1:])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)
 def get_dataframe(self, data):
     dataframe = DataFrame(data)
     index = self.get_index(dataframe)
     if index:
         if self.index_none_value is not None:
             for key in index:
                 dataframe.replace({key: None}, {key: self.index_none_value}, inplace=True)
         dataframe.set_index(index, inplace=True)
     else:
         # Name auto-index column to ensure valid CSV output
         dataframe.index.name = "row"
     return dataframe
Exemple #13
0
    def test_replace_input_formats_listlike(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(to_rep, values)
        expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
                              'C': ['', 'asdf', 'fd']})
        assert_frame_equal(result, expected)

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(np.nan, values)
        expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            df.replace(to_rep, values[1:])
Exemple #14
0
    def historicalRequest (self, securities, fields, startDate, endDate, **kwargs):
        """ Equivalent to the Excel BDH Function.
        
            If securities are provided as a list, the returned DataFrame will
            have a MultiIndex.
        """
        defaults = {'startDate'       : startDate,
            'endDate'                 : endDate,
            'periodicityAdjustment'   : 'ACTUAL',
            'periodicitySelection'    : 'DAILY',
            'nonTradingDayFillOption' : 'ACTIVE_DAYS_ONLY',
            'adjustmentNormal'        : False,
            'adjustmentAbnormal'      : False,
            'adjustmentSplit'         : True,
            'adjustmentFollowDPDF'    : False}   
        defaults.update(kwargs)

        response = self.sendRequest('HistoricalData', securities, fields, defaults)
        
        data = []
        keys = []
        
        for msg in response:
            securityData = msg.getElement('securityData')
            fieldData = securityData.getElement('fieldData')
            fieldDataList = [fieldData.getValueAsElement(i) for i in range(fieldData.numValues())]
            
            df = DataFrame()
            
            for fld in fieldDataList:
                for v in [fld.getElement(i) for i in range(fld.numElements()) if fld.getElement(i).name() != 'date']:
                    df.ix[fld.getElementAsDatetime('date'), str(v.name())] = v.getValue()

            df.index = df.index.to_datetime()
            df.replace('#N/A History', np.nan, inplace=True)
            
            keys.append(securityData.getElementAsString('security'))
            data.append(df)
        
        if len(data) == 0:
            return DataFrame()
        if type(securities) == str:
            data = pd.concat(data, axis=1)
            data.columns.name = 'Field'
        else:
            data = pd.concat(data, keys=keys, axis=1, names=['Security','Field'])
            
        data.index.name = 'Date'
        return data
Exemple #15
0
 def test_regex_replace_dict_nested(self):
     # nested dicts will not work until this is implemented for Series
     mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
     dfmix = DataFrame(mix)
     res = dfmix.replace({'b': {r'\s*\.\s*': nan}}, regex=True)
     res2 = dfmix.copy()
     res4 = dfmix.copy()
     res2.replace({'b': {r'\s*\.\s*': nan}}, inplace=True, regex=True)
     res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': nan}})
     res4.replace(regex={'b': {r'\s*\.\s*': nan}}, inplace=True)
     expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c':
                        mix['c']})
     assert_frame_equal(res, expec)
     assert_frame_equal(res2, expec)
     assert_frame_equal(res3, expec)
     assert_frame_equal(res4, expec)
Exemple #16
0
 def test_regex_replace_numeric_to_object_conversion(self):
     mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
     df = DataFrame(mix)
     expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']})
     res = df.replace(0, 'a')
     assert_frame_equal(res, expec)
     self.assertEqual(res.a.dtype, np.object_)
Exemple #17
0
    def test_fillna_dtype_conversion(self):
        # make sure that fillna on an empty frame works
        df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = df.get_dtype_counts().sort_values()
        expected = Series({'object': 5})
        assert_series_equal(result, expected)

        result = df.fillna(1)
        expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = result.get_dtype_counts().sort_values()
        expected = Series({'int64': 5})
        assert_series_equal(result, expected)

        # empty block
        df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
        result = df.fillna('nan')
        expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
        assert_frame_equal(result, expected)

        # equiv of replace
        df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
        for v in ['', 1, np.nan, 1.0]:
            expected = df.replace(np.nan, v)
            result = df.fillna(v)
            assert_frame_equal(result, expected)
Exemple #18
0
 def test_regex_replace_dict_nested(self, mix_abc):
     # nested dicts will not work until this is implemented for Series
     dfmix = DataFrame(mix_abc)
     res = dfmix.replace({'b': {r'\s*\.\s*': np.nan}}, regex=True)
     res2 = dfmix.copy()
     res4 = dfmix.copy()
     res2.replace({'b': {r'\s*\.\s*': np.nan}}, inplace=True, regex=True)
     res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': np.nan}})
     res4.replace(regex={'b': {r'\s*\.\s*': np.nan}}, inplace=True)
     expec = DataFrame({'a': mix_abc['a'],
                        'b': ['a', 'b', np.nan, np.nan],
                        'c': mix_abc['c'],
                        })
     assert_frame_equal(res, expec)
     assert_frame_equal(res2, expec)
     assert_frame_equal(res3, expec)
     assert_frame_equal(res4, expec)
Exemple #19
0
 def test_replace_convert(self):
     # gh 3907
     df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']])
     m = {'foo': 1, 'bar': 2, 'bah': 3}
     rep = df.replace(m)
     expec = Series([np.int64] * 3)
     res = rep.dtypes
     assert_series_equal(expec, res)
Exemple #20
0
    def test_replace_regex_metachar(self):
        metachars = '[]', '()', r'\d', r'\w', r'\s'

        for metachar in metachars:
            df = DataFrame({'a': [metachar, 'else']})
            result = df.replace({'a': {metachar: 'paren'}})
            expected = DataFrame({'a': ['paren', 'else']})
            assert_frame_equal(result, expected)
Exemple #21
0
    def fillna_dict(cls, prop):
        """
        Use trade history then fill empty with value row above
        """
        df = DataFrame(prop)
        df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan)
        df = df.fillna(method='ffill')

        return [r.to_dict() for k, r in df.iterrows()]
Exemple #22
0
    def test_replace_method(self, to_replace, method, expected):
        # GH 19632
        df = DataFrame({'A': [0, 1, 2],
                        'B': [5, np.nan, 7],
                        'C': ['a', 'b', 'c']})

        result = df.replace(to_replace=to_replace, value=None, method=method)
        expected = DataFrame(expected)
        assert_frame_equal(result, expected)
Exemple #23
0
 def test_regex_replace_numeric_to_object_conversion(self, mix_abc):
     df = DataFrame(mix_abc)
     expec = DataFrame({'a': ['a', 1, 2, 3],
                        'b': mix_abc['b'],
                        'c': mix_abc['c'],
                        })
     res = df.replace(0, 'a')
     assert_frame_equal(res, expec)
     assert res.a.dtype == np.object_
Exemple #24
0
    def test_regex_replace_list_obj(self):
        obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')}
        dfobj = DataFrame(obj)

        # lists of regexes and values
        # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
        to_replace_res = [r'\s*\.\s*', r'e|f|g']
        values = [np.nan, 'crap']
        res = dfobj.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': ['a', 'b', np.nan, np.nan],
                           'b': ['crap'] * 3 + ['h'],
                           'c': ['h', 'crap', 'l', 'o'],
                           })
        assert_frame_equal(res, expec)

        # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
        to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)']
        values = [r'\1\1', r'\1_crap']
        res = dfobj.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': ['a', 'b', '..', '..'],
                           'b': ['e_crap', 'f_crap', 'g_crap', 'h'],
                           'c': ['h', 'e_crap', 'l', 'o'],
                           })
        assert_frame_equal(res, expec)

        # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
        # or vN)]
        to_replace_res = [r'\s*(\.)\s*', r'e']
        values = [r'\1\1', r'crap']
        res = dfobj.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': ['a', 'b', '..', '..'],
                           'b': ['crap', 'f', 'g', 'h'],
                           'c': ['h', 'crap', 'l', 'o'],
                           })
        assert_frame_equal(res, expec)

        to_replace_res = [r'\s*(\.)\s*', r'e']
        values = [r'\1\1', r'crap']
        res = dfobj.replace(value=values, regex=to_replace_res)
        expec = DataFrame({'a': ['a', 'b', '..', '..'],
                           'b': ['crap', 'f', 'g', 'h'],
                           'c': ['h', 'crap', 'l', 'o'],
                           })
        assert_frame_equal(res, expec)
Exemple #25
0
    def test_replace_mixed(self):
        mf = self.mixed_frame
        mf.iloc[5:20, mf.columns.get_loc('foo')] = nan
        mf.iloc[-10:, mf.columns.get_loc('A')] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64'),
                        'C': Series([1, 2], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64'),
                              'C': Series([1, 2], dtype='int64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1, 'foo'], dtype='object'),
                              'B': Series([0, 1], dtype='int64')})
        result = df.replace(2, 'foo')
        assert_frame_equal(result, expected)

        expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'),
                              'B': Series([0, 'foo'], dtype='object')})
        result = df.replace([1, 2], ['foo', 'bar'])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({'A': Series([3, 0], dtype='int64'),
                        'B': Series([0, 3], dtype='int64')})
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype('float64')
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)
Exemple #26
0
    def test_regex_replace_list_mixed(self, mix_ab):
        # mixed frame to make sure this doesn't break things
        dfmix = DataFrame(mix_ab)

        # lists of regexes and values
        # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
        to_replace_res = [r'\s*\.\s*', r'a']
        values = [np.nan, 'crap']
        mix2 = {'a': list(range(4)), 'b': list('ab..'), 'c': list('halo')}
        dfmix2 = DataFrame(mix2)
        res = dfmix2.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': mix2['a'],
                           'b': ['crap', 'b', np.nan, np.nan],
                           'c': ['h', 'crap', 'l', 'o'],
                           })
        assert_frame_equal(res, expec)

        # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
        to_replace_res = [r'\s*(\.)\s*', r'(a|b)']
        values = [r'\1\1', r'\1_crap']
        res = dfmix.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': mix_ab['a'],
                           'b': ['a_crap', 'b_crap', '..', '..'],
                           })
        assert_frame_equal(res, expec)

        # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
        # or vN)]
        to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
        values = [r'\1\1', r'crap', r'\1_crap']
        res = dfmix.replace(to_replace_res, values, regex=True)
        expec = DataFrame({'a': mix_ab['a'],
                           'b': ['crap', 'b_crap', '..', '..'],
                           })
        assert_frame_equal(res, expec)

        to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)']
        values = [r'\1\1', r'crap', r'\1_crap']
        res = dfmix.replace(regex=to_replace_res, value=values)
        expec = DataFrame({'a': mix_ab['a'],
                           'b': ['crap', 'b_crap', '..', '..'],
                           })
        assert_frame_equal(res, expec)
Exemple #27
0
    def test_replace_input_formats_scalar(self):
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})

        # dict to scalar
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        filled = df.replace(to_rep, 0)
        expected = {k: v.replace(to_rep[k], 0)
                    for k, v in compat.iteritems(df)}
        assert_frame_equal(filled, DataFrame(expected))

        pytest.raises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)
Exemple #28
0
    def test_replace_input_formats_scalar(self):
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})

        # dict to scalar
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        filled = df.replace(to_rep, 0)
        expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        msg = "value argument must be scalar, dict, or Series"
        with pytest.raises(TypeError, match=msg):
            df.replace(to_rep, [np.nan, 0, ''])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)
Exemple #29
0
    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        result = df.replace(0, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}})
        assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        s = Series({'zero': 0.0, 'one': 2.0})
        result = df.replace(s, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}})
        assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        assert_frame_equal(result, expected)
Exemple #30
0
    def test_replace_dtypes(self):
        # int
        df = DataFrame({'ints': [1, 2, 3]})
        result = df.replace(1, 0)
        expected = DataFrame({'ints': [0, 2, 3]})
        assert_frame_equal(result, expected)

        df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int32)
        result = df.replace(1, 0)
        expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int32)
        assert_frame_equal(result, expected)

        df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int16)
        result = df.replace(1, 0)
        expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int16)
        assert_frame_equal(result, expected)

        # bools
        df = DataFrame({'bools': [True, False, True]})
        result = df.replace(False, True)
        self.assertTrue(result.values.all())

        # complex blocks
        df = DataFrame({'complex': [1j, 2j, 3j]})
        result = df.replace(1j, 0j)
        expected = DataFrame({'complex': [0j, 2j, 3j]})
        assert_frame_equal(result, expected)

        # datetime blocks
        prev = datetime.today()
        now = datetime.today()
        df = DataFrame({'datetime64': Index([prev, now, prev])})
        result = df.replace(prev, now)
        expected = DataFrame({'datetime64': Index([now] * 3)})
        assert_frame_equal(result, expected)
Exemple #31
0
    def test_regex_replace_dict_mixed(self):
        mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
        dfmix = DataFrame(mix)

        # dicts
        # single dict {re1: v1}, search the whole frame
        # need test for this...

        # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
        # frame
        res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*\.\s*'}, {'b': nan}, inplace=True, regex=True)
        expec = DataFrame({
            'a': mix['a'],
            'b': ['a', 'b', nan, nan],
            'c': mix['c']
        })
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
        # whole frame
        res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'},
                     inplace=True,
                     regex=True)
        expec = DataFrame({
            'a': mix['a'],
            'b': ['a', 'b', '.ty', '.ty'],
            'c': mix['c']
        })
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'})
        res2 = dfmix.copy()
        res2.replace(regex={'b': r'\s*(\.)\s*'},
                     value={'b': r'\1ty'},
                     inplace=True)
        expec = DataFrame({
            'a': mix['a'],
            'b': ['a', 'b', '.ty', '.ty'],
            'c': mix['c']
        })
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # scalar -> dict
        # to_replace regex, {value: value}
        expec = DataFrame({
            'a': mix['a'],
            'b': [nan, 'b', '.', '.'],
            'c': mix['c']
        })
        res = dfmix.replace('a', {'b': nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace('a', {'b': nan}, regex=True, inplace=True)
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace('a', {'b': nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace(regex='a', value={'b': nan}, inplace=True)
        expec = DataFrame({
            'a': mix['a'],
            'b': [nan, 'b', '.', '.'],
            'c': mix['c']
        })
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)
Exemple #32
0
def _remove_punctuation(dataframe: pd.DataFrame):
    dataframe.replace(',', '', regex=True, inplace=True)
    dataframe.replace('\-$', '0', regex=True, inplace=True)
    dataframe.replace('', '0', regex=True, inplace=True)

    return dataframe
Exemple #33
0
    def test_regex_replace_scalar(self):
        obj = {'a': list('ab..'), 'b': list('efgh')}
        dfobj = DataFrame(obj)
        mix = {'a': lrange(4), 'b': list('ab..')}
        dfmix = DataFrame(mix)

        # simplest cases
        # regex -> value
        # obj frame
        res = dfobj.replace(r'\s*\.\s*', nan, regex=True)
        assert_frame_equal(dfobj, res.fillna('.'))

        # mixed
        res = dfmix.replace(r'\s*\.\s*', nan, regex=True)
        assert_frame_equal(dfmix, res.fillna('.'))

        # regex -> regex
        # obj frame
        res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True)
        objc = obj.copy()
        objc['a'] = ['a', 'b', '...', '...']
        expec = DataFrame(objc)
        assert_frame_equal(res, expec)

        # with mixed
        res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True)
        mixc = mix.copy()
        mixc['b'] = ['a', 'b', '...', '...']
        expec = DataFrame(mixc)
        assert_frame_equal(res, expec)

        # everything with compiled regexs as well
        res = dfobj.replace(re.compile(r'\s*\.\s*'), nan, regex=True)
        assert_frame_equal(dfobj, res.fillna('.'))

        # mixed
        res = dfmix.replace(re.compile(r'\s*\.\s*'), nan, regex=True)
        assert_frame_equal(dfmix, res.fillna('.'))

        # regex -> regex
        # obj frame
        res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1')
        objc = obj.copy()
        objc['a'] = ['a', 'b', '...', '...']
        expec = DataFrame(objc)
        assert_frame_equal(res, expec)

        # with mixed
        res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1')
        mixc = mix.copy()
        mixc['b'] = ['a', 'b', '...', '...']
        expec = DataFrame(mixc)
        assert_frame_equal(res, expec)

        res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1')
        mixc = mix.copy()
        mixc['b'] = ['a', 'b', '...', '...']
        expec = DataFrame(mixc)
        assert_frame_equal(res, expec)

        res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1')
        mixc = mix.copy()
        mixc['b'] = ['a', 'b', '...', '...']
        expec = DataFrame(mixc)
        assert_frame_equal(res, expec)
Exemple #34
0
 def test_replace_bool_with_string(self):
     df = DataFrame({'a': [True, False], 'b': list('ab')})
     result = df.replace(True, 'a')
     expected = DataFrame({'a': ['a', False], 'b': df.b})
     assert_frame_equal(result, expected)
Exemple #35
0
def getData(link):

    url = 'http://pm.kksk.org' + link

    index_headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    flag = True
    try_times = 0
    while flag:
        try:
            proxies = getProxy()
            html = requests.get(url,
                                headers=index_headers,
                                proxies=proxies,
                                verify=False,
                                timeout=(120, 180))  # getHtml
            time.sleep(
                random.uniform(0.5, 1)
            )  #Random time sleep in order to prevent from getting IP banned
            html.encoding = 'gbk'  #this web is encoded in gbk
            soup = BeautifulSoup(html.text, "html.parser")

            updatetimefetch = soup.find(
                'div',
                class_='panel panel-primary hidden-xs')  # updatetime fetch
            timefetch = updatetimefetch.find('h3')
            timestring = timefetch.string
            timenum = re.sub('\D', '', timestring)  # pure number time
            date = timenum[0:8]
            hour = timenum[8:10]

            citynamediv = soup.find('div',
                                    class_='panel-title row')  #get city name
            citynamefetch = citynamediv.find('h1')
            cityname = citynamefetch.text
            cityname = re.sub("[^a-zA-Z0-9\u4e00-\u9fa5]", '', cityname)
            cityname = cityname[:2]

            tablediv = soup.find('div', class_='panel panel-primary hidden-xs')

            datalist = []

            for tr in tablediv.findAll('tr'):
                for td in tr.findAll(['td']):
                    datalist.append(td.get_text())

            table = chunks(datalist, 11)

            df = DataFrame(table)  #transfer from list to dataframe

            df.drop(index=[0],
                    inplace=True)  #drop the first row which is average

            df.replace('', 'NaN', inplace=True)  # replace empty value as NaN
            df.replace(' - ', 'NaN', inplace=True)

            df.columns = [
                'WatchPoint', 'AQI', 'Level', 'PrimaryPollution', 'PM2.5',
                'PM10', 'CO', 'NO2', 'O3_1h', 'O3_8h', 'SO2'
            ]  #change head

            col_name = df.columns.tolist()  # add time to the dataframe head
            col_name.insert(0, 'Date')
            col_name.insert(1, 'Hour')
            df = df.reindex(columns=col_name)  # reindex the dataframe head

            dateList = [date for x in range(0, (df.iloc[:, 0].size))
                        ]  # create time list
            hourList = [hour for x in range(0, (df.iloc[:, 0].size))]

            df['Date'] = dateList  # add time to the dataframe
            df['Hour'] = hourList

            csvName = 'data/' + cityname + '.csv'  # file name

            if os.path.exists(
                    csvName
            ) == True:  # write csv with tablehead if no existing file
                df.to_csv(csvName, index=False, mode='a+', header=False)
            else:
                df.to_csv(csvName, index=False, mode='a+')
            print('Get data of ' + cityname)
            flag = False
            #return df
        except BaseException:
            try_times += 1
            if try_times > 20:  #Max numbers of trying new ip is 20
                print('Proxy IP failed')
                return -1
                flag = False
            else:
                flag = True
Exemple #36
0
 def test_replace_truthy(self):
     df = DataFrame({'a': [True, True]})
     r = df.replace([np.inf, -np.inf], np.nan)
     e = df
     assert_frame_equal(r, e)
Exemple #37
0
 def test_replace_bool_with_bool(self):
     df = DataFrame(np.random.rand(2, 2) > 0.5)
     result = df.replace(False, True)
     expected = DataFrame(np.ones((2, 2), dtype=bool))
     assert_frame_equal(result, expected)
Exemple #38
0
 def pipe_rename_vaccines(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.replace(self.vaccine_mapping)
Exemple #39
0
def build_dataset(job_id: str,
                  data_frame: pd.DataFrame,
                  source: Optional[str] = const.DEFAULT_SOURCE) -> List[dict]:
    """
    Build a dataset from the dataframe.

    :param job_id: The dataset id where data should be uploaded.
    :type job_id: str
    :param data_frame: The dataframe to upload.
    :type data_frame: pd.DataFrame
    :param source: The source of the data_frame.
    :type source: Optional[str]
    :return: The job-id where the data was uploaded.
    :rtype: int
    """
    dataset = []
    logger.debug(f"Pushing {len(data_frame)} items to {job_id=}")
    data_frame.fillna(np.nan, inplace=True)
    data_frame.replace([np.nan], [None], inplace=True)
    for _, row in tqdm(
            data_frame.iterrows(),
            total=len(data_frame),
            desc="Building a dataset for uploading safely.",
    ):
        dedupe_id = "_".join(
            [row[const.CONVERSATION_UUID], row[const.CALL_UUID]])
        errors = []
        if const.RAW in data_frame.columns:
            data = json.loads(row[const.RAW])
        else:
            data = row.to_dict()

        alternatives = data.get(const.UTTERANCES, [])
        try:
            alternatives = json.loads(alternatives) if isinstance(
                alternatives, str) else alternatives
        except json.JSONDecodeError:
            raise ValueError(
                f"{alternatives} of type {type(alternatives)} is not a valid json string."
            )

        data_point = {
            const.PRIORITY: 1,
            const.DATA_SOURCE: source,
            const.DATA_ID: dedupe_id,
            const.DATA: {
                **data,
                const.CALL_UUID: str(row[const.CALL_UUID]),
                const.CONVERSATION_UUID: str(row[const.CONVERSATION_UUID]),
                const.ALTERNATIVES: alternatives,
            },
            const.IS_GOLD: False,
        }
        try:
            jsonschema.validate(data_point[const.DATA],
                                const.UPLOAD_DATASET_SCHEMA)
            dataset.append(data_point)
        except jsonschema.exceptions.ValidationError as e:
            errors.append(e)
            if len(errors) > len(data_frame) * 0.5:
                raise RuntimeError(f"Too many errors: {errors}")
    return dataset
def replace_qm(df: pd.DataFrame) -> pd.DataFrame:
    df.replace({"?": None, " ?": None}, inplace=True)
    return df
Exemple #41
0
  wine=[row[4] for row in reader]
  wine_result=json.dumps(wine)
  id=[row[0] for row in reader]'''
#用pd来读取文件
reader = pd.read_csv('ttt.csv', header=None)
print(reader)
hypertension = DataFrame(reader[0])
low = DataFrame(reader[1])
sleep = DataFrame(reader[2])
wine = DataFrame(reader[3])
#获得行数【0】,列数为【1】
nrows = reader.shape[0] - 1
#获取记录
record = reader[1:]
print(record)
#删去含有NAN的项
clean_all = record.dropna()
cl_all_nrows = clean_all.shape[1]
#print(cl_all_nrows)
#print(clean_all)
print(sleep)
y_axis = [1, 5, 10, 15]
#p1 = plt.bar(left=0,bottom=low , width=y, color='yellow',height=0.5, orientation='horizontal')
x_axis = tuple(sleep.keys())
plt.bar(x_axis, y_axis, color='rgb')
plt.show()
low = low.replace([0, 1, 2, 3], ['b0', 'b1', 'b2', 'b3'])
hypertension = hypertension.replace([0, 1, 2], ['a0', 'a1', 'a2'])
sleep = sleep.replace([0, 1, 2, 3, 4], ['c0', 'c1', 'c2', 'c3', 'c4'])
wine = wine.replace([0, 1, 2, 3], ['d0', 'd1', 'd2', 'd3'])
"""
Replace
"""

ser1 = Series([1, 2, 3, 4, 1, 2, 3, 4])

# using replace when can select which value I want to replace
# replace, with
ser1.replace(1, np.nan)
# lists can also be passed
ser1.replace([1, 4], [100, 400])

# can also pass a dictionary
ser1.replace({4: np.nan})

dframe.replace({'Alma': 'SEA'})
"""
Rename
"""
dframe = DataFrame(np.arange(12).reshape(3, 4),
                   index=['NY', 'LA', 'SF'],
                   columns=['A', 'B', 'C', 'D'])
dframe.index.map(str.lower)
# makes the index lower case
dframe.index = dframe.index.map(str.lower)

# rename
dframe.rename(index=str.title, columns=str.lower)

# passing dictionaries for index and columns to rename them
dframe.rename(index={'ny': 'New York'}, columns={'A': 'Alpha'})
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
# The map method on a Series accepts a function or dict-like object containing a mapping but here we have a small problem in that some of the meats above are capitalized and others are not.  Convert each value to lower case
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print data
# We could also have passed a function that does all the work:
print data['food'].map(lambda x: meat_to_animal[x.lower()])
# Using map is a convienient way to perform element-wise transformations and other data cleaning-related operations

# Replacing Values - Filling in missing data with the fillna method can be thought of as a special case of more general value replacement.  While map, as you've seen above can be used to modify a subset of values in an object, replace provides a simpler and more flexible way to do so.
# Consider this series:
data = Series([1., -999., 2., -999., -1000., 3.])
print data
# The -999 values might be sentinel values for missing data.  To replace these NA values that pandas understands we can use replace, producing a new Series:
print data.replace(-999., np.nan)
# If you want to replace multiple values at once, you instead pass a list then the substitute value:
print data.replace([-999., -1000.], np.nan)
# To use a different replacement for each value, pass a list of substitutes:
print data.replace([-999., -1000.], [np.nan, 0])
# The argument passed can also be a dict:
print data.replace({-999.: np.nan, -1000.: 0})

# Renaming Axis Indexes
# Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects.  The axes can also be modified in place without creating a new data structure:
data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
# Like a Series, the axis indexes have a map method:
print data.index.map(str.upper)
# You can assign to index, modifying the DataFrame in place:
Exemple #44
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)
                
                #plot output correlations
                abs_perc_error_time = go.Figure()

                abs_perc_error = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], 
                    zip(production_data[target_column], production_data[prediction_column])))

                error_trace = go.Scatter(
                    x = production_data[date_column] if date_column else production_data.index,
                    y = abs_perc_error,
                    mode = 'lines',
                    name = 'Absolute Percentage Error',
                    marker=dict(
                        size=6,
                        color=red
                    )
                )

                zero_trace = go.Scatter(
                    x = production_data[date_column] if date_column else production_data.index,
                    y = [0]*production_data.shape[0],
                    mode = 'lines',
                    opacity=0.5,
                    marker=dict(
                        size=6,
                        color='green',
                    ),
                    showlegend=False,
                )

                abs_perc_error_time.add_trace(error_trace)
                abs_perc_error_time.add_trace(zero_trace)

                abs_perc_error_time.update_layout(
                    xaxis_title = "Timestamp" if date_column else "Index",
                    yaxis_title = "Percent",
                    legend = dict(
                    orientation="h",
                    yanchor="bottom",
                    y=1.02,
                    xanchor="right",
                    x=1
                    )
                )

                abs_perc_error_time_json = json.loads(abs_perc_error_time.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": abs_perc_error_time_json['data'],
                        "layout": abs_perc_error_time_json['layout']
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
Exemple #45
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({
            'A': date_range('20130101', periods=3, tz='US/Eastern'),
            'B': [0, np.nan, 2]
        })
        result = df.replace(np.nan, 1)
        expected = DataFrame({
            'A':
            date_range('20130101', periods=3, tz='US/Eastern'),
            'B':
            Series([0, 1, 2], dtype='float64')
        })
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({
            'A':
            date_range('20130101', periods=3, tz='US/Eastern'),
            'B': [np.nan, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104', tz='US/Eastern'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': pd.NaT},
                                Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': pd.NaT},
                                Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104', tz='US/Pacific'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)
Exemple #46
0
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.applymap(lambda s: s.lower().strip() if isinstance(s, str) else s)
    df = df.replace(np.nan, '', regex=True)
    return df
Exemple #47
0
 def test_replace_int_to_int_chain(self):
     df = DataFrame({'a': lrange(1, 5)})
     with tm.assert_raises_regex(ValueError, "Replacement not allowed .+"):
         df.replace({'a': dict(zip(range(1, 5), range(2, 6)))})
Exemple #48
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            #array_prediction = reference_data[prediction_column].to_numpy()

            #prediction_ids = np.argmax(array_prediction, axis=-1)
            #prediction_labels = [prediction_column[x] for x in prediction_ids]
            if len(prediction_column) <= 2:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = ['target']

                params_data = []

                step_size = 0.05
                binded = list(
                    zip(binaraized_target['target'].tolist(),
                        production_data[prediction_column[0]].tolist()))
                binded.sort(key=lambda item: item[1], reverse=True)

                data_size = len(binded)
                target_class_size = sum([x[0] for x in binded])

                #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall'])

                offset = max(round(data_size * step_size), 1)
                for step in np.arange(offset, data_size + offset, offset):
                    count = min(step, data_size)
                    prob = round(binded[min(step, data_size - 1)][1], 2)
                    top = round(100.0 * min(step, data_size) / data_size, 1)
                    tp = sum([x[0] for x in binded[:min(step, data_size)]])
                    fp = count - tp
                    precision = round(100.0 * tp / count, 1)
                    recall = round(100.0 * tp / target_class_size, 1)

                    params_data.append({
                        'f1': float(top),
                        'f2': int(count),
                        'f3': float(prob),
                        'f4': int(tp),
                        'f5': int(fp),
                        'f6': float(precision),
                        'f7': float(recall)
                    })

                self.wi = BaseWidgetInfo(title=self.title,
                                         type="big_table",
                                         details="",
                                         alertStats=AlertStats(),
                                         alerts=[],
                                         alertsPosition="row",
                                         insights=[],
                                         size=1,
                                         params={
                                             "rowsPerPage":
                                             21,
                                             "columns": [{
                                                 "title": "Top(%)",
                                                 "field": "f1",
                                                 "sort": "asc"
                                             }, {
                                                 "title": "Count",
                                                 "field": "f2",
                                             }, {
                                                 "title": "Prob",
                                                 "field": "f3",
                                             }, {
                                                 "title": "TP",
                                                 "field": "f4"
                                             }, {
                                                 "title": "FP",
                                                 "field": "f5"
                                             }, {
                                                 "title": "Precision",
                                                 "field": "f6"
                                             }, {
                                                 "title": "Recall",
                                                 "field": "f7"
                                             }],
                                             "data":
                                             params_data
                                         },
                                         additionalGraphs=[])
            else:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = prediction_column

                #create tables
                tabs = []

                for label in prediction_column:
                    params_data = []

                    step_size = 0.05
                    binded = list(
                        zip(binaraized_target[label].tolist(),
                            production_data[label].tolist()))
                    binded.sort(key=lambda item: item[1], reverse=True)

                    data_size = len(binded)
                    target_class_size = sum([x[0] for x in binded])

                    #result = pd.DataFrame(columns = ['Top(%)', 'Count', 'TP', 'FP', 'precision', 'recall'])

                    offset = max(round(data_size * step_size), 1)
                    for step in np.arange(offset, data_size + offset, offset):
                        count = min(step, data_size)
                        prob = round(binded[min(step, data_size - 1)][1], 2)
                        top = round(100.0 * min(step, data_size) / data_size,
                                    1)
                        tp = sum([x[0] for x in binded[:min(step, data_size)]])
                        fp = count - tp
                        precision = round(100.0 * tp / count, 1)
                        recall = round(100.0 * tp / target_class_size, 1)

                        params_data.append({
                            'f1': float(top),
                            'f2': int(count),
                            'f3': float(prob),
                            'f4': int(tp),
                            'f5': int(fp),
                            'f6': float(precision),
                            'f7': float(recall)
                        })

                    tabs.append(
                        TabInfo(id=label,
                                title=label,
                                widget=BaseWidgetInfo(title="",
                                                      type="big_table",
                                                      details="",
                                                      alertStats=AlertStats(),
                                                      alerts=[],
                                                      alertsPosition="row",
                                                      insights=[],
                                                      size=2,
                                                      params={
                                                          "rowsPerPage":
                                                          21,
                                                          "columns": [{
                                                              "title":
                                                              "Top(%)",
                                                              "field":
                                                              "f1",
                                                              "sort":
                                                              "asc"
                                                          }, {
                                                              "title":
                                                              "Count",
                                                              "field":
                                                              "f2",
                                                          }, {
                                                              "title":
                                                              "Prob",
                                                              "field":
                                                              "f3",
                                                          }, {
                                                              "title":
                                                              "TP",
                                                              "field":
                                                              "f4"
                                                          }, {
                                                              "title":
                                                              "FP",
                                                              "field":
                                                              "f5"
                                                          }, {
                                                              "title":
                                                              "Precision",
                                                              "field":
                                                              "f6"
                                                          }, {
                                                              "title":
                                                              "Recall",
                                                              "field":
                                                              "f7"
                                                          }],
                                                          "data":
                                                          params_data
                                                      },
                                                      additionalGraphs=[])))

                self.wi = BaseWidgetInfo(type="tabs",
                                         title=self.title,
                                         size=1,
                                         details="",
                                         tabs=tabs)
        else:
            self.wi = None
Exemple #49
0
 def test_replace_with_dict_with_bool_keys(self):
     df = DataFrame({0: [True, False], 1: [False, True]})
     with tm.assert_raises_regex(TypeError, 'Cannot compare types .+'):
         df.replace({'asdf': 'asdb', True: 'yes'})
Exemple #50
0
 def test_fillna_dtype_conversion_equiv_replace(self, val):
     df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
     expected = df.replace(np.nan, val)
     result = df.fillna(val)
     tm.assert_frame_equal(result, expected)
Exemple #51
0
 def test_replace_pure_bool_with_string_no_op(self):
     df = DataFrame(np.random.rand(2, 2) > 0.5)
     result = df.replace('asdf', 'fdsa')
     assert_frame_equal(df, result)
Exemple #52
0
    def test_replace_mixed(self):
        mf = self.mixed_frame
        mf.iloc[5:20, mf.columns.get_loc('foo')] = nan
        mf.iloc[-10:, mf.columns.get_loc('A')] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({
            'A': Series([1.0, 2.0], dtype='float64'),
            'B': Series([0, 1], dtype='int64')
        })
        expected = DataFrame({
            'A': Series([1.0, 2.0], dtype='float64'),
            'B': Series([0.5, 1], dtype='float64')
        })
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame({
            'A': Series([1.0, 2.0], dtype='float64'),
            'B': Series([0, 1], dtype='int64'),
            'C': Series([1, 2], dtype='int64')
        })
        expected = DataFrame({
            'A': Series([1.0, 2.0], dtype='float64'),
            'B': Series([0.5, 1], dtype='float64'),
            'C': Series([1, 2], dtype='int64')
        })
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({
            'A': Series([1.0, 2.0], dtype='float64'),
            'B': Series([0, 1], dtype='int64')
        })
        expected = DataFrame({
            'A': Series([1, 'foo'], dtype='object'),
            'B': Series([0, 1], dtype='int64')
        })
        result = df.replace(2, 'foo')
        assert_frame_equal(result, expected)

        expected = DataFrame({
            'A': Series(['foo', 'bar'], dtype='object'),
            'B': Series([0, 'foo'], dtype='object')
        })
        result = df.replace([1, 2], ['foo', 'bar'])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({
            'A': Series([3, 0], dtype='int64'),
            'B': Series([0, 3], dtype='int64')
        })
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype('float64')
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)
Exemple #53
0
def to_sql(
    df: pd.DataFrame,
    table_name: str,
    creds: SqlCreds,
    sql_type: str = "table",
    schema: str = "dbo",
    index: bool = True,
    if_exists: str = "fail",
    batch_size: int = None,
    debug: bool = False,
    bcp_path: str = None,
):
    """
    Writes the pandas DataFrame to a SQL table or view.

    Will write all columns to the table or view. If the destination table/view doesn't exist, will create it.
    Assumes the SQL table/view has the same number, name, and type of columns.
    To only write parts of the DataFrame, filter it beforehand and pass that to this function.
    Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen.

    Parameters
    ----------
    df : pandas.DataFrame
    table_name : str
        Name of SQL table or view, without the schema
    creds : bcpandas.SqlCreds
        The credentials used in the SQL database.
    sql_type : {'table'}, can only be 'table'
        The type of SQL object of the destination.
    schema : str, default 'dbo'
        The SQL schema.
    index : bool, default True
        Write DataFrame index as a column. Uses the index name as the column
        name in the table.
    if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the table already exists.
        * fail: Raise a BCPandasValueError.
        * replace: Drop the table before inserting new values.
        * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name.
            If the database table exists then the dataframe cannot have new columns that aren't in the table, 
            but conversely table columns can be missing from the dataframe.
    batch_size : int, optional
        Rows will be written in batches of this size at a time. By default, BCP sets this to 1000.
    debug : bool, default False
        If True, will not delete the temporary CSV and format files, and will output their location.
    bcp_path : str, default None
        The full path to the BCP utility, useful if it is not in the PATH environment variable
    """
    # validation
    if df.shape[0] == 0 or df.shape[1] == 0:
        return
    assert sql_type == TABLE, "only supporting table, not view, for now"
    assert if_exists in IF_EXISTS_OPTIONS

    if df.columns.has_duplicates:
        raise BCPandasValueError(
            "Columns with duplicate names detected, SQL requires that column names be unique. "
            f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}")

    # TODO diff way to implement? could be big performance hit with big dataframe
    if index:
        df = df.copy(deep=True).reset_index()

    delim = get_delimiter(df)
    quotechar = get_quotechar(df)

    if batch_size is not None:
        if batch_size == 0:
            raise BCPandasValueError("Param batch_size can't be 0")
        if batch_size > df.shape[0]:
            raise BCPandasValueError(
                "Param batch_size can't be larger than the number of rows in the DataFrame"
            )

    # save to temp path
    csv_file_path = get_temp_file()
    # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server
    df.replace({
        True: 1,
        False: 0
    }).to_csv(
        path_or_buf=csv_file_path,
        sep=delim,
        header=False,
        index=False,  # already set as new col earlier if index=True
        quoting=csv.QUOTE_MINIMAL,  # pandas default
        quotechar=quotechar,
        line_terminator=NEWLINE,
        doublequote=True,
        escapechar=None,  # not needed, as using doublequote
    )
    logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}")

    # build format file
    fmt_file_path = get_temp_file()

    sql_item_exists = _sql_item_exists(sql_type=sql_type,
                                       schema=schema,
                                       table_name=table_name,
                                       creds=creds)
    cols_dict = None  # for mypy
    if if_exists == "append":
        # get dict of column names -> order of column
        cols_dict = dict(
            pd.read_sql_query(
                """
                SELECT COLUMN_NAME, ORDINAL_POSITION 
                FROM INFORMATION_SCHEMA.COLUMNS 
                WHERE TABLE_SCHEMA = '{_schema}'
                AND TABLE_NAME = '{_tbl}'
            """.format(_schema=schema, _tbl=table_name),
                creds.engine,
            ).values)

        # check that column names match in db and dataframe exactly
        if sql_item_exists:
            # the db cols are always strings, unlike df cols
            extra_cols = [
                str(x) for x in df.columns if str(x) not in cols_dict.keys()
            ]
            if extra_cols:
                raise BCPandasValueError(
                    f"Column(s) detected in the dataframe that are not in the database, "
                    f"cannot have new columns if `if_exists=='append'`, "
                    f"the extra column(s): {extra_cols}")

    fmt_file_txt = build_format_file(df=df,
                                     delimiter=delim,
                                     db_cols_order=cols_dict)
    with open(fmt_file_path, "w") as ff:
        ff.write(fmt_file_txt)
    logger.debug(f"Created BCP format file at {fmt_file_path}")

    try:
        if if_exists == "fail":
            if sql_item_exists:
                raise BCPandasValueError(
                    f"The {sql_type} called {schema}.{table_name} already exists, "
                    f"`if_exists` param was set to `fail`.")
            else:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)
        elif if_exists == "replace":
            _create_table(schema=schema,
                          table_name=table_name,
                          creds=creds,
                          df=df,
                          if_exists=if_exists)
        elif if_exists == "append":
            if not sql_item_exists:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)

        # BCP the data in
        bcp(
            sql_item=table_name,
            direction=IN,
            flat_file=csv_file_path,
            format_file_path=fmt_file_path,
            creds=creds,
            sql_type=sql_type,
            schema=schema,
            batch_size=batch_size,
            bcp_path=bcp_path,
        )
    finally:
        if not debug:
            logger.debug(f"Deleting temp CSV and format files")
            os.remove(csv_file_path)
            os.remove(fmt_file_path)
        else:
            logger.debug(
                f"`to_sql` DEBUG mode, not deleting the files. CSV file is at "
                f"{csv_file_path}, format file is at {fmt_file_path}")
Exemple #54
0
 def test_replace_regex_metachar(self, metachar):
     df = DataFrame({'a': [metachar, 'else']})
     result = df.replace({'a': {metachar: 'paren'}})
     expected = DataFrame({'a': ['paren', 'else']})
     assert_frame_equal(result, expected)
def generate_barchart_for_incomegroup_distribution(idf,startyear = None, stopyear = datetime.now().year,
                                                   targetdir = "results/dataoverview/",
                                                   basefilename = "incomegroups.png",
                                                   filterzerocommitment = True,
                                                   valuename="USD_Commitment_Defl",
                                                   incomegroups=['LDCs','LMICs','UMICs'],
                                                   figsize=(30,56)):
    """
    generates six barchart-graphics showing the distribution of commitments and projectnumer among the different incomegroups (LDCs,LICs...) over time. also creates the json-files of the aggregated data.

    @param idf (DataFrame): the inputDataFrame that shall be used as base for the images and json-files
    @param startyear (int): select a specific year as start. for example 2011 to get graphs beginning at 1-1-2011 upto the end of the data
    @param stopyear: select a specific year to end. for example 2018 to get graphs including data upto 31-12-2018
    @param targetdir: where to store the results. will be created if needed
    @param filterzerocommitment: about 50% of the raw-data have not amount for the attribute 'USD_Commitment_Defl' per default those datapoints will be ignored
    @param valuename: defaults to USD_Commitment_Defl
    @param basefilename: the filename and format (based on extension) of the resulting image
    @incomegroups: only consider the listed incomegroups. expects an array of IncomegroupNames. There are: LDCs,LMICs,MADCTs,Other LICs,Part I unallocated by income, UMICs. project specific per default only LDCs, LMICs and UMICs are taken into account. with None or empty array every group is considered

    """

    all_incomegroups = ["LDCs","LMICs","MADCTs","Other LICs","Part I unallocated by income", "UMICs"]


    nrows = 8

    fig, axes = plt.subplots(nrows=nrows, ncols=1,figsize=figsize)

    df = DataFrame()
    for i in ['CommitmentDate','IncomegroupName',valuename]:
        df[i] = idf[i]

    if startyear:
        df = df[df['CommitmentDate'] > datetime(year=startyear-1,month=12,day=31)]
        targetdir = targetdir + "from_" + str(startyear) + "_"
    if stopyear:
        df = df[df['CommitmentDate'] < datetime(year=stopyear+1,month=1,day=1)]
        targetdir = targetdir + "upto_" + str(stopyear)
    if startyear or stopyear:
        targetdir = targetdir + "/"

    os.makedirs(targetdir,exist_ok=True)

    if filterzerocommitment:
        df = df[df[valuename].notnull()]
        df = df[df[valuename] != 0.0]

    if type(incomegroups) == type([]) and len(incomegroups) > 0:
        df = df[df['IncomegroupName'].isin(incomegroups)]

    # group by year and IncomegroupName
    df = df.set_index("CommitmentDate")

    groupeddf = df.groupby([Grouper(freq="A"), 'IncomegroupName'])[valuename]

    # resolve grouping, unstack, fill missing and reset_index()
    ddf = groupeddf.sum().unstack().fillna(0.0).reset_index()
    # get a percent view in a new DataFrame
    df2 = DataFrame()
    for i in all_incomegroups:
        try:
            df2[i]=ddf[i]
        except Exception as e:
            pass
    df2['CommitmentYear'] = ddf.reset_index()['CommitmentDate'].apply(lambda x: str(x.year))

    df2=df2.set_index('CommitmentYear')

    countrytypesum = DataFrame(df2.T.sum())
    for i in all_incomegroups:
        try:
            df2[i] = (df2[i] / countrytypesum[0]) * 100
        except Exception as e:
            pass
    df2.replace([np.inf, -np.inf], np.nan, inplace=True)

    df2.plot(width=0.9,grid=True,kind='bar', ax=axes[1],title="mUSD per IncomeGroup (Prozent)")
    df2.plot(width=0.9,grid=True,kind='bar', stacked=True, ax=axes[2],title="mUSD per IncomeGroup (Prozent)")

    with open("%s%s-sum-percent.json" %(targetdir,basefilename),"w") as fd:
        fd.write(df2.to_json(orient="index"))

    # an absolut view
    # create index on year as string and drop old index
    ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year))
    ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear")

    # plot absolut sum
    ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[0], title="mUSD per IncomeGroup (sum)")

    with open("%s%s-sum-absolut.json" %(targetdir,basefilename),"w") as fd:
        fd.write(ddf.to_json(orient="index"))

    # create bar-char with mean
    ddf = groupeddf.mean().unstack().fillna(0.0).reset_index()

    # create index on year as string and drop old index
    ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year))
    ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear")

    ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[3], title="mUSD per IncomeGroup (mean)")
    with open("%s%s-mean.json" %(targetdir,basefilename),"w") as fd:
        fd.write(ddf.to_json(orient="index"))


    # create bar-char with median
    ddf = groupeddf.median().unstack().fillna(0.0).reset_index()

    # create index on year as string and drop old index
    ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year))
    ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear")

    ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[4], title="mUSD per IncomeGroup (median)")
    with open("%s%s-median.json" %(targetdir,basefilename),"w") as fd:
        fd.write(ddf.to_json(orient="index"))

    ddf = groupeddf.count().unstack().fillna(0.0).reset_index()

    # get a percent view in a new DataFrame
    df2 = DataFrame()
    for i in all_incomegroups:
        try:
            df2[i]=ddf[i]
        except Exception as e:
            pass
    df2['CommitmentYear'] = ddf.reset_index()['CommitmentDate'].apply(lambda x: str(x.year))

    df2=df2.set_index('CommitmentYear')

    countrytypesum = DataFrame(df2.T.sum())
    for i in all_incomegroups:
        try:
            df2[i] = (df2[i] / countrytypesum[0]) * 100
        except Exception as e:
            pass
    df2.replace([np.inf, -np.inf], np.nan, inplace=True)

    df2.plot(width=0.9,grid=True,kind='bar', ax=axes[6],title="projects per IncomeGroup (Prozent)")
    df2.plot(width=0.9,grid=True,kind='bar', stacked=True,ax=axes[7],title="projects per IncomeGroup (Prozent)")
    with open("%s%s-count-percent.json" %(targetdir,basefilename),"w") as fd:
        fd.write(df2.to_json(orient="index"))


    # create index on year as string and drop old index
    ddf["CommitmentYear"]=ddf["CommitmentDate"].apply(lambda x: str(x.year))
    ddf = ddf.drop(columns=["CommitmentDate"]).set_index("CommitmentYear")

    ddf.plot(width=0.9,grid=True,kind='bar',ax=axes[5], title="projects per IncomeGroup (count)")
    with open("%s%s-count-absolut.json" %(targetdir,basefilename),"w") as fd:
        fd.write(ddf.to_json(orient="index"))





    plt.savefig(targetdir+basefilename,bbox_inches='tight')
    plt.close(plt.gcf())
def add_custom_ta_features(
    df: pd.DataFrame,
    open: str,  # noqa
    high: str,
    low: str,
    close: str,
    fillna: bool = False,
    colprefix: str = "",
    apply_pct: bool = False,
    plot: bool = False,
) -> pd.DataFrame:

    # Add Volatility TA
    df = ta.add_volatility_ta(df=df,
                              high=high,
                              low=low,
                              close=close,
                              fillna=fillna,
                              colprefix=colprefix)
    # Add Trend TA
    df = ta.add_trend_ta(df=df,
                         high=high,
                         low=low,
                         close=close,
                         fillna=fillna,
                         colprefix=colprefix)
    # Add Other TA
    df = ta.add_others_ta(df=df,
                          close=close,
                          fillna=fillna,
                          colprefix=colprefix)

    # convert to pct
    if apply_pct:
        df = df.pct_change(fill_method='ffill')
        df = df.applymap(lambda x: x * 100)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.astype(np.float32)
    df = df.round(5)

    if fillna:
        df.fillna(value=0, inplace=True)

    if plot:
        fig = make_subplots(rows=5,
                            cols=1,
                            shared_xaxes=True,
                            vertical_spacing=0.02,
                            subplot_titles=("Close", "Bollinger Bands",
                                            "MACD"))

        fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol),
                      row=1,
                      col=1)

        # Bollinger-Bands
        fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol),
                      row=2,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['volatility_bbh'],
                                 name=symbol + ' High BB'),
                      row=2,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['volatility_bbl'],
                                 name=symbol + ' Low BB'),
                      row=2,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['volatility_bbm'],
                                 name=symbol + ' EMA BB'),
                      row=2,
                      col=1)

        # MACD
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_macd'],
                                 name=symbol + ' MACD'),
                      row=3,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_macd_signal'],
                                 name=symbol + ' MACD Signal'),
                      row=3,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_macd_diff'],
                                 name=symbol + ' MACD Difference'),
                      row=3,
                      col=1)

        # SMA
        fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol),
                      row=4,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_sma_fast'],
                                 name=symbol + ' SMA-Fast'),
                      row=4,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_sma_slow'],
                                 name=symbol + ' SMA-Slow'),
                      row=4,
                      col=1)

        # EMA
        fig.add_trace(go.Scatter(x=df.index, y=df['close'], name=symbol),
                      row=5,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_ema_fast'],
                                 name=symbol + ' EMA-Fast'),
                      row=5,
                      col=1)

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['trend_ema_slow'],
                                 name=symbol + ' EMA-Slow'),
                      row=5,
                      col=1)

        config = {'displayModeBar': False}
        fig.show(config=config)

    return df
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)
            
            #plot output correlations
            error_distr = go.Figure()

            error = reference_data[prediction_column] - reference_data[target_column] 

            error_distr.add_trace(go.Histogram(x=error,
                marker_color=red, name = 'error distribution', histnorm = 'percent'))

            error_distr.update_layout(
                xaxis_title = "Error (Actual - Predicted)",
                yaxis_title = "Percentage",
            )


            error_distr_json = json.loads(error_distr.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": error_distr_json['data'],
                    "layout": error_distr_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            target_names = None

        if target_column is not None and prediction_column is not None:
            reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            reference_data.dropna(axis=0, how='any', inplace=True)

            #plot support bar
            metrics_matrix = metrics.classification_report(
                reference_data[target_column],
                reference_data[prediction_column],
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)
            support = metrics_frame.iloc[-1:, :-3].values[0]

            fig = go.Figure()

            fig.add_trace(
                go.Bar(x=target_names if target_names else
                       metrics_frame.columns.tolist()[:-3],
                       y=metrics_frame.iloc[-1:, :-3].values[0],
                       marker_color=red,
                       name='Support'))

            fig.update_layout(
                xaxis_title="Class",
                yaxis_title="Number of Objects",
            )

            support_bar_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1 if production_data is not None else 2,
                params={
                    "data": support_bar_json['data'],
                    "layout": support_bar_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)
            
                #calculate quality metrics
                me = np.mean(production_data[prediction_column] - production_data[target_column])
                sde = np.std(production_data[prediction_column] - production_data[target_column], ddof = 1)

                abs_err = list(map(lambda x : abs(x[0] - x[1]), 
                    zip(production_data[prediction_column], production_data[target_column])))
                mae = np.mean(abs_err)
                sdae = np.std(abs_err, ddof = 1)

                abs_perc_err = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], 
                    zip(production_data[prediction_column], production_data[target_column])))
                mape = np.mean(abs_perc_err)
                sdape = np.std(abs_perc_err, ddof = 1)

                #sqrt_err = list(map(lambda x : (x[0] - x[1])**2, 
                #    zip(production_data[prediction_column], production_data[target_column])))
                #mse = np.mean(sqrt_err)
                #sdse = np.std(sqrt_err, ddof = 1)

                #error_norm_json = json.loads(error_norm.to_json())

                self.wi = BaseWidgetInfo(
                    title="Production: Model Quality (+/- std)",
                    type="counter",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={   
                        "counters": [
                          {
                            "value": str(round(me, 2)) + " (" + str(round(sde,2)) + ")",
                            "label": "ME"
                          },
                          {
                            "value": str(round(mae, 2)) + " (" + str(round(sdae,2)) + ")",
                            "label": "MAE"
                          },
                          {
                            "value": str(round(mape, 2)) + " (" + str(round(sdape, 2)) + ")",
                            "label": "MAPE"
                          }#,
                          #{
                          #  "value": str(round(mse, 2)) + " (" + str(round(sdse, 2)) + ")",
                          #  "label": "MSE"
                          #}
                        ]
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
def rename_columns(df_: pd.DataFrame) -> pd.DataFrame:
    df_ = df_.replace(-1, np.nan)

    zero_replacements = [
        "Age",
        "Education",
        "MaritalStatus",
        "EmploymentStatus",
        "OccupationArea",
        "CreditScoreEeMini",
    ]
    df_[zero_replacements] = df_[zero_replacements].replace(0.0, np.nan)

    value_replacements = {
        "UseOfLoan": {
            0: "Loan_consolidation",
            1: "Real_estate",
            2: "Home_improvement",
            3: "Business",
            4: "Education",
            5: "Travel",
            6: "Vehicle",
            7: "Other",
            8: "Health",
            101: "Working_capital_financing",
            102: "Purchase_of_machinery_equipment",
            103: "Renovation_of_real_estate",
            104: "Accounts_receivable_financing ",
            105: "Acquisition_of_means_of_transport",
            106: "Construction_finance",
            107: "Acquisition_of_stocks",
            108: "Acquisition_of_real_estate",
            109: "Guaranteeing_obligation ",
            110: "Other_business",
        },
        "Education": {
            1: "Primary",
            2: "Basic",
            3: "Vocational",
            4: "Secondary",
            5: "Higher",
        },
        "MaritalStatus": {
            1: "Married",
            2: "Cohabitant",
            3: "Single",
            4: "Divorced",
            5: "Widow",
        },
        "EmploymentStatus": {
            1: "Unemployed",
            2: "Partially",
            3: "Fully",
            4: "Self_employed",
            5: "Entrepreneur",
            6: "Retiree",
        },
        "NewCreditCustomer": {
            0: "Existing_credit_customer",
            1: "New_credit_Customer"
        },
        "OccupationArea": {
            1: "Other",
            2: "Mining",
            3: "Processing",
            4: "Energy",
            5: "Utilities",
            6: "Construction",
            7: "Retail_and_wholesale",
            8: "Transport_and_warehousing",
            9: "Hospitality_and_catering",
            10: "Info_and_telecom",
            11: "Finance_and_insurance",
            12: "Real_estate",
            13: "Research",
            14: "Administrative",
            15: "Civil_service_and_military",
            16: "Education",
            17: "Healthcare_and_social_help",
            18: "Art_and_entertainment",
            19: "Agriculture_forestry_and_fishing",
        },
        "HomeOwnershipType": {
            0: "Homeless",
            1: "Owner",
            2: "Living_with_parents",
            3: "Tenant_pre_furnished_property",
            4: "Tenant_unfurnished_property",
            5: "Council_house",
            6: "Joint_tenant",
            7: "Joint_ownership",
            8: "Mortgage",
            9: "Owner_with_encumbrance",
            10: "Other",
        },
        "NrOfDependants": {
            "10Plus": 11
        },
        "Gender": {
            0: "Male",
            1: "Female",
            2: "Unknown"
        },
    }
    df_ = df_.replace(value_replacements)

    return df_