[FieldTypes.uint64, [64, np.uint64(64)], [64.]], [FieldTypes.float16, [16., np.float16(16)], [16]], [FieldTypes.float32, [32., np.float32(32)], [32]], [FieldTypes.float64, [64., np.float64(64)], [64]], [FieldTypes.complex64, [1+2j, np.complex64(1+2j)], [64]], [FieldTypes.complex128, [1+2j, np.complex128(1+2j)], [128]], [FieldTypes.bytes, [b'abc', np.bytes_('abc')], ['abc']], [FieldTypes.string, ['abc', np.str_('abc')], [b'abc']], [FieldTypes.ndarray, [np.array([1, 2, 3])], [object()]], [FieldTypes.dtype, [np.dtype(np.int32), pd.StringDtype()], [object()]], [FieldTypes.key, [MyClass()], [object()]], [FieldTypes.slice, [slice(1, 10), slice('a', 'b')], [object()]], [FieldTypes.datetime, [datetime.now(), pd.Timestamp(0)], [object()]], [FieldTypes.timedelta, [timedelta(days=1), pd.Timedelta(days=1)], [object()]], [FieldTypes.tzinfo, [timezone.utc], [object()]], [FieldTypes.index, [pd.RangeIndex(10), pd.Index([1, 2])], [object()]], [FieldTypes.series, [pd.Series([1, 2, 3])], [object()]], [FieldTypes.dataframe, [pd.DataFrame({'a': [1, 2]})], [object()]], [FieldTypes.interval_array, [pd.arrays.IntervalArray([])], [object()]], [FieldTypes.function, [MyClass.my_func], [object()]], [FieldTypes.namedtuple, [my_named_tuple(a=1, b=2)], [tuple()]], [FieldTypes.reference(MyClass), [MyClass()], [object()]], [FieldTypes.tuple(FieldTypes.int64, ...), [tuple(), tuple([1, 2])], [list(), tuple([1, 2.])]], [FieldTypes.list(FieldTypes.int64, FieldTypes.float64), [[1, 1.]], [tuple(), [1, 1]]], [FieldTypes.dict(FieldTypes.string, FieldTypes.int64), [{'a': 1}], [{1: 'a'}, {'a': 1.}]], [FieldTypes.any, [object()], []], ] @pytest.mark.parametrize( 'field_type, valid_values, invalid_values',
def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None, sort=None, check_index_type=True, check_column_type=True, check_numpy_dtype=False): if sort is not None: df = df.sort_values(sort) else: df = df.sort_index() # if we are not unique, then check that we are raising ValueError # for the appropriate orients if not df.index.is_unique and orient in ['index', 'columns']: pytest.raises(ValueError, lambda: df.to_json(orient=orient)) return if (not df.columns.is_unique and orient in ['index', 'columns', 'records']): pytest.raises(ValueError, lambda: df.to_json(orient=orient)) return dfjson = df.to_json(orient=orient) try: unser = read_json(dfjson, orient=orient, dtype=dtype, numpy=numpy, convert_axes=convert_axes) except Exception as detail: if raise_ok is not None: if isinstance(detail, raise_ok): return raise if sort is not None and sort in unser.columns: unser = unser.sort_values(sort) else: unser = unser.sort_index() if dtype is False: check_dtype = False if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex( unser.index.values.astype('i8') * 1e6) if orient == "records": # index is not captured in this orientation tm.assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) tm.assert_index_equal(df.columns, unser.columns, exact=check_column_type) elif orient == "values": # index and cols are not captured in this orientation if numpy is True and df.shape == (0, 0): assert unser.shape[0] == 0 else: tm.assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) elif orient == "split": # index and col labels might not be strings unser.index = [str(i) for i in unser.index] unser.columns = [str(i) for i in unser.columns] if sort is None: unser = unser.sort_index() tm.assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) else: if convert_axes: tm.assert_frame_equal(df, unser, check_dtype=check_dtype, check_index_type=check_index_type, check_column_type=check_column_type) else: tm.assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None, check_index_type=True, check_column_type=True): # numpy=False if convert_axes: _check_orient(df, "columns", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "records", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "split", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "index", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "values", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "values", dtype=dtype, convert_axes=False, sort=sort) # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: _check_orient(df, "columns", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "records", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "split", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "index", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "values", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) # basic _check_all_orients(self.frame) assert self.frame.to_json() == self.frame.to_json(orient="columns") _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) _check_all_orients(self.intframe, dtype=False) # big one # index and columns are strings as all unserialised JSON object keys # are assumed to be strings biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) _check_all_orients(biggie, dtype=False, convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', convert_axes=False, raise_ok=ValueError) # categorical _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError) # empty _check_all_orients(self.empty_frame, check_index_type=False, check_column_type=False) # time series data _check_all_orients(self.tsframe) # mixed data index = pd.Index(['a', 'b', 'c', 'd', 'e']) data = { 'A': [0., 1., 2., 3., 4.], 'B': [0., 1., 0., 1., 0.], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': [True, False, True, False, True] } df = DataFrame(data=data, index=index) _check_orient(df, "split", check_dtype=False) _check_orient(df, "records", check_dtype=False) _check_orient(df, "values", check_dtype=False) _check_orient(df, "columns", check_dtype=False) # index oriented is problematic as it is read back in in a transposed # state, so the columns are interpreted as having mixed data and # given object dtypes. # force everything to have object dtype beforehand _check_orient(df.transpose().transpose(), "index", dtype=False)
def test_extract_cell_dataframe(self): data = np.stack(( np.array([[3, 2, 4, 0], [1, 1, 3, 1], [0, 0, 1, 1], [5, 0, 3, 1]]), np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 2, 0], [5, 0, 0, 0]]), ), axis=2) cell_labels = np.array([[0, 1, 1, 0], [1, 1, 3, 3], [0, 0, 3, 3], [0, 0, 3, 3]]) image = mi.MibiImage(data, ['1', '2']) labels = [1, 3] areas = [4, 6] x_centroids = [1, 2] y_centroids = [0, 2] first_total = [8, 10] second_total = [1, 3] # Check coords and areas only expected_from_labels = pd.DataFrame( np.array([areas, x_centroids, y_centroids]).T, columns=['area', 'x_centroid', 'y_centroid'], index=pd.Index(labels, name='label')) pdt.assert_frame_equal( segmentation.extract_cell_dataframe(cell_labels), expected_from_labels) # Check mode 'total' expected_from_total = pd.DataFrame( np.array([first_total, second_total]).T, columns=['1', '2'], index=pd.Index(labels, name='label')) pdt.assert_frame_equal( segmentation.extract_cell_dataframe(cell_labels, image), pd.concat((expected_from_labels, expected_from_total), axis=1)) # Check mode 'quadrant' quads = [] for label in labels: inds = np.nonzero(cell_labels == label) quads.append( segmentation._circular_sectors_mean(inds, image, num_sectors=4)) expected_from_quadrants = pd.DataFrame(np.array(quads), columns=['1', '2'], index=pd.Index(labels, name='label')) pdt.assert_frame_equal( segmentation.extract_cell_dataframe(cell_labels, image, mode='quadrant'), pd.concat((expected_from_labels, expected_from_quadrants), axis=1)) # Check mode 'circular_sectors' secs = [] for label in labels: inds = np.nonzero(cell_labels == label) num_sectors = 8 secs.append( segmentation._circular_sectors_mean(inds, image, num_sectors)) expected_from_circular_sectors = pd.DataFrame( np.array(secs), columns=['1', '2'], index=pd.Index(labels, name='label')) pdt.assert_frame_equal( segmentation.extract_cell_dataframe(cell_labels, image, mode='circular_sectors', num_sectors=num_sectors), pd.concat((expected_from_labels, expected_from_circular_sectors), axis=1))
class TestTableSchemaType: @pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_data(self, int_type): int_data = [1, 2, 3] assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer" @pytest.mark.parametrize("float_type", [np.float, np.float16, np.float32, np.float64]) def test_as_json_table_type_float_data(self, float_type): float_data = [1.0, 2.0, 3.0] assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number" @pytest.mark.parametrize("bool_type", [bool, np.bool]) def test_as_json_table_type_bool_data(self, bool_type): bool_data = [True, False] assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean" @pytest.mark.parametrize( "date_data", [ pd.to_datetime(["2016"]), pd.to_datetime(["2016"], utc=True), pd.Series(pd.to_datetime(["2016"])), pd.Series(pd.to_datetime(["2016"], utc=True)), pd.period_range("2016", freq="A", periods=3), ], ) def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data) == "datetime" @pytest.mark.parametrize( "str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) def test_as_json_table_type_string_data(self, str_data): assert as_json_table_type(str_data) == "string" @pytest.mark.parametrize( "cat_data", [ pd.Categorical(["a"]), pd.Categorical([1]), pd.Series(pd.Categorical([1])), pd.CategoricalIndex([1]), pd.Categorical([1]), ], ) def test_as_json_table_type_categorical_data(self, cat_data): assert as_json_table_type(cat_data) == "any" # ------ # dtypes # ------ @pytest.mark.parametrize("int_dtype", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_dtypes(self, int_dtype): assert as_json_table_type(int_dtype) == "integer" @pytest.mark.parametrize("float_dtype", [np.float, np.float16, np.float32, np.float64]) def test_as_json_table_type_float_dtypes(self, float_dtype): assert as_json_table_type(float_dtype) == "number" @pytest.mark.parametrize("bool_dtype", [bool, np.bool]) def test_as_json_table_type_bool_dtypes(self, bool_dtype): assert as_json_table_type(bool_dtype) == "boolean" @pytest.mark.parametrize( "date_dtype", [ np.datetime64, np.dtype("<M8[ns]"), PeriodDtype("D"), DatetimeTZDtype("ns", "US/Central"), ], ) def test_as_json_table_type_date_dtypes(self, date_dtype): # TODO: datedate.date? datetime.time? assert as_json_table_type(date_dtype) == "datetime" @pytest.mark.parametrize("td_dtype", [np.timedelta64, np.dtype("<m8[ns]")]) def test_as_json_table_type_timedelta_dtypes(self, td_dtype): assert as_json_table_type(td_dtype) == "duration" @pytest.mark.parametrize("str_dtype", [object]) # TODO def test_as_json_table_type_string_dtypes(self, str_dtype): assert as_json_table_type(str_dtype) == "string" def test_as_json_table_type_categorical_dtypes(self): # TODO: I think before is_categorical_dtype(Categorical) # returned True, but now it's False. Figure out why or # if it matters assert as_json_table_type(pd.Categorical(["a"])) == "any" assert as_json_table_type(CategoricalDtype()) == "any"
def test_read_json_table_orient(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) out = df.to_json(orient="table") result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result)
def bureau_and_balance(file_path=file_path, nan_as_category=True): df_bureau_b = reduce_mem_usage(pd.read_csv(file_path + 'bureau_balance.csv'), verbose=False) # Some new features in bureau_balance set tmp = df_bureau_b[['SK_ID_BUREAU', 'STATUS']].groupby('SK_ID_BUREAU') tmp_last = tmp.last() tmp_last.columns = ['First_status'] df_bureau_b = df_bureau_b.join(tmp_last, how='left', on='SK_ID_BUREAU') tmp_first = tmp.first() tmp_first.columns = ['Last_status'] df_bureau_b = df_bureau_b.join(tmp_first, how='left', on='SK_ID_BUREAU') del tmp, tmp_first, tmp_last gc.collect() tmp = df_bureau_b[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').last() tmp = tmp.apply(abs) tmp.columns = ['Month'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') del tmp gc.collect() tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == 'C', ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \ .groupby('SK_ID_BUREAU').last() tmp = tmp.apply(abs) tmp.columns = ['When_closed'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') del tmp gc.collect() df_bureau_b['Month_closed_to_end'] = df_bureau_b['Month'] - df_bureau_b['When_closed'] for c in range(6): tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == str(c), ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \ .groupby('SK_ID_BUREAU').count() tmp.columns = ['DPD_' + str(c) + '_cnt'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') df_bureau_b['DPD_' + str(c) + ' / Month'] = df_bureau_b['DPD_' + str(c) + '_cnt'] / df_bureau_b['Month'] del tmp gc.collect() df_bureau_b['Non_zero_DPD_cnt'] = df_bureau_b[ ['DPD_1_cnt', 'DPD_2_cnt', 'DPD_3_cnt', 'DPD_4_cnt', 'DPD_5_cnt']].sum(axis=1) df_bureau_b, bureau_b_cat = one_hot_encoder(df_bureau_b, nan_as_category) # Bureau balance: Perform aggregations aggregations = {} for col in df_bureau_b.columns: aggregations[col] = ['mean','sum'] if col in bureau_b_cat else ['min', 'max', 'size'] df_bureau_b_agg = df_bureau_b.groupby('SK_ID_BUREAU').agg(aggregations) df_bureau_b_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_bureau_b_agg.columns.tolist()]) del df_bureau_b gc.collect() df_bureau = reduce_mem_usage(pd.read_csv(file_path + 'bureau.csv'), verbose=False) # Replace\remove some outliers in bureau set # fill na df_bureau.loc[df_bureau['CREDIT_ACTIVE'] == 'Closed', ['AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT']] = \ df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Closed'][['AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT']].fillna(0) # credit sum = credit sum limit + credit sum debt df_bureau.loc[df_bureau['CREDIT_ACTIVE'] == 'Active', 'AMT_CREDIT_SUM_LIMIT'] = \ df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Active']['AMT_CREDIT_SUM'] - \ df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Active']['AMT_CREDIT_SUM_DEBT'] df_bureau.loc[df_bureau['AMT_ANNUITY'] > .8e8, 'AMT_ANNUITY'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_SUM'] > 3e8, 'AMT_CREDIT_SUM'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_SUM_DEBT'] > 1e8, 'AMT_CREDIT_SUM_DEBT'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_MAX_OVERDUE'] > .8e8, 'AMT_CREDIT_MAX_OVERDUE'] = np.nan df_bureau.loc[df_bureau['DAYS_ENDDATE_FACT'] < -10000, 'DAYS_ENDDATE_FACT'] = np.nan df_bureau.loc[(df_bureau['DAYS_CREDIT_UPDATE'] > 0) | ( df_bureau['DAYS_CREDIT_UPDATE'] < -40000), 'DAYS_CREDIT_UPDATE'] = np.nan df_bureau.loc[df_bureau['DAYS_CREDIT_ENDDATE'] < -10000, 'DAYS_CREDIT_ENDDATE'] = np.nan df_bureau.drop(df_bureau[df_bureau['DAYS_ENDDATE_FACT'] < df_bureau['DAYS_CREDIT']].index, inplace=True) df_bureau.drop('CREDIT_CURRENCY',axis=1,inplace=True) # Some new features in bureau set df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_DEBT'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[ 'AMT_CREDIT_SUM_DEBT'] df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_LIMIT'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[ 'AMT_CREDIT_SUM_LIMIT'] df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_OVERDUE'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[ 'AMT_CREDIT_SUM_OVERDUE'] df_bureau['bureau DAYS_CREDIT - CREDIT_DAY_OVERDUE'] = df_bureau['DAYS_CREDIT'] - df_bureau['CREDIT_DAY_OVERDUE'] df_bureau['bureau DAYS_CREDIT - DAYS_CREDIT_ENDDATE'] = df_bureau['DAYS_CREDIT'] - df_bureau['DAYS_CREDIT_ENDDATE'] df_bureau['bureau DAYS_CREDIT - DAYS_ENDDATE_FACT'] = df_bureau['DAYS_CREDIT'] - df_bureau['DAYS_ENDDATE_FACT'] df_bureau['bureau DAYS_CREDIT_ENDDATE - DAYS_ENDDATE_FACT'] = df_bureau['DAYS_CREDIT_ENDDATE'] - df_bureau[ 'DAYS_ENDDATE_FACT'] df_bureau['bureau DAYS_CREDIT_UPDATE - DAYS_CREDIT_ENDDATE'] = df_bureau['DAYS_CREDIT_UPDATE'] - df_bureau[ 'DAYS_CREDIT_ENDDATE'] df_bureau['FLAG_overdue'] = df_bureau['AMT_CREDIT_SUM_OVERDUE'].apply(lambda x: 1 if x > 0 else 0) # replace high correlation column and low variance column # Categorical features with One-Hot encode df_bureau['CREDIT_TYPE'] = df_bureau['CREDIT_TYPE'].apply( lambda x: x if x in ['Consumer credit', 'Credit card'] else 'other') df_bureau['CREDIT_ACTIVE'] = df_bureau['CREDIT_ACTIVE'].apply( lambda x: x if x in ['Closed', 'Active'] else 'other') df_bureau, bureau_cat = one_hot_encoder(df_bureau, nan_as_category) # Bureau balance: merge with bureau.csv df_bureau = df_bureau.join(df_bureau_b_agg, how='left', on='SK_ID_BUREAU') df_bureau.drop('SK_ID_BUREAU', axis=1, inplace=True) del df_bureau_b_agg gc.collect() # Bureau and bureau_balance aggregations for application set categorical = bureau_cat + bureau_b_cat aggregations = {} for col in df_bureau.columns: aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum'] df_bureau_agg = df_bureau.groupby('SK_ID_CURR').agg(aggregations) df_bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in df_bureau_agg.columns.tolist()]) # Bureau: Active credits active_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Active'] == 1].groupby('SK_ID_CURR').agg(aggregations) active_agg.columns = pd.Index(['BURO_ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()]) df_bureau_agg = df_bureau_agg.join(active_agg, how='left') del active_agg gc.collect() # Bureau: Closed credits closed_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Closed'] == 1].groupby('SK_ID_CURR').agg(aggregations) closed_agg.columns = pd.Index(['BURO_CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()]) df_bureau_agg = df_bureau_agg.join(closed_agg, how='left') # bureau: amt annuity ==0 annuity_0_agg = df_bureau[df_bureau['AMT_ANNUITY'].isnull()].groupby('SK_ID_CURR').agg(aggregations) annuity_0_agg.columns = pd.Index(['BURO_annuity_0_' + e[0] + "_" + e[1].upper() for e in annuity_0_agg.columns.tolist()]) df_bureau_agg = df_bureau_agg.join(annuity_0_agg, how='left') # bureau: amt annuity >0 annuity_non_0_agg = df_bureau[df_bureau['AMT_ANNUITY']>0].groupby('SK_ID_CURR').agg(aggregations) annuity_non_0_agg.columns = pd.Index( ['BURO_annuity_non_0_' + e[0] + "_" + e[1].upper() for e in annuity_non_0_agg.columns.tolist()]) df_bureau_agg = df_bureau_agg.join(annuity_non_0_agg, how='left') del closed_agg, df_bureau gc.collect() return reduce_mem_usage(df_bureau_agg)
def credit_card_balance(file_path=file_path, nan_as_category=True): df_card = pd.read_csv(file_path + 'credit_card_balance.csv') # Replace some outliers df_card.loc[df_card['AMT_PAYMENT_CURRENT'] > 4000000, 'AMT_PAYMENT_CURRENT'] = np.nan df_card.loc[df_card['AMT_CREDIT_LIMIT_ACTUAL'] > 1000000, 'AMT_CREDIT_LIMIT_ACTUAL'] = np.nan # Some new features df_card['card missing'] = df_card.isnull().sum(axis=1).values df_card['card SK_DPD - MONTHS_BALANCE'] = df_card['SK_DPD'] - df_card['MONTHS_BALANCE'] df_card['card SK_DPD_DEF - MONTHS_BALANCE'] = df_card['SK_DPD_DEF'] - df_card['MONTHS_BALANCE'] df_card['card SK_DPD - SK_DPD_DEF'] = df_card['SK_DPD'] - df_card['SK_DPD_DEF'] df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECIVABLE'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card['AMT_RECIVABLE'] df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card[ 'AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_RECIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_RECIVABLE'] - df_card[ 'AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_BALANCE - AMT_RECIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_RECIVABLE'] df_card['card AMT_BALANCE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_BALANCE'] - df_card[ 'AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_BALANCE - AMT_TOTAL_RECEIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_TOTAL_RECEIVABLE'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_ATM_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[ 'AMT_DRAWINGS_ATM_CURRENT'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_OTHER_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[ 'AMT_DRAWINGS_OTHER_CURRENT'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_POS_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[ 'AMT_DRAWINGS_POS_CURRENT'] df_card['AMT_PAYMENT_CURRENT - AMT_PAYMENT_TOTAL_CURRENT'] = df_card['AMT_PAYMENT_CURRENT'] - df_card['AMT_PAYMENT_TOTAL_CURRENT'] df_card['SK_DPD * AMT OBERDUE'] = df_card['SK_DPD'] * (df_card['AMT_INST_MIN_REGULARITY'] - df_card['AMT_PAYMENT_CURRENT']) df_card['available credit'] = df_card['AMT_CREDIT_LIMIT_ACTUAL'] - df_card['AMT_BALANCE'] df_card = df_card.sort_values(by= ['SK_ID_PREV','MONTHS_BALANCE']) # Categorical features with One-Hot encode df_card, categorical = one_hot_encoder(df_card, nan_as_category) # Aggregations for application set aggregations = {} for col in df_card.columns: aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum'] df_card_agg = df_card.groupby('SK_ID_CURR').agg(aggregations) df_card_agg.columns = pd.Index(['CARD_total_' + e[0] + "_" + e[1].upper() for e in df_card_agg.columns.tolist()]) df_card_agg['CARD_total avg DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_CURRENT_SUM']/ df_card_agg['CARD_total_CNT_DRAWINGS_CURRENT_SUM'] df_card_agg['CARD_total avg OTHER DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_OTHER_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_OTHER_CURRENT_SUM'] df_card_agg['CARD_total avg ATM DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_ATM_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_ATM_CURRENT_SUM'] df_card_agg['CARD_total avg POS DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_POS_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_POS_CURRENT_SUM'] # aggregations when credit card is used amt drawing >0 aggregations = {} for col in df_card.columns: aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum'] df_card_used_agg = df_card[df_card['AMT_DRAWINGS_ATM_CURRENT'] > 0].groupby('SK_ID_CURR').agg(aggregations) df_card_used_agg.columns = pd.Index(['CARD_used_' + e[0] + "_" + e[1].upper() for e in df_card_used_agg.columns.tolist()]) df_card_used_agg['CARD_used avg DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_CURRENT_SUM']/ df_card_used_agg['CARD_used_CNT_DRAWINGS_CURRENT_SUM'] df_card_used_agg['CARD_used avg OTHER DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_OTHER_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_OTHER_CURRENT_SUM'] df_card_used_agg['CARD_used avg ATM DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_ATM_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_ATM_CURRENT_SUM'] df_card_used_agg['CARD_used avg POS DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_POS_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_POS_CURRENT_SUM'] df_card_agg=df_card_agg.join(df_card_used_agg) # Count credit card lines df_card_agg['CARD_COUNT'] = df_card.groupby('SK_ID_CURR').size() df_card_agg['CARD_USED_COUNT']= df_card[df_card['AMT_DRAWINGS_ATM_CURRENT'].notnull()].groupby('SK_ID_CURR').size() # total balance latest_balance = df_card[['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_BALANCE']].groupby('SK_ID_PREV').last() latest_balance.columns = ['SK_ID_CURR', 'MONTHS_BALANCE', 'CARD_total balance'] total_latest_balance = latest_balance.groupby('SK_ID_CURR').sum()['CARD_total balance'] df_card_agg = df_card_agg.join(total_latest_balance) del df_card, latest_balance, total_latest_balance gc.collect() return reduce_mem_usage(df_card_agg)
def _read(cls, sql, con, index_col=None, **kwargs): """ Read a SQL query or database table into a query compiler. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, sqlite3 connection, or ModinDatabaseConnection Connection object to database. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). **kwargs : dict Parameters to pass into `pandas.read_sql` function. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. """ if isinstance(con, str): con = ModinDatabaseConnection("sqlalchemy", con) if not isinstance(con, ModinDatabaseConnection): warnings.warn( "To use parallel implementation of `read_sql`, pass either " + "the SQL connection string or a ModinDatabaseConnection " + "with the arguments required to make a connection, instead " + f"of {type(con)}. For documentation of ModinDatabaseConnection, see " + "https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html#connecting-to-a-database-for-read-sql" ) return cls.single_worker_read( sql, con=con, index_col=index_col, read_sql_engine=ReadSqlEngine.get(), **kwargs, ) row_count_query = con.row_count_query(sql) connection_for_pandas = con.get_connection() colum_names_query = con.column_names_query(sql) row_cnt = pandas.read_sql(row_count_query, connection_for_pandas).squeeze() cols_names_df = pandas.read_sql(colum_names_query, connection_for_pandas, index_col=index_col) cols_names = cols_names_df.columns num_partitions = NPartitions.get() partition_ids = [None] * num_partitions index_ids = [None] * num_partitions dtypes_ids = [None] * num_partitions limit = math.ceil(row_cnt / num_partitions) for part in range(num_partitions): offset = part * limit query = con.partition_query(sql, limit, offset) *partition_ids[part], index_ids[part], dtypes_ids[ part] = cls.deploy( cls.parse, num_returns=num_partitions + 2, num_splits=num_partitions, sql=query, con=con, index_col=index_col, read_sql_engine=ReadSqlEngine.get(), **kwargs, ) partition_ids[part] = [ cls.frame_partition_cls(obj) for obj in partition_ids[part] ] if index_col is None: # sum all lens returned from partitions index_lens = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(index_lens)) else: # concat index returned from partitions index_lst = [ x for part_index in cls.materialize(index_ids) for x in part_index ] new_index = pandas.Index(index_lst).set_names(index_col) new_frame = cls.frame_cls(np.array(partition_ids), new_index, cols_names) new_frame.synchronize_labels(axis=0) return cls.query_compiler_cls(new_frame)
pa = pd.read_pickle('preprocess/edge/p_a_before284_delete_author.pkl') pa_extra = pd.read_pickle('preprocess/edge/p_a_delete_author.pkl') pp = pd.read_pickle('preprocess/edge/paper_paper.pkl') pp['new_cited_papr_id'] = pp['new_cited_papr_id'].astype(int).astype(str) # 從pp整合出 ref list paper_refs = pp.groupby(['new_papr_id'])['new_cited_papr_id'].agg([','.join]).reset_index() # 挑出ref>20的index, 只用他們算MAP paper_refs = paper_refs[paper_refs.new_papr_id.isin(pp[pp.groupby(['new_papr_id'])['year'].transform('count') > 20].new_papr_id.value_counts().index.tolist())] # https://stackoverflow.com/questions/20067636/pandas-dataframe-get-first-row-of-each-group # dblp_top50_conf['new_first_aId'] = pa.groupby('new_papr_id').first()['new_author_id'] # 取每篇的第一作者 dblp_top50_conf['authors'] = pa.groupby('new_papr_id')['new_author_id'].apply(list) # groupby element to list dblp_top50_conf['references'] = dblp_top50[dblp_top50['new_papr_id'].isin(dblp_top50_conf['new_papr_id'].values)]['references'] dblp_top50_conf.dropna(subset=['authors'], inplace=True) # drop empty author papers # 根據author數量變成多筆training data dblp_top50_conf = pd.DataFrame([np.append(row.values, d) for _, row in dblp_top50_conf.iterrows() for d in row['authors']], columns=dblp_top50_conf.columns.append(pd.Index(['new_first_aId']))) # select 2018以前全部當train train2017 = dblp_top50_conf.loc[dblp_top50_conf.time_step < 284, ['new_papr_id', 'new_venue_id', 'new_first_aId', 'references']] # dblp_top50_test['new_first_aId'] = pa_extra.groupby('new_papr_id').first()['new_author_id'] # 移除沒有作者的paper dblp_top50_test['authors'] = pa_extra.groupby('new_papr_id')['new_author_id'].apply(list) # groupby element to list dblp_top50_test['references'] = dblp_top50[dblp_top50['new_papr_id'].isin(dblp_top50_test['new_papr_id'].values)]['references'] dblp_top50_test.dropna(subset=['authors'], inplace=True) # drop empty author papers dblp_top50_test = pd.DataFrame([np.append(row.values, d) for _, row in dblp_top50_test.iterrows() for d in row['authors']], columns=dblp_top50_test.columns.append(pd.Index(['new_first_aId']))) # 塞入bert titles = pd.read_pickle('preprocess/edge/titles_bert.pkl') abstracts = pd.read_pickle('preprocess/edge/abstracts_bert.pkl') # normalize column/ feature titles = preprocessing.scale(np.array(titles.tolist())) abstracts = preprocessing.scale(np.array(abstracts.tolist()))
def dispatch(self, event): # { # RE-INSTANTIATE GLOBALS global isEOD, idx_list, idx event_str = str(event.event_type) event_path = Path(event.src_path) print("\t\te=" + str(event)) print("\t\ttype=" + str(event.event_type)) print("\t\tsrc_path=" + str(event.src_path)) # CHECK AND PERFORM ON EVENT_STR ################################################################## if event_str == "created": # { # TRY THE FOLlOWING: try: # { print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") ts = pd.Timestamp.now() # CREATE TIME STAMP print("| CREATED >>> " + str(ts)) # CREATE EVENT PATH VAR the_event_path = Path(event.src_path) # WAS: the_event_path # APPEND TO INDEX LIST idx_list.append(str(event.src_path)) # CREATE TEMPORARY INDEX IN ORDER TO APPEND temp_idx = pd.Index(data=idx_list, dtype=np.str) # APPEND TO "CREATION" INDEX self.index.append(temp_idx) # print INDEX print("INDEX --> \n" + str(self.index)) print("\t\tEVENT_PATH=" + str(the_event_path)) # CREATE 'file_name' VAR file_name = os.path.basename(the_event_path) print("\t\tFILE_NAME=" + str(file_name)) # CHECK AND SEE IF FILE IS OF TYPE .PDF if fnmatch.fnmatch(file_name, "*.pdf"): # { # CREATE NEW FILE NAME CONV file_name_conv = generate_naming_convention(file_name) # CREATE PATH VARIABLES FOR FILE MOVING PROCEDURES new_path = os.path.join(self.out_directory, file_name_conv) ############################################### # CREATE/COPY WATERMARK TO DESTINATION FOLDER # ############################################### create_watermark(input_pdf=the_event_path, output=new_path, watermark=in_file) # CREATE EVENT ITEM FOR LIST event_list = [str(file_name_conv), str(ts)] # WAS: (created_str) # APPEND TO DATAFRAME self.save_dataframe = append_to_dataframe( the_event_list=event_list, dataframe_to_append=self.save_dataframe) # print DATAFRAME print(self.save_dataframe.tail(8)) # } else: # { print("NON-PDF CREATED AT " + str(ts)) # } # } except: # { errorMessage = str(sys.exc_info()[0]) + "\n\t\t" errorMessage = errorMessage + str(sys.exc_info()[1]) + "\n\t\t" errorMessage = errorMessage + str(sys.exc_info()[2]) + "\n" exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] typeE = str("TYPE : " + str(exc_type)) fileE = str("FILE : " + str(fname)) lineE = str("LINE : " + str(exc_tb.tb_lineno)) messageE = str("MESG : " + "\n" + str(errorMessage) + "\n") logging.error("\n" + typeE + "\n" + fileE + "\n" + lineE + "\n" + messageE) # } else: # { print("SUCCESS! VERY NICE!") # } finally: # { # CREATE END-TIME VAR time_end = pd.Timestamp.now() # DETERMINE OVERALL RUN-TIME run_time = pd.Timedelta(time_end - time_start) # print TOTAL RUNTIME print("\t\t[Created-Event] >>> time_alloted: " + str(run_time)) # } # } elif event_str == "modified": #{ # TRY THE FOLLOWING try: # { print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") ts = pd.Timestamp.now() # CREATE TIME STAMP print("| MODIFIED >>> " + str(ts)) # print INDEX print("INDEX --> \n" + str(self.index)) # CREATE EVENT PATH VAR the_event_path = Path(event.src_path) # WAS: the_event_path print("\t\tEVENT_PATH=" + str(the_event_path)) # CREATE 'file_name' VAR file_name = os.path.basename(the_event_path) print("\t\tFILE_NAME=" + str(file_name)) # CHECK AND SEE IF FILE IS OF TYPE .PDF if fnmatch.fnmatch(file_name, "*.pdf"): # { # CREATE NEW FILE NAME CONV file_name_conv = generate_naming_convention(file_name) # CREATE EVENT ITEM FOR LIST event_list = [str(file_name_conv), str(ts)] # WAS: (created_str) print("\n EVENT_LIST : \n" + str(event_list)) # IF THE EVENT_PAT HIS ALREADY IN INDEX if str(the_event_path) in self.index: # { print( "ALREADY IN INDEX... THEN WE CAN **INDEED** APPEND\n\n\n" ) # APPEND TO DATAFRAME self.save_dataframe = append_to_dataframe( the_event_list=event_list, dataframe_to_append=self.save_dataframe) # print DATAFRAME print(self.save_dataframe) # } else: # { print( "NOT IN INDEX... NOT CREATED TODAY... SO WE SKIP APPENDING..." ) # BUT WE STILL WATERMARK? """ # CREATE PATH VARIABLES FOR FILE MOVING PROCEDURES new_path = os.path.join(self.out_directory, file_name_conv) # 08/28/2019 - REMOVED BECAUSE WE DONT NEED TO WATERMARK SO MANY TIMES # JUST KEEPING THE MODIFIED TIMESTAMP AND APPENDING TO DATAFRAME # CREATE/COPY WATERMARK TO DESTINATION FOLDER create_watermark(input_pdf=the_event_path, output=new_path, watermark=in_file) """ # } # } else: # { print("NON-PDF MODIFIED AT " + str(ts)) # } # } except: # { errorMessage = str(sys.exc_info()[0]) + "\n\t\t" errorMessage = errorMessage + str(sys.exc_info()[1]) + "\n\t\t" errorMessage = errorMessage + str(sys.exc_info()[2]) + "\n" exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] typeE = str("TYPE : " + str(exc_type)) fileE = str("FILE : " + str(fname)) lineE = str("LINE : " + str(exc_tb.tb_lineno)) messageE = str("MESG : " + "\n" + str(errorMessage) + "\n") logging.error("\n" + typeE + "\n" + fileE + "\n" + lineE + "\n" + messageE) # } else: # { print("SUCCESS! VERY NICE!") # } finally: # { # CREATE END-TIME VAR time_end = pd.Timestamp.now() # DETERMINE OVERALL RUN-TIME run_time = pd.Timedelta(time_end - time_start) # print TOTAL RUNTIME print("\t\t[MODIFIED-Event] >>> time_alloted: " + str(run_time)) # } # } ########################################################## # TRY THE FOLLOWING: """
def assert_array_index_eq(left, right): """left and right are equal, treating index and array as equivalent""" assert_eq(left, pd.Index(right) if isinstance(right, np.ndarray) else right)
in_directory = "C:/Temp/F/APPS/CofA/" # "F:/APPS/CofA/" out_directory = "C:/Temp/G/C of A's/#Email Node/" #"G:/C of A's/#Email Node/" outbound_directory = "C:/data/outbound/CofA/" in_file = "C:/data/inbound/Agilent_CofA_Letterhead_03-21-19.pdf" # CREATE FILE_NAME STR USING TODAYS DATE CONVENTION out_file_dir = "C:/data/outbound/CofA/" # WAS: "C:/data/inbound/" # FILENAME FOR DATAFRAME THAT WAS CREATED VIA "CofA_Event_Hanlder" CLASS out_file_str_1 = str("CofA_Email_Node_list_" + time_today + "_F_watch.csv") # FILENAME FOR DATAFRAME THAT WAS CREATED VIA "set_diff_df" out_file_str_2 = str("CofA_Email_Node_list_" + time_today + "_F_pull.csv") df_save_list = pd.DataFrame(data=None, columns=['CofA']) isEOD = False # CREATE IDX_LIST TO BE INSERTED INSIDE INDEX idx_list = os.listdir(out_directory) # INDEX VARIABLE TO HOLD CREATION LIST idx = pd.Index(data=idx_list, dtype=np.str) # CREATE OBSERVER VARIABLE FOR WATCHDOG EVENT HANDLER observer = Observer() # CREATE TIMER VARIABLE // starts at 5 am and runs until 8:59 pm so == 57,600 seconds t = Timer(34200, end_of_day) # START TIMER t.start() # CREATE INSTANCE OF CUSTOM EVENT HANDLER event_handler = CofA_Event_Handler(idx, df_save_list, out_directory, observer) observer.schedule(event_handler=event_handler, path=in_directory, recursive=True) observer.start() # TRY THE FOLLOWING try: # {
def get_groupby_pool_in_out(self, period: int = None, shift: int = 0, figsize=(30, 8), heatmap_rotation1=False, heatmap_rotation2=False, annot_fontsize1=30, annot_fontsize2=30) -> None: """ 获取商品池进出每个组的情况,返回 Returns ------- """ self.signal.set_factor_data(self.factor.factor_value) self.signal.set_commodity_pool( self.commodity_pool.commodity_pool_value) # 修正group_num group_num: int = self.get_params()['group_num'] if not period: period: int = self.get_params()['period'] self.signal.set_params(group_num=group_num) signal_df: DataFrame = self.signal.transform() # avg_group_in_pct_dict = {} # avg_group_out_pct_dict = {} # for shift in range(period): # index = pd.Index(range(len(signal_df))) # index = index[(index - shift) % period == 0] # new_signal_df = signal_df.copy() # new_signal_df = new_signal_df.iloc[index] # # if self.signal.__class__.group_signal_type == GroupSignalType.AllGroupSignal: # other_group_list = [-2, -1, 0] # else: # other_group_list = [-3, -2, -1, 0] # # group_in_num_dict = defaultdict(dict) # group_out_num_dict = defaultdict(dict) # # for i in range(1, group_num+1): # for j in other_group_list: # in_df: DataFrame = pd.DataFrame(data=False, # index=new_signal_df.index, # columns=new_signal_df.columns) # out_df: DataFrame = pd.DataFrame(data=False, # index=new_signal_df.index, # columns=new_signal_df.columns) # # 考虑进入 # in_df[(new_signal_df.shift(1)==j)&(new_signal_df==i)] = True # if j != 0: # pass # else: # in_df.loc[new_signal_df.index[0], new_signal_df.iloc[0]==i] = True # # # 考虑退出 # out_df[(new_signal_df.shift(1)==i)&(new_signal_df==j)] = True # # group_in_num_dict[j][i] = in_df.sum().sum() # group_out_num_dict[i][j] = out_df.sum().sum() # # group_in_num_df: DataFrame = pd.DataFrame(group_in_num_dict) # group_out_num_df: DataFrame = pd.DataFrame(group_out_num_dict) # group_in_pct_df: DataFrame = group_in_num_df / group_in_num_df.sum(axis=0) # group_out_pct_df: DataFrame = group_out_num_df / group_out_num_df.sum(axis=0) # # avg_group_in_pct_dict[shift] = group_in_pct_df # avg_group_out_pct_dict[shift] = group_out_pct_df # # avg_group_in_pct_df = pd.DataFrame() # avg_group_out_pct_df = pd.DataFrame() # for i in avg_group_in_pct_dict: # if i == 0: # avg_group_in_pct_df = avg_group_in_pct_dict[i] # avg_group_out_pct_df = avg_group_out_pct_dict[i] # else: # avg_group_in_pct_df += avg_group_in_pct_dict[i] # avg_group_out_pct_df += avg_group_out_pct_dict[i] # avg_group_in_pct_df = avg_group_in_pct_df / len(avg_group_in_pct_dict) # avg_group_out_pct_df = avg_group_out_pct_df / len(avg_group_out_pct_dict) index = pd.Series(range(len(signal_df))) index = index[((index - float(shift)) % float(period)) == 0] index = pd.Index(index.values.tolist()) new_signal_df = signal_df.copy() new_signal_df = new_signal_df.iloc[index] if self.signal.__class__.group_signal_type == GroupSignalType.AllGroupSignal: other_group_list = [-2, -1, 0] else: other_group_list = [-3, -2, -1, 0] group_in_num_dict = defaultdict(dict) group_out_num_dict = defaultdict(dict) for i in range(1, group_num + 1): for j in other_group_list: in_df: DataFrame = pd.DataFrame(data=False, index=new_signal_df.index, columns=new_signal_df.columns) out_df: DataFrame = pd.DataFrame(data=False, index=new_signal_df.index, columns=new_signal_df.columns) # 考虑进入 in_df[(new_signal_df.shift(1) == j) & (new_signal_df == i)] = True if j != 0: pass else: in_df.loc[new_signal_df.index[0], new_signal_df.iloc[0] == i] = True # 考虑退出 out_df[(new_signal_df.shift(1) == i) & (new_signal_df == j)] = True group_in_num_dict[j][i] = in_df.sum().sum() group_out_num_dict[i][j] = out_df.sum().sum() group_in_num_df: DataFrame = pd.DataFrame(group_in_num_dict) group_out_num_df: DataFrame = pd.DataFrame(group_out_num_dict) cond_avg_group_in_pct_df: DataFrame = group_in_num_df / group_in_num_df.sum( axis=0) cond_avg_group_out_pct_df: DataFrame = group_out_num_df / group_out_num_df.sum( axis=0) # 统计从-2, -1, 0到其他状态的情况 start_in_pct_series: Series = group_in_num_df.sum( axis=0) / group_in_num_df.sum(axis=0).sum() # 统计从持仓状态到0,-1,-2的情况 start_out_pct_series: Series = group_out_num_df.sum( axis=0) / group_out_num_df.sum(axis=0).sum() uncond_avg_group_in_pct_df = cond_avg_group_in_pct_df * start_in_pct_series uncond_avg_group_out_pct_df = cond_avg_group_out_pct_df * start_out_pct_series # 总天数 total_days_num = len(new_signal_df) # 总进入次数 total_in_num = int(group_in_num_df.sum().sum()) # 总退出次数 total_out_num = int(group_out_num_df.sum().sum()) # 总进入比例 total_in_pct = (total_in_num / group_num) / total_days_num # 总退出比例 total_out_pct = (total_out_num / group_num) / total_days_num # 第一张图 fig, axes = plt.subplots(figsize=figsize, nrows=1, ncols=3) # 第一张子图 if heatmap_rotation1: cond_avg_group_in_pct_df = cond_avg_group_in_pct_df.T sns.heatmap(data=np.round(cond_avg_group_in_pct_df, 2), vmin=0, vmax=1, annot=True, annot_kws={'fontsize': annot_fontsize1}, ax=axes[0]) axes[0].set_title("cond commodity in pct", fontsize=25) axes[0].tick_params(axis='both', labelsize=30) # 第二张子图 # start_in_pct_series.plot.bar(ax=axes[1]) # if len(start_in_pct_series) == 3: # for x, y in start_in_pct_series.to_dict().items(): # axes[1].text(x+1.7, y, np.round(y, 2), fontdict={'fontsize': 30}) # elif len(start_in_pct_series) == 4: # for x, y in start_in_pct_series.to_dict().items(): # axes[1].text(x+2.7, y, np.round(y, 2), fontdict={'fontsize': 30}) # 第二张子图 labels = start_in_pct_series.sort_index().index.tolist() start_in_pct_series.index = start_in_pct_series.index - start_in_pct_series.index.min( ) + 1 start_in_pct_series.plot.bar(ax=axes[1]) for i, x in enumerate(start_in_pct_series.tolist()): axes[1].text(i - 0.1, x, np.round(x, 2), fontdict={'fontsize': 30}) axes[1].set_xticklabels(labels=labels, fontsize=25) # 第三张子图 if heatmap_rotation1: uncond_avg_group_in_pct_df = uncond_avg_group_in_pct_df.T sns.heatmap(data=np.round(uncond_avg_group_in_pct_df, 2), vmin=0, vmax=1, annot=True, annot_kws={'fontsize': annot_fontsize1}, ax=axes[2]) axes[2].set_title("uncond commodity in pct", fontsize=25) axes[2].tick_params(axis='both', labelsize=30) # 汇总 fig.suptitle( f"group commodity in info 总次数={total_in_num} 组数={group_num} 总天数={total_days_num} 比例={round(total_in_num/group_num/total_days_num,3)}", fontsize=36) fig.subplots_adjust(wspace=0.3) fig.subplots_adjust(hspace=0.2) plt.xticks(fontsize=36) plt.show() # 第二张图 fig, axes = plt.subplots(figsize=(30, 8), nrows=1, ncols=3) # 第一张子图 if heatmap_rotation2: cond_avg_group_out_pct_df = cond_avg_group_out_pct_df.T sns.heatmap(data=np.round(cond_avg_group_out_pct_df, 2), vmin=0, vmax=1, annot=True, annot_kws={'fontsize': annot_fontsize2}, ax=axes[0]) axes[0].set_title("cond commodity out pct", fontsize=25) axes[0].tick_params(axis='both', labelsize=30) # 第二张子图 # start_out_pct_series.plot.bar(ax=axes[1]) # for x, y in start_out_pct_series.to_dict().items(): # axes[1].text(x+1.7, y, np.round(y, 2), fontdict={'fontsize': 30}) # 第二张子图 labels = start_out_pct_series.sort_index().index.tolist() start_out_pct_series.index = start_out_pct_series.index - start_out_pct_series.index.min( ) + 1 start_out_pct_series.plot.bar(ax=axes[1]) for i, x in enumerate(start_out_pct_series.tolist()): axes[1].text(i - 0.1, x, np.round(x, 2), fontdict={'fontsize': 20}) axes[1].set_xticklabels(labels=labels, fontsize=25) # 第三张子图 if heatmap_rotation2: uncond_avg_group_out_pct_df = uncond_avg_group_out_pct_df.T sns.heatmap(data=np.round(uncond_avg_group_out_pct_df, 2), vmin=0, vmax=1, annot=True, annot_kws={'fontsize': annot_fontsize2}, ax=axes[2]) axes[2].set_title("uncond commodity out pct", fontsize=25) axes[2].tick_params(axis='both', labelsize=30) # 汇总 fig.suptitle( f"group commodity out info 总次数={total_out_num} 组数={group_num} 总天数={total_days_num} 比例={round(total_out_num/group_num/total_days_num,3)}", fontsize=36) fig.subplots_adjust(wspace=0.3) fig.subplots_adjust(hspace=0.2) plt.xticks(fontsize=36) plt.show()
def get_group_distribution_per_symbol(self, period: int = None, shift: int = 0, start: str = None, end: str = None): """ 获取各品种在各组的分布(包括已上市但被商品池排除的组) Attributes __________ period: int, default None 采样间隔多少个交易日 shift: int, default 0 从第几个交易日开始采样。如果shift=0, period=20,则取第1, 21, 41, ...个交易日 start: str, default None 起始日期 end: str, default None 结束日期 Returns ------- None """ # 获取signal_df self.signal.set_factor_data(self.factor.factor_value) self.signal.set_commodity_pool( self.commodity_pool.commodity_pool_value) # 修正group_num group_num: int = self.get_params()['group_num'] if not period: period: int = self.get_params()['period'] self.signal.set_params(group_num=group_num) signal_df: DataFrame = self.signal.transform() if start: signal_df = signal_df[start:] if end: signal_df = signal_df[:end] index = pd.Index(range(len(signal_df))) index = index[(index - shift) % period == 0] new_signal_df = signal_df.copy() new_signal_df = new_signal_df.iloc[index] min_num = int(signal_df.min().min()) max_num = int(signal_df.max().max()) num_list = list(range(min_num, max_num + 1)) industry_symbol_map = self.get_industry(group='actual_industry', name='actual_five_industry') # industry_list = list(industry_symbol_map.keys()) # for i in range(len(industry_symbol_map)): # fig, ax = plt.subplots(figsize=(20, 8)) # industry = industry_list[i] # symbol_list = industry_symbol_map[industry] # industry_signal_df = new_signal_df[symbol_list] # industry_signal_df.plot(ax=ax, legend=False) # ax.set_title(industry) # fig.suptitle("Group Distribution per Symbol") # fig.legend() # plt.grid() # plt.show() minus_one_pct_per_symbol = {} minus_two_pct_per_symbol = {} for industry in industry_symbol_map: symbol_list = industry_symbol_map[industry] fig, axes = plt.subplots(figsize=(30, 50), nrows=len(symbol_list), ncols=1) for symbol in symbol_list: i = symbol_list.index(symbol) symbol_signal_series = new_signal_df[symbol] symbol_signal_series = symbol_signal_series[ symbol_signal_series != 0.0] # sns.distplot(symbol_signal_series[symbol_signal_series != 0.0], ax=axes[i]) symbol_signal_series_value_counts = symbol_signal_series.dropna( ).value_counts() symbol_signal_series_value_counts.index = symbol_signal_series_value_counts.index.astype( int) for num in num_list: if num not in symbol_signal_series_value_counts.index: symbol_signal_series_value_counts.loc[num] = 0 symbol_signal_series_pct = ( symbol_signal_series_value_counts / symbol_signal_series_value_counts.sum()).sort_index( ascending=True) symbol_signal_series_pct.plot.bar(ax=axes[i]) minus_one_pct_per_symbol[ symbol] = symbol_signal_series_pct.loc[-1] minus_two_pct_per_symbol[ symbol] = symbol_signal_series_pct.loc[-2] axes[i].set_title(label=symbol, fontsize=30) # axes[i].set_xticks(num_list) xticks_delta = num_list[0] - axes[i].get_xticks()[0] axes[i].set_xticklabels(labels=axes[i].get_xticks() + xticks_delta, fontsize=30) axes[i].set_yticklabels(labels=np.round( axes[i].get_yticks(), 2), fontsize=30) # for x, y in symbol_signal_series_value_counts.to_dict().items(): # axes[i].text(x, y, np.round(y, 2)) for tick in axes[i].get_xticklabels(): tick.set_rotation(360) # plt.xticks(ticks=list(range(min_num, max_num+1)), labels=list(range(min_num, max_num+1))) fig.subplots_adjust(hspace=0.7) fig.suptitle(industry, fontsize=30) plt.show() minus_one_pct_per_symbol = np.round( pd.Series(minus_one_pct_per_symbol), 2).sort_values(ascending=False) minus_two_pct_per_symbol = np.round( pd.Series(minus_two_pct_per_symbol), 2).sort_values(ascending=False) fig, axes = plt.subplots(figsize=(20, 8), nrows=2, ncols=1) minus_one_pct_per_symbol.plot.bar(ax=axes[0], figsize=(20, 8)) axes[0].set_title('各品种已上市但未被纳入商品池的比例') axes[0].grid() for tick in axes[0].get_xticklabels(): tick.set_rotation(360) minus_two_pct_per_symbol.plot.bar(ax=axes[1], figsize=(20, 8)) axes[1].set_title('各品种已上市且被纳入商品池但无因子值的比例') axes[1].grid() for tick in axes[1].get_xticklabels(): tick.set_rotation(360) plt.show()
def QA_fetch_financial_report(code, report_date, ltype='CN', db=DATABASE): """获取专业财务报表 Arguments: code {[type]} -- [description] report_date {[type]} -- [description] Keyword Arguments: ltype {str} -- [description] (default: {'EN'}) db {[type]} -- [description] (default: {DATABASE}) Raises: e -- [description] Returns: pd.DataFrame -- [description] """ if isinstance(code, str): code = [code] if isinstance(report_date, str): report_date = [QA_util_date_str2int(report_date)] elif isinstance(report_date, int): report_date = [report_date] elif isinstance(report_date, list): report_date = [QA_util_date_str2int(item) for item in report_date] collection = db.financial CH_columns = [item for item in sorted(list(financial_dict.keys()))] CH_columns.extend(['277', '278', '279', '280', '281', '282', '_id', 'code', 'report_date']) CH_columns = pd.Index(CH_columns) #EN_columns = list(financial_dict.values()) EN_columns = [financial_dict[key] for key in sorted(list(financial_dict.keys()))] EN_columns.extend(['277', '278', '279', '280', '281', '282', '_id', 'code', 'report_date']) EN_columns = pd.Index(EN_columns) try: if code is not None and report_date is not None: data = [item for item in collection.find( {'code': {'$in': code}, 'report_date': {'$in': report_date}}, batch_size=10000)] elif code is None and report_date is not None: data = [item for item in collection.find( {'report_date': {'$in': report_date}}, batch_size=10000)] elif code is not None and report_date is None: data = [item for item in collection.find( {'code': {'$in': code}}, batch_size=10000)] else: data = [item for item in collection.find()] if len(data) > 0: res_pd = pd.DataFrame(data) if ltype in ['CH', 'CN']: res_pd.columns = CH_columns elif ltype is 'EN': res_pd.columns = EN_columns if res_pd.report_date.dtype == numpy.int64: res_pd.report_date = pd.to_datetime( res_pd.report_date.apply(QA_util_date_int2str)) else: res_pd.report_date=pd.to_datetime(res_pd.report_date) #return res_pd.replace(-4.039810335e+34, numpy.nan).set_index(['report_date', 'code'], drop=False) return res_pd.replace(-4.039810335e+34, numpy.nan).set_index(['report_date'], drop=False) else: return None except Exception as e: raise e
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name="foo") res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(["a"], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(["1970-01-01"], freq="d", tz="America/New_York", name="foo") res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")] codes = [[0], [0]] if PANDAS_GT_0240: kwargs = {"codes": codes} else: kwargs = {"labels": codes} idx = pd.MultiIndex(levels=levels, names=["a", "b"], **kwargs) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Int64Index([1], name="a"), pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"), pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"), ] codes = [[0], [0], [0]] if PANDAS_GT_0240: kwargs = {"codes": codes} else: kwargs = {"labels": codes} idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], **kwargs) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
cl.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True}) # Conversion to radians because HDBSCAN uses that. clusters = cl.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs) # with Clustering() as cl: # This is the memory heavy precomputed DBSCAN variety # cl.prepare_for_distance_algorithm(where = 'shared', manipulator = Latlons) # cl.call_distance_algorithm(func = haversine_worker, n_par_processes = NPROC, distmatdtype = np.float16) # clusters = cl.clustering(clusterclass = DBSCAN, kwargs = {'eps':1300, 'min_samples':2000}) nclusters = int(clusters.coords["nclusters"]) # nclusters returned as coordinate because this matches bahaviour of the non-DBSCAN algorithms, even though with DBSCAN it is only a dimension of length 1 logging.debug(f'clustered {invarname} of {filename} by spatial haversine distance with HDBSCAN for lag: {lag}, fold: {fold}, resulting nclusters: {nclusters}') except MaskingError: # Happens when masking results in zero or less than the minimum samples nclusters = 0 clusters = xr.DataArray(np.nan, dims = cl.samplefield.dims, coords = cl.samplefield.drop_vars(['lag','fold'], errors = 'ignore').coords) logging.debug(f'No/too little samples were present after masking {invarname} of {filename} for lag: {lag}. fold: {fold}, HDBSCAN was not called. A field with zero clusters is returned.') if fold is None: attrs.update({f'lag{lag}':f'nclusters: {nclusters}'}) else: attrs.update({f'lag{lag}_fold{fold}':f'nclusters: {nclusters}'}) combined.append(clusters.squeeze().drop_vars('nclusters', errors = 'ignore')) if fold is None: temp = xr.concat(combined, dim = pd.Index(lags, name = 'lag')) # Immediately at first position, and correct order else: temp = xr.concat(combined, dim = pd.MultiIndex.from_product([lags,folds], names = ['lag','fold'])).unstack('concat_dim').transpose(*ds[invarname].dims).reindex_like(ds[invarname]) # Unstack brings the lag/fold dimension to last place and scrambled order, so so transpose and reindex to get original ordering ds.close() # Need to close before writer can access attrs.update({key:str(item) for key,item in clusterkwargs.items()}) w = Writer(corrpath,varname = outvarname) # Should be able to find the dataformat w.create_dataset(example = temp) w.write(array = temp, units = '', attrs = attrs) else: logging.debug(f'{filename} was already clustered') ds.close()
else: hpc_channel = generalinfo['channelStructure'][0][0][1][0][0][0][0] - 1 spikes, shank = loadSpikeData( data_directory + session + '/Analysis/SpikeData.mat', shankStructure['thalamus']) wake_ep = loadEpoch(data_directory + session, 'wake') sleep_ep = loadEpoch(data_directory + session, 'sleep') sws_ep = loadEpoch(data_directory + session, 'sws') rem_ep = loadEpoch(data_directory + session, 'rem') sleep_ep = sleep_ep.merge_close_intervals(threshold=1.e3) sws_ep = sleep_ep.intersect(sws_ep) rem_ep = sleep_ep.intersect(rem_ep) Hcorr_ep = {} for ep, k in zip([wake_ep, rem_ep, sws_ep], ['wak', 'rem', 'sws']): AUT, FR = compute_AutoCorrs(spikes, ep, binsize=0.5, nbins=20000) AUT.columns = pd.Index( [session.split("/")[1] + "_" + str(n) for n in spikes.keys()]) datatosave[k].append(AUT) print(session, time() - start) for e in datatosave.keys(): datatosave[e] = pd.concat(datatosave[e], 1) store_autocorr = pd.HDFStore( "/mnt/DataGuillaume/MergedData/AUTOCORR_FOR_FOURIER.h5", 'w') store_autocorr.put('wak', datatosave['wak']) store_autocorr.put('rem', datatosave['rem']) store_autocorr.put('sws', datatosave['sws']) store_autocorr.close() from pychronux import *
def previous_application(file_path=file_path, nan_as_category=True): def goods_cat(x): if x in ['XNA', 'Other']: return 'XNA' elif x in ['Mobile', 'Consumer Electronics', 'Computers', 'Photo / Cinema Equipment', 'Clothing and Accessories', 'Jewelry', 'Sport and Leisure', 'Tourism', 'Fitness', 'Additional Service', 'Weapon', 'Animals', 'Direct Sales']: return 'electronics & leisure' else: return 'home & car & edu & medi' df_prev = pd.read_csv(file_path + 'previous_application.csv') # Replace some outliers df_prev.loc[df_prev['AMT_CREDIT'] > 6000000, 'AMT_CREDIT'] = np.nan df_prev.loc[df_prev['SELLERPLACE_AREA'] > 3500000, 'SELLERPLACE_AREA'] = np.nan df_prev[['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION']].replace(365243, np.nan, inplace=True) # category df_prev.drop('WEEKDAY_APPR_PROCESS_START', axis=1, inplace=True) df_prev['NAME_SELLER_INDUSTRY'] = df_prev['NAME_SELLER_INDUSTRY'].apply(lambda x: 'other' if x not in ['XNA', 'Consumer electronics', 'Connectivity'] else x) df_prev['CHANNEL_TYPE'] = df_prev['CHANNEL_TYPE'].apply(lambda x: 'other' if x not in ['Credit and cash offices', 'Country-wide'] else x) df_prev['NAME_PORTFOLIO'] = df_prev['NAME_PORTFOLIO'].apply(lambda x: 'other' if x not in ['POS', 'Cash'] else x) df_prev['NAME_GOODS_CATEGORY'] = df_prev['NAME_GOODS_CATEGORY'].apply(lambda x: goods_cat(x)) df_prev['NAME_TYPE_SUITE'] = df_prev['NAME_TYPE_SUITE'].apply(lambda x: 'other' if x != 'Unaccompanied' else x) df_prev['CODE_REJECT_REASON'] = df_prev['CODE_REJECT_REASON'].apply(lambda x: 'other' if x not in ['XAP', 'HC'] else x) df_prev['NAME_PAYMENT_TYPE'] = df_prev['NAME_PAYMENT_TYPE'].apply(lambda x: 'other' if x not in [ 'Cash through the bank'] else x) df_prev['NAME_CASH_LOAN_PURPOSE'] = df_prev['NAME_CASH_LOAN_PURPOSE'].apply(lambda x: 'other' if x not in ['XAP', 'XNA'] else x) # Some new features df_prev['prev missing'] = df_prev.isnull().sum(axis=1).values df_prev['prev AMT_APPLICATION / AMT_CREDIT'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_CREDIT'] df_prev['prev AMT_APPLICATION - AMT_CREDIT'] = df_prev['AMT_APPLICATION'] - df_prev['AMT_CREDIT'] df_prev['prev AMT_APPLICATION - AMT_GOODS_PRICE'] = df_prev['AMT_APPLICATION'] - df_prev['AMT_GOODS_PRICE'] df_prev['prev AMT_GOODS_PRICE - AMT_CREDIT'] = df_prev['AMT_GOODS_PRICE'] - df_prev['AMT_CREDIT'] df_prev['prev DAYS_FIRST_DRAWING - DAYS_FIRST_DUE'] = df_prev['DAYS_FIRST_DRAWING'] - df_prev['DAYS_FIRST_DUE'] df_prev['prev DAYS_TERMINATION less -500'] = (df_prev['DAYS_TERMINATION'] < -500).astype(int) df_prev['DAYS_LAST_DUE - DAYS_TERMINATION'] = df_prev['DAYS_LAST_DUE'] - df_prev['DAYS_TERMINATION'] #df_prev = df_prev.drop(['AMT_APPLICATION', 'AMT_GOODS_PRICE', 'DAYS_TERMINATION'], axis=1) df_prev['avg loan terms'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY'] # Categorical features with One-Hot encode df_prev, categorical = one_hot_encoder(df_prev, nan_as_category) # Aggregations for application set aggregations = {} for col in df_prev.columns: aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum'] df_prev_agg = df_prev.groupby('SK_ID_CURR').agg(aggregations) df_prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in df_prev_agg.columns.tolist()]) # Previous Applications: Approved Applications approved_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Approved'] == 1].groupby('SK_ID_CURR').agg(aggregations) approved_agg.columns = pd.Index(['PREV_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()]) df_prev_agg = df_prev_agg.join(approved_agg, how='left') del approved_agg gc.collect() # Previous Applications: Refused Applications refused_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Refused'] == 1].groupby('SK_ID_CURR').agg(aggregations) refused_agg.columns = pd.Index(['PREV_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()]) df_prev_agg = df_prev_agg.join(refused_agg, how='left') del refused_agg gc.collect() # cash loans application cash_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Cash loans'] == 1].groupby('SK_ID_CURR').agg(aggregations) cash_loan_agg.columns = pd.Index( ['PREV_Cash loans_' + e[0] + "_" + e[1].upper() for e in cash_loan_agg.columns.tolist()]) df_prev_agg = df_prev_agg.join(cash_loan_agg, how='left') del cash_loan_agg gc.collect() # consumer loans consumer_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Consumer loans'] == 1].groupby('SK_ID_CURR').agg(aggregations) consumer_loan_agg.columns = pd.Index( ['PREV_Consumer loans_' + e[0] + "_" + e[1].upper() for e in consumer_loan_agg.columns.tolist()]) df_prev_agg = df_prev_agg.join(consumer_loan_agg, how='left') del consumer_loan_agg gc.collect() # Revolving loans Revolving_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Revolving loans'] == 1].groupby('SK_ID_CURR').agg( aggregations) Revolving_loan_agg.columns = pd.Index( ['PREV_Revolving loans_' + e[0] + "_" + e[1].upper() for e in Revolving_loan_agg.columns.tolist()]) df_prev_agg = df_prev_agg.join(Revolving_loan_agg, how='left') del Revolving_loan_agg gc.collect() del df_prev gc.collect() return reduce_mem_usage(df_prev_agg)
def itemid_2_index(self): r_index = pd.Index(self.item_data.item.unique(), name='item') return r_index
def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64)))
def to_pandas_frame(self) -> pd.DataFrame: """ Return as pandas DataFrame. """ sdf = self.to_internal_spark_frame pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: pdf = pdf.astype({ field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema }) elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"): for field in sdf.schema: if field.nullable and pdf[field.name].isnull().all(): if isinstance(field.dataType, BooleanType): pdf[field.name] = pdf[field.name].astype(np.object) elif isinstance(field.dataType, IntegralType): pdf[field.name] = pdf[field.name].astype(np.float64) else: pdf[field.name] = pdf[field.name].astype( spark_type_to_pandas_dtype(field.dataType)) column_names = [] for i, (label, spark_column, column_name) in enumerate( zip(self.column_labels, self.data_spark_columns, self.data_spark_column_names)): for index_spark_column_name, index_spark_column in zip( self.index_spark_column_names, self.index_spark_columns): if spark_column._jc.equals(index_spark_column._jc): column_names.append(index_spark_column_name) break else: name = str(i) if label is None else name_like_string(label) if column_name != name: column_name = name column_names.append(column_name) append = False for index_field in self.index_spark_column_names: drop = index_field not in column_names pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[column_names] names = [ name if name is None or len(name) > 1 else name[0] for name in self._column_label_names ] if self.column_labels_level > 1: pdf.columns = pd.MultiIndex.from_tuples(self._column_labels, names=names) else: pdf.columns = pd.Index( [ None if label is None else label[0] for label in self._column_labels ], name=names[0], ) pdf.index.names = [ name if name is None or len(name) > 1 else name[0] for name in self.index_names ] return pdf
class TestTableOrient: def setup_method(self, method): self.df = DataFrame( { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), "D": pd.timedelta_range("1H", periods=4, freq="T"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], "H": pd.date_range( "2016-01-01", freq="d", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) def test_build_series(self): s = pd.Series([1, 2], name="a") s.index.name = "id" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) assert "pandas_version" in result["schema"] result["schema"].pop("pandas_version") fields = [{ "name": "id", "type": "integer" }, { "name": "a", "type": "integer" }] schema = {"fields": fields, "primaryKey": ["id"]} expected = OrderedDict([ ("schema", schema), ( "data", [ OrderedDict([("id", 0), ("a", 1)]), OrderedDict([("id", 1), ("a", 2)]), ], ), ]) assert result == expected def test_to_json(self): df = self.df.copy() df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) assert "pandas_version" in result["schema"] result["schema"].pop("pandas_version") fields = [ { "name": "idx", "type": "integer" }, { "name": "A", "type": "integer" }, { "name": "B", "type": "string" }, { "name": "C", "type": "datetime" }, { "name": "D", "type": "duration" }, { "constraints": { "enum": ["a", "b", "c"] }, "name": "E", "ordered": False, "type": "any", }, { "constraints": { "enum": ["a", "b", "c"] }, "name": "F", "ordered": True, "type": "any", }, { "name": "G", "type": "number" }, { "name": "H", "type": "datetime", "tz": "US/Central" }, ] schema = {"fields": fields, "primaryKey": ["idx"]} data = [ OrderedDict([ ("idx", 0), ("A", 1), ("B", "a"), ("C", "2016-01-01T00:00:00.000Z"), ("D", "P0DT1H0M0S"), ("E", "a"), ("F", "a"), ("G", 1.0), ("H", "2016-01-01T06:00:00.000Z"), ]), OrderedDict([ ("idx", 1), ("A", 2), ("B", "b"), ("C", "2016-01-02T00:00:00.000Z"), ("D", "P0DT1H1M0S"), ("E", "b"), ("F", "b"), ("G", 2.0), ("H", "2016-01-02T06:00:00.000Z"), ]), OrderedDict([ ("idx", 2), ("A", 3), ("B", "c"), ("C", "2016-01-03T00:00:00.000Z"), ("D", "P0DT1H2M0S"), ("E", "c"), ("F", "c"), ("G", 3.0), ("H", "2016-01-03T06:00:00.000Z"), ]), OrderedDict([ ("idx", 3), ("A", 4), ("B", "c"), ("C", "2016-01-04T00:00:00.000Z"), ("D", "P0DT1H3M0S"), ("E", "c"), ("F", "c"), ("G", 4.0), ("H", "2016-01-04T06:00:00.000Z"), ]), ] expected = OrderedDict([("schema", schema), ("data", data)]) assert result == expected def test_to_json_float_index(self): data = pd.Series(1, index=[1.0, 2.0]) result = data.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) result["schema"].pop("pandas_version") expected = OrderedDict([ ( "schema", { "fields": [ { "name": "index", "type": "number" }, { "name": "values", "type": "integer" }, ], "primaryKey": ["index"], }, ), ( "data", [ OrderedDict([("index", 1.0), ("values", 1)]), OrderedDict([("index", 2.0), ("values", 1)]), ], ), ]) assert result == expected def test_to_json_period_index(self): idx = pd.period_range("2016", freq="Q-JAN", periods=2) data = pd.Series(1, idx) result = data.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) result["schema"].pop("pandas_version") fields = [ { "freq": "Q-JAN", "name": "index", "type": "datetime" }, { "name": "values", "type": "integer" }, ] schema = {"fields": fields, "primaryKey": ["index"]} data = [ OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]), OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]), ] expected = OrderedDict([("schema", schema), ("data", data)]) assert result == expected def test_to_json_categorical_index(self): data = pd.Series(1, pd.CategoricalIndex(["a", "b"])) result = data.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) result["schema"].pop("pandas_version") expected = OrderedDict([ ( "schema", { "fields": [ { "name": "index", "type": "any", "constraints": { "enum": ["a", "b"] }, "ordered": False, }, { "name": "values", "type": "integer" }, ], "primaryKey": ["index"], }, ), ( "data", [ OrderedDict([("index", "a"), ("values", 1)]), OrderedDict([("index", "b"), ("values", 1)]), ], ), ]) assert result == expected def test_date_format_raises(self): with pytest.raises(ValueError): self.df.to_json(orient="table", date_format="epoch") # others work self.df.to_json(orient="table", date_format="iso") self.df.to_json(orient="table") def test_convert_pandas_type_to_json_field_int(self, index_or_series): kind = index_or_series data = [1, 2, 3] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "integer"} assert result == expected def test_convert_pandas_type_to_json_field_float(self, index_or_series): kind = index_or_series data = [1.0, 2.0, 3.0] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "number"} assert result == expected @pytest.mark.parametrize("dt_args,extra_exp", [({}, {}), ({ "utc": True }, { "tz": "UTC" })]) @pytest.mark.parametrize("wrapper", [None, pd.Series]) def test_convert_pandas_type_to_json_field_datetime( self, dt_args, extra_exp, wrapper): data = [1.0, 2.0, 3.0] data = pd.to_datetime(data, **dt_args) if wrapper is pd.Series: data = pd.Series(data, name="values") result = convert_pandas_type_to_json_field(data) expected = {"name": "values", "type": "datetime"} expected.update(extra_exp) assert result == expected def test_convert_pandas_type_to_json_period_range(self): arr = pd.period_range("2016", freq="A-DEC", periods=4) result = convert_pandas_type_to_json_field(arr) expected = {"name": "values", "type": "datetime", "freq": "A-DEC"} assert result == expected @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex]) @pytest.mark.parametrize("ordered", [True, False]) def test_convert_pandas_type_to_json_field_categorical( self, kind, ordered): data = ["a", "b", "c"] if kind is pd.Categorical: arr = pd.Series(kind(data, ordered=ordered), name="cats") elif kind is pd.CategoricalIndex: arr = kind(data, ordered=ordered, name="cats") result = convert_pandas_type_to_json_field(arr) expected = { "name": "cats", "type": "any", "constraints": { "enum": data }, "ordered": ordered, } assert result == expected @pytest.mark.parametrize( "inp,exp", [ ({ "type": "integer" }, "int64"), ({ "type": "number" }, "float64"), ({ "type": "boolean" }, "bool"), ({ "type": "duration" }, "timedelta64"), ({ "type": "datetime" }, "datetime64[ns]"), ({ "type": "datetime", "tz": "US/Hawaii" }, "datetime64[ns, US/Hawaii]"), ({ "type": "any" }, "object"), ( { "type": "any", "constraints": { "enum": ["a", "b", "c"] }, "ordered": False, }, CategoricalDtype(categories=["a", "b", "c"], ordered=False), ), ( { "type": "any", "constraints": { "enum": ["a", "b", "c"] }, "ordered": True, }, CategoricalDtype(categories=["a", "b", "c"], ordered=True), ), ({ "type": "string" }, "object"), ], ) def test_convert_json_field_to_pandas_type(self, inp, exp): field = {"name": "foo"} field.update(inp) assert convert_json_field_to_pandas_type(field) == exp @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"]) def test_convert_json_field_to_pandas_type_raises(self, inp): field = {"type": inp} with pytest.raises(ValueError, match=f"Unsupported or invalid field type: {inp}"): convert_json_field_to_pandas_type(field) def test_categorical(self): s = pd.Series(pd.Categorical(["a", "b", "a"])) s.index.name = "idx" result = s.to_json(orient="table", date_format="iso") result = json.loads(result, object_pairs_hook=OrderedDict) result["schema"].pop("pandas_version") fields = [ { "name": "idx", "type": "integer" }, { "constraints": { "enum": ["a", "b"] }, "name": "values", "ordered": False, "type": "any", }, ] expected = OrderedDict([ ("schema", { "fields": fields, "primaryKey": ["idx"] }), ( "data", [ OrderedDict([("idx", 0), ("values", "a")]), OrderedDict([("idx", 1), ("values", "b")]), OrderedDict([("idx", 2), ("values", "a")]), ], ), ]) assert result == expected @pytest.mark.parametrize( "idx,nm,prop", [ (pd.Index([1]), "index", "name"), (pd.Index([1], name="myname"), "myname", "name"), ( pd.MultiIndex.from_product([("a", "b"), ("c", "d")]), ["level_0", "level_1"], "names", ), ( pd.MultiIndex.from_product([("a", "b"), ("c", "d")], names=["n1", "n2"]), ["n1", "n2"], "names", ), ( pd.MultiIndex.from_product([("a", "b"), ("c", "d")], names=["n1", None]), ["n1", "level_1"], "names", ), ], ) def test_set_names_unset(self, idx, nm, prop): data = pd.Series(1, idx) result = set_default_names(data) assert getattr(result.index, prop) == nm @pytest.mark.parametrize( "idx", [ pd.Index([], name="index"), pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")), pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")), ], ) def test_warns_non_roundtrippable_names(self, idx): # GH 19130 df = pd.DataFrame(index=idx) df.index.name = "index" with tm.assert_produces_warning(): set_default_names(df) def test_timestamp_in_columns(self): df = pd.DataFrame( [[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]) result = df.to_json(orient="table") js = json.loads(result) assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" # TODO - below expectation is not correct; see GH 28256 assert js["schema"]["fields"][2]["name"] == 10000 @pytest.mark.parametrize( "case", [ pd.Series([1], index=pd.Index([1], name="a"), name="a"), pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")), pd.DataFrame( {"A": [1]}, index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a" ]), ), ], ) def test_overlapping_names(self, case): with pytest.raises(ValueError, match="Overlapping"): case.to_json(orient="table")
def execute_selection_dataframe( op, data, selections, predicates, sort_keys, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result = data # Build up the individual pandas structures from column expressions if selections: if all(isinstance(s.op(), ops.TableColumn) for s in selections): result = build_df_from_selection(selections, data, op.table.op()) else: result = build_df_from_projection( selections, op, data, scope=scope, timecontext=timecontext, **kwargs, ) if predicates: predicates = _compute_predicates( op.table.op(), predicates, data, scope, timecontext, **kwargs ) predicate = functools.reduce(operator.and_, predicates) assert len(predicate) == len( result ), 'Selection predicate length does not match underlying table' result = result.loc[predicate] if sort_keys: result, grouping_keys, ordering_keys = util.compute_sorted_frame( result, order_by=sort_keys, scope=scope, timecontext=timecontext, **kwargs, ) else: grouping_keys = ordering_keys = () # return early if we do not have any temporary grouping or ordering columns assert not grouping_keys, 'group by should never show up in Selection' if not ordering_keys: return result # create a sequence of columns that we need to drop temporary_columns = pd.Index( concatv(grouping_keys, ordering_keys) ).difference(data.columns) # no reason to call drop if we don't need to if temporary_columns.empty: return result # drop every temporary column we created for ordering or grouping return result.drop(temporary_columns, axis=1)
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) out = df.to_json(orient="table") with pytest.raises(NotImplementedError, match="can not yet read "): pd.read_json(out, orient="table")
def _infer_concat_order_from_coords(datasets): concat_dims = [] tile_ids = [() for ds in datasets] # All datasets have same variables because they've been grouped as such ds0 = datasets[0] for dim in ds0.dims: # Check if dim is a coordinate dimension if dim in ds0: # Need to read coordinate values to do ordering indexes = [ds.indexes.get(dim) for ds in datasets] if any(index is None for index in indexes): raise ValueError("Every dimension needs a coordinate for " "inferring concatenation order") # If dimension coordinate values are same on every dataset then # should be leaving this dimension alone (it's just a "bystander") if not all(index.equals(indexes[0]) for index in indexes[1:]): # Infer order datasets should be arranged in along this dim concat_dims.append(dim) if all(index.is_monotonic_increasing for index in indexes): ascending = True elif all(index.is_monotonic_decreasing for index in indexes): ascending = False else: raise ValueError( "Coordinate variable {} is neither " "monotonically increasing nor " "monotonically decreasing on all datasets".format(dim)) # Assume that any two datasets whose coord along dim starts # with the same value have the same coord values throughout. if any(index.size == 0 for index in indexes): raise ValueError("Cannot handle size zero dimensions") first_items = pd.Index([index.take([0]) for index in indexes]) # Sort datasets along dim # We want rank but with identical elements given identical # position indices - they should be concatenated along another # dimension, not along this one series = first_items.to_series() rank = series.rank(method="dense", ascending=ascending) order = rank.astype(int).values - 1 # Append positions along extra dimension to structure which # encodes the multi-dimensional concatentation order tile_ids = [ tile_id + (position, ) for tile_id, position in zip(tile_ids, order) ] if len(datasets) > 1 and not concat_dims: raise ValueError("Could not find any dimension coordinates to use to " "order the datasets for concatenation") combined_ids = OrderedDict(zip(tile_ids, datasets)) return combined_ids, concat_dims
"to same dtypes."), ): s.where([True, False, True], [1, 2, 3]) @pytest.mark.parametrize( "ps", [ pd.Series(["a"] * 20, index=range(0, 20)), pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), ], ) @pytest.mark.parametrize( "labels", [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], ) @pytest.mark.parametrize("inplace", [True, False]) def test_series_drop_labels(ps, labels, inplace): ps = ps.copy() gs = cudf.from_pandas(ps) expected = ps.drop(labels=labels, axis=0, inplace=inplace) actual = gs.drop(labels=labels, axis=0, inplace=inplace) if inplace: expected = ps actual = gs assert_eq(expected, actual)
def aggregate(self, func=None, *args, **kwargs): if self._axis != 0: # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") if ( callable(func) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self) ): func = func.__name__ relabeling_required = False if isinstance(func, dict) or func is None: def try_get_str_func(fn): if not isinstance(fn, str) and isinstance(fn, Iterable): return [try_get_str_func(f) for f in fn] return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn relabeling_required, func_dict, new_columns, order = reconstruct_func( func, **kwargs ) func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()} if any(i not in self._df.columns for i in func_dict.keys()): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") if func is None: kwargs = {} func = func_dict elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif callable(func): return self._apply_agg_function( lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs), *args, **kwargs, ) elif isinstance(func, str): # Using "getattr" here masks possible AttributeError which we throw # in __getattr__, so we should call __getattr__ directly instead. agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) result = self._apply_agg_function( func, *args, **kwargs, ) if relabeling_required: if not self._as_index: nby_cols = len(result.columns) - len(new_columns) order = np.concatenate([np.arange(nby_cols), order + nby_cols]) by_cols = result.columns[:nby_cols] new_columns = pandas.Index(new_columns) if by_cols.nlevels != new_columns.nlevels: by_cols = by_cols.remove_unused_levels() empty_levels = [ i for i, level in enumerate(by_cols.levels) if len(level) == 1 and level[0] == "" ] by_cols = by_cols.droplevel(empty_levels) new_columns = by_cols.append(new_columns) result = result.iloc[:, order] result.columns = new_columns return result
def create_loom_from_tabulamurisfacs(fn_loom): samplesheet = pd.read_csv( '../../data/tabulamuris/FACS_alltissues/annotations_facs.csv', sep=',', index_col='cell', low_memory=False, ).iloc[:, 2:] samplesheet.index.name = 'CellID' immune_types = [ 'B cell', 'DN1 thymic pro-T cell', 'regulatory T cell', 'basophil', 'pre-natural killer cell', 'immature T cell', 'myeloid cell', 'T cell', 'granulocyte', 'naive B cell', 'leukocyte', 'precursor B cell', 'macrophage', 'immature B cell', 'monocyte', 'late pro-B cell', 'natural killer cell', 'granulocyte monocyte progenitor cell', 'classical monocyte', 'lymphocyte', 'professional antigen presenting cell', 'mature natural killer cell', 'immature NK T cell', 'immature natural killer cell', ] samplesheet = samplesheet.loc[samplesheet['cell_ontology_class'].isin( immune_types)] print('Tabula Muris has a total of {:} immune cells of {:} types'.format( samplesheet.shape[0], len(immune_types))) cnames_unsort = samplesheet.index cellnames = [] genes = [] counts = [] fns = glob.glob('../../data/tabulamuris/FACS_alltissues/FACS/FACS/*.loom') for ifn, fn in enumerate(fns): tissue = os.path.basename(fn)[:-len('-counts.loom')] print('Mining {:} ({:}/{:})'.format(tissue, ifn + 1, len(fns))) with loompy.connect(fn) as dsl: cnsus = dsl.ca['CellID'] idx = pd.Index(cnsus).isin(cnames_unsort).nonzero()[0] cns = cnsus[idx] cos = dsl[:, idx] cellnames.append(cns) counts.append(cos) genes.append(dsl.ra['GeneName']) # Check that they all have the same genes if len(set([tuple(x) for x in genes])) > 1: print('WARNING: not all tissues have the same genes') return { 'ss': samplesheet, 'counts': counts, 'cns': cellnames, 'genes': genes } print('Merging into single loom file') cellnames = np.concatenate(cellnames) counts = np.hstack(counts) genes = genes[0] samplesheet = samplesheet.loc[cellnames] print('Writing loom file') col_attrs = {col: samplesheet[col].values for col in samplesheet.columns} col_attrs['CellID'] = samplesheet.index.values row_attrs = {'GeneName': genes} loompy.create( fn_loom, layers={'': counts}, col_attrs=col_attrs, row_attrs=row_attrs, )
klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) if isinstance(obj, pd.CategoricalIndex): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: num_unique_values = len(obj.unique()) assert obj.nunique() == max(0, num_unique_values - 1) assert obj.nunique(dropna=False) == max(0, num_unique_values) @pytest.mark.parametrize("idx_or_series_w_bad_unicode", [pd.Index(["\ud83d"] * 2), pd.Series(["\ud83d"] * 2)]) def test_unique_bad_unicode(idx_or_series_w_bad_unicode): # regression test for #34550 obj = idx_or_series_w_bad_unicode result = obj.unique() if isinstance(obj, pd.Index): expected = pd.Index(["\ud83d"], dtype=object) tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(["\ud83d"], dtype=object) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dropna", [True, False])