[FieldTypes.uint64, [64, np.uint64(64)], [64.]],
    [FieldTypes.float16, [16., np.float16(16)], [16]],
    [FieldTypes.float32, [32., np.float32(32)], [32]],
    [FieldTypes.float64, [64., np.float64(64)], [64]],
    [FieldTypes.complex64, [1+2j, np.complex64(1+2j)], [64]],
    [FieldTypes.complex128, [1+2j, np.complex128(1+2j)], [128]],
    [FieldTypes.bytes, [b'abc', np.bytes_('abc')], ['abc']],
    [FieldTypes.string, ['abc', np.str_('abc')], [b'abc']],
    [FieldTypes.ndarray, [np.array([1, 2, 3])], [object()]],
    [FieldTypes.dtype, [np.dtype(np.int32), pd.StringDtype()], [object()]],
    [FieldTypes.key, [MyClass()], [object()]],
    [FieldTypes.slice, [slice(1, 10), slice('a', 'b')], [object()]],
    [FieldTypes.datetime, [datetime.now(), pd.Timestamp(0)], [object()]],
    [FieldTypes.timedelta, [timedelta(days=1), pd.Timedelta(days=1)], [object()]],
    [FieldTypes.tzinfo, [timezone.utc], [object()]],
    [FieldTypes.index, [pd.RangeIndex(10), pd.Index([1, 2])], [object()]],
    [FieldTypes.series, [pd.Series([1, 2, 3])], [object()]],
    [FieldTypes.dataframe, [pd.DataFrame({'a': [1, 2]})], [object()]],
    [FieldTypes.interval_array, [pd.arrays.IntervalArray([])], [object()]],
    [FieldTypes.function, [MyClass.my_func], [object()]],
    [FieldTypes.namedtuple, [my_named_tuple(a=1, b=2)], [tuple()]],
    [FieldTypes.reference(MyClass), [MyClass()], [object()]],
    [FieldTypes.tuple(FieldTypes.int64, ...), [tuple(), tuple([1, 2])], [list(), tuple([1, 2.])]],
    [FieldTypes.list(FieldTypes.int64, FieldTypes.float64), [[1, 1.]], [tuple(), [1, 1]]],
    [FieldTypes.dict(FieldTypes.string, FieldTypes.int64), [{'a': 1}], [{1: 'a'}, {'a': 1.}]],
    [FieldTypes.any, [object()], []],
]


@pytest.mark.parametrize(
    'field_type, valid_values, invalid_values',
Example #2
0
    def test_frame_from_json_to_json(self):
        def _check_orient(df,
                          orient,
                          dtype=None,
                          numpy=False,
                          convert_axes=True,
                          check_dtype=True,
                          raise_ok=None,
                          sort=None,
                          check_index_type=True,
                          check_column_type=True,
                          check_numpy_dtype=False):
            if sort is not None:
                df = df.sort_values(sort)
            else:
                df = df.sort_index()

            # if we are not unique, then check that we are raising ValueError
            # for the appropriate orients
            if not df.index.is_unique and orient in ['index', 'columns']:
                pytest.raises(ValueError, lambda: df.to_json(orient=orient))
                return
            if (not df.columns.is_unique
                    and orient in ['index', 'columns', 'records']):
                pytest.raises(ValueError, lambda: df.to_json(orient=orient))
                return

            dfjson = df.to_json(orient=orient)

            try:
                unser = read_json(dfjson,
                                  orient=orient,
                                  dtype=dtype,
                                  numpy=numpy,
                                  convert_axes=convert_axes)
            except Exception as detail:
                if raise_ok is not None:
                    if isinstance(detail, raise_ok):
                        return
                raise

            if sort is not None and sort in unser.columns:
                unser = unser.sort_values(sort)
            else:
                unser = unser.sort_index()

            if dtype is False:
                check_dtype = False

            if not convert_axes and df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(
                    unser.index.values.astype('i8') * 1e6)
            if orient == "records":
                # index is not captured in this orientation
                tm.assert_almost_equal(df.values,
                                       unser.values,
                                       check_dtype=check_numpy_dtype)
                tm.assert_index_equal(df.columns,
                                      unser.columns,
                                      exact=check_column_type)
            elif orient == "values":
                # index and cols are not captured in this orientation
                if numpy is True and df.shape == (0, 0):
                    assert unser.shape[0] == 0
                else:
                    tm.assert_almost_equal(df.values,
                                           unser.values,
                                           check_dtype=check_numpy_dtype)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]

                if sort is None:
                    unser = unser.sort_index()
                tm.assert_almost_equal(df.values,
                                       unser.values,
                                       check_dtype=check_numpy_dtype)
            else:
                if convert_axes:
                    tm.assert_frame_equal(df,
                                          unser,
                                          check_dtype=check_dtype,
                                          check_index_type=check_index_type,
                                          check_column_type=check_column_type)
                else:
                    tm.assert_frame_equal(df,
                                          unser,
                                          check_less_precise=False,
                                          check_dtype=check_dtype)

        def _check_all_orients(df,
                               dtype=None,
                               convert_axes=True,
                               raise_ok=None,
                               sort=None,
                               check_index_type=True,
                               check_column_type=True):

            # numpy=False
            if convert_axes:
                _check_orient(df,
                              "columns",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "records",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "split",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "index",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "values",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)

            _check_orient(df,
                          "columns",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "records",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "split",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "index",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "values",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)

            # numpy=True and raise_ok might be not None, so ignore the error
            if convert_axes:
                _check_orient(df,
                              "columns",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "records",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "split",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "index",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "values",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)

            _check_orient(df,
                          "columns",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "records",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "split",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "index",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "values",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)

        # basic
        _check_all_orients(self.frame)
        assert self.frame.to_json() == self.frame.to_json(orient="columns")

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
        _check_all_orients(self.intframe, dtype=False)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)),
                           columns=[str(i) for i in range(4)],
                           index=[str(i) for i in range(200)])
        _check_all_orients(biggie, dtype=False, convert_axes=False)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64),
                           dtype=np.float64,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype=np.int),
                           dtype=np.int,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype='U3'),
                           dtype='U3',
                           convert_axes=False,
                           raise_ok=ValueError)

        # categorical
        _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)

        # empty
        _check_all_orients(self.empty_frame,
                           check_index_type=False,
                           check_column_type=False)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(['a', 'b', 'c', 'd', 'e'])
        data = {
            'A': [0., 1., 2., 3., 4.],
            'B': [0., 1., 0., 1., 0.],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': [True, False, True, False, True]
        }
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split", check_dtype=False)
        _check_orient(df, "records", check_dtype=False)
        _check_orient(df, "values", check_dtype=False)
        _check_orient(df, "columns", check_dtype=False)
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index", dtype=False)
Example #3
0
 def test_extract_cell_dataframe(self):
     data = np.stack((
         np.array([[3, 2, 4, 0], [1, 1, 3, 1], [0, 0, 1, 1], [5, 0, 3, 1]]),
         np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 2, 0], [5, 0, 0, 0]]),
     ),
                     axis=2)
     cell_labels = np.array([[0, 1, 1, 0], [1, 1, 3, 3], [0, 0, 3, 3],
                             [0, 0, 3, 3]])
     image = mi.MibiImage(data, ['1', '2'])
     labels = [1, 3]
     areas = [4, 6]
     x_centroids = [1, 2]
     y_centroids = [0, 2]
     first_total = [8, 10]
     second_total = [1, 3]
     # Check coords and areas only
     expected_from_labels = pd.DataFrame(
         np.array([areas, x_centroids, y_centroids]).T,
         columns=['area', 'x_centroid', 'y_centroid'],
         index=pd.Index(labels, name='label'))
     pdt.assert_frame_equal(
         segmentation.extract_cell_dataframe(cell_labels),
         expected_from_labels)
     # Check mode 'total'
     expected_from_total = pd.DataFrame(
         np.array([first_total, second_total]).T,
         columns=['1', '2'],
         index=pd.Index(labels, name='label'))
     pdt.assert_frame_equal(
         segmentation.extract_cell_dataframe(cell_labels, image),
         pd.concat((expected_from_labels, expected_from_total), axis=1))
     # Check mode 'quadrant'
     quads = []
     for label in labels:
         inds = np.nonzero(cell_labels == label)
         quads.append(
             segmentation._circular_sectors_mean(inds, image,
                                                 num_sectors=4))
     expected_from_quadrants = pd.DataFrame(np.array(quads),
                                            columns=['1', '2'],
                                            index=pd.Index(labels,
                                                           name='label'))
     pdt.assert_frame_equal(
         segmentation.extract_cell_dataframe(cell_labels,
                                             image,
                                             mode='quadrant'),
         pd.concat((expected_from_labels, expected_from_quadrants), axis=1))
     # Check mode 'circular_sectors'
     secs = []
     for label in labels:
         inds = np.nonzero(cell_labels == label)
         num_sectors = 8
         secs.append(
             segmentation._circular_sectors_mean(inds, image, num_sectors))
     expected_from_circular_sectors = pd.DataFrame(
         np.array(secs),
         columns=['1', '2'],
         index=pd.Index(labels, name='label'))
     pdt.assert_frame_equal(
         segmentation.extract_cell_dataframe(cell_labels,
                                             image,
                                             mode='circular_sectors',
                                             num_sectors=num_sectors),
         pd.concat((expected_from_labels, expected_from_circular_sectors),
                   axis=1))
class TestTableSchemaType:
    @pytest.mark.parametrize("int_type",
                             [np.int, np.int16, np.int32, np.int64])
    def test_as_json_table_type_int_data(self, int_type):
        int_data = [1, 2, 3]
        assert as_json_table_type(np.array(int_data,
                                           dtype=int_type)) == "integer"

    @pytest.mark.parametrize("float_type",
                             [np.float, np.float16, np.float32, np.float64])
    def test_as_json_table_type_float_data(self, float_type):
        float_data = [1.0, 2.0, 3.0]
        assert as_json_table_type(np.array(float_data,
                                           dtype=float_type)) == "number"

    @pytest.mark.parametrize("bool_type", [bool, np.bool])
    def test_as_json_table_type_bool_data(self, bool_type):
        bool_data = [True, False]
        assert as_json_table_type(np.array(bool_data,
                                           dtype=bool_type)) == "boolean"

    @pytest.mark.parametrize(
        "date_data",
        [
            pd.to_datetime(["2016"]),
            pd.to_datetime(["2016"], utc=True),
            pd.Series(pd.to_datetime(["2016"])),
            pd.Series(pd.to_datetime(["2016"], utc=True)),
            pd.period_range("2016", freq="A", periods=3),
        ],
    )
    def test_as_json_table_type_date_data(self, date_data):
        assert as_json_table_type(date_data) == "datetime"

    @pytest.mark.parametrize(
        "str_data",
        [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
    def test_as_json_table_type_string_data(self, str_data):
        assert as_json_table_type(str_data) == "string"

    @pytest.mark.parametrize(
        "cat_data",
        [
            pd.Categorical(["a"]),
            pd.Categorical([1]),
            pd.Series(pd.Categorical([1])),
            pd.CategoricalIndex([1]),
            pd.Categorical([1]),
        ],
    )
    def test_as_json_table_type_categorical_data(self, cat_data):
        assert as_json_table_type(cat_data) == "any"

    # ------
    # dtypes
    # ------
    @pytest.mark.parametrize("int_dtype",
                             [np.int, np.int16, np.int32, np.int64])
    def test_as_json_table_type_int_dtypes(self, int_dtype):
        assert as_json_table_type(int_dtype) == "integer"

    @pytest.mark.parametrize("float_dtype",
                             [np.float, np.float16, np.float32, np.float64])
    def test_as_json_table_type_float_dtypes(self, float_dtype):
        assert as_json_table_type(float_dtype) == "number"

    @pytest.mark.parametrize("bool_dtype", [bool, np.bool])
    def test_as_json_table_type_bool_dtypes(self, bool_dtype):
        assert as_json_table_type(bool_dtype) == "boolean"

    @pytest.mark.parametrize(
        "date_dtype",
        [
            np.datetime64,
            np.dtype("<M8[ns]"),
            PeriodDtype("D"),
            DatetimeTZDtype("ns", "US/Central"),
        ],
    )
    def test_as_json_table_type_date_dtypes(self, date_dtype):
        # TODO: datedate.date? datetime.time?
        assert as_json_table_type(date_dtype) == "datetime"

    @pytest.mark.parametrize("td_dtype", [np.timedelta64, np.dtype("<m8[ns]")])
    def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
        assert as_json_table_type(td_dtype) == "duration"

    @pytest.mark.parametrize("str_dtype", [object])  # TODO
    def test_as_json_table_type_string_dtypes(self, str_dtype):
        assert as_json_table_type(str_dtype) == "string"

    def test_as_json_table_type_categorical_dtypes(self):
        # TODO: I think before is_categorical_dtype(Categorical)
        # returned True, but now it's False. Figure out why or
        # if it matters
        assert as_json_table_type(pd.Categorical(["a"])) == "any"
        assert as_json_table_type(CategoricalDtype()) == "any"
 def test_read_json_table_orient(self, index_nm, vals, recwarn):
     df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
     out = df.to_json(orient="table")
     result = pd.read_json(out, orient="table")
     tm.assert_frame_equal(df, result)
def bureau_and_balance(file_path=file_path, nan_as_category=True):
    df_bureau_b = reduce_mem_usage(pd.read_csv(file_path + 'bureau_balance.csv'), verbose=False)

    # Some new features in bureau_balance set
    tmp = df_bureau_b[['SK_ID_BUREAU', 'STATUS']].groupby('SK_ID_BUREAU')
    tmp_last = tmp.last()
    tmp_last.columns = ['First_status']
    df_bureau_b = df_bureau_b.join(tmp_last, how='left', on='SK_ID_BUREAU')
    tmp_first = tmp.first()
    tmp_first.columns = ['Last_status']
    df_bureau_b = df_bureau_b.join(tmp_first, how='left', on='SK_ID_BUREAU')
    del tmp, tmp_first, tmp_last
    gc.collect()

    tmp = df_bureau_b[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').last()
    tmp = tmp.apply(abs)
    tmp.columns = ['Month']
    df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU')
    del tmp
    gc.collect()

    tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == 'C', ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \
        .groupby('SK_ID_BUREAU').last()
    tmp = tmp.apply(abs)
    tmp.columns = ['When_closed']
    df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU')
    del tmp
    gc.collect()

    df_bureau_b['Month_closed_to_end'] = df_bureau_b['Month'] - df_bureau_b['When_closed']

    for c in range(6):
        tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == str(c), ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \
            .groupby('SK_ID_BUREAU').count()
        tmp.columns = ['DPD_' + str(c) + '_cnt']
        df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU')
        df_bureau_b['DPD_' + str(c) + ' / Month'] = df_bureau_b['DPD_' + str(c) + '_cnt'] / df_bureau_b['Month']
        del tmp
        gc.collect()
    df_bureau_b['Non_zero_DPD_cnt'] = df_bureau_b[
        ['DPD_1_cnt', 'DPD_2_cnt', 'DPD_3_cnt', 'DPD_4_cnt', 'DPD_5_cnt']].sum(axis=1)

    df_bureau_b, bureau_b_cat = one_hot_encoder(df_bureau_b, nan_as_category)

    # Bureau balance: Perform aggregations
    aggregations = {}
    for col in df_bureau_b.columns:
        aggregations[col] = ['mean','sum'] if col in bureau_b_cat else ['min', 'max', 'size']
    df_bureau_b_agg = df_bureau_b.groupby('SK_ID_BUREAU').agg(aggregations)
    df_bureau_b_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_bureau_b_agg.columns.tolist()])
    del df_bureau_b
    gc.collect()

    df_bureau = reduce_mem_usage(pd.read_csv(file_path + 'bureau.csv'), verbose=False)

    # Replace\remove some outliers in bureau set

    # fill na
    df_bureau.loc[df_bureau['CREDIT_ACTIVE'] == 'Closed', ['AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT']] = \
        df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Closed'][['AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT']].fillna(0)

    # credit sum = credit sum limit + credit sum debt
    df_bureau.loc[df_bureau['CREDIT_ACTIVE'] == 'Active', 'AMT_CREDIT_SUM_LIMIT'] = \
        df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Active']['AMT_CREDIT_SUM'] - \
        df_bureau[df_bureau['CREDIT_ACTIVE'] == 'Active']['AMT_CREDIT_SUM_DEBT']

    df_bureau.loc[df_bureau['AMT_ANNUITY'] > .8e8, 'AMT_ANNUITY'] = np.nan
    df_bureau.loc[df_bureau['AMT_CREDIT_SUM'] > 3e8, 'AMT_CREDIT_SUM'] = np.nan
    df_bureau.loc[df_bureau['AMT_CREDIT_SUM_DEBT'] > 1e8, 'AMT_CREDIT_SUM_DEBT'] = np.nan
    df_bureau.loc[df_bureau['AMT_CREDIT_MAX_OVERDUE'] > .8e8, 'AMT_CREDIT_MAX_OVERDUE'] = np.nan
    df_bureau.loc[df_bureau['DAYS_ENDDATE_FACT'] < -10000, 'DAYS_ENDDATE_FACT'] = np.nan
    df_bureau.loc[(df_bureau['DAYS_CREDIT_UPDATE'] > 0) | (
            df_bureau['DAYS_CREDIT_UPDATE'] < -40000), 'DAYS_CREDIT_UPDATE'] = np.nan
    df_bureau.loc[df_bureau['DAYS_CREDIT_ENDDATE'] < -10000, 'DAYS_CREDIT_ENDDATE'] = np.nan

    df_bureau.drop(df_bureau[df_bureau['DAYS_ENDDATE_FACT'] < df_bureau['DAYS_CREDIT']].index, inplace=True)
    df_bureau.drop('CREDIT_CURRENCY',axis=1,inplace=True)


    # Some new features in bureau set
    df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_DEBT'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[
        'AMT_CREDIT_SUM_DEBT']
    df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_LIMIT'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[
        'AMT_CREDIT_SUM_LIMIT']
    df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_OVERDUE'] = df_bureau['AMT_CREDIT_SUM'] - df_bureau[
        'AMT_CREDIT_SUM_OVERDUE']

    df_bureau['bureau DAYS_CREDIT - CREDIT_DAY_OVERDUE'] = df_bureau['DAYS_CREDIT'] - df_bureau['CREDIT_DAY_OVERDUE']
    df_bureau['bureau DAYS_CREDIT - DAYS_CREDIT_ENDDATE'] = df_bureau['DAYS_CREDIT'] - df_bureau['DAYS_CREDIT_ENDDATE']
    df_bureau['bureau DAYS_CREDIT - DAYS_ENDDATE_FACT'] = df_bureau['DAYS_CREDIT'] - df_bureau['DAYS_ENDDATE_FACT']
    df_bureau['bureau DAYS_CREDIT_ENDDATE - DAYS_ENDDATE_FACT'] = df_bureau['DAYS_CREDIT_ENDDATE'] - df_bureau[
        'DAYS_ENDDATE_FACT']
    df_bureau['bureau DAYS_CREDIT_UPDATE - DAYS_CREDIT_ENDDATE'] = df_bureau['DAYS_CREDIT_UPDATE'] - df_bureau[
        'DAYS_CREDIT_ENDDATE']

    df_bureau['FLAG_overdue'] = df_bureau['AMT_CREDIT_SUM_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)

    # replace high correlation column and low variance column

    # Categorical features with One-Hot encode
    df_bureau['CREDIT_TYPE'] = df_bureau['CREDIT_TYPE'].apply(
        lambda x: x if x in ['Consumer credit', 'Credit card'] else 'other')
    df_bureau['CREDIT_ACTIVE'] = df_bureau['CREDIT_ACTIVE'].apply(
        lambda x: x if x in ['Closed', 'Active'] else 'other')
    df_bureau, bureau_cat = one_hot_encoder(df_bureau, nan_as_category)

    # Bureau balance: merge with bureau.csv
    df_bureau = df_bureau.join(df_bureau_b_agg, how='left', on='SK_ID_BUREAU')
    df_bureau.drop('SK_ID_BUREAU', axis=1, inplace=True)
    del df_bureau_b_agg
    gc.collect()

    # Bureau and bureau_balance aggregations for application set
    categorical = bureau_cat + bureau_b_cat
    aggregations = {}
    for col in df_bureau.columns:
        aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum']
    df_bureau_agg = df_bureau.groupby('SK_ID_CURR').agg(aggregations)
    df_bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in df_bureau_agg.columns.tolist()])

    # Bureau: Active credits
    active_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Active'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    active_agg.columns = pd.Index(['BURO_ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    df_bureau_agg = df_bureau_agg.join(active_agg, how='left')
    del active_agg
    gc.collect()

    # Bureau: Closed credits
    closed_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Closed'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    closed_agg.columns = pd.Index(['BURO_CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    df_bureau_agg = df_bureau_agg.join(closed_agg, how='left')

    # bureau: amt annuity ==0
    annuity_0_agg = df_bureau[df_bureau['AMT_ANNUITY'].isnull()].groupby('SK_ID_CURR').agg(aggregations)
    annuity_0_agg.columns = pd.Index(['BURO_annuity_0_' + e[0] + "_" + e[1].upper() for e in annuity_0_agg.columns.tolist()])
    df_bureau_agg = df_bureau_agg.join(annuity_0_agg, how='left')

    # bureau: amt annuity >0
    annuity_non_0_agg = df_bureau[df_bureau['AMT_ANNUITY']>0].groupby('SK_ID_CURR').agg(aggregations)
    annuity_non_0_agg.columns = pd.Index(
        ['BURO_annuity_non_0_' + e[0] + "_" + e[1].upper() for e in annuity_non_0_agg.columns.tolist()])
    df_bureau_agg = df_bureau_agg.join(annuity_non_0_agg, how='left')

    del closed_agg, df_bureau
    gc.collect()

    return reduce_mem_usage(df_bureau_agg)
def credit_card_balance(file_path=file_path, nan_as_category=True):
    df_card = pd.read_csv(file_path + 'credit_card_balance.csv')

    # Replace some outliers
    df_card.loc[df_card['AMT_PAYMENT_CURRENT'] > 4000000, 'AMT_PAYMENT_CURRENT'] = np.nan
    df_card.loc[df_card['AMT_CREDIT_LIMIT_ACTUAL'] > 1000000, 'AMT_CREDIT_LIMIT_ACTUAL'] = np.nan

    # Some new features
    df_card['card missing'] = df_card.isnull().sum(axis=1).values
    df_card['card SK_DPD - MONTHS_BALANCE'] = df_card['SK_DPD'] - df_card['MONTHS_BALANCE']
    df_card['card SK_DPD_DEF - MONTHS_BALANCE'] = df_card['SK_DPD_DEF'] - df_card['MONTHS_BALANCE']
    df_card['card SK_DPD - SK_DPD_DEF'] = df_card['SK_DPD'] - df_card['SK_DPD_DEF']

    df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECIVABLE'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card['AMT_RECIVABLE']
    df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card[
        'AMT_RECEIVABLE_PRINCIPAL']
    df_card['card AMT_RECIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_RECIVABLE'] - df_card[
        'AMT_RECEIVABLE_PRINCIPAL']

    df_card['card AMT_BALANCE - AMT_RECIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_RECIVABLE']
    df_card['card AMT_BALANCE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_BALANCE'] - df_card[
        'AMT_RECEIVABLE_PRINCIPAL']
    df_card['card AMT_BALANCE - AMT_TOTAL_RECEIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_TOTAL_RECEIVABLE']

    df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_ATM_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[
        'AMT_DRAWINGS_ATM_CURRENT']
    df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_OTHER_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[
        'AMT_DRAWINGS_OTHER_CURRENT']
    df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_POS_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card[
        'AMT_DRAWINGS_POS_CURRENT']

    df_card['AMT_PAYMENT_CURRENT - AMT_PAYMENT_TOTAL_CURRENT'] = df_card['AMT_PAYMENT_CURRENT'] - df_card['AMT_PAYMENT_TOTAL_CURRENT']

    df_card['SK_DPD * AMT OBERDUE'] = df_card['SK_DPD'] * (df_card['AMT_INST_MIN_REGULARITY'] - df_card['AMT_PAYMENT_CURRENT'])

    df_card['available credit'] = df_card['AMT_CREDIT_LIMIT_ACTUAL'] - df_card['AMT_BALANCE']

    df_card = df_card.sort_values(by= ['SK_ID_PREV','MONTHS_BALANCE'])









    # Categorical features with One-Hot encode
    df_card, categorical = one_hot_encoder(df_card, nan_as_category)

    # Aggregations for application set
    aggregations = {}
    for col in df_card.columns:
        aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum']
    df_card_agg = df_card.groupby('SK_ID_CURR').agg(aggregations)
    df_card_agg.columns = pd.Index(['CARD_total_' + e[0] + "_" + e[1].upper() for e in df_card_agg.columns.tolist()])

    df_card_agg['CARD_total avg  DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_CURRENT_SUM']/ df_card_agg['CARD_total_CNT_DRAWINGS_CURRENT_SUM']
    df_card_agg['CARD_total avg  OTHER DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_OTHER_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_OTHER_CURRENT_SUM']
    df_card_agg['CARD_total avg  ATM DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_ATM_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_ATM_CURRENT_SUM']
    df_card_agg['CARD_total avg  POS DRAWING'] = df_card_agg['CARD_total_AMT_DRAWINGS_POS_CURRENT_SUM'] / df_card_agg['CARD_total_CNT_DRAWINGS_POS_CURRENT_SUM']


    # aggregations when credit card is used amt drawing >0
    aggregations = {}
    for col in df_card.columns:
        aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum']
    df_card_used_agg = df_card[df_card['AMT_DRAWINGS_ATM_CURRENT'] > 0].groupby('SK_ID_CURR').agg(aggregations)
    df_card_used_agg.columns = pd.Index(['CARD_used_' + e[0] + "_" + e[1].upper() for e in df_card_used_agg.columns.tolist()])

    df_card_used_agg['CARD_used avg  DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_CURRENT_SUM']/ df_card_used_agg['CARD_used_CNT_DRAWINGS_CURRENT_SUM']
    df_card_used_agg['CARD_used avg  OTHER DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_OTHER_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_OTHER_CURRENT_SUM']
    df_card_used_agg['CARD_used avg  ATM DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_ATM_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_ATM_CURRENT_SUM']
    df_card_used_agg['CARD_used avg  POS DRAWING'] = df_card_used_agg['CARD_used_AMT_DRAWINGS_POS_CURRENT_SUM'] / df_card_used_agg['CARD_used_CNT_DRAWINGS_POS_CURRENT_SUM']

    df_card_agg=df_card_agg.join(df_card_used_agg)



    # Count credit card lines
    df_card_agg['CARD_COUNT'] = df_card.groupby('SK_ID_CURR').size()
    df_card_agg['CARD_USED_COUNT']= df_card[df_card['AMT_DRAWINGS_ATM_CURRENT'].notnull()].groupby('SK_ID_CURR').size()

    # total balance
    latest_balance = df_card[['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_BALANCE']].groupby('SK_ID_PREV').last()
    latest_balance.columns = ['SK_ID_CURR', 'MONTHS_BALANCE', 'CARD_total balance']
    total_latest_balance = latest_balance.groupby('SK_ID_CURR').sum()['CARD_total balance']
    df_card_agg = df_card_agg.join(total_latest_balance)

    del df_card, latest_balance, total_latest_balance
    gc.collect()

    return reduce_mem_usage(df_card_agg)
Example #8
0
    def _read(cls, sql, con, index_col=None, **kwargs):
        """
        Read a SQL query or database table into a query compiler.

        Parameters
        ----------
        sql : str or SQLAlchemy Selectable (select or text object)
            SQL query to be executed or a table name.
        con : SQLAlchemy connectable, str, sqlite3 connection, or ModinDatabaseConnection
            Connection object to database.
        index_col : str or list of str, optional
            Column(s) to set as index(MultiIndex).
        **kwargs : dict
            Parameters to pass into `pandas.read_sql` function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        if isinstance(con, str):
            con = ModinDatabaseConnection("sqlalchemy", con)
        if not isinstance(con, ModinDatabaseConnection):
            warnings.warn(
                "To use parallel implementation of `read_sql`, pass either " +
                "the SQL connection string or a ModinDatabaseConnection " +
                "with the arguments required to make a connection, instead " +
                f"of {type(con)}. For documentation of ModinDatabaseConnection, see "
                +
                "https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html#connecting-to-a-database-for-read-sql"
            )
            return cls.single_worker_read(
                sql,
                con=con,
                index_col=index_col,
                read_sql_engine=ReadSqlEngine.get(),
                **kwargs,
            )
        row_count_query = con.row_count_query(sql)
        connection_for_pandas = con.get_connection()
        colum_names_query = con.column_names_query(sql)
        row_cnt = pandas.read_sql(row_count_query,
                                  connection_for_pandas).squeeze()
        cols_names_df = pandas.read_sql(colum_names_query,
                                        connection_for_pandas,
                                        index_col=index_col)
        cols_names = cols_names_df.columns
        num_partitions = NPartitions.get()
        partition_ids = [None] * num_partitions
        index_ids = [None] * num_partitions
        dtypes_ids = [None] * num_partitions
        limit = math.ceil(row_cnt / num_partitions)
        for part in range(num_partitions):
            offset = part * limit
            query = con.partition_query(sql, limit, offset)
            *partition_ids[part], index_ids[part], dtypes_ids[
                part] = cls.deploy(
                    cls.parse,
                    num_returns=num_partitions + 2,
                    num_splits=num_partitions,
                    sql=query,
                    con=con,
                    index_col=index_col,
                    read_sql_engine=ReadSqlEngine.get(),
                    **kwargs,
                )
            partition_ids[part] = [
                cls.frame_partition_cls(obj) for obj in partition_ids[part]
            ]
        if index_col is None:  # sum all lens returned from partitions
            index_lens = cls.materialize(index_ids)
            new_index = pandas.RangeIndex(sum(index_lens))
        else:  # concat index returned from partitions
            index_lst = [
                x for part_index in cls.materialize(index_ids)
                for x in part_index
            ]
            new_index = pandas.Index(index_lst).set_names(index_col)
        new_frame = cls.frame_cls(np.array(partition_ids), new_index,
                                  cols_names)
        new_frame.synchronize_labels(axis=0)
        return cls.query_compiler_cls(new_frame)
Example #9
0
pa = pd.read_pickle('preprocess/edge/p_a_before284_delete_author.pkl')
pa_extra = pd.read_pickle('preprocess/edge/p_a_delete_author.pkl')
pp = pd.read_pickle('preprocess/edge/paper_paper.pkl')
pp['new_cited_papr_id'] = pp['new_cited_papr_id'].astype(int).astype(str)
# 從pp整合出 ref list
paper_refs = pp.groupby(['new_papr_id'])['new_cited_papr_id'].agg([','.join]).reset_index()
# 挑出ref>20的index, 只用他們算MAP
paper_refs = paper_refs[paper_refs.new_papr_id.isin(pp[pp.groupby(['new_papr_id'])['year'].transform('count') > 20].new_papr_id.value_counts().index.tolist())]

# https://stackoverflow.com/questions/20067636/pandas-dataframe-get-first-row-of-each-group
# dblp_top50_conf['new_first_aId'] = pa.groupby('new_papr_id').first()['new_author_id']  # 取每篇的第一作者
dblp_top50_conf['authors'] = pa.groupby('new_papr_id')['new_author_id'].apply(list)  # groupby element to list
dblp_top50_conf['references'] = dblp_top50[dblp_top50['new_papr_id'].isin(dblp_top50_conf['new_papr_id'].values)]['references']
dblp_top50_conf.dropna(subset=['authors'], inplace=True)  # drop empty author papers
# 根據author數量變成多筆training data
dblp_top50_conf = pd.DataFrame([np.append(row.values, d) for _, row in dblp_top50_conf.iterrows() for d in row['authors']], columns=dblp_top50_conf.columns.append(pd.Index(['new_first_aId'])))
# select 2018以前全部當train
train2017 = dblp_top50_conf.loc[dblp_top50_conf.time_step < 284, ['new_papr_id', 'new_venue_id', 'new_first_aId', 'references']]

# dblp_top50_test['new_first_aId'] = pa_extra.groupby('new_papr_id').first()['new_author_id']  # 移除沒有作者的paper
dblp_top50_test['authors'] = pa_extra.groupby('new_papr_id')['new_author_id'].apply(list)  # groupby element to list
dblp_top50_test['references'] = dblp_top50[dblp_top50['new_papr_id'].isin(dblp_top50_test['new_papr_id'].values)]['references']
dblp_top50_test.dropna(subset=['authors'], inplace=True)  # drop empty author papers
dblp_top50_test = pd.DataFrame([np.append(row.values, d) for _, row in dblp_top50_test.iterrows() for d in row['authors']], columns=dblp_top50_test.columns.append(pd.Index(['new_first_aId'])))

# 塞入bert
titles = pd.read_pickle('preprocess/edge/titles_bert.pkl')
abstracts = pd.read_pickle('preprocess/edge/abstracts_bert.pkl')
# normalize column/ feature
titles = preprocessing.scale(np.array(titles.tolist()))
abstracts = preprocessing.scale(np.array(abstracts.tolist()))
Example #10
0
    def dispatch(self, event):  # {
        # RE-INSTANTIATE GLOBALS
        global isEOD, idx_list, idx
        event_str = str(event.event_type)
        event_path = Path(event.src_path)
        print("\t\te=" + str(event))
        print("\t\ttype=" + str(event.event_type))
        print("\t\tsrc_path=" + str(event.src_path))
        # CHECK AND PERFORM ON EVENT_STR
        ##################################################################
        if event_str == "created":  # {
            # TRY THE FOLlOWING:
            try:  # {
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                ts = pd.Timestamp.now()  # CREATE TIME STAMP
                print("| CREATED >>> " + str(ts))
                # CREATE EVENT PATH VAR
                the_event_path = Path(event.src_path)  # WAS: the_event_path
                # APPEND TO INDEX LIST
                idx_list.append(str(event.src_path))
                # CREATE TEMPORARY INDEX IN ORDER TO APPEND
                temp_idx = pd.Index(data=idx_list, dtype=np.str)
                # APPEND TO "CREATION" INDEX
                self.index.append(temp_idx)
                # print INDEX
                print("INDEX --> \n" + str(self.index))
                print("\t\tEVENT_PATH=" + str(the_event_path))
                # CREATE 'file_name' VAR
                file_name = os.path.basename(the_event_path)
                print("\t\tFILE_NAME=" + str(file_name))
                # CHECK AND SEE IF FILE IS OF TYPE .PDF
                if fnmatch.fnmatch(file_name, "*.pdf"):  # {
                    # CREATE NEW FILE NAME CONV
                    file_name_conv = generate_naming_convention(file_name)

                    # CREATE PATH VARIABLES FOR FILE MOVING PROCEDURES
                    new_path = os.path.join(self.out_directory, file_name_conv)
                    ###############################################
                    # CREATE/COPY WATERMARK TO DESTINATION FOLDER #
                    ###############################################
                    create_watermark(input_pdf=the_event_path,
                                     output=new_path,
                                     watermark=in_file)
                    # CREATE EVENT ITEM FOR LIST
                    event_list = [str(file_name_conv),
                                  str(ts)]  # WAS: (created_str)
                    # APPEND TO DATAFRAME
                    self.save_dataframe = append_to_dataframe(
                        the_event_list=event_list,
                        dataframe_to_append=self.save_dataframe)
                    # print DATAFRAME
                    print(self.save_dataframe.tail(8))
                # }
                else:  # {
                    print("NON-PDF CREATED AT " + str(ts))
                # }
            # }
            except:  # {
                errorMessage = str(sys.exc_info()[0]) + "\n\t\t"
                errorMessage = errorMessage + str(sys.exc_info()[1]) + "\n\t\t"
                errorMessage = errorMessage + str(sys.exc_info()[2]) + "\n"
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                typeE = str("TYPE : " + str(exc_type))
                fileE = str("FILE : " + str(fname))
                lineE = str("LINE : " + str(exc_tb.tb_lineno))
                messageE = str("MESG : " + "\n" + str(errorMessage) + "\n")
                logging.error("\n" + typeE + "\n" + fileE + "\n" + lineE +
                              "\n" + messageE)
            # }
            else:  # {
                print("SUCCESS! VERY NICE!")
            # }
            finally:  # {
                # CREATE END-TIME VAR
                time_end = pd.Timestamp.now()
                # DETERMINE OVERALL RUN-TIME
                run_time = pd.Timedelta(time_end - time_start)
                # print TOTAL RUNTIME
                print("\t\t[Created-Event] >>> time_alloted: " + str(run_time))
            # }
        # }
        elif event_str == "modified":  #{
            # TRY THE FOLLOWING
            try:  # {
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                ts = pd.Timestamp.now()  # CREATE TIME STAMP
                print("| MODIFIED >>> " + str(ts))
                # print INDEX
                print("INDEX --> \n" + str(self.index))
                # CREATE EVENT PATH VAR
                the_event_path = Path(event.src_path)  # WAS: the_event_path
                print("\t\tEVENT_PATH=" + str(the_event_path))
                # CREATE 'file_name' VAR
                file_name = os.path.basename(the_event_path)
                print("\t\tFILE_NAME=" + str(file_name))
                # CHECK AND SEE IF FILE IS OF TYPE .PDF
                if fnmatch.fnmatch(file_name, "*.pdf"):  # {
                    # CREATE NEW FILE NAME CONV
                    file_name_conv = generate_naming_convention(file_name)
                    # CREATE EVENT ITEM FOR LIST
                    event_list = [str(file_name_conv),
                                  str(ts)]  # WAS: (created_str)
                    print("\n EVENT_LIST : \n" + str(event_list))
                    # IF THE EVENT_PAT HIS ALREADY IN INDEX
                    if str(the_event_path) in self.index:  # {
                        print(
                            "ALREADY IN INDEX... THEN WE CAN **INDEED** APPEND\n\n\n"
                        )
                        # APPEND TO DATAFRAME
                        self.save_dataframe = append_to_dataframe(
                            the_event_list=event_list,
                            dataframe_to_append=self.save_dataframe)
                        # print DATAFRAME
                        print(self.save_dataframe)
                    # }
                    else:  # {
                        print(
                            "NOT IN INDEX... NOT CREATED TODAY... SO WE SKIP APPENDING..."
                        )
                        # BUT WE STILL WATERMARK?
                        """
                        # CREATE PATH VARIABLES FOR FILE MOVING PROCEDURES
                        new_path = os.path.join(self.out_directory, file_name_conv)
                        # 08/28/2019 - REMOVED BECAUSE WE DONT NEED TO WATERMARK SO MANY TIMES
                        # JUST KEEPING THE MODIFIED TIMESTAMP AND APPENDING TO DATAFRAME
                        # CREATE/COPY WATERMARK TO DESTINATION FOLDER
                        create_watermark(input_pdf=the_event_path,
                                         output=new_path,
                                         watermark=in_file)
                        """
                    # }
                # }
                else:  # {
                    print("NON-PDF MODIFIED AT " + str(ts))
                # }
            # }
            except:  # {
                errorMessage = str(sys.exc_info()[0]) + "\n\t\t"
                errorMessage = errorMessage + str(sys.exc_info()[1]) + "\n\t\t"
                errorMessage = errorMessage + str(sys.exc_info()[2]) + "\n"
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                typeE = str("TYPE : " + str(exc_type))
                fileE = str("FILE : " + str(fname))
                lineE = str("LINE : " + str(exc_tb.tb_lineno))
                messageE = str("MESG : " + "\n" + str(errorMessage) + "\n")
                logging.error("\n" + typeE + "\n" + fileE + "\n" + lineE +
                              "\n" + messageE)
            # }
            else:  # {
                print("SUCCESS! VERY NICE!")
            # }
            finally:  # {
                # CREATE END-TIME VAR
                time_end = pd.Timestamp.now()
                # DETERMINE OVERALL RUN-TIME
                run_time = pd.Timedelta(time_end - time_start)
                # print TOTAL RUNTIME
                print("\t\t[MODIFIED-Event] >>> time_alloted: " +
                      str(run_time))
            # }
        # }
        ##########################################################
        # TRY THE FOLLOWING:
        """
Example #11
0
def assert_array_index_eq(left, right):
    """left and right are equal, treating index and array as equivalent"""
    assert_eq(left,
              pd.Index(right) if isinstance(right, np.ndarray) else right)
Example #12
0
 in_directory = "C:/Temp/F/APPS/CofA/"  # "F:/APPS/CofA/"
 out_directory = "C:/Temp/G/C of A's/#Email Node/"  #"G:/C of A's/#Email Node/"
 outbound_directory = "C:/data/outbound/CofA/"
 in_file = "C:/data/inbound/Agilent_CofA_Letterhead_03-21-19.pdf"
 # CREATE FILE_NAME STR USING TODAYS DATE CONVENTION
 out_file_dir = "C:/data/outbound/CofA/"  # WAS: "C:/data/inbound/"
 # FILENAME FOR DATAFRAME THAT WAS CREATED VIA "CofA_Event_Hanlder" CLASS
 out_file_str_1 = str("CofA_Email_Node_list_" + time_today + "_F_watch.csv")
 # FILENAME FOR DATAFRAME THAT WAS CREATED VIA "set_diff_df"
 out_file_str_2 = str("CofA_Email_Node_list_" + time_today + "_F_pull.csv")
 df_save_list = pd.DataFrame(data=None, columns=['CofA'])
 isEOD = False
 # CREATE IDX_LIST TO BE INSERTED INSIDE INDEX
 idx_list = os.listdir(out_directory)
 # INDEX VARIABLE TO HOLD CREATION LIST
 idx = pd.Index(data=idx_list, dtype=np.str)
 # CREATE OBSERVER VARIABLE FOR WATCHDOG EVENT HANDLER
 observer = Observer()
 # CREATE TIMER VARIABLE // starts at 5 am and runs until 8:59 pm so == 57,600 seconds
 t = Timer(34200, end_of_day)
 # START TIMER
 t.start()
 # CREATE INSTANCE OF CUSTOM EVENT HANDLER
 event_handler = CofA_Event_Handler(idx, df_save_list, out_directory,
                                    observer)
 observer.schedule(event_handler=event_handler,
                   path=in_directory,
                   recursive=True)
 observer.start()
 # TRY THE FOLLOWING
 try:  # {
Example #13
0
    def get_groupby_pool_in_out(self,
                                period: int = None,
                                shift: int = 0,
                                figsize=(30, 8),
                                heatmap_rotation1=False,
                                heatmap_rotation2=False,
                                annot_fontsize1=30,
                                annot_fontsize2=30) -> None:
        """
        获取商品池进出每个组的情况,返回

        Returns
        -------

        """
        self.signal.set_factor_data(self.factor.factor_value)
        self.signal.set_commodity_pool(
            self.commodity_pool.commodity_pool_value)

        # 修正group_num
        group_num: int = self.get_params()['group_num']
        if not period:
            period: int = self.get_params()['period']
        self.signal.set_params(group_num=group_num)

        signal_df: DataFrame = self.signal.transform()
        # avg_group_in_pct_dict = {}
        # avg_group_out_pct_dict = {}
        # for shift in range(period):
        #     index = pd.Index(range(len(signal_df)))
        #     index = index[(index - shift) % period == 0]
        #     new_signal_df = signal_df.copy()
        #     new_signal_df = new_signal_df.iloc[index]
        #
        #     if self.signal.__class__.group_signal_type == GroupSignalType.AllGroupSignal:
        #         other_group_list = [-2, -1, 0]
        #     else:
        #         other_group_list = [-3, -2, -1, 0]
        #
        #     group_in_num_dict = defaultdict(dict)
        #     group_out_num_dict = defaultdict(dict)
        #
        #     for i in range(1, group_num+1):
        #         for j in other_group_list:
        #             in_df: DataFrame = pd.DataFrame(data=False,
        #                             index=new_signal_df.index,
        #                             columns=new_signal_df.columns)
        #             out_df: DataFrame = pd.DataFrame(data=False,
        #                             index=new_signal_df.index,
        #                             columns=new_signal_df.columns)
        #             # 考虑进入
        #             in_df[(new_signal_df.shift(1)==j)&(new_signal_df==i)] = True
        #             if j != 0:
        #                 pass
        #             else:
        #                 in_df.loc[new_signal_df.index[0], new_signal_df.iloc[0]==i] = True
        #
        #             # 考虑退出
        #             out_df[(new_signal_df.shift(1)==i)&(new_signal_df==j)] = True
        #
        #             group_in_num_dict[j][i] = in_df.sum().sum()
        #             group_out_num_dict[i][j] = out_df.sum().sum()
        #
        #     group_in_num_df: DataFrame = pd.DataFrame(group_in_num_dict)
        #     group_out_num_df: DataFrame = pd.DataFrame(group_out_num_dict)
        #     group_in_pct_df: DataFrame = group_in_num_df / group_in_num_df.sum(axis=0)
        #     group_out_pct_df: DataFrame = group_out_num_df / group_out_num_df.sum(axis=0)
        #
        #     avg_group_in_pct_dict[shift] = group_in_pct_df
        #     avg_group_out_pct_dict[shift] = group_out_pct_df
        #
        # avg_group_in_pct_df = pd.DataFrame()
        # avg_group_out_pct_df = pd.DataFrame()
        # for i in avg_group_in_pct_dict:
        #     if i == 0:
        #         avg_group_in_pct_df = avg_group_in_pct_dict[i]
        #         avg_group_out_pct_df = avg_group_out_pct_dict[i]
        #     else:
        #         avg_group_in_pct_df += avg_group_in_pct_dict[i]
        #         avg_group_out_pct_df += avg_group_out_pct_dict[i]
        # avg_group_in_pct_df = avg_group_in_pct_df / len(avg_group_in_pct_dict)
        # avg_group_out_pct_df = avg_group_out_pct_df / len(avg_group_out_pct_dict)

        index = pd.Series(range(len(signal_df)))
        index = index[((index - float(shift)) % float(period)) == 0]
        index = pd.Index(index.values.tolist())
        new_signal_df = signal_df.copy()
        new_signal_df = new_signal_df.iloc[index]

        if self.signal.__class__.group_signal_type == GroupSignalType.AllGroupSignal:
            other_group_list = [-2, -1, 0]
        else:
            other_group_list = [-3, -2, -1, 0]

        group_in_num_dict = defaultdict(dict)
        group_out_num_dict = defaultdict(dict)

        for i in range(1, group_num + 1):
            for j in other_group_list:
                in_df: DataFrame = pd.DataFrame(data=False,
                                                index=new_signal_df.index,
                                                columns=new_signal_df.columns)
                out_df: DataFrame = pd.DataFrame(data=False,
                                                 index=new_signal_df.index,
                                                 columns=new_signal_df.columns)
                # 考虑进入
                in_df[(new_signal_df.shift(1) == j)
                      & (new_signal_df == i)] = True
                if j != 0:
                    pass
                else:
                    in_df.loc[new_signal_df.index[0],
                              new_signal_df.iloc[0] == i] = True

                # 考虑退出
                out_df[(new_signal_df.shift(1) == i)
                       & (new_signal_df == j)] = True

                group_in_num_dict[j][i] = in_df.sum().sum()
                group_out_num_dict[i][j] = out_df.sum().sum()

        group_in_num_df: DataFrame = pd.DataFrame(group_in_num_dict)
        group_out_num_df: DataFrame = pd.DataFrame(group_out_num_dict)
        cond_avg_group_in_pct_df: DataFrame = group_in_num_df / group_in_num_df.sum(
            axis=0)
        cond_avg_group_out_pct_df: DataFrame = group_out_num_df / group_out_num_df.sum(
            axis=0)

        # 统计从-2, -1, 0到其他状态的情况
        start_in_pct_series: Series = group_in_num_df.sum(
            axis=0) / group_in_num_df.sum(axis=0).sum()

        # 统计从持仓状态到0,-1,-2的情况
        start_out_pct_series: Series = group_out_num_df.sum(
            axis=0) / group_out_num_df.sum(axis=0).sum()

        uncond_avg_group_in_pct_df = cond_avg_group_in_pct_df * start_in_pct_series
        uncond_avg_group_out_pct_df = cond_avg_group_out_pct_df * start_out_pct_series

        # 总天数
        total_days_num = len(new_signal_df)

        # 总进入次数
        total_in_num = int(group_in_num_df.sum().sum())

        # 总退出次数
        total_out_num = int(group_out_num_df.sum().sum())

        # 总进入比例
        total_in_pct = (total_in_num / group_num) / total_days_num

        # 总退出比例
        total_out_pct = (total_out_num / group_num) / total_days_num

        # 第一张图
        fig, axes = plt.subplots(figsize=figsize, nrows=1, ncols=3)
        # 第一张子图
        if heatmap_rotation1:
            cond_avg_group_in_pct_df = cond_avg_group_in_pct_df.T
        sns.heatmap(data=np.round(cond_avg_group_in_pct_df, 2),
                    vmin=0,
                    vmax=1,
                    annot=True,
                    annot_kws={'fontsize': annot_fontsize1},
                    ax=axes[0])
        axes[0].set_title("cond commodity in pct", fontsize=25)
        axes[0].tick_params(axis='both', labelsize=30)
        # 第二张子图
        # start_in_pct_series.plot.bar(ax=axes[1])
        # if len(start_in_pct_series) == 3:
        #     for x, y in start_in_pct_series.to_dict().items():
        #         axes[1].text(x+1.7, y, np.round(y, 2), fontdict={'fontsize': 30})
        # elif len(start_in_pct_series) == 4:
        #     for x, y in start_in_pct_series.to_dict().items():
        #         axes[1].text(x+2.7, y, np.round(y, 2), fontdict={'fontsize': 30})
        # 第二张子图
        labels = start_in_pct_series.sort_index().index.tolist()
        start_in_pct_series.index = start_in_pct_series.index - start_in_pct_series.index.min(
        ) + 1
        start_in_pct_series.plot.bar(ax=axes[1])
        for i, x in enumerate(start_in_pct_series.tolist()):
            axes[1].text(i - 0.1, x, np.round(x, 2), fontdict={'fontsize': 30})
        axes[1].set_xticklabels(labels=labels, fontsize=25)
        # 第三张子图
        if heatmap_rotation1:
            uncond_avg_group_in_pct_df = uncond_avg_group_in_pct_df.T
        sns.heatmap(data=np.round(uncond_avg_group_in_pct_df, 2),
                    vmin=0,
                    vmax=1,
                    annot=True,
                    annot_kws={'fontsize': annot_fontsize1},
                    ax=axes[2])
        axes[2].set_title("uncond commodity in pct", fontsize=25)
        axes[2].tick_params(axis='both', labelsize=30)
        # 汇总
        fig.suptitle(
            f"group commodity in info 总次数={total_in_num} 组数={group_num} 总天数={total_days_num} 比例={round(total_in_num/group_num/total_days_num,3)}",
            fontsize=36)
        fig.subplots_adjust(wspace=0.3)
        fig.subplots_adjust(hspace=0.2)
        plt.xticks(fontsize=36)
        plt.show()

        # 第二张图
        fig, axes = plt.subplots(figsize=(30, 8), nrows=1, ncols=3)
        # 第一张子图
        if heatmap_rotation2:
            cond_avg_group_out_pct_df = cond_avg_group_out_pct_df.T
        sns.heatmap(data=np.round(cond_avg_group_out_pct_df, 2),
                    vmin=0,
                    vmax=1,
                    annot=True,
                    annot_kws={'fontsize': annot_fontsize2},
                    ax=axes[0])
        axes[0].set_title("cond commodity out pct", fontsize=25)
        axes[0].tick_params(axis='both', labelsize=30)
        # 第二张子图
        # start_out_pct_series.plot.bar(ax=axes[1])
        # for x, y in start_out_pct_series.to_dict().items():
        #     axes[1].text(x+1.7, y, np.round(y, 2), fontdict={'fontsize': 30})
        # 第二张子图
        labels = start_out_pct_series.sort_index().index.tolist()
        start_out_pct_series.index = start_out_pct_series.index - start_out_pct_series.index.min(
        ) + 1
        start_out_pct_series.plot.bar(ax=axes[1])
        for i, x in enumerate(start_out_pct_series.tolist()):
            axes[1].text(i - 0.1, x, np.round(x, 2), fontdict={'fontsize': 20})
        axes[1].set_xticklabels(labels=labels, fontsize=25)
        # 第三张子图
        if heatmap_rotation2:
            uncond_avg_group_out_pct_df = uncond_avg_group_out_pct_df.T
        sns.heatmap(data=np.round(uncond_avg_group_out_pct_df, 2),
                    vmin=0,
                    vmax=1,
                    annot=True,
                    annot_kws={'fontsize': annot_fontsize2},
                    ax=axes[2])
        axes[2].set_title("uncond commodity out pct", fontsize=25)
        axes[2].tick_params(axis='both', labelsize=30)
        # 汇总
        fig.suptitle(
            f"group commodity out info 总次数={total_out_num} 组数={group_num} 总天数={total_days_num} 比例={round(total_out_num/group_num/total_days_num,3)}",
            fontsize=36)
        fig.subplots_adjust(wspace=0.3)
        fig.subplots_adjust(hspace=0.2)
        plt.xticks(fontsize=36)
        plt.show()
Example #14
0
    def get_group_distribution_per_symbol(self,
                                          period: int = None,
                                          shift: int = 0,
                                          start: str = None,
                                          end: str = None):
        """
        获取各品种在各组的分布(包括已上市但被商品池排除的组)

        Attributes
        __________
        period: int, default None
                采样间隔多少个交易日

        shift: int, default 0
                从第几个交易日开始采样。如果shift=0, period=20,则取第1, 21, 41, ...个交易日

        start: str, default None
                起始日期

        end: str, default None
             结束日期

        Returns
        -------
        None
        """
        # 获取signal_df
        self.signal.set_factor_data(self.factor.factor_value)
        self.signal.set_commodity_pool(
            self.commodity_pool.commodity_pool_value)

        # 修正group_num
        group_num: int = self.get_params()['group_num']
        if not period:
            period: int = self.get_params()['period']
        self.signal.set_params(group_num=group_num)

        signal_df: DataFrame = self.signal.transform()
        if start:
            signal_df = signal_df[start:]
        if end:
            signal_df = signal_df[:end]
        index = pd.Index(range(len(signal_df)))
        index = index[(index - shift) % period == 0]
        new_signal_df = signal_df.copy()
        new_signal_df = new_signal_df.iloc[index]

        min_num = int(signal_df.min().min())
        max_num = int(signal_df.max().max())
        num_list = list(range(min_num, max_num + 1))

        industry_symbol_map = self.get_industry(group='actual_industry',
                                                name='actual_five_industry')

        # industry_list = list(industry_symbol_map.keys())
        # for i in range(len(industry_symbol_map)):
        #     fig, ax = plt.subplots(figsize=(20, 8))
        #     industry = industry_list[i]
        #     symbol_list = industry_symbol_map[industry]
        #     industry_signal_df = new_signal_df[symbol_list]
        #     industry_signal_df.plot(ax=ax, legend=False)
        #     ax.set_title(industry)
        #     fig.suptitle("Group Distribution per Symbol")
        #     fig.legend()
        #     plt.grid()
        #     plt.show()

        minus_one_pct_per_symbol = {}
        minus_two_pct_per_symbol = {}
        for industry in industry_symbol_map:
            symbol_list = industry_symbol_map[industry]

            fig, axes = plt.subplots(figsize=(30, 50),
                                     nrows=len(symbol_list),
                                     ncols=1)
            for symbol in symbol_list:
                i = symbol_list.index(symbol)
                symbol_signal_series = new_signal_df[symbol]
                symbol_signal_series = symbol_signal_series[
                    symbol_signal_series != 0.0]
                # sns.distplot(symbol_signal_series[symbol_signal_series != 0.0], ax=axes[i])
                symbol_signal_series_value_counts = symbol_signal_series.dropna(
                ).value_counts()
                symbol_signal_series_value_counts.index = symbol_signal_series_value_counts.index.astype(
                    int)
                for num in num_list:
                    if num not in symbol_signal_series_value_counts.index:
                        symbol_signal_series_value_counts.loc[num] = 0
                symbol_signal_series_pct = (
                    symbol_signal_series_value_counts /
                    symbol_signal_series_value_counts.sum()).sort_index(
                        ascending=True)
                symbol_signal_series_pct.plot.bar(ax=axes[i])
                minus_one_pct_per_symbol[
                    symbol] = symbol_signal_series_pct.loc[-1]
                minus_two_pct_per_symbol[
                    symbol] = symbol_signal_series_pct.loc[-2]
                axes[i].set_title(label=symbol, fontsize=30)
                # axes[i].set_xticks(num_list)
                xticks_delta = num_list[0] - axes[i].get_xticks()[0]
                axes[i].set_xticklabels(labels=axes[i].get_xticks() +
                                        xticks_delta,
                                        fontsize=30)
                axes[i].set_yticklabels(labels=np.round(
                    axes[i].get_yticks(), 2),
                                        fontsize=30)
                # for x, y in symbol_signal_series_value_counts.to_dict().items():
                #     axes[i].text(x, y, np.round(y, 2))
                for tick in axes[i].get_xticklabels():
                    tick.set_rotation(360)
            # plt.xticks(ticks=list(range(min_num, max_num+1)), labels=list(range(min_num, max_num+1)))
            fig.subplots_adjust(hspace=0.7)
            fig.suptitle(industry, fontsize=30)
            plt.show()

        minus_one_pct_per_symbol = np.round(
            pd.Series(minus_one_pct_per_symbol),
            2).sort_values(ascending=False)
        minus_two_pct_per_symbol = np.round(
            pd.Series(minus_two_pct_per_symbol),
            2).sort_values(ascending=False)

        fig, axes = plt.subplots(figsize=(20, 8), nrows=2, ncols=1)
        minus_one_pct_per_symbol.plot.bar(ax=axes[0], figsize=(20, 8))
        axes[0].set_title('各品种已上市但未被纳入商品池的比例')
        axes[0].grid()
        for tick in axes[0].get_xticklabels():
            tick.set_rotation(360)

        minus_two_pct_per_symbol.plot.bar(ax=axes[1], figsize=(20, 8))
        axes[1].set_title('各品种已上市且被纳入商品池但无因子值的比例')
        axes[1].grid()
        for tick in axes[1].get_xticklabels():
            tick.set_rotation(360)
        plt.show()
Example #15
0
def QA_fetch_financial_report(code, report_date, ltype='CN', db=DATABASE):
    """获取专业财务报表

    Arguments:
        code {[type]} -- [description]
        report_date {[type]} -- [description]

    Keyword Arguments:
        ltype {str} -- [description] (default: {'EN'})
        db {[type]} -- [description] (default: {DATABASE})

    Raises:
        e -- [description]

    Returns:
        pd.DataFrame -- [description]
    """

    if isinstance(code, str):
        code = [code]
    if isinstance(report_date, str):
        report_date = [QA_util_date_str2int(report_date)]
    elif isinstance(report_date, int):
        report_date = [report_date]
    elif isinstance(report_date, list):
        report_date = [QA_util_date_str2int(item) for item in report_date]

    collection = db.financial
    CH_columns = [item for item in sorted(list(financial_dict.keys()))]
    CH_columns.extend(['277', '278', '279', '280', '281', '282', '_id', 'code',
                       'report_date'])
    CH_columns = pd.Index(CH_columns)
    #EN_columns = list(financial_dict.values())
    EN_columns = [financial_dict[key] for key in sorted(list(financial_dict.keys()))]
    EN_columns.extend(['277', '278', '279', '280', '281', '282', '_id', 'code',
                       'report_date'])
    EN_columns = pd.Index(EN_columns)

    try:
        if code is not None and report_date is not None:
            data = [item for item in collection.find(
                {'code': {'$in': code}, 'report_date': {'$in': report_date}}, batch_size=10000)]
        elif code is None and report_date is not None:
            data = [item for item in collection.find(
                {'report_date': {'$in': report_date}}, batch_size=10000)]
        elif code is not None and report_date is None:
            data = [item for item in collection.find(
                {'code': {'$in': code}}, batch_size=10000)]
        else:
            data = [item for item in collection.find()]
        if len(data) > 0:
            res_pd = pd.DataFrame(data)

            if ltype in ['CH', 'CN']:
                res_pd.columns = CH_columns
            elif ltype is 'EN':
                res_pd.columns = EN_columns

            if res_pd.report_date.dtype == numpy.int64:
                res_pd.report_date = pd.to_datetime(
                    res_pd.report_date.apply(QA_util_date_int2str))
            else:
                res_pd.report_date=pd.to_datetime(res_pd.report_date)
            
            #return res_pd.replace(-4.039810335e+34, numpy.nan).set_index(['report_date', 'code'], drop=False)
            return res_pd.replace(-4.039810335e+34, numpy.nan).set_index(['report_date'], drop=False)
        else:
            return None
    except Exception as e:
        raise e
Example #16
0
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(["a"], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(["1970-01-01"],
                           freq="d",
                           tz="America/New_York",
                           name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")]
    codes = [[0], [0]]
    if PANDAS_GT_0240:
        kwargs = {"codes": codes}
    else:
        kwargs = {"labels": codes}
    idx = pd.MultiIndex(levels=levels, names=["a", "b"], **kwargs)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name="a"),
        pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"),
        pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"),
    ]

    codes = [[0], [0], [0]]
    if PANDAS_GT_0240:
        kwargs = {"codes": codes}
    else:
        kwargs = {"labels": codes}

    idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], **kwargs)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Example #17
0
                cl.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True}) # Conversion to radians because HDBSCAN uses that.
                clusters = cl.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)
#            with Clustering() as cl: # This is the memory heavy precomputed DBSCAN variety
#                cl.prepare_for_distance_algorithm(where = 'shared', manipulator = Latlons)
#                cl.call_distance_algorithm(func = haversine_worker, n_par_processes = NPROC, distmatdtype = np.float16) 
#                clusters = cl.clustering(clusterclass = DBSCAN, kwargs = {'eps':1300, 'min_samples':2000})
                nclusters = int(clusters.coords["nclusters"]) # nclusters returned as coordinate because this matches bahaviour of the non-DBSCAN algorithms, even though with DBSCAN it is only a dimension of length 1
                logging.debug(f'clustered {invarname} of {filename} by spatial haversine distance with HDBSCAN for lag: {lag}, fold: {fold}, resulting nclusters: {nclusters}')
            except MaskingError: # Happens when masking results in zero or less than the minimum samples 
                nclusters = 0
                clusters = xr.DataArray(np.nan, dims = cl.samplefield.dims, coords = cl.samplefield.drop_vars(['lag','fold'], errors = 'ignore').coords)
                logging.debug(f'No/too little samples were present after masking {invarname} of {filename} for lag: {lag}. fold: {fold}, HDBSCAN was not called. A field with zero clusters is returned.')
            if fold is None:    
                attrs.update({f'lag{lag}':f'nclusters: {nclusters}'}) 
            else:
                attrs.update({f'lag{lag}_fold{fold}':f'nclusters: {nclusters}'}) 
            combined.append(clusters.squeeze().drop_vars('nclusters', errors = 'ignore'))
        
        if fold is None:
            temp = xr.concat(combined, dim = pd.Index(lags, name = 'lag')) # Immediately at first position, and correct order
        else:
            temp = xr.concat(combined, dim = pd.MultiIndex.from_product([lags,folds], names = ['lag','fold'])).unstack('concat_dim').transpose(*ds[invarname].dims).reindex_like(ds[invarname]) # Unstack brings the lag/fold dimension to last place and scrambled order, so so transpose and reindex to get original ordering
        ds.close() # Need to close before writer can access
        attrs.update({key:str(item) for key,item in clusterkwargs.items()})
        w = Writer(corrpath,varname = outvarname) # Should be able to find the dataformat
        w.create_dataset(example = temp)
        w.write(array = temp, units = '', attrs = attrs) 
    else:
        logging.debug(f'{filename} was already clustered')
        ds.close()
Example #18
0
    else:
        hpc_channel = generalinfo['channelStructure'][0][0][1][0][0][0][0] - 1
    spikes, shank = loadSpikeData(
        data_directory + session + '/Analysis/SpikeData.mat',
        shankStructure['thalamus'])
    wake_ep = loadEpoch(data_directory + session, 'wake')
    sleep_ep = loadEpoch(data_directory + session, 'sleep')
    sws_ep = loadEpoch(data_directory + session, 'sws')
    rem_ep = loadEpoch(data_directory + session, 'rem')
    sleep_ep = sleep_ep.merge_close_intervals(threshold=1.e3)
    sws_ep = sleep_ep.intersect(sws_ep)
    rem_ep = sleep_ep.intersect(rem_ep)
    Hcorr_ep = {}
    for ep, k in zip([wake_ep, rem_ep, sws_ep], ['wak', 'rem', 'sws']):
        AUT, FR = compute_AutoCorrs(spikes, ep, binsize=0.5, nbins=20000)
        AUT.columns = pd.Index(
            [session.split("/")[1] + "_" + str(n) for n in spikes.keys()])
        datatosave[k].append(AUT)

    print(session, time() - start)

for e in datatosave.keys():
    datatosave[e] = pd.concat(datatosave[e], 1)

store_autocorr = pd.HDFStore(
    "/mnt/DataGuillaume/MergedData/AUTOCORR_FOR_FOURIER.h5", 'w')
store_autocorr.put('wak', datatosave['wak'])
store_autocorr.put('rem', datatosave['rem'])
store_autocorr.put('sws', datatosave['sws'])
store_autocorr.close()

from pychronux import *
def previous_application(file_path=file_path, nan_as_category=True):
    def goods_cat(x):
        if x in ['XNA', 'Other']:
            return 'XNA'
        elif x in ['Mobile', 'Consumer Electronics', 'Computers', 'Photo / Cinema Equipment',
                   'Clothing and Accessories', 'Jewelry', 'Sport and Leisure', 'Tourism',
                   'Fitness', 'Additional Service', 'Weapon', 'Animals', 'Direct Sales']:
            return 'electronics & leisure'
        else:
            return 'home & car & edu & medi'

    df_prev = pd.read_csv(file_path + 'previous_application.csv')

    # Replace some outliers
    df_prev.loc[df_prev['AMT_CREDIT'] > 6000000, 'AMT_CREDIT'] = np.nan
    df_prev.loc[df_prev['SELLERPLACE_AREA'] > 3500000, 'SELLERPLACE_AREA'] = np.nan
    df_prev[['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
             'DAYS_LAST_DUE', 'DAYS_TERMINATION']].replace(365243, np.nan, inplace=True)

    # category
    df_prev.drop('WEEKDAY_APPR_PROCESS_START', axis=1, inplace=True)

    df_prev['NAME_SELLER_INDUSTRY'] = df_prev['NAME_SELLER_INDUSTRY'].apply(lambda x:
                                                                            'other' if x not in ['XNA',
                                                                                                 'Consumer electronics',
                                                                                                 'Connectivity']
                                                                            else x)
    df_prev['CHANNEL_TYPE'] = df_prev['CHANNEL_TYPE'].apply(lambda x:
                                                            'other' if x not in ['Credit and cash offices',
                                                                                 'Country-wide'] else x)
    df_prev['NAME_PORTFOLIO'] = df_prev['NAME_PORTFOLIO'].apply(lambda x:
                                                                'other' if x not in ['POS', 'Cash'] else x)

    df_prev['NAME_GOODS_CATEGORY'] = df_prev['NAME_GOODS_CATEGORY'].apply(lambda x: goods_cat(x))
    df_prev['NAME_TYPE_SUITE'] = df_prev['NAME_TYPE_SUITE'].apply(lambda x: 'other' if x != 'Unaccompanied' else x)
    df_prev['CODE_REJECT_REASON'] = df_prev['CODE_REJECT_REASON'].apply(lambda x:
                                                                        'other' if x not in ['XAP', 'HC'] else x)
    df_prev['NAME_PAYMENT_TYPE'] = df_prev['NAME_PAYMENT_TYPE'].apply(lambda x:
                                                                      'other' if x not in [
                                                                          'Cash through the bank'] else x)
    df_prev['NAME_CASH_LOAN_PURPOSE'] = df_prev['NAME_CASH_LOAN_PURPOSE'].apply(lambda x:
                                                                                'other' if x not in ['XAP',
                                                                                                     'XNA'] else x)

    # Some new features
    df_prev['prev missing'] = df_prev.isnull().sum(axis=1).values
    df_prev['prev AMT_APPLICATION / AMT_CREDIT'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_CREDIT']
    df_prev['prev AMT_APPLICATION - AMT_CREDIT'] = df_prev['AMT_APPLICATION'] - df_prev['AMT_CREDIT']

    df_prev['prev AMT_APPLICATION - AMT_GOODS_PRICE'] = df_prev['AMT_APPLICATION'] - df_prev['AMT_GOODS_PRICE']
    df_prev['prev AMT_GOODS_PRICE - AMT_CREDIT'] = df_prev['AMT_GOODS_PRICE'] - df_prev['AMT_CREDIT']
    df_prev['prev DAYS_FIRST_DRAWING - DAYS_FIRST_DUE'] = df_prev['DAYS_FIRST_DRAWING'] - df_prev['DAYS_FIRST_DUE']
    df_prev['prev DAYS_TERMINATION less -500'] = (df_prev['DAYS_TERMINATION'] < -500).astype(int)
    df_prev['DAYS_LAST_DUE - DAYS_TERMINATION'] = df_prev['DAYS_LAST_DUE'] - df_prev['DAYS_TERMINATION']
    #df_prev = df_prev.drop(['AMT_APPLICATION', 'AMT_GOODS_PRICE', 'DAYS_TERMINATION'], axis=1)
    df_prev['avg loan terms'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY']


    # Categorical features with One-Hot encode
    df_prev, categorical = one_hot_encoder(df_prev, nan_as_category)

    # Aggregations for application set
    aggregations = {}
    for col in df_prev.columns:
        aggregations[col] = ['mean','sum'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum']
    df_prev_agg = df_prev.groupby('SK_ID_CURR').agg(aggregations)
    df_prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in df_prev_agg.columns.tolist()])

    # Previous Applications: Approved Applications
    approved_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Approved'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    approved_agg.columns = pd.Index(['PREV_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    df_prev_agg = df_prev_agg.join(approved_agg, how='left')
    del approved_agg
    gc.collect()

    # Previous Applications: Refused Applications
    refused_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Refused'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    refused_agg.columns = pd.Index(['PREV_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    df_prev_agg = df_prev_agg.join(refused_agg, how='left')
    del refused_agg
    gc.collect()

    # cash loans application
    cash_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Cash loans'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    cash_loan_agg.columns = pd.Index(
        ['PREV_Cash loans_' + e[0] + "_" + e[1].upper() for e in cash_loan_agg.columns.tolist()])
    df_prev_agg = df_prev_agg.join(cash_loan_agg, how='left')

    del cash_loan_agg
    gc.collect()
    # consumer loans
    consumer_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Consumer loans'] == 1].groupby('SK_ID_CURR').agg(aggregations)
    consumer_loan_agg.columns = pd.Index(
        ['PREV_Consumer loans_' + e[0] + "_" + e[1].upper() for e in consumer_loan_agg.columns.tolist()])
    df_prev_agg = df_prev_agg.join(consumer_loan_agg, how='left')
    del consumer_loan_agg
    gc.collect()

    # Revolving loans
    Revolving_loan_agg = df_prev[df_prev['NAME_CONTRACT_TYPE_Revolving loans'] == 1].groupby('SK_ID_CURR').agg(
        aggregations)
    Revolving_loan_agg.columns = pd.Index(
        ['PREV_Revolving loans_' + e[0] + "_" + e[1].upper() for e in Revolving_loan_agg.columns.tolist()])
    df_prev_agg = df_prev_agg.join(Revolving_loan_agg, how='left')

    del Revolving_loan_agg
    gc.collect()


    del df_prev
    gc.collect()

    return reduce_mem_usage(df_prev_agg)
Example #20
0
 def itemid_2_index(self):
     r_index = pd.Index(self.item_data.item.unique(), name='item')
     return r_index
Example #21
0
    def test_nanosecond_field(self):
        dti = DatetimeIndex(np.arange(10))

        tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64)))
Example #22
0
    def to_pandas_frame(self) -> pd.DataFrame:
        """ Return as pandas DataFrame. """
        sdf = self.to_internal_spark_frame
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({
                field.name: spark_type_to_pandas_dtype(field.dataType)
                for field in sdf.schema
            })
        elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
            for field in sdf.schema:
                if field.nullable and pdf[field.name].isnull().all():
                    if isinstance(field.dataType, BooleanType):
                        pdf[field.name] = pdf[field.name].astype(np.object)
                    elif isinstance(field.dataType, IntegralType):
                        pdf[field.name] = pdf[field.name].astype(np.float64)
                    else:
                        pdf[field.name] = pdf[field.name].astype(
                            spark_type_to_pandas_dtype(field.dataType))

        column_names = []
        for i, (label, spark_column, column_name) in enumerate(
                zip(self.column_labels, self.data_spark_columns,
                    self.data_spark_column_names)):
            for index_spark_column_name, index_spark_column in zip(
                    self.index_spark_column_names, self.index_spark_columns):
                if spark_column._jc.equals(index_spark_column._jc):
                    column_names.append(index_spark_column_name)
                    break
            else:
                name = str(i) if label is None else name_like_string(label)
                if column_name != name:
                    column_name = name
                column_names.append(column_name)

        append = False
        for index_field in self.index_spark_column_names:
            drop = index_field not in column_names
            pdf = pdf.set_index(index_field, drop=drop, append=append)
            append = True
        pdf = pdf[column_names]

        names = [
            name if name is None or len(name) > 1 else name[0]
            for name in self._column_label_names
        ]
        if self.column_labels_level > 1:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_labels,
                                                    names=names)
        else:
            pdf.columns = pd.Index(
                [
                    None if label is None else label[0]
                    for label in self._column_labels
                ],
                name=names[0],
            )

        pdf.index.names = [
            name if name is None or len(name) > 1 else name[0]
            for name in self.index_names
        ]

        return pdf
class TestTableOrient:
    def setup_method(self, method):
        self.df = DataFrame(
            {
                "A": [1, 2, 3, 4],
                "B": ["a", "b", "c", "c"],
                "C":
                pd.date_range("2016-01-01", freq="d", periods=4),
                "D":
                pd.timedelta_range("1H", periods=4, freq="T"),
                "E":
                pd.Series(pd.Categorical(["a", "b", "c", "c"])),
                "F":
                pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
                "G": [1.0, 2.0, 3, 4.0],
                "H":
                pd.date_range(
                    "2016-01-01", freq="d", periods=4, tz="US/Central"),
            },
            index=pd.Index(range(4), name="idx"),
        )

    def test_build_series(self):
        s = pd.Series([1, 2], name="a")
        s.index.name = "id"
        result = s.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)

        assert "pandas_version" in result["schema"]
        result["schema"].pop("pandas_version")

        fields = [{
            "name": "id",
            "type": "integer"
        }, {
            "name": "a",
            "type": "integer"
        }]

        schema = {"fields": fields, "primaryKey": ["id"]}

        expected = OrderedDict([
            ("schema", schema),
            (
                "data",
                [
                    OrderedDict([("id", 0), ("a", 1)]),
                    OrderedDict([("id", 1), ("a", 2)]),
                ],
            ),
        ])

        assert result == expected

    def test_to_json(self):
        df = self.df.copy()
        df.index.name = "idx"
        result = df.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)

        assert "pandas_version" in result["schema"]
        result["schema"].pop("pandas_version")

        fields = [
            {
                "name": "idx",
                "type": "integer"
            },
            {
                "name": "A",
                "type": "integer"
            },
            {
                "name": "B",
                "type": "string"
            },
            {
                "name": "C",
                "type": "datetime"
            },
            {
                "name": "D",
                "type": "duration"
            },
            {
                "constraints": {
                    "enum": ["a", "b", "c"]
                },
                "name": "E",
                "ordered": False,
                "type": "any",
            },
            {
                "constraints": {
                    "enum": ["a", "b", "c"]
                },
                "name": "F",
                "ordered": True,
                "type": "any",
            },
            {
                "name": "G",
                "type": "number"
            },
            {
                "name": "H",
                "type": "datetime",
                "tz": "US/Central"
            },
        ]

        schema = {"fields": fields, "primaryKey": ["idx"]}
        data = [
            OrderedDict([
                ("idx", 0),
                ("A", 1),
                ("B", "a"),
                ("C", "2016-01-01T00:00:00.000Z"),
                ("D", "P0DT1H0M0S"),
                ("E", "a"),
                ("F", "a"),
                ("G", 1.0),
                ("H", "2016-01-01T06:00:00.000Z"),
            ]),
            OrderedDict([
                ("idx", 1),
                ("A", 2),
                ("B", "b"),
                ("C", "2016-01-02T00:00:00.000Z"),
                ("D", "P0DT1H1M0S"),
                ("E", "b"),
                ("F", "b"),
                ("G", 2.0),
                ("H", "2016-01-02T06:00:00.000Z"),
            ]),
            OrderedDict([
                ("idx", 2),
                ("A", 3),
                ("B", "c"),
                ("C", "2016-01-03T00:00:00.000Z"),
                ("D", "P0DT1H2M0S"),
                ("E", "c"),
                ("F", "c"),
                ("G", 3.0),
                ("H", "2016-01-03T06:00:00.000Z"),
            ]),
            OrderedDict([
                ("idx", 3),
                ("A", 4),
                ("B", "c"),
                ("C", "2016-01-04T00:00:00.000Z"),
                ("D", "P0DT1H3M0S"),
                ("E", "c"),
                ("F", "c"),
                ("G", 4.0),
                ("H", "2016-01-04T06:00:00.000Z"),
            ]),
        ]
        expected = OrderedDict([("schema", schema), ("data", data)])

        assert result == expected

    def test_to_json_float_index(self):
        data = pd.Series(1, index=[1.0, 2.0])
        result = data.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)
        result["schema"].pop("pandas_version")

        expected = OrderedDict([
            (
                "schema",
                {
                    "fields": [
                        {
                            "name": "index",
                            "type": "number"
                        },
                        {
                            "name": "values",
                            "type": "integer"
                        },
                    ],
                    "primaryKey": ["index"],
                },
            ),
            (
                "data",
                [
                    OrderedDict([("index", 1.0), ("values", 1)]),
                    OrderedDict([("index", 2.0), ("values", 1)]),
                ],
            ),
        ])

        assert result == expected

    def test_to_json_period_index(self):
        idx = pd.period_range("2016", freq="Q-JAN", periods=2)
        data = pd.Series(1, idx)
        result = data.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)
        result["schema"].pop("pandas_version")

        fields = [
            {
                "freq": "Q-JAN",
                "name": "index",
                "type": "datetime"
            },
            {
                "name": "values",
                "type": "integer"
            },
        ]

        schema = {"fields": fields, "primaryKey": ["index"]}
        data = [
            OrderedDict([("index", "2015-11-01T00:00:00.000Z"),
                         ("values", 1)]),
            OrderedDict([("index", "2016-02-01T00:00:00.000Z"),
                         ("values", 1)]),
        ]
        expected = OrderedDict([("schema", schema), ("data", data)])

        assert result == expected

    def test_to_json_categorical_index(self):
        data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
        result = data.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)
        result["schema"].pop("pandas_version")

        expected = OrderedDict([
            (
                "schema",
                {
                    "fields": [
                        {
                            "name": "index",
                            "type": "any",
                            "constraints": {
                                "enum": ["a", "b"]
                            },
                            "ordered": False,
                        },
                        {
                            "name": "values",
                            "type": "integer"
                        },
                    ],
                    "primaryKey": ["index"],
                },
            ),
            (
                "data",
                [
                    OrderedDict([("index", "a"), ("values", 1)]),
                    OrderedDict([("index", "b"), ("values", 1)]),
                ],
            ),
        ])

        assert result == expected

    def test_date_format_raises(self):
        with pytest.raises(ValueError):
            self.df.to_json(orient="table", date_format="epoch")

        # others work
        self.df.to_json(orient="table", date_format="iso")
        self.df.to_json(orient="table")

    def test_convert_pandas_type_to_json_field_int(self, index_or_series):
        kind = index_or_series
        data = [1, 2, 3]
        result = convert_pandas_type_to_json_field(kind(data, name="name"))
        expected = {"name": "name", "type": "integer"}
        assert result == expected

    def test_convert_pandas_type_to_json_field_float(self, index_or_series):
        kind = index_or_series
        data = [1.0, 2.0, 3.0]
        result = convert_pandas_type_to_json_field(kind(data, name="name"))
        expected = {"name": "name", "type": "number"}
        assert result == expected

    @pytest.mark.parametrize("dt_args,extra_exp", [({}, {}),
                                                   ({
                                                       "utc": True
                                                   }, {
                                                       "tz": "UTC"
                                                   })])
    @pytest.mark.parametrize("wrapper", [None, pd.Series])
    def test_convert_pandas_type_to_json_field_datetime(
            self, dt_args, extra_exp, wrapper):
        data = [1.0, 2.0, 3.0]
        data = pd.to_datetime(data, **dt_args)
        if wrapper is pd.Series:
            data = pd.Series(data, name="values")
        result = convert_pandas_type_to_json_field(data)
        expected = {"name": "values", "type": "datetime"}
        expected.update(extra_exp)
        assert result == expected

    def test_convert_pandas_type_to_json_period_range(self):
        arr = pd.period_range("2016", freq="A-DEC", periods=4)
        result = convert_pandas_type_to_json_field(arr)
        expected = {"name": "values", "type": "datetime", "freq": "A-DEC"}
        assert result == expected

    @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
    @pytest.mark.parametrize("ordered", [True, False])
    def test_convert_pandas_type_to_json_field_categorical(
            self, kind, ordered):
        data = ["a", "b", "c"]
        if kind is pd.Categorical:
            arr = pd.Series(kind(data, ordered=ordered), name="cats")
        elif kind is pd.CategoricalIndex:
            arr = kind(data, ordered=ordered, name="cats")

        result = convert_pandas_type_to_json_field(arr)
        expected = {
            "name": "cats",
            "type": "any",
            "constraints": {
                "enum": data
            },
            "ordered": ordered,
        }
        assert result == expected

    @pytest.mark.parametrize(
        "inp,exp",
        [
            ({
                "type": "integer"
            }, "int64"),
            ({
                "type": "number"
            }, "float64"),
            ({
                "type": "boolean"
            }, "bool"),
            ({
                "type": "duration"
            }, "timedelta64"),
            ({
                "type": "datetime"
            }, "datetime64[ns]"),
            ({
                "type": "datetime",
                "tz": "US/Hawaii"
            }, "datetime64[ns, US/Hawaii]"),
            ({
                "type": "any"
            }, "object"),
            (
                {
                    "type": "any",
                    "constraints": {
                        "enum": ["a", "b", "c"]
                    },
                    "ordered": False,
                },
                CategoricalDtype(categories=["a", "b", "c"], ordered=False),
            ),
            (
                {
                    "type": "any",
                    "constraints": {
                        "enum": ["a", "b", "c"]
                    },
                    "ordered": True,
                },
                CategoricalDtype(categories=["a", "b", "c"], ordered=True),
            ),
            ({
                "type": "string"
            }, "object"),
        ],
    )
    def test_convert_json_field_to_pandas_type(self, inp, exp):
        field = {"name": "foo"}
        field.update(inp)
        assert convert_json_field_to_pandas_type(field) == exp

    @pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
    def test_convert_json_field_to_pandas_type_raises(self, inp):
        field = {"type": inp}
        with pytest.raises(ValueError,
                           match=f"Unsupported or invalid field type: {inp}"):
            convert_json_field_to_pandas_type(field)

    def test_categorical(self):
        s = pd.Series(pd.Categorical(["a", "b", "a"]))
        s.index.name = "idx"
        result = s.to_json(orient="table", date_format="iso")
        result = json.loads(result, object_pairs_hook=OrderedDict)
        result["schema"].pop("pandas_version")

        fields = [
            {
                "name": "idx",
                "type": "integer"
            },
            {
                "constraints": {
                    "enum": ["a", "b"]
                },
                "name": "values",
                "ordered": False,
                "type": "any",
            },
        ]

        expected = OrderedDict([
            ("schema", {
                "fields": fields,
                "primaryKey": ["idx"]
            }),
            (
                "data",
                [
                    OrderedDict([("idx", 0), ("values", "a")]),
                    OrderedDict([("idx", 1), ("values", "b")]),
                    OrderedDict([("idx", 2), ("values", "a")]),
                ],
            ),
        ])

        assert result == expected

    @pytest.mark.parametrize(
        "idx,nm,prop",
        [
            (pd.Index([1]), "index", "name"),
            (pd.Index([1], name="myname"), "myname", "name"),
            (
                pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
                ["level_0", "level_1"],
                "names",
            ),
            (
                pd.MultiIndex.from_product([("a", "b"), ("c", "d")],
                                           names=["n1", "n2"]),
                ["n1", "n2"],
                "names",
            ),
            (
                pd.MultiIndex.from_product([("a", "b"), ("c", "d")],
                                           names=["n1", None]),
                ["n1", "level_1"],
                "names",
            ),
        ],
    )
    def test_set_names_unset(self, idx, nm, prop):
        data = pd.Series(1, idx)
        result = set_default_names(data)
        assert getattr(result.index, prop) == nm

    @pytest.mark.parametrize(
        "idx",
        [
            pd.Index([], name="index"),
            pd.MultiIndex.from_arrays([["foo"], ["bar"]],
                                      names=("level_0", "level_1")),
            pd.MultiIndex.from_arrays([["foo"], ["bar"]],
                                      names=("foo", "level_1")),
        ],
    )
    def test_warns_non_roundtrippable_names(self, idx):
        # GH 19130
        df = pd.DataFrame(index=idx)
        df.index.name = "index"
        with tm.assert_produces_warning():
            set_default_names(df)

    def test_timestamp_in_columns(self):
        df = pd.DataFrame(
            [[1, 2]],
            columns=[pd.Timestamp("2016"),
                     pd.Timedelta(10, unit="s")])
        result = df.to_json(orient="table")
        js = json.loads(result)
        assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
        # TODO - below expectation is not correct; see GH 28256
        assert js["schema"]["fields"][2]["name"] == 10000

    @pytest.mark.parametrize(
        "case",
        [
            pd.Series([1], index=pd.Index([1], name="a"), name="a"),
            pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
            pd.DataFrame(
                {"A": [1]},
                index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"
                                                                     ]),
            ),
        ],
    )
    def test_overlapping_names(self, case):
        with pytest.raises(ValueError, match="Overlapping"):
            case.to_json(orient="table")
Example #24
0
def execute_selection_dataframe(
    op,
    data,
    selections,
    predicates,
    sort_keys,
    scope: Scope,
    timecontext: Optional[TimeContext],
    **kwargs,
):
    result = data

    # Build up the individual pandas structures from column expressions
    if selections:
        if all(isinstance(s.op(), ops.TableColumn) for s in selections):
            result = build_df_from_selection(selections, data, op.table.op())
        else:
            result = build_df_from_projection(
                selections,
                op,
                data,
                scope=scope,
                timecontext=timecontext,
                **kwargs,
            )

    if predicates:
        predicates = _compute_predicates(
            op.table.op(), predicates, data, scope, timecontext, **kwargs
        )
        predicate = functools.reduce(operator.and_, predicates)
        assert len(predicate) == len(
            result
        ), 'Selection predicate length does not match underlying table'
        result = result.loc[predicate]

    if sort_keys:
        result, grouping_keys, ordering_keys = util.compute_sorted_frame(
            result,
            order_by=sort_keys,
            scope=scope,
            timecontext=timecontext,
            **kwargs,
        )
    else:
        grouping_keys = ordering_keys = ()

    # return early if we do not have any temporary grouping or ordering columns
    assert not grouping_keys, 'group by should never show up in Selection'
    if not ordering_keys:
        return result

    # create a sequence of columns that we need to drop
    temporary_columns = pd.Index(
        concatv(grouping_keys, ordering_keys)
    ).difference(data.columns)

    # no reason to call drop if we don't need to
    if temporary_columns.empty:
        return result

    # drop every temporary column we created for ordering or grouping
    return result.drop(temporary_columns, axis=1)
 def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
     df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
     out = df.to_json(orient="table")
     with pytest.raises(NotImplementedError, match="can not yet read "):
         pd.read_json(out, orient="table")
Example #26
0
def _infer_concat_order_from_coords(datasets):

    concat_dims = []
    tile_ids = [() for ds in datasets]

    # All datasets have same variables because they've been grouped as such
    ds0 = datasets[0]
    for dim in ds0.dims:

        # Check if dim is a coordinate dimension
        if dim in ds0:

            # Need to read coordinate values to do ordering
            indexes = [ds.indexes.get(dim) for ds in datasets]
            if any(index is None for index in indexes):
                raise ValueError("Every dimension needs a coordinate for "
                                 "inferring concatenation order")

            # If dimension coordinate values are same on every dataset then
            # should be leaving this dimension alone (it's just a "bystander")
            if not all(index.equals(indexes[0]) for index in indexes[1:]):

                # Infer order datasets should be arranged in along this dim
                concat_dims.append(dim)

                if all(index.is_monotonic_increasing for index in indexes):
                    ascending = True
                elif all(index.is_monotonic_decreasing for index in indexes):
                    ascending = False
                else:
                    raise ValueError(
                        "Coordinate variable {} is neither "
                        "monotonically increasing nor "
                        "monotonically decreasing on all datasets".format(dim))

                # Assume that any two datasets whose coord along dim starts
                # with the same value have the same coord values throughout.
                if any(index.size == 0 for index in indexes):
                    raise ValueError("Cannot handle size zero dimensions")
                first_items = pd.Index([index.take([0]) for index in indexes])

                # Sort datasets along dim
                # We want rank but with identical elements given identical
                # position indices - they should be concatenated along another
                # dimension, not along this one
                series = first_items.to_series()
                rank = series.rank(method="dense", ascending=ascending)
                order = rank.astype(int).values - 1

                # Append positions along extra dimension to structure which
                # encodes the multi-dimensional concatentation order
                tile_ids = [
                    tile_id + (position, )
                    for tile_id, position in zip(tile_ids, order)
                ]

    if len(datasets) > 1 and not concat_dims:
        raise ValueError("Could not find any dimension coordinates to use to "
                         "order the datasets for concatenation")

    combined_ids = OrderedDict(zip(tile_ids, datasets))

    return combined_ids, concat_dims
Example #27
0
                "to same dtypes."),
    ):
        s.where([True, False, True], [1, 2, 3])


@pytest.mark.parametrize(
    "ps",
    [
        pd.Series(["a"] * 20, index=range(0, 20)),
        pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"),
    ],
)
@pytest.mark.parametrize(
    "labels",
    [[1], [0], 1, 5, [5, 9],
     pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])],
)
@pytest.mark.parametrize("inplace", [True, False])
def test_series_drop_labels(ps, labels, inplace):
    ps = ps.copy()
    gs = cudf.from_pandas(ps)

    expected = ps.drop(labels=labels, axis=0, inplace=inplace)
    actual = gs.drop(labels=labels, axis=0, inplace=inplace)

    if inplace:
        expected = ps
        actual = gs

    assert_eq(expected, actual)
Example #28
0
    def aggregate(self, func=None, *args, **kwargs):
        if self._axis != 0:
            # This is not implemented in pandas,
            # so we throw a different message
            raise NotImplementedError("axis other than 0 is not supported")

        if (
            callable(func)
            and isinstance(func, BuiltinFunctionType)
            and func.__name__ in dir(self)
        ):
            func = func.__name__

        relabeling_required = False
        if isinstance(func, dict) or func is None:

            def try_get_str_func(fn):
                if not isinstance(fn, str) and isinstance(fn, Iterable):
                    return [try_get_str_func(f) for f in fn]
                return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn

            relabeling_required, func_dict, new_columns, order = reconstruct_func(
                func, **kwargs
            )
            func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()}

            if any(i not in self._df.columns for i in func_dict.keys()):
                from pandas.core.base import SpecificationError

                raise SpecificationError("nested renamer is not supported")
            if func is None:
                kwargs = {}
            func = func_dict
        elif is_list_like(func):
            return self._default_to_pandas(
                lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif callable(func):
            return self._apply_agg_function(
                lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs),
                *args,
                **kwargs,
            )
        elif isinstance(func, str):
            # Using "getattr" here masks possible AttributeError which we throw
            # in __getattr__, so we should call __getattr__ directly instead.
            agg_func = self.__getattr__(func)
            if callable(agg_func):
                return agg_func(*args, **kwargs)

        result = self._apply_agg_function(
            func,
            *args,
            **kwargs,
        )

        if relabeling_required:
            if not self._as_index:
                nby_cols = len(result.columns) - len(new_columns)
                order = np.concatenate([np.arange(nby_cols), order + nby_cols])
                by_cols = result.columns[:nby_cols]
                new_columns = pandas.Index(new_columns)
                if by_cols.nlevels != new_columns.nlevels:
                    by_cols = by_cols.remove_unused_levels()
                    empty_levels = [
                        i
                        for i, level in enumerate(by_cols.levels)
                        if len(level) == 1 and level[0] == ""
                    ]
                    by_cols = by_cols.droplevel(empty_levels)
                new_columns = by_cols.append(new_columns)
            result = result.iloc[:, order]
            result.columns = new_columns
        return result
Example #29
0
def create_loom_from_tabulamurisfacs(fn_loom):
    samplesheet = pd.read_csv(
        '../../data/tabulamuris/FACS_alltissues/annotations_facs.csv',
        sep=',',
        index_col='cell',
        low_memory=False,
    ).iloc[:, 2:]
    samplesheet.index.name = 'CellID'

    immune_types = [
        'B cell',
        'DN1 thymic pro-T cell',
        'regulatory T cell',
        'basophil',
        'pre-natural killer cell',
        'immature T cell',
        'myeloid cell',
        'T cell',
        'granulocyte',
        'naive B cell',
        'leukocyte',
        'precursor B cell',
        'macrophage',
        'immature B cell',
        'monocyte',
        'late pro-B cell',
        'natural killer cell',
        'granulocyte monocyte progenitor cell',
        'classical monocyte',
        'lymphocyte',
        'professional antigen presenting cell',
        'mature natural killer cell',
        'immature NK T cell',
        'immature natural killer cell',
    ]

    samplesheet = samplesheet.loc[samplesheet['cell_ontology_class'].isin(
        immune_types)]

    print('Tabula Muris has a total of {:} immune cells of {:} types'.format(
        samplesheet.shape[0], len(immune_types)))

    cnames_unsort = samplesheet.index

    cellnames = []
    genes = []
    counts = []
    fns = glob.glob('../../data/tabulamuris/FACS_alltissues/FACS/FACS/*.loom')
    for ifn, fn in enumerate(fns):
        tissue = os.path.basename(fn)[:-len('-counts.loom')]
        print('Mining {:} ({:}/{:})'.format(tissue, ifn + 1, len(fns)))
        with loompy.connect(fn) as dsl:
            cnsus = dsl.ca['CellID']
            idx = pd.Index(cnsus).isin(cnames_unsort).nonzero()[0]
            cns = cnsus[idx]
            cos = dsl[:, idx]

            cellnames.append(cns)
            counts.append(cos)
            genes.append(dsl.ra['GeneName'])

    # Check that they all have the same genes
    if len(set([tuple(x) for x in genes])) > 1:
        print('WARNING: not all tissues have the same genes')
        return {
            'ss': samplesheet,
            'counts': counts,
            'cns': cellnames,
            'genes': genes
        }

    print('Merging into single loom file')
    cellnames = np.concatenate(cellnames)
    counts = np.hstack(counts)
    genes = genes[0]
    samplesheet = samplesheet.loc[cellnames]

    print('Writing loom file')
    col_attrs = {col: samplesheet[col].values for col in samplesheet.columns}
    col_attrs['CellID'] = samplesheet.index.values
    row_attrs = {'GeneName': genes}
    loompy.create(
        fn_loom,
        layers={'': counts},
        col_attrs=col_attrs,
        row_attrs=row_attrs,
    )
Example #30
0
    klass = type(obj)
    repeated_values = np.repeat(values, range(1, len(values) + 1))
    obj = klass(repeated_values, dtype=obj.dtype)

    if isinstance(obj, pd.CategoricalIndex):
        assert obj.nunique() == len(obj.categories)
        assert obj.nunique(dropna=False) == len(obj.categories) + 1
    else:
        num_unique_values = len(obj.unique())
        assert obj.nunique() == max(0, num_unique_values - 1)
        assert obj.nunique(dropna=False) == max(0, num_unique_values)


@pytest.mark.parametrize("idx_or_series_w_bad_unicode",
                         [pd.Index(["\ud83d"] * 2),
                          pd.Series(["\ud83d"] * 2)])
def test_unique_bad_unicode(idx_or_series_w_bad_unicode):
    # regression test for #34550
    obj = idx_or_series_w_bad_unicode
    result = obj.unique()

    if isinstance(obj, pd.Index):
        expected = pd.Index(["\ud83d"], dtype=object)
        tm.assert_index_equal(result, expected, exact=True)
    else:
        expected = np.array(["\ud83d"], dtype=object)
        tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("dropna", [True, False])