Beispiel #1
0
    def test_all_any_params(self):
        # Check skipna, with implicit 'object' dtype.
        s1 = Series([np.nan, True])
        s2 = Series([np.nan, False])
        assert s1.all(skipna=False)  # nan && True => True
        assert s1.all(skipna=True)
        assert np.isnan(s2.any(skipna=False))  # nan || False => nan
        assert not s2.any(skipna=True)

        # Check level.
        s = pd.Series([False, False, True, True, False, True],
                      index=[0, 0, 1, 1, 2, 2])
        tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
        tm.assert_series_equal(s.any(level=0), Series([False, True, True]))

        # bool_only is not implemented with level option.
        with pytest.raises(NotImplementedError):
            s.any(bool_only=True, level=0)
        with pytest.raises(NotImplementedError):
            s.all(bool_only=True, level=0)

        # bool_only is not implemented alone.
        with pytest.raises(NotImplementedError):
            s.any(bool_only=True)
        with pytest.raises(NotImplementedError):
            s.all(bool_only=True)
Beispiel #2
0
    def test_all_any_params(self):
        # Check skipna, with implicit 'object' dtype.
        s1 = Series([np.nan, True])
        s2 = Series([np.nan, False])
        assert s1.all(skipna=False)  # nan && True => True
        assert s1.all(skipna=True)
        assert np.isnan(s2.any(skipna=False))  # nan || False => nan
        assert not s2.any(skipna=True)

        # Check level.
        s = pd.Series([False, False, True, True, False, True],
                      index=[0, 0, 1, 1, 2, 2])
        tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
        tm.assert_series_equal(s.any(level=0), Series([False, True, True]))

        # bool_only is not implemented with level option.
        with pytest.raises(NotImplementedError):
            s.any(bool_only=True, level=0)
        with pytest.raises(NotImplementedError):
            s.all(bool_only=True, level=0)

        # bool_only is not implemented alone.
        with pytest.raises(NotImplementedError):
            s.any(bool_only=True,)
        with pytest.raises(NotImplementedError):
            s.all(bool_only=True)
Beispiel #3
0
    def test_all_any_boolean(self):
        # Check skipna, with boolean type
        s1 = Series([pd.NA, True], dtype="boolean")
        s2 = Series([pd.NA, False], dtype="boolean")
        assert s1.all(skipna=False) is pd.NA  # NA && True => NA
        assert s1.all(skipna=True)
        assert s2.any(skipna=False) is pd.NA  # NA || False => NA
        assert not s2.any(skipna=True)

        # GH-33253: all True / all False values buggy with skipna=False
        s3 = Series([True, True], dtype="boolean")
        s4 = Series([False, False], dtype="boolean")
        assert s3.all(skipna=False)
        assert not s4.any(skipna=False)

        # Check level TODO(GH-33449) result should also be boolean
        s = Series(
            [False, False, True, True, False, True],
            index=[0, 0, 1, 1, 2, 2],
            dtype="boolean",
        )
        with tm.assert_produces_warning(FutureWarning):
            tm.assert_series_equal(s.all(level=0), Series([False, True,
                                                           False]))
        with tm.assert_produces_warning(FutureWarning):
            tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
Beispiel #4
0
class Any:

    params = [[10**3, 10**6], ["fast", "slow"]]
    param_names = ["N", "case"]

    def setup(self, N, case):
        val = case == "fast"
        self.s = Series([val] * N)

    def time_any(self, N, case):
        self.s.any()
Beispiel #5
0
class Any(object):

    params = [[10**3, 10**6], ['fast', 'slow']]
    param_names = ['N', 'case']

    def setup(self, N, case):
        val = case == 'fast'
        self.s = Series([val] * N)

    def time_any(self, N, case):
        self.s.any()
def _fill_res_dict(col: pd.Series, col_oob: pd.Series, res_dict: dict) -> dict:

    valid = not col_oob.any()
    res_dict["valid"] = valid

    if not valid:
        col_oob = col_oob.fillna(False)
        n = global_log_verbosity

        # get the unexpected values
        unexpected_index = col_oob.index[col_oob]
        unexpected_values = col[unexpected_index].astype(str)

        res_dict["percentage_of_column_is_error"] = (len(unexpected_index) /
                                                     len(col) * 100)

        if n is not None:
            # if the global_log_verbosity is not 0, sample
            if n != 0:
                # asking for a higher sample than is there?
                if global_log_verbosity > len(unexpected_values):
                    n = len(unexpected_values)
                # sample the requested amount
                unexpected_values = unexpected_values.sample(n=n)
                unexpected_index = unexpected_values[unexpected_values.index]
            # log the required unexpected values
            res_dict["unexpected_index_sample"] = unexpected_index.tolist()
            res_dict["unexpected_values_sample"] = unexpected_values.tolist()

    return res_dict
Beispiel #7
0
    def test_all_any_params(self):
        # Check skipna, with implicit 'object' dtype.
        s1 = Series([np.nan, True])
        s2 = Series([np.nan, False])
        assert s1.all(skipna=False)  # nan && True => True
        assert s1.all(skipna=True)
        assert s2.any(skipna=False)
        assert not s2.any(skipna=True)

        # Check level.
        s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2])
        with tm.assert_produces_warning(FutureWarning):
            tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
        with tm.assert_produces_warning(FutureWarning):
            tm.assert_series_equal(s.any(level=0), Series([False, True, True]))

        msg = "Option bool_only is not implemented with option level"
        with pytest.raises(NotImplementedError, match=msg):
            with tm.assert_produces_warning(FutureWarning):
                s.any(bool_only=True, level=0)
        with pytest.raises(NotImplementedError, match=msg):
            with tm.assert_produces_warning(FutureWarning):
                s.all(bool_only=True, level=0)

        # GH#38810 bool_only is not implemented alone.
        msg = "Series.any does not implement bool_only"
        with pytest.raises(NotImplementedError, match=msg):
            s.any(bool_only=True)
        msg = "Series.all does not implement bool_only."
        with pytest.raises(NotImplementedError, match=msg):
            s.all(bool_only=True)
Beispiel #8
0
    def test_all_any_params(self):
        # Check skipna, with implicit 'object' dtype.
        s1 = Series([np.nan, True])
        s2 = Series([np.nan, False])
        assert s1.all(skipna=False)  # nan && True => True
        assert s1.all(skipna=True)
        assert np.isnan(s2.any(skipna=False))  # nan || False => nan
        assert not s2.any(skipna=True)

        # Check level.
        s = Series([False, False, True, True, False, True],
                   index=[0, 0, 1, 1, 2, 2])
        tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
        tm.assert_series_equal(s.any(level=0), Series([False, True, True]))

        msg = "Option bool_only is not implemented with option level"
        with pytest.raises(NotImplementedError, match=msg):
            s.any(bool_only=True, level=0)
        with pytest.raises(NotImplementedError, match=msg):
            s.all(bool_only=True, level=0)

        # bool_only is not implemented alone.
        # TODO GH38810 change this error message to:
        # "Series.any does not implement bool_only"
        msg = "Series.any does not implement numeric_only"
        with pytest.raises(NotImplementedError, match=msg):
            s.any(bool_only=True)
        msg = "Series.all does not implement numeric_only."
        with pytest.raises(NotImplementedError, match=msg):
            s.all(bool_only=True)
    def test_all_any(self):
        ts = tm.makeTimeSeries()
        bool_series = ts > 0
        assert not bool_series.all()
        assert bool_series.any()

        # Alternative types, with implicit 'object' dtype.
        s = Series(["abc", True])
        assert "abc" == s.any()  # 'abc' || True => 'abc'
Beispiel #10
0
    def test_all_any(self):
        ts = tm.makeTimeSeries()
        bool_series = ts > 0
        assert not bool_series.all()
        assert bool_series.any()

        # Alternative types, with implicit 'object' dtype.
        s = Series(['abc', True])
        assert 'abc' == s.any()  # 'abc' || True => 'abc'
    def test_any_all_datetimelike(self):
        # GH#38723 these may not be the desired long-term behavior (GH#34479)
        #  but in the interim should be internally consistent
        dta = date_range("1995-01-02", periods=3)._data
        ser = Series(dta)
        df = DataFrame(ser)

        assert dta.all()
        assert dta.any()

        assert ser.all()
        assert ser.any()

        assert df.any().all()
        assert df.all().all()

        dta = dta.tz_localize("UTC")
        ser = Series(dta)
        df = DataFrame(ser)

        assert dta.all()
        assert dta.any()

        assert ser.all()
        assert ser.any()

        assert df.any().all()
        assert df.all().all()

        tda = dta - dta[0]
        ser = Series(tda)
        df = DataFrame(ser)

        assert tda.any()
        assert not tda.all()

        assert ser.any()
        assert not ser.all()

        assert df.any().all()
        assert not df.all().any()
Beispiel #12
0
def test_any_non_keyword_deprecation():
    df = DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
    msg = (
        "In a future version of pandas all arguments of "
        "DataFrame.any and Series.any will be keyword-only."
    )
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = df.any("index", None)
    expected = Series({"A": True, "B": True, "C": False})
    tm.assert_series_equal(result, expected)

    s = Series([False, False, False])
    msg = (
        "In a future version of pandas all arguments of "
        "DataFrame.any and Series.any will be keyword-only."
    )
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = s.any("index")
    expected = False
    tm.assert_equal(result, expected)
Beispiel #13
0
def threshold_cluster(Data_set,threshold):
    stand_array=np.asarray(Data_set).ravel('C')
    stand_Data=Series(stand_array)
    index_list,class_k=[],[]
    while stand_Data.any():
        if len(stand_Data)==1:
            index_list.append(list(stand_Data.index))
            class_k.append(list(stand_Data))
            stand_Data=stand_Data.drop(stand_Data.index)
        else:
            class_data_index=stand_Data.index[0]
            class_data=stand_Data[class_data_index]
            stand_Data=stand_Data.drop(class_data_index)
            if (abs(stand_Data-class_data)<=threshold).any():
                args_data=stand_Data[abs(stand_Data-class_data)<=threshold]
                stand_Data=stand_Data.drop(args_data.index)
                index_list.append([class_data_index]+list(args_data.index))
                class_k.append([class_data]+list(args_data))
            else:
                index_list.append([class_data_index])
                class_k.append([class_data])
    return index_list,class_k 
Beispiel #14
0
    def load(self, datelist, tracklist=[], path=''):
        """
        load all JCapper Race files (aka past performance files) for the specified dates
        :param datelist: list of dates
        :param tracklist: list of x8 track symbols e.g. DMR, etc
        :param path: str, if given, load data from given directory, otherwise, load data directly from s3
        """

        schema_filepath = os.path.join(os.path.dirname(__file__),
                                       'schema_pastperformance.csv')
        columnDict = read_csv(schema_filepath)['field_name'].to_dict()

        # convert tracklist symbols to jcp track symbols
        tracklist = Series(tracklist).map(self.map_track_x8_to_jcp)
        if tracklist.isnull().any():
            raise ValueError(
                'tracklist must be list of x8 track symbols in track_detail: \n%s'
                % tracklist)

        # raw data
        self.dfraw = DataFrame()

        if not path:
            # load each date and concat to the master raw df
            for d in datelist:
                # load the DataFrame for this date, e.g. DMR0831F.TXT for 2017-08-31
                year = d.strftime('%Y')
                month = d.strftime('%m')
                day = d.strftime('%d')
                # skip Christmas, no jcapper file
                if month == '12' and day in ['24', '25']:
                    continue
                key = 'x8-bucket/jcapper/%s/%s/%s/' % (year, month, day)
                s3_files = self.s3.ls(
                    key
                )  # list of all files in a given direcetory - in this case, all files for a single day
                # filter for .jcp files, in this case for non Chart Files
                s3_files = [
                    os.path.basename(fp) for fp in s3_files if fp[-5] != 'F'
                ]
                # filter tracks
                if len(tracklist) > 0:
                    s3_files = [
                        fp for fp in s3_files if fp[:3] in list(tracklist)
                    ]
                idx_s3files = Series([n[:3] for n in s3_files
                                      ]).drop_duplicates().index
                if self.verbose:
                    print('pp.load(%s) loading %s race cards..' %
                          (d.strftime('%Y-%m-%d'), len(idx_s3files)))
                # load all past performance files for given date, track is no longer a condition
                for i in idx_s3files:
                    fp = os.path.join(key, s3_files[i])
                    if fp[-3:] == 'jcp':
                        df = read_csv(self.s3.open(fp, mode='rb'),
                                      header=None,
                                      encoding='ISO-8859-1')
                    else:
                        df = read_csv(self.s3.open(fp, mode='rb'),
                                      header=None,
                                      compression='zip',
                                      encoding='ISO-8859-1')

                    # concat in the master df
                    self.dfraw = concat([self.dfraw, df])
        else:
            # load each date and concat to the master raw df
            for d in datelist:
                # load the DataFrame for this date, e.g. DMR0831F.TXT for 2017-08-31
                year = d.strftime('%Y')
                month = d.strftime('%m')
                day = d.strftime('%d')
                # skip Christmas, no jcapper file
                if month == '12' and day in ['24', '25']:
                    continue
                path_day = os.path.join(path, 'jcapper', year, month, day)
                files = os.listdir(
                    path_day
                )  # list of all files in a given direcetory - in this case, all files for a single day
                # filter tracks
                if tracklist.any():
                    files = [
                        fp for fp in files
                        if os.path.basename(fp)[:3] in list(tracklist)
                    ]
                if self.verbose:
                    print('pp.load(%s) loading %s race cards..' %
                          (d.strftime('%Y-%m-%d'), len(files)))
                # load all past performance files for given date, track is no longer a condition
                for fp in files:
                    # filter for .jcp files, in this case for non Chart Files
                    if fp[-5] != 'F':
                        fp = os.path.join(path_day, fp)
                        if fp[-3:] == 'jcp':
                            df = read_csv(fp,
                                          header=None,
                                          encoding='ISO-8859-1')
                        else:
                            df = read_csv(fp,
                                          header=None,
                                          compression='zip',
                                          encoding='ISO-8859-1')

                        # concat in the master df
                        self.dfraw = concat([self.dfraw, df])

        try:
            # copy a subset of columns and replace the header names (numbers to text)
            cols = list(columnDict.keys())
            self.df = self.dfraw[cols].copy()
        except KeyError:
            raise Exception(
                'No files available for given datelist and tracklist.')

        # column names
        self.df.rename(columns=columnDict, inplace=True)

        # normalize track sym and make race_id
        self.df['x8_track_sym'] = self.df['jcp_track_sym'].map(
            self.map_track_jcp_to_x8)
        # adding itsp_track_sym here so that we can filter for bettable tracks in daily races
        self.df['itsp_track_sym'] = self.df['x8_track_sym'].map(
            self.map_track_x8_to_itsp)

        # drop rows where we are missing jcp symbol mapping in track detail if any
        x8_isnull = self.df['x8_track_sym'].isnull()
        if x8_isnull.any():
            missing_jcp_symbols = self.df[x8_isnull]['jcp_track_sym'].unique()
            warn(
                'pp.load() track_detail.csv is missing jcp symbols: %s\nDropping all rows with missing symbols'
                % missing_jcp_symbols)
            self.df = self.df[~x8_isnull]
            print('pp.load() dropping %s rows' % x8_isnull.sum())

        # convert dates and validate
        self.df['race_time_flag'] = self.df['race_time'].isnull(
        )  # flag bad race_time values (sometimes is null)
        self.df['race_time'] = to_datetime(
            self.df['date'].astype(str) +
            self.df['race_time'].fillna(1000.0).astype(int).astype(str),
            format='%Y%m%d%H%M')
        self.df['race_time_utc'] = self.df['race_time'].map(
            lambda x: x + timedelta(hours=8))
        self.df['race_time_toronto'] = self.df['race_time'].map(
            lambda x: x + timedelta(hours=3))
        self.df['date'] = to_datetime(self.df['date'], format='%Y%m%d')
        self.df['date_str'] = self.df['date'].dt.strftime('%Y%m%d')
        self._birthdate_columns()

        # clean nans in wk date cols
        fields_wk_date = [
            c for c in self.df.columns if c.startswith('wk_date')
        ]
        self.df['wk_date_1'].fillna(self.df['date_str'], inplace=True)
        self.df[fields_wk_date] = self.df[fields_wk_date].fillna(
            method='ffill', axis=1)
        self.df[fields_wk_date] = self.df[fields_wk_date].applymap(
            lambda x: to_datetime(str(int(x)), format='%Y%m%d'))

        # clean nans in pp date cols
        fields_pp_date = [
            c for c in self.df.columns if c.startswith('pp_date')
        ]
        self.df['pp_date_0'].fillna(self.df['date_str'], inplace=True)
        self.df[fields_pp_date] = self.df[fields_pp_date].fillna(
            method='ffill', axis=1)
        self.df[fields_pp_date] = self.df[fields_pp_date].applymap(
            lambda x: to_datetime(str(int(x)), format='%Y%m%d'))

        self.df['race_id'] = self.df['x8_track_sym'] + '_' + self.df[
            'date_str'] + '_' + self.df['race_race_num'].astype(str)
        self.df['runner_program_number'] = self.df[
            'runner_program_number'].map(str)
        self.df['betting_interest'] = self.df[
            'runner_program_number'].str.strip('A')
        self.df['coupled'] = self.df['runner_program_number'].str.count(
            'A').astype(bool)
        self.df['coupled_race'] = self.df.groupby(
            'race_id')['coupled'].transform('any')
        self.df['runner_id'] = self.df['race_id'] + '_' + self.df[
            'runner_program_number']

        # additional time index data and day of week for seasonality
        self.df['month'] = self.df['date'].map(lambda x: x.month)
        self.df['weekday'] = self.df['date'].map(lambda x: x.strftime('%A'))
        self.df['year'] = self.df['date'].map(lambda x: x.year)
        self.df['weeknum'] = self.df['date'].map(lambda x: x.strftime('%w'))
        # normalize horse name
        self.df['x8name'] = self.df['name'].map(self._normalize_name)
        self.df['x8country'] = self.df['name'].map(self._country_from_name)

        # convert pp_track and wk_track columns to x8 symbol
        fields_pp_track = [
            c for c in self.df.columns if c.startswith('pp_track_')
        ]
        self.df[fields_pp_track] = self.df[fields_pp_track].applymap(
            lambda x: self.map_track_chart_to_x8.get(x))
        fields_wk_track = [
            c for c in self.df.columns if c.startswith('wk_track_')
        ]
        self.df[fields_wk_track] = self.df[fields_wk_track].applymap(
            lambda x: self.map_track_chart_to_x8.get(x))

        # make dataframes for historical pp columns and wk columns that are multiindexed by date
        self._index_pp_columns()
        self._index_wk_columns()

        # validate df
        self._validate(datelist)