Example #1
0
    def test_series_map_box_timestamps(self):
        # GH#2689, GH#2627
        ser = Series(pd.date_range('1/1/2000', periods=10))

        def func(x):
            return (x.hour, x.day, x.month)

        # it works!
        ser.map(func)
        ser.apply(func)
Example #2
0
    def test_series_map_box_timedelta(self):
        # GH 11349
        s = Series(timedelta_range('1 day 1 s', periods=5, freq='h'))

        def f(x):
            return x.total_seconds()

        s.map(f)
        s.apply(f)
        DataFrame(s).applymap(f)
Example #3
0
    def test_apply_same_length_inference_bug(self):
        s = Series([1, 2])
        f = lambda x: (x, x + 1)

        result = s.apply(f)
        expected = s.map(f)
        assert_series_equal(result, expected)

        s = Series([1, 2, 3])
        result = s.apply(f)
        expected = s.map(f)
        assert_series_equal(result, expected)
Example #4
0
class Map(object):

    params = ['dict', 'Series']
    param_names = 'mapper'

    def setup(self, mapper):
        map_size = 1000
        map_data = Series(map_size - np.arange(map_size))
        self.map_data = map_data if mapper == 'Series' else map_data.to_dict()
        self.s = Series(np.random.randint(0, map_size, 10000))

    def time_map(self, mapper):
        self.s.map(self.map_data)
Example #5
0
    def test_series_frame_radd_bug(self):
        from pandas.util.testing import rands

        # GH 353
        vals = Series([rands(5) for _ in xrange(10)])
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        assert_series_equal(result, expected)

        frame = DataFrame({'vals' : vals})
        result = 'foo_' + frame
        expected = DataFrame({'vals' : vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)
Example #6
0
    def test_series_frame_radd_bug(self):
        import operator

        # GH 353
        vals = Series(tm.rands_array(5, 10))
        result = 'foo_' + vals
        expected = vals.map(lambda x: 'foo_' + x)
        assert_series_equal(result, expected)

        frame = DataFrame({'vals': vals})
        result = 'foo_' + frame
        expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)})
        tm.assert_frame_equal(result, expected)

        # really raise this time
        self.assertRaises(TypeError, operator.add, datetime.now(), self.ts)
Example #7
0
    def test_all_methods(self):
        x_cols = ["Lag2"]
        formula = "Direction~Lag2"
        # print self.df.shape[0]
        train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
        # print train_data.shape[0]
        """ (d) logistic"""
        model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
        result = model.fit()
        test_data = self.df.ix[self.df["Year"] > 2008, :]
        probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
        pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
        tp.output_table(pred_values.values, test_data[self.y_col].values)

        train_X = train_data[x_cols].values
        train_y = train_data[self.y_col].values
        test_X = test_data[x_cols].values
        test_y = test_data[self.y_col].values
        """ (e) LDA """
        lda_res = LDA().fit(train_X, train_y)
        pred_y = lda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (f) QDA """
        qda_res = QDA().fit(train_X, train_y)
        pred_y = qda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (g) KNN """
        clf = neighbors.KNeighborsClassifier(1, weights="uniform")
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (h) logistic and LDA """
        """ (i) Is the purpose of the last question going through all methods with no direction?"""
Example #8
0
    def test_rolling_max_how_resample(self):

        indices = [datetime(1975, 1, i) for i in range(1, 6)]
        # So that we can have 3 datapoints on last day (4, 10, and 20)
        indices.append(datetime(1975, 1, 5, 1))
        indices.append(datetime(1975, 1, 5, 2))
        series = Series(list(range(0, 5)) + [10, 20], index=indices)
        # Use floats instead of ints as values
        series = series.map(lambda x: float(x))
        # Sort chronologically
        series = series.sort_index()

        # Default how should be max
        expected = Series([0.0, 1.0, 2.0, 3.0, 20.0],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D')
        assert_series_equal(expected, x)

        # Now specify median (10.0)
        expected = Series([0.0, 1.0, 2.0, 3.0, 10.0],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D', how='median')
        assert_series_equal(expected, x)

        # Now specify mean (4+10+20)/3
        v = (4.0+10.0+20.0)/3.0
        expected = Series([0.0, 1.0, 2.0, 3.0, v],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D', how='mean')
        assert_series_equal(expected, x)
Example #9
0
 def logistic_regression(self, use_glm=True):
     """
     (b) it seems the statistical significant predict variable is only Lag2. How disappointing...
     """
     formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
     model = (
         smf.glm(formula, data=self.df, family=sm.families.Binomial())
         if use_glm
         else smf.logit(formula, data=self.transformedDF)
     )
     result = model.fit()
     if use_glm:
         probs = result.fittedvalues
         """Beware the prob here is the index 0's prob, so we should use the lambda function below"""
         pred_values = probs.map(lambda x: 0 if x > 0.5 else 1)
     else:
         """The probability of being 1"""
         probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]])))
         pred_values = probs.map(lambda x: 1 if x > 0.5 else 0)
     """
     (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%.
         Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%.
         Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%.
     """
     tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
Example #10
0
 def test_map_defaultdict(self):
     s = Series([1, 2, 3], index=['a', 'b', 'c'])
     default_dict = defaultdict(lambda: 'blank')
     default_dict[1] = 'stuff'
     result = s.map(default_dict)
     expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c'])
     assert_series_equal(result, expected)
Example #11
0
def extract_bday_feats_n_heads(series, modality, field, stat_type, tr_type):
    """
    "bussiness day or not" conditioning feature extraction
    :return: feature name list, feature value list
    """
    if series is None or len(series) == 0:
        b_day_heads, b_day_feats = extract_basic_feats_n_heads(None, modality, field, stat_type, tr_type)
        nb_day_heads, nb_day_feats = extract_basic_feats_n_heads(None, modality, field, stat_type, tr_type)
    else:
        cal = SouthKorea()
        time_stamp_series = Series(series.index.tolist())
        unique_dates = time_stamp_series.map(lambda x: x.date()).unique()
        nb_day_series = None
        b_day_series = None
        for date in unique_dates:
            if cal.is_holiday(date) is False and date.weekday() < 5:
                if b_day_series is None:
                    b_day_series = series[series.index.date == date]
                else:
                    b_day_series = b_day_series.append(series[series.index.date == date])
            else:
                if nb_day_series is None:
                    nb_day_series = series[series.index.date == date]
                else:
                    nb_day_series = nb_day_series.append(series[series.index.date == date])
        b_day_heads, b_day_feats = extract_basic_feats_n_heads(b_day_series, modality, field, stat_type, tr_type)
        nb_day_heads, nb_day_feats = extract_basic_feats_n_heads(nb_day_series, modality, field, stat_type, tr_type)
    heads = list(map(lambda x: '%s_%s' % (feat.BSS_DAY, x), b_day_heads)) + list(
        map(lambda x: '%s_%s' % (feat.NON_BSS_DAY, x), nb_day_heads))
    values = b_day_feats + nb_day_feats
    return heads, values
Example #12
0
 def test_map_counter(self):
     s = Series(['a', 'b', 'c'], index=[1, 2, 3])
     counter = Counter()
     counter['b'] = 5
     counter['c'] += 1
     result = s.map(counter)
     expected = Series([0, 5, 1], index=[1, 2, 3])
     assert_series_equal(result, expected)
Example #13
0
 def test_map_dict_subclass_without_missing(self):
     class DictWithoutMissing(dict):
         pass
     s = Series([1, 2, 3])
     dictionary = DictWithoutMissing({3: 'three'})
     result = s.map(dictionary)
     expected = Series([np.nan, np.nan, 'three'])
     assert_series_equal(result, expected)
Example #14
0
 def test_parse_dates_combine(self):
     raw_dates = Series(date_range('1/1/2001', periods=10))
     df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
                     'time': raw_dates.map(lambda x: str(x.time()))})
     res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
                          index_col=1)
     newdf = DataFrame({'datetime': raw_dates})
     tm.assert_frame_equal(newdf, res[0])
Example #15
0
    def test_encode_decode_errors(self):
        encodeBase = Series([u"a", u"b", u"a\x9d"])

        self.assertRaises(UnicodeEncodeError, encodeBase.str.encode, "cp1252")

        f = lambda x: x.encode("cp1252", "ignore")
        result = encodeBase.str.encode("cp1252", "ignore")
        exp = encodeBase.map(f)
        tm.assert_series_equal(result, exp)

        decodeBase = Series(["a", "b", "a\x9d"])

        self.assertRaises(UnicodeDecodeError, decodeBase.str.encode, "cp1252")

        f = lambda x: x.decode("cp1252", "ignore")
        result = decodeBase.str.decode("cp1252", "ignore")
        exp = decodeBase.map(f)

        tm.assert_series_equal(result, exp)
Example #16
0
    def test_map_int(self):
        left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4})
        right = Series({1: 11, 2: 22, 3: 33})

        assert left.dtype == np.float_
        assert issubclass(right.dtype.type, np.integer)

        merged = left.map(right)
        assert merged.dtype == np.float_
        assert isna(merged['d'])
        assert not isna(merged['c'])
Example #17
0
    def test_map(self):
        index, data = tm.getMixedTypeDict()

        source = Series(data['B'], index=data['C'])
        target = Series(data['C'][:4], index=data['D'][:4])

        merged = target.map(source)

        for k, v in merged.iteritems():
            self.assertEqual(v, source[target[k]])

        # input could be a dict
        merged = target.map(source.to_dict())

        for k, v in merged.iteritems():
            self.assertEqual(v, source[target[k]])

        # function
        result = self.ts.map(lambda x: x * 2)
        self.assert_(np.array_equal(result, self.ts * 2))
Example #18
0
    def test_map_int(self):
        left = Series({'a' : 1., 'b' : 2., 'c' : 3., 'd' : 4})
        right = Series({1 : 11, 2 : 22, 3 : 33})

        self.assert_(left.dtype == np.float_)
        self.assert_(issubclass(right.dtype.type, np.integer))

        merged = left.map(right)
        self.assert_(merged.dtype == np.float_)
        self.assert_(isnull(merged['d']))
        self.assert_(not isnull(merged['c']))
Example #19
0
    def test_map(self):
        index, data = tm.getMixedTypeDict()

        source = Series(data['B'], index=data['C'])
        target = Series(data['C'][:4], index=data['D'][:4])

        merged = target.map(source)

        for k, v in compat.iteritems(merged):
            assert v == source[target[k]]

        # input could be a dict
        merged = target.map(source.to_dict())

        for k, v in compat.iteritems(merged):
            assert v == source[target[k]]

        # function
        result = self.ts.map(lambda x: x * 2)
        tm.assert_series_equal(result, self.ts * 2)

        # GH 10324
        a = Series([1, 2, 3, 4])
        b = Series(["even", "odd", "even", "odd"], dtype="category")
        c = Series(["even", "odd", "even", "odd"])

        exp = Series(["odd", "even", "odd", np.nan], dtype="category")
        tm.assert_series_equal(a.map(b), exp)
        exp = Series(["odd", "even", "odd", np.nan])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(['a', 'b', 'c', 'd'])
        b = Series([1, 2, 3, 4],
                   index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
        c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e']))

        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(['a', 'b', 'c', 'd'])
        b = Series(['B', 'C', 'D', 'E'], dtype='category',
                   index=pd.CategoricalIndex(['b', 'c', 'd', 'e']))
        c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e']))

        exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'],
                                    categories=['B', 'C', 'D', 'E']))
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, 'B', 'C', 'D'])
        tm.assert_series_equal(a.map(c), exp)
Example #20
0
    def test_encode_decode_errors(self):
        encodeBase = Series([u('a'), u('b'), u('a\x9d')])

        self.assertRaises(UnicodeEncodeError,
                          encodeBase.str.encode, 'cp1252')

        f = lambda x: x.encode('cp1252', 'ignore')
        result = encodeBase.str.encode('cp1252', 'ignore')
        exp = encodeBase.map(f)
        tm.assert_series_equal(result, exp)

        decodeBase = Series([b'a', b'b', b'a\x9d'])

        self.assertRaises(UnicodeDecodeError,
                          decodeBase.str.decode, 'cp1252')

        f = lambda x: x.decode('cp1252', 'ignore')
        result = decodeBase.str.decode('cp1252', 'ignore')
        exp = decodeBase.map(f)

        tm.assert_series_equal(result, exp)
Example #21
0
 def test_map_dict_subclass_with_missing(self):
     """
     Test Series.map with a dictionary subclass that defines __missing__,
     i.e. sets a default value (GH #15999).
     """
     class DictWithMissing(dict):
         def __missing__(self, key):
             return 'missing'
     s = Series([1, 2, 3])
     dictionary = DictWithMissing({3: 'three'})
     result = s.map(dictionary)
     expected = Series(['missing', 'missing', 'three'])
     assert_series_equal(result, expected)
Example #22
0
    def test_len(self):
        values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"])

        result = values.str.len()
        exp = values.map(lambda x: len(x) if com.notnull(x) else NA)
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = Series(["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0])

        rs = Series(mixed).str.len()
        xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA])

        self.assert_(isinstance(rs, Series))
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u"foo", u"fooo", u"fooooo", np.nan, u"fooooooo"])

        result = values.str.len()
        exp = values.map(lambda x: len(x) if com.notnull(x) else NA)
        tm.assert_series_equal(result, exp)
Example #23
0
def _esd(x, max_outlier, alpha, direction):
    """
    The ESD test using median and MAD in the calculation of the test statistic.
    """
    x = Series(x)
    n = len(x)
    outlier_index = []
    for i in range(1, max_outlier + 1):
        median = x.median()
        mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT
        if mad == 0:
            break
        if direction == 'both':
            ares = x.map(lambda value: abs(value - median) / mad)
        elif direction == 'pos':
            ares = x.map(lambda value: (value - median) / mad)
        elif direction == 'neg':
            ares = x.map(lambda value: (median - value) / mad)
        r_idx = ares.idxmax()
        r = ares[r_idx]
        if direction == 'both':
            p = 1.0 - alpha / (2 * (n - i + 1))
        else:
            p = 1.0 - alpha / (n - i + 1)
        crit = t.ppf(p, n-i-1)
        lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" %
                         (i, max_outlier, median, mad, r_idx, r, crit, lam))
        if r > lam:
            outlier_index.append(r_idx)
            x = x.drop(r_idx)
        else:
            # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time,
            # we can stop.
            break
    return outlier_index
Example #24
0
    def test_rolling_max_gh6297(self):
        """Replicate result expected in GH #6297"""

        indices = [datetime(1975, 1, i) for i in range(1, 6)]
        # So that we can have 2 datapoints on one of the days
        indices.append(datetime(1975, 1, 3, 6, 0))
        series = Series(range(1, 7), index=indices)
        # Use floats instead of ints as values
        series = series.map(lambda x: float(x))
        # Sort chronologically
        series = series.sort_index()

        expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq="D")
        assert_series_equal(expected, x)
Example #25
0
    def test_len(self):
        values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo'])

        result = values.str.len()
        exp = values.map(lambda x: len(x) if com.notnull(x) else NA)
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
                        'foo', None, 1, 2.])

        rs = Series(mixed).str.len()
        xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA])

        tm.assert_isinstance(rs, Series)
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan,
                         u('fooooooo')])

        result = values.str.len()
        exp = values.map(lambda x: len(x) if com.notnull(x) else NA)
        tm.assert_series_equal(result, exp)
Example #26
0
def test_type_promote_putmask():
    # GH8387: test that changing types does not break alignment
    ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5)
    left, mask = ts.copy(), ts > 0
    right = ts[mask].copy().map(str)
    left[mask] = right
    assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t))

    s = Series([0, 1, 2, 0])
    mask = s > 0
    s2 = s[mask].map(str)
    s[mask] = s2
    assert_series_equal(s, Series([0, '1', '2', 0]))

    s = Series([0, 'foo', 'bar', 0])
    mask = Series([False, True, True, False])
    s2 = s[mask]
    s[mask] = s2
    assert_series_equal(s, Series([0, 'foo', 'bar', 0]))
def calcSumDistPerDay(df):
    t = formattedDf.index # Get timeseries index
    t2 = Series(t) # convert to series
    uniqueDates = t2.map(pd.Timestamp.date).unique() # get unique dates only

    uniqueList = []

    # For each datetime object in unique dates, convert to strings so we can
    # use them as dataframe indexes
    for date in uniqueDates:
        uniqueList.append(date.strftime('%Y-%m-%d'))

    dateDict = {} # New dictionary for dataframe

    for date in uniqueList:
        # Grab matching data, take sum and place in new dictionary
        dateDict[date] = formattedDf[date].sum()

    daySumDf = DataFrame(dateDict) # Turn into dataframe

    return daySumDf
Example #28
0
 def membership_map(s: pd.Series,
                    groups: dict,
                    fillvalue: Any = -1) -> pd.Series:
     # Reverse & expand the dictionary key-value pairs
     groups = {x: k for k, v in groups.items() for x in v}
     return s.map(groups).fillna(fillvalue)
Example #29
0
def compute_conditional_distribution(
    data_col, true_labs, pred_labs, as_categorical=False, binning="fd", common_bins=True
):
    """Compute a distributional summary.

    The metric is computed within unique values of the grouping column
    (categorical) or within bins partitioning its range (continuous).

    Parameters
    ----------
    data_col :
        A column of data from a test dataset.
    true_labs : Series
        A series of true labels for the test dataset.
    pred_labs : Series
        A series of labels predicted by a model for the test dataset.
    as_categorical : bool
        Should the data column be treated as categorical, ie. binned
        on its unique values? If it is not numeric, this param is ignored.
    binning : str
        Binning scheme to use for a numerical column, passed to `numpy.histogram`.
        Can be a fixed number of bins or a string indicating a binning scheme
    common_bins : bool
        Should the bins be computed over the entire column and shared
        across groups (`True`) or computed within each group (`False`)

    Returns
    -------
    ConditionalDistributionResult
    """

    grouping = [true_labs, pred_labs]
    if is_discrete(data_col):
        as_categorical = True
    if as_categorical:
        grouping.append(data_col)
        distribs = data_col.groupby(grouping).size()
        if common_bins:
            # Extend the index in each label group to include all data values
            data_vals = distribs.index.get_level_values(-1).unique()
            y_vals = distribs.index.droplevel(-1).unique()
            full_ind = MultiIndex.from_tuples(
                [(yt, yp, x) for yt, yp in y_vals.values for x in data_vals],
                names=distribs.index.names,
            )
            distribs = distribs.reindex(index=full_ind, fill_value=0)
            bin_edges = Series(data_vals)
        else:
            # Convert the innermost index level to a Series of bin edges.
            bin_edges = distribs.rename(None).reset_index(level=-1).iloc[:, 0]
    else:
        if common_bins:
            bins = histogram_bin_edges(data_col, bins=binning)
        else:
            bins = binning
        # distribs will be a series with values (<hist_values>, <bin_edges>)
        distribs = data_col.groupby(grouping).apply(lambda x: histogram(x, bins=bins))
        bin_edges = distribs.map(lambda x: x[1])
        bin_ind_tuples = []
        for y in distribs.index:
            bin_ind_tuples.extend(
                [(y[0], y[1], x) for x in _histogram_bin_labels(bin_edges.loc[y])]
            )
        index_with_bins = MultiIndex.from_tuples(
            bin_ind_tuples, names=distribs.index.names + [None]
        )
        distribs = Series(
            distribs.map(lambda x: x[0]).explode().values, index=index_with_bins
        )
        if common_bins:
            # Retain the unique bin edges as an array
            bin_edges = Series(bin_edges.iloc[0])

    return ConditionalDistributionResult(
        vals=distribs,
        bins=Series(bin_edges),
        categorical=as_categorical,
        binning=binning,
        common_bins=common_bins,
    )
Example #30
0
 def transform(self, X: pd.Series, y=None):
     transed = X.map(self._dict) / X.count()
     return transed
 def get_sanitized_bool_series(source: pd.Series) -> pd.Series:
     return source.map(DataframeReport.sanitize_bool, na_action='ignore')
Example #32
0
    def test_map_na_exclusion(self):
        s = Series([1.5, np.nan, 3, np.nan, 5])

        result = s.map(lambda x: x * 2, na_action='ignore')
        exp = s * 2
        assert_series_equal(result, exp)
Example #33
0
def test_map_float_to_string_precision():
    # GH 13228
    ser = Series(1 / 3)
    result = ser.map(lambda val: str(val)).to_dict()
    expected = {0: "0.3333333333333333"}
    assert result == expected
    def convert_states(s: pd.Series):
        """ Converts df['state'] from abbrev to full"""

        return s.map(Helper.STATES)
Example #35
0
    'metric': 'auc',
    'num_leaves': 25,
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_data_in_leaf': 5,
    'max_bin': 200,
    'verbose': 0,
}
gbm = lgb.train(params, lgb_train, num_boost_round=200)
predict = gbm.predict(X_test)
minmin = min(predict)
maxmax = max(predict)
predict = Series(predict)
vfunc_lg = predict.map(lambda x: (x - minmin) /
                       (maxmax - minmin))  # 将 LGBM 输出概率值映射至[0,1]区间

gbm.feature_importance(importance_type='split')  # 输出特征重要性
gbm.feature_name()  # 特征名称

#################   XGBoost
params = {
    'booster': 'gbtree',
    'objective': 'rank:pairwise',
    'eval_metric': 'auc',
    'eta': 0.02,
    'max_depth': 5,  # 4 3
    'colsample_bytree': 0.7,  #0.8
    'subsample': 0.7,
    'min_child_weight': 1,  # 2 3
    'seed': 1111,
Example #36
0
 def f(series: pd.Series) -> pd.Series:
     return series.map(mapping)
Example #37
0
def test_map_datetimetz_na_action():
    values = date_range("2011-01-01", "2011-01-02",
                        freq="H").tz_localize("Asia/Tokyo")
    s = Series(values, name="XX")
    with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN):
        s.map(lambda x: x, na_action="ignore")
Example #38
0
def test_map_with_invalid_na_action_raises():
    # https://github.com/pandas-dev/pandas/issues/32815
    s = Series([1, 2, 3])
    msg = "na_action must either be 'ignore' or None"
    with pytest.raises(ValueError, match=msg):
        s.map(lambda x: x, na_action="____")
Example #39
0
def color_series(s: pd.Series):
    unique = s.unique()
    colors = sns.color_palette("hls", len(unique)).as_hex()  # type: ignore
    cmap = dict(zip(unique, colors))

    return s.map(cmap)
Example #40
0
kirjainnumero=pd.read_table(sys.argv[2])  #the reference file goes here
yleisreferenssi=pd.read_table(sys.argv[3])#the final reference file

#Next is just munging the datafile, and the references to format we need for actually calculating something.
#First make the ID column of the datafile into actual ID, and not a column of data, and get rid of the data
#formatted column of the ID column from the datafile, and some other changes that are needed using the 
#functions defined above.
datafile.index=datafile['ID'].values
datafile=datafile.drop('ID', axis=1)
datafile=datafile.applymap(remove_asterisks)
#We replace missing values denoted by '*X' in the original .csv given by filling them from the cell to the left 
datafile=datafile.replace(to_replace='X', value=np.nan)
datafile=datafile.fillna(method='ffill', axis=1)
#Get the relevant part of the reference file 1, and make it appropriate for further use
refsarja=Series(np.array(yleisreferenssi['IMGT/HLA 3.9.0 Allele Name']),index=np.array(yleisreferenssi['Locus']))
refsarja=refsarja.map(remove_names)
#As munging procedure we create a series with appropriate indexing to prevent looping and difficulties in future
kirjainnrosarja=Series(np.array(kirjainnumero['SUBTYPE']),index=np.array(kirjainnumero['CODE']))
#Split the values in original data 
datafile=datafile.applymap(splitframe)
#After getting the data read, and in appropriate format, we need to take care of using the correct Loci.
#We do this by splitting the column names(see requirements for the column names in the original datafile)
def get_loci(dataframe):
    lista=[]
    for value in dataframe.columns.values:
        lista.extend(value.split(':')[0])
    return lista
result=pd.DataFrame(index=datafile.index)
for column in datafile.columns:
    lah=[]
    for num, value in datafile[column]:
Example #41
0
def get_h3_centroids(hex_column: Series) -> List:
    centroid_lat_lon = hex_column.map(lambda hex: h3_to_geo(hex))
    return [Point(geom[1], geom[0]) for geom in centroid_lat_lon]
Example #42
0
def test_encode_decode():
    ser = Series(["a", "b", "a\xe4"]).str.encode("utf-8")
    result = ser.str.decode("utf-8")
    expected = ser.map(lambda x: x.decode("utf-8"))
    tm.assert_series_equal(result, expected)
Example #43
0
 def test_map_type_inference(self):
     s = Series(lrange(3))
     s2 = s.map(lambda x: np.where(x == 0, 0, 1))
     assert issubclass(s2.dtype.type, np.integer)
Example #44
0
    def test_map_empty(self, index):
        s = Series(index)
        result = s.map({})

        expected = pd.Series(np.nan, index=s.index)
        tm.assert_series_equal(result, expected)
Example #45
0
 def __init__(self, y: Series, y_uncleaned: Series):
     super().__init__(y=y, y_uncleaned=y_uncleaned)
     self.label_cleaner_binary = LabelCleanerBinary(y=y.map(self.inv_map))
     self.problem_type_transform = self.label_cleaner_binary.problem_type_transform
Example #46
0
 def test_map_compat(self):
     # related GH 8024
     s = Series([True, True, False], index=[1, 2, 3])
     result = s.map({True: "foo", False: "bar"})
     expected = Series(["foo", "foo", "bar"], index=[1, 2, 3])
     tm.assert_series_equal(result, expected)
def preprocessing(texts: pd.Series):
  texts = texts.map(basic_preprocessing.expand_contractions)
  texts = texts.map(basic_preprocessing.remove_special_characters)
  print("Basic preprocessing completed on {} reviews.".format(len(texts)))
  return texts
Example #48
0
 def test_map_compat(self):
     # related GH 8024
     s = Series([True, True, False], index=[1, 2, 3])
     result = s.map({True: 'foo', False: 'bar'})
     expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3])
     assert_series_equal(result, expected)
Example #49
0
 def transform(self, s: pd.Series):
     _ = s.map(self.mapper).fillna(0)  # 不在训练集的补0
     return _
Example #50
0
 def test_map_type_inference(self):
     s = Series(lrange(3))
     s2 = s.map(lambda x: np.where(x == 0, 0, 1))
     assert issubclass(s2.dtype.type, np.integer)
Example #51
0
def fix_na(col: pd.Series) -> pd.Series:
    return col.map(_fix_na)
Example #52
0
    def test_map_na_exclusion(self):
        s = Series([1.5, np.nan, 3, np.nan, 5])

        result = s.map(lambda x: x * 2, na_action='ignore')
        exp = s * 2
        assert_series_equal(result, exp)
Example #53
0
def fix_sex(sex_col: pd.Series) -> pd.Series:
    """Fixes various ways of spelling male/female."""
    return sex_col.map(_fix_sex)
Example #54
0
 def column_tokenizer(s: pd.Series):
     return s.map(self.tokenization_fn)
Example #55
0
    def _encode(self, feature: CalendarFeature, timeseries: pd.Series):
        """ Encode a specific feature numerical given a pandas series timeseries.

        :param feature: Feature to calculate (e.g. year, month, weekday, ...)
        :type feature: str
        :param timeseries: Datetime[ns] timeseries as pandas Series (fast and easy map method)
        :type timeseries: pd.Series
        """
        if feature == CalendarFeature.year:
            return timeseries.map(lambda element: element.year)
        elif feature == CalendarFeature.month:
            return timeseries.map(lambda element: element.month - 1)
        elif feature == CalendarFeature.day:
            return timeseries.map(lambda element: element.day - 1)
        elif feature == CalendarFeature.weekday:
            return timeseries.map(lambda element: element.weekday())
        elif feature == CalendarFeature.hour:
            return timeseries.map(lambda element: element.hour)
        elif feature == CalendarFeature.weekend:
            return timeseries.map(lambda element: (element.weekday() >= 5) * 1)
        elif feature == CalendarFeature.workday:
            return timeseries.map(lambda element:
                                  (self.calendar.is_working_day(element)) * 1)
        elif feature == CalendarFeature.holiday:
            return timeseries.map(lambda element:
                                  (self.calendar.is_holiday(element)) * 1)
        elif feature == CalendarFeature.monday:
            return timeseries.map(lambda element: element.weekday() == 0)
        elif feature == CalendarFeature.tuesday:
            return timeseries.map(lambda element: element.weekday() == 1)
        elif feature == CalendarFeature.wednesday:
            return timeseries.map(lambda element: element.weekday() == 2)
        elif feature == CalendarFeature.thursday:
            return timeseries.map(lambda element: element.weekday() == 3)
        elif feature == CalendarFeature.friday:
            return timeseries.map(lambda element: element.weekday() == 4)
        elif feature == CalendarFeature.saturday:
            return timeseries.map(lambda element: element.weekday() == 5)
        elif feature == CalendarFeature.sunday:
            return timeseries.map(lambda element: element.weekday() == 6)
        elif feature == CalendarFeature.month_sine:
            return timeseries.map(
                lambda element: np.sin(np.pi * 2 * (element.month - 1) / 11))
        elif feature == CalendarFeature.day_sine:
            return timeseries.map(lambda element: np.sin(np.pi * 2 * (
                element.day - 1) / element.days_in_month))
        elif feature == CalendarFeature.weekday_sine:
            return timeseries.map(
                lambda element: np.sin(np.pi * 2 * element.weekday() / 6))
        elif feature == CalendarFeature.hour_sine:
            return timeseries.map(
                lambda element: np.sin(np.pi * 2 * element.hour / 23))
        elif feature == CalendarFeature.month_cos:
            return timeseries.map(
                lambda element: np.cos(np.pi * 2 * (element.month - 1) / 11))
        elif feature == CalendarFeature.day_cos:
            return timeseries.map(lambda element: np.cos(np.pi * 2 * (
                element.day - 1) / element.days_in_month))
        elif feature == CalendarFeature.weekday_cos:
            return timeseries.map(
                lambda element: np.cos(np.pi * 2 * (element.weekday()) / 6))
        elif feature == CalendarFeature.hour_cos:
            return timeseries.map(lambda element: np.cos(np.pi * 2 *
                                                         (element.hour) / 23))
Example #56
0
 def _inverse_transform(self, y: Series) -> Series:
     y = y.map(self.cat_mappings_dependent_var)
     return y
Example #57
0
def is_mixed_type(ser: pd.Series) -> bool:
    """Determines whether the column has mixed types in it."""
    return ser.map(lambda x: type(x)).nunique() > 1 if ser.dtype == object else False
Example #58
0
 def _transform(self, y: Series) -> Series:
     y = y.map(self.inv_map)
     return y
def collection_language_model(totals: Series):
    doc_len = totals['doc_length']
    return totals.map(lambda x: x / doc_len)
Example #60
0
def test_map_missing_mixed(vals, mapping, exp):
    # GH20495
    s = Series(vals + [np.nan])
    result = s.map(mapping)

    tm.assert_series_equal(result, Series(exp))