def test_series_map_box_timestamps(self): # GH#2689, GH#2627 ser = Series(pd.date_range('1/1/2000', periods=10)) def func(x): return (x.hour, x.day, x.month) # it works! ser.map(func) ser.apply(func)
def test_series_map_box_timedelta(self): # GH 11349 s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) def f(x): return x.total_seconds() s.map(f) s.apply(f) DataFrame(s).applymap(f)
def test_apply_same_length_inference_bug(self): s = Series([1, 2]) f = lambda x: (x, x + 1) result = s.apply(f) expected = s.map(f) assert_series_equal(result, expected) s = Series([1, 2, 3]) result = s.apply(f) expected = s.map(f) assert_series_equal(result, expected)
class Map(object): params = ['dict', 'Series'] param_names = 'mapper' def setup(self, mapper): map_size = 1000 map_data = Series(map_size - np.arange(map_size)) self.map_data = map_data if mapper == 'Series' else map_data.to_dict() self.s = Series(np.random.randint(0, map_size, 10000)) def time_map(self, mapper): self.s.map(self.map_data)
def test_series_frame_radd_bug(self): from pandas.util.testing import rands # GH 353 vals = Series([rands(5) for _ in xrange(10)]) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) assert_series_equal(result, expected) frame = DataFrame({'vals' : vals}) result = 'foo_' + frame expected = DataFrame({'vals' : vals.map(lambda x: 'foo_' + x)}) tm.assert_frame_equal(result, expected)
def test_series_frame_radd_bug(self): import operator # GH 353 vals = Series(tm.rands_array(5, 10)) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) assert_series_equal(result, expected) frame = DataFrame({'vals': vals}) result = 'foo_' + frame expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) tm.assert_frame_equal(result, expected) # really raise this time self.assertRaises(TypeError, operator.add, datetime.now(), self.ts)
def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
def test_rolling_max_how_resample(self): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) series = Series(list(range(0, 5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() # Default how should be max expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D') assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D', how='median') assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0+10.0+20.0)/3.0 expected = Series([0.0, 1.0, 2.0, 3.0, v], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D', how='mean') assert_series_equal(expected, x)
def logistic_regression(self, use_glm=True): """ (b) it seems the statistical significant predict variable is only Lag2. How disappointing... """ formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" model = ( smf.glm(formula, data=self.df, family=sm.families.Binomial()) if use_glm else smf.logit(formula, data=self.transformedDF) ) result = model.fit() if use_glm: probs = result.fittedvalues """Beware the prob here is the index 0's prob, so we should use the lambda function below""" pred_values = probs.map(lambda x: 0 if x > 0.5 else 1) else: """The probability of being 1""" probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]]))) pred_values = probs.map(lambda x: 1 if x > 0.5 else 0) """ (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%. Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%. Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%. """ tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
def test_map_defaultdict(self): s = Series([1, 2, 3], index=['a', 'b', 'c']) default_dict = defaultdict(lambda: 'blank') default_dict[1] = 'stuff' result = s.map(default_dict) expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) assert_series_equal(result, expected)
def extract_bday_feats_n_heads(series, modality, field, stat_type, tr_type): """ "bussiness day or not" conditioning feature extraction :return: feature name list, feature value list """ if series is None or len(series) == 0: b_day_heads, b_day_feats = extract_basic_feats_n_heads(None, modality, field, stat_type, tr_type) nb_day_heads, nb_day_feats = extract_basic_feats_n_heads(None, modality, field, stat_type, tr_type) else: cal = SouthKorea() time_stamp_series = Series(series.index.tolist()) unique_dates = time_stamp_series.map(lambda x: x.date()).unique() nb_day_series = None b_day_series = None for date in unique_dates: if cal.is_holiday(date) is False and date.weekday() < 5: if b_day_series is None: b_day_series = series[series.index.date == date] else: b_day_series = b_day_series.append(series[series.index.date == date]) else: if nb_day_series is None: nb_day_series = series[series.index.date == date] else: nb_day_series = nb_day_series.append(series[series.index.date == date]) b_day_heads, b_day_feats = extract_basic_feats_n_heads(b_day_series, modality, field, stat_type, tr_type) nb_day_heads, nb_day_feats = extract_basic_feats_n_heads(nb_day_series, modality, field, stat_type, tr_type) heads = list(map(lambda x: '%s_%s' % (feat.BSS_DAY, x), b_day_heads)) + list( map(lambda x: '%s_%s' % (feat.NON_BSS_DAY, x), nb_day_heads)) values = b_day_feats + nb_day_feats return heads, values
def test_map_counter(self): s = Series(['a', 'b', 'c'], index=[1, 2, 3]) counter = Counter() counter['b'] = 5 counter['c'] += 1 result = s.map(counter) expected = Series([0, 5, 1], index=[1, 2, 3]) assert_series_equal(result, expected)
def test_map_dict_subclass_without_missing(self): class DictWithoutMissing(dict): pass s = Series([1, 2, 3]) dictionary = DictWithoutMissing({3: 'three'}) result = s.map(dictionary) expected = Series([np.nan, np.nan, 'three']) assert_series_equal(result, expected)
def test_parse_dates_combine(self): raw_dates = Series(date_range('1/1/2001', periods=10)) df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), 'time': raw_dates.map(lambda x: str(x.time()))}) res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, index_col=1) newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0])
def test_encode_decode_errors(self): encodeBase = Series([u"a", u"b", u"a\x9d"]) self.assertRaises(UnicodeEncodeError, encodeBase.str.encode, "cp1252") f = lambda x: x.encode("cp1252", "ignore") result = encodeBase.str.encode("cp1252", "ignore") exp = encodeBase.map(f) tm.assert_series_equal(result, exp) decodeBase = Series(["a", "b", "a\x9d"]) self.assertRaises(UnicodeDecodeError, decodeBase.str.encode, "cp1252") f = lambda x: x.decode("cp1252", "ignore") result = decodeBase.str.decode("cp1252", "ignore") exp = decodeBase.map(f) tm.assert_series_equal(result, exp)
def test_map_int(self): left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) right = Series({1: 11, 2: 22, 3: 33}) assert left.dtype == np.float_ assert issubclass(right.dtype.type, np.integer) merged = left.map(right) assert merged.dtype == np.float_ assert isna(merged['d']) assert not isna(merged['c'])
def test_map(self): index, data = tm.getMixedTypeDict() source = Series(data['B'], index=data['C']) target = Series(data['C'][:4], index=data['D'][:4]) merged = target.map(source) for k, v in merged.iteritems(): self.assertEqual(v, source[target[k]]) # input could be a dict merged = target.map(source.to_dict()) for k, v in merged.iteritems(): self.assertEqual(v, source[target[k]]) # function result = self.ts.map(lambda x: x * 2) self.assert_(np.array_equal(result, self.ts * 2))
def test_map_int(self): left = Series({'a' : 1., 'b' : 2., 'c' : 3., 'd' : 4}) right = Series({1 : 11, 2 : 22, 3 : 33}) self.assert_(left.dtype == np.float_) self.assert_(issubclass(right.dtype.type, np.integer)) merged = left.map(right) self.assert_(merged.dtype == np.float_) self.assert_(isnull(merged['d'])) self.assert_(not isnull(merged['c']))
def test_map(self): index, data = tm.getMixedTypeDict() source = Series(data['B'], index=data['C']) target = Series(data['C'][:4], index=data['D'][:4]) merged = target.map(source) for k, v in compat.iteritems(merged): assert v == source[target[k]] # input could be a dict merged = target.map(source.to_dict()) for k, v in compat.iteritems(merged): assert v == source[target[k]] # function result = self.ts.map(lambda x: x * 2) tm.assert_series_equal(result, self.ts * 2) # GH 10324 a = Series([1, 2, 3, 4]) b = Series(["even", "odd", "even", "odd"], dtype="category") c = Series(["even", "odd", "even", "odd"]) exp = Series(["odd", "even", "odd", np.nan], dtype="category") tm.assert_series_equal(a.map(b), exp) exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) a = Series(['a', 'b', 'c', 'd']) b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) a = Series(['a', 'b', 'c', 'd']) b = Series(['B', 'C', 'D', 'E'], dtype='category', index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], categories=['B', 'C', 'D', 'E'])) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 'B', 'C', 'D']) tm.assert_series_equal(a.map(c), exp)
def test_encode_decode_errors(self): encodeBase = Series([u('a'), u('b'), u('a\x9d')]) self.assertRaises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252') f = lambda x: x.encode('cp1252', 'ignore') result = encodeBase.str.encode('cp1252', 'ignore') exp = encodeBase.map(f) tm.assert_series_equal(result, exp) decodeBase = Series([b'a', b'b', b'a\x9d']) self.assertRaises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252') f = lambda x: x.decode('cp1252', 'ignore') result = decodeBase.str.decode('cp1252', 'ignore') exp = decodeBase.map(f) tm.assert_series_equal(result, exp)
def test_map_dict_subclass_with_missing(self): """ Test Series.map with a dictionary subclass that defines __missing__, i.e. sets a default value (GH #15999). """ class DictWithMissing(dict): def __missing__(self, key): return 'missing' s = Series([1, 2, 3]) dictionary = DictWithMissing({3: 'three'}) result = s.map(dictionary) expected = Series(['missing', 'missing', 'three']) assert_series_equal(result, expected)
def test_len(self): values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) result = values.str.len() exp = values.map(lambda x: len(x) if com.notnull(x) else NA) tm.assert_series_equal(result, exp) # mixed mixed = Series(["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]) rs = Series(mixed).str.len() xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) self.assert_(isinstance(rs, Series)) tm.assert_almost_equal(rs, xp) # unicode values = Series([u"foo", u"fooo", u"fooooo", np.nan, u"fooooooo"]) result = values.str.len() exp = values.map(lambda x: len(x) if com.notnull(x) else NA) tm.assert_series_equal(result, exp)
def _esd(x, max_outlier, alpha, direction): """ The ESD test using median and MAD in the calculation of the test statistic. """ x = Series(x) n = len(x) outlier_index = [] for i in range(1, max_outlier + 1): median = x.median() mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT if mad == 0: break if direction == 'both': ares = x.map(lambda value: abs(value - median) / mad) elif direction == 'pos': ares = x.map(lambda value: (value - median) / mad) elif direction == 'neg': ares = x.map(lambda value: (median - value) / mad) r_idx = ares.idxmax() r = ares[r_idx] if direction == 'both': p = 1.0 - alpha / (2 * (n - i + 1)) else: p = 1.0 - alpha / (n - i + 1) crit = t.ppf(p, n-i-1) lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1)) if logger.isEnabledFor(logging.DEBUG): logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" % (i, max_outlier, median, mad, r_idx, r, crit, lam)) if r > lam: outlier_index.append(r_idx) x = x.drop(r_idx) else: # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time, # we can stop. break return outlier_index
def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 2 datapoints on one of the days indices.append(datetime(1975, 1, 3, 6, 0)) series = Series(range(1, 7), index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq="D") assert_series_equal(expected, x)
def test_len(self): values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) result = values.str.len() exp = values.map(lambda x: len(x) if com.notnull(x) else NA) tm.assert_series_equal(result, exp) # mixed mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), 'foo', None, 1, 2.]) rs = Series(mixed).str.len() xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # unicode values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, u('fooooooo')]) result = values.str.len() exp = values.map(lambda x: len(x) if com.notnull(x) else NA) tm.assert_series_equal(result, exp)
def test_type_promote_putmask(): # GH8387: test that changing types does not break alignment ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) left, mask = ts.copy(), ts > 0 right = ts[mask].copy().map(str) left[mask] = right assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) s = Series([0, 1, 2, 0]) mask = s > 0 s2 = s[mask].map(str) s[mask] = s2 assert_series_equal(s, Series([0, '1', '2', 0])) s = Series([0, 'foo', 'bar', 0]) mask = Series([False, True, True, False]) s2 = s[mask] s[mask] = s2 assert_series_equal(s, Series([0, 'foo', 'bar', 0]))
def calcSumDistPerDay(df): t = formattedDf.index # Get timeseries index t2 = Series(t) # convert to series uniqueDates = t2.map(pd.Timestamp.date).unique() # get unique dates only uniqueList = [] # For each datetime object in unique dates, convert to strings so we can # use them as dataframe indexes for date in uniqueDates: uniqueList.append(date.strftime('%Y-%m-%d')) dateDict = {} # New dictionary for dataframe for date in uniqueList: # Grab matching data, take sum and place in new dictionary dateDict[date] = formattedDf[date].sum() daySumDf = DataFrame(dateDict) # Turn into dataframe return daySumDf
def membership_map(s: pd.Series, groups: dict, fillvalue: Any = -1) -> pd.Series: # Reverse & expand the dictionary key-value pairs groups = {x: k for k, v in groups.items() for x in v} return s.map(groups).fillna(fillvalue)
def compute_conditional_distribution( data_col, true_labs, pred_labs, as_categorical=False, binning="fd", common_bins=True ): """Compute a distributional summary. The metric is computed within unique values of the grouping column (categorical) or within bins partitioning its range (continuous). Parameters ---------- data_col : A column of data from a test dataset. true_labs : Series A series of true labels for the test dataset. pred_labs : Series A series of labels predicted by a model for the test dataset. as_categorical : bool Should the data column be treated as categorical, ie. binned on its unique values? If it is not numeric, this param is ignored. binning : str Binning scheme to use for a numerical column, passed to `numpy.histogram`. Can be a fixed number of bins or a string indicating a binning scheme common_bins : bool Should the bins be computed over the entire column and shared across groups (`True`) or computed within each group (`False`) Returns ------- ConditionalDistributionResult """ grouping = [true_labs, pred_labs] if is_discrete(data_col): as_categorical = True if as_categorical: grouping.append(data_col) distribs = data_col.groupby(grouping).size() if common_bins: # Extend the index in each label group to include all data values data_vals = distribs.index.get_level_values(-1).unique() y_vals = distribs.index.droplevel(-1).unique() full_ind = MultiIndex.from_tuples( [(yt, yp, x) for yt, yp in y_vals.values for x in data_vals], names=distribs.index.names, ) distribs = distribs.reindex(index=full_ind, fill_value=0) bin_edges = Series(data_vals) else: # Convert the innermost index level to a Series of bin edges. bin_edges = distribs.rename(None).reset_index(level=-1).iloc[:, 0] else: if common_bins: bins = histogram_bin_edges(data_col, bins=binning) else: bins = binning # distribs will be a series with values (<hist_values>, <bin_edges>) distribs = data_col.groupby(grouping).apply(lambda x: histogram(x, bins=bins)) bin_edges = distribs.map(lambda x: x[1]) bin_ind_tuples = [] for y in distribs.index: bin_ind_tuples.extend( [(y[0], y[1], x) for x in _histogram_bin_labels(bin_edges.loc[y])] ) index_with_bins = MultiIndex.from_tuples( bin_ind_tuples, names=distribs.index.names + [None] ) distribs = Series( distribs.map(lambda x: x[0]).explode().values, index=index_with_bins ) if common_bins: # Retain the unique bin edges as an array bin_edges = Series(bin_edges.iloc[0]) return ConditionalDistributionResult( vals=distribs, bins=Series(bin_edges), categorical=as_categorical, binning=binning, common_bins=common_bins, )
def transform(self, X: pd.Series, y=None): transed = X.map(self._dict) / X.count() return transed
def get_sanitized_bool_series(source: pd.Series) -> pd.Series: return source.map(DataframeReport.sanitize_bool, na_action='ignore')
def test_map_na_exclusion(self): s = Series([1.5, np.nan, 3, np.nan, 5]) result = s.map(lambda x: x * 2, na_action='ignore') exp = s * 2 assert_series_equal(result, exp)
def test_map_float_to_string_precision(): # GH 13228 ser = Series(1 / 3) result = ser.map(lambda val: str(val)).to_dict() expected = {0: "0.3333333333333333"} assert result == expected
def convert_states(s: pd.Series): """ Converts df['state'] from abbrev to full""" return s.map(Helper.STATES)
'metric': 'auc', 'num_leaves': 25, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'min_data_in_leaf': 5, 'max_bin': 200, 'verbose': 0, } gbm = lgb.train(params, lgb_train, num_boost_round=200) predict = gbm.predict(X_test) minmin = min(predict) maxmax = max(predict) predict = Series(predict) vfunc_lg = predict.map(lambda x: (x - minmin) / (maxmax - minmin)) # 将 LGBM 输出概率值映射至[0,1]区间 gbm.feature_importance(importance_type='split') # 输出特征重要性 gbm.feature_name() # 特征名称 ################# XGBoost params = { 'booster': 'gbtree', 'objective': 'rank:pairwise', 'eval_metric': 'auc', 'eta': 0.02, 'max_depth': 5, # 4 3 'colsample_bytree': 0.7, #0.8 'subsample': 0.7, 'min_child_weight': 1, # 2 3 'seed': 1111,
def f(series: pd.Series) -> pd.Series: return series.map(mapping)
def test_map_datetimetz_na_action(): values = date_range("2011-01-01", "2011-01-02", freq="H").tz_localize("Asia/Tokyo") s = Series(values, name="XX") with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): s.map(lambda x: x, na_action="ignore")
def test_map_with_invalid_na_action_raises(): # https://github.com/pandas-dev/pandas/issues/32815 s = Series([1, 2, 3]) msg = "na_action must either be 'ignore' or None" with pytest.raises(ValueError, match=msg): s.map(lambda x: x, na_action="____")
def color_series(s: pd.Series): unique = s.unique() colors = sns.color_palette("hls", len(unique)).as_hex() # type: ignore cmap = dict(zip(unique, colors)) return s.map(cmap)
kirjainnumero=pd.read_table(sys.argv[2]) #the reference file goes here yleisreferenssi=pd.read_table(sys.argv[3])#the final reference file #Next is just munging the datafile, and the references to format we need for actually calculating something. #First make the ID column of the datafile into actual ID, and not a column of data, and get rid of the data #formatted column of the ID column from the datafile, and some other changes that are needed using the #functions defined above. datafile.index=datafile['ID'].values datafile=datafile.drop('ID', axis=1) datafile=datafile.applymap(remove_asterisks) #We replace missing values denoted by '*X' in the original .csv given by filling them from the cell to the left datafile=datafile.replace(to_replace='X', value=np.nan) datafile=datafile.fillna(method='ffill', axis=1) #Get the relevant part of the reference file 1, and make it appropriate for further use refsarja=Series(np.array(yleisreferenssi['IMGT/HLA 3.9.0 Allele Name']),index=np.array(yleisreferenssi['Locus'])) refsarja=refsarja.map(remove_names) #As munging procedure we create a series with appropriate indexing to prevent looping and difficulties in future kirjainnrosarja=Series(np.array(kirjainnumero['SUBTYPE']),index=np.array(kirjainnumero['CODE'])) #Split the values in original data datafile=datafile.applymap(splitframe) #After getting the data read, and in appropriate format, we need to take care of using the correct Loci. #We do this by splitting the column names(see requirements for the column names in the original datafile) def get_loci(dataframe): lista=[] for value in dataframe.columns.values: lista.extend(value.split(':')[0]) return lista result=pd.DataFrame(index=datafile.index) for column in datafile.columns: lah=[] for num, value in datafile[column]:
def get_h3_centroids(hex_column: Series) -> List: centroid_lat_lon = hex_column.map(lambda hex: h3_to_geo(hex)) return [Point(geom[1], geom[0]) for geom in centroid_lat_lon]
def test_encode_decode(): ser = Series(["a", "b", "a\xe4"]).str.encode("utf-8") result = ser.str.decode("utf-8") expected = ser.map(lambda x: x.decode("utf-8")) tm.assert_series_equal(result, expected)
def test_map_type_inference(self): s = Series(lrange(3)) s2 = s.map(lambda x: np.where(x == 0, 0, 1)) assert issubclass(s2.dtype.type, np.integer)
def test_map_empty(self, index): s = Series(index) result = s.map({}) expected = pd.Series(np.nan, index=s.index) tm.assert_series_equal(result, expected)
def __init__(self, y: Series, y_uncleaned: Series): super().__init__(y=y, y_uncleaned=y_uncleaned) self.label_cleaner_binary = LabelCleanerBinary(y=y.map(self.inv_map)) self.problem_type_transform = self.label_cleaner_binary.problem_type_transform
def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) result = s.map({True: "foo", False: "bar"}) expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) tm.assert_series_equal(result, expected)
def preprocessing(texts: pd.Series): texts = texts.map(basic_preprocessing.expand_contractions) texts = texts.map(basic_preprocessing.remove_special_characters) print("Basic preprocessing completed on {} reviews.".format(len(texts))) return texts
def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) result = s.map({True: 'foo', False: 'bar'}) expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3]) assert_series_equal(result, expected)
def transform(self, s: pd.Series): _ = s.map(self.mapper).fillna(0) # 不在训练集的补0 return _
def fix_na(col: pd.Series) -> pd.Series: return col.map(_fix_na)
def fix_sex(sex_col: pd.Series) -> pd.Series: """Fixes various ways of spelling male/female.""" return sex_col.map(_fix_sex)
def column_tokenizer(s: pd.Series): return s.map(self.tokenization_fn)
def _encode(self, feature: CalendarFeature, timeseries: pd.Series): """ Encode a specific feature numerical given a pandas series timeseries. :param feature: Feature to calculate (e.g. year, month, weekday, ...) :type feature: str :param timeseries: Datetime[ns] timeseries as pandas Series (fast and easy map method) :type timeseries: pd.Series """ if feature == CalendarFeature.year: return timeseries.map(lambda element: element.year) elif feature == CalendarFeature.month: return timeseries.map(lambda element: element.month - 1) elif feature == CalendarFeature.day: return timeseries.map(lambda element: element.day - 1) elif feature == CalendarFeature.weekday: return timeseries.map(lambda element: element.weekday()) elif feature == CalendarFeature.hour: return timeseries.map(lambda element: element.hour) elif feature == CalendarFeature.weekend: return timeseries.map(lambda element: (element.weekday() >= 5) * 1) elif feature == CalendarFeature.workday: return timeseries.map(lambda element: (self.calendar.is_working_day(element)) * 1) elif feature == CalendarFeature.holiday: return timeseries.map(lambda element: (self.calendar.is_holiday(element)) * 1) elif feature == CalendarFeature.monday: return timeseries.map(lambda element: element.weekday() == 0) elif feature == CalendarFeature.tuesday: return timeseries.map(lambda element: element.weekday() == 1) elif feature == CalendarFeature.wednesday: return timeseries.map(lambda element: element.weekday() == 2) elif feature == CalendarFeature.thursday: return timeseries.map(lambda element: element.weekday() == 3) elif feature == CalendarFeature.friday: return timeseries.map(lambda element: element.weekday() == 4) elif feature == CalendarFeature.saturday: return timeseries.map(lambda element: element.weekday() == 5) elif feature == CalendarFeature.sunday: return timeseries.map(lambda element: element.weekday() == 6) elif feature == CalendarFeature.month_sine: return timeseries.map( lambda element: np.sin(np.pi * 2 * (element.month - 1) / 11)) elif feature == CalendarFeature.day_sine: return timeseries.map(lambda element: np.sin(np.pi * 2 * ( element.day - 1) / element.days_in_month)) elif feature == CalendarFeature.weekday_sine: return timeseries.map( lambda element: np.sin(np.pi * 2 * element.weekday() / 6)) elif feature == CalendarFeature.hour_sine: return timeseries.map( lambda element: np.sin(np.pi * 2 * element.hour / 23)) elif feature == CalendarFeature.month_cos: return timeseries.map( lambda element: np.cos(np.pi * 2 * (element.month - 1) / 11)) elif feature == CalendarFeature.day_cos: return timeseries.map(lambda element: np.cos(np.pi * 2 * ( element.day - 1) / element.days_in_month)) elif feature == CalendarFeature.weekday_cos: return timeseries.map( lambda element: np.cos(np.pi * 2 * (element.weekday()) / 6)) elif feature == CalendarFeature.hour_cos: return timeseries.map(lambda element: np.cos(np.pi * 2 * (element.hour) / 23))
def _inverse_transform(self, y: Series) -> Series: y = y.map(self.cat_mappings_dependent_var) return y
def is_mixed_type(ser: pd.Series) -> bool: """Determines whether the column has mixed types in it.""" return ser.map(lambda x: type(x)).nunique() > 1 if ser.dtype == object else False
def _transform(self, y: Series) -> Series: y = y.map(self.inv_map) return y
def collection_language_model(totals: Series): doc_len = totals['doc_length'] return totals.map(lambda x: x / doc_len)
def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) tm.assert_series_equal(result, Series(exp))