def __iter_snp(snp: pd.Series, phenotype: np.ndarray) -> pd.Series: """ For a given SNP, encode the major allele as 0 and the minor allele as 1. Perform ordinary least squares linear regression on Phenotype ~ SNP Data. Put calculated P and R-Squared_adjusted values into the output dataframe. Parameters ---------- snp : pd.Series pandas Series of SNP row from processed genotype dataframe. phenotype : np.ndarray numpy ndarray of phenotype values from processed phenotype dataframe. """ snp_idx = snp.name # Encode alleles = snp.unique() allele_dict = {} if len(alleles) == 1: snps_to_drop.append(snp_idx) return else: indv = len(snp) a0 = (snp == alleles[0]).sum() a1 = indv - a0 if a0 > a1: allele_dict = {alleles[0]: 0, alleles[1]: 1} else: allele_dict = {alleles[1]: 0, alleles[0]: 1} snp.replace(allele_dict, inplace=True) genotypes.loc[snp_idx].replace(allele_dict, inplace=True) model = sm.OLS(phenotype, snp.values) results = model.fit() gwas_output_df.iat[snp_idx, 1] = results.pvalues gwas_output_df.iat[snp_idx, 2] = results.rsquared_adj
def _to_label_features_with_min_cut( self, feature_col: pd.Series, feature_name: str, category_min_cnt: int = None, category_min_rate: float = None, aggregation_value='others') -> pd.DataFrame: feature_col: pd.Series = feature_col.astype('object') if category_min_cnt is not None or category_min_rate is not None: category_cnt_series: pd.Series = feature_col.value_counts() sum_cnt = len(feature_col.values) for category_val, cnt in zip(category_cnt_series.index, category_cnt_series.values): if (category_min_cnt is not None and cnt < category_min_cnt) \ or (category_min_rate is not None and float(cnt) / float(sum_cnt) < category_min_rate): feature_col.replace(category_val, aggregation_value, inplace=True) feature_col: pd.Series = feature_col.astype('category') feature_df = feature_col.to_frame(name=feature_name) dummy_df: pd.DataFrame = pd.get_dummies(feature_df, drop_first=False) self._feature_columns = dummy_df.columns return dummy_df
def convert_postcode_toint(serie: pd.Series, str_postcode: list): """ Convert a string postcode into integer. """ serie.replace({i:None for i in str_postcode}, inplace=True) serie.fillna(0, inplace=True) serie = serie.astype(int) return serie
def cat2numeric(col:pd.Series)->None: """ Convert catg column values to numeric values using sklearns LabelEncoder """ le = preprocessing.LabelEncoder() num_values = le.fit_transform(col.values) col.replace(col.values,num_values, inplace=True)
def impute_series_zeros_with_half_min(s: pd.Series): # Find minimal non-zero abundance per sample min_abundance = s.replace(0, np.nan).dropna().min() # replace all zeros with half of the min abundance sample = s.replace(0, min_abundance / 2) return sample
def pandas_data_astype_part3(): # 替换值 data = Series([1, -999, 2, -999, -1000, 3]) print(data.replace(to_replace=-999, value=np.nan)) # 把-999替换为np.nan print(data.replace(to_replace=[-999, -1000], value=np.nan)) # 使用列表来实现一次替换多个值 print(data.replace(to_replace=[-999, -1000], value=[np.nan, 0])) # 对不同的值进行不同的替换 print(data.replace(to_replace={-999: np.nan, -1000: 0})) # 对不同的值进行不同的替换
def one_hot_encode(series: pd.Series, ais_col_name: str) -> Tuple[pd.DataFrame, OneHotEncoder]: series_categories = series.unique() # in order of appearance ais_col_name = ais_col_name.lower() default_alias = ["Default", "Unknown", "Undefined"] if ais_col_name in categorical_values: categories = categorical_values[ais_col_name] + default_alias nan_idx = -1 # prevent typing error by replacing NaN with custom decorator "nan_type" for idx, cat in enumerate(series_categories): if type(cat) is float: nan_idx = idx if nan_idx > -1: series_categories[nan_idx] = "default" for s_cat in series_categories: i = 0 found = False while i < len(categories) and not found: cat = categories[i] if cat.lower() in s_cat.lower(): if cat in default_alias: series = series.replace(s_cat, f"Default {ais_col_name}") else: series = series.replace(s_cat, cat) found = True i += 1 if not found or s_cat.lower() == "other": series = series.replace(s_cat, f"Other {ais_col_name}") desired_categories = categorical_values[ais_col_name] + [ f"Other {ais_col_name}", f"Default {ais_col_name}" ] series = series.values.reshape(-1, 1) # shape as column encoder = OneHotEncoder(sparse=False, categories=[desired_categories]) encoder.fit(series) data_ohe = encoder.transform(series) df_ohe = pd.DataFrame(data_ohe, columns=[ desired_categories[i] for i in range(len(desired_categories)) ]) return df_ohe, encoder else: raise ValueError( f"Error while one hot encoding: Cannot find column {ais_col_name} " f"in {categorical_values.keys()}")
def _split_T_in_col(nfl: pd.DataFrame, column: pd.Series): x = column.replace('T', '', regex=True) y = column.replace('^-?\d+', '', regex=True) y.where(y == 'T', other=1, inplace=True) y.where(y != 'T', other=0, inplace=True) x.name = column.name y.name = column.name + "_T" nfl[x.name] = x nfl[y.name] = y
def __work(se: pd.Series, except_strings: List[str]): if is_integer_str_regex( se, except_strings=except_strings).sum() == se.shape[0]: return se.replace(rep_nan, strtmp, inplace=False).astype( np.float64).astype(np.int64).astype(str).replace( strtmp, rep_nan) elif is_float_str_regex( se, except_strings=except_strings).sum() == se.shape[0]: return se.replace(rep_nan, np.nan, inplace=False).astype( np.float32).round(n_round).astype(str).replace( str(np.nan), rep_nan) else: return se
def vidya(close, length=None, drift=None, offset=None, **kwargs): """Indicator: Variable Index Dynamic Average (VIDYA)""" # Validate Arguments length = int(length) if length and length > 0 else 14 close = verify_series(close, length) drift = get_drift(drift) offset = get_offset(offset) if close is None: return def _cmo(source: Series, n: int, d: int): """Chande Momentum Oscillator (CMO) Patch For some reason: from pandas_ta.momentum import cmo causes pandas_ta.momentum.coppock to not be able to import it's wma like from pandas_ta.overlap import wma? Weird Circular TypeError!?! """ mom = source.diff(d) positive = mom.copy().clip(lower=0) negative = mom.copy().clip(upper=0).abs() pos_sum = positive.rolling(n).sum() neg_sum = negative.rolling(n).sum() return (pos_sum - neg_sum) / (pos_sum + neg_sum) # Calculate Result m = close.size alpha = 2 / (length + 1) abs_cmo = _cmo(close, length, drift).abs() vidya = Series(0, index=close.index) for i in range(length, m): vidya.iloc[i] = alpha * abs_cmo.iloc[i] * close.iloc[i] + vidya.iloc[ i - 1] * (1 - alpha * abs_cmo.iloc[i]) vidya.replace({0: npNaN}, inplace=True) # Offset if offset != 0: vidya = vidya.shift(offset) # Handle fills if "fillna" in kwargs: vidya.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: vidya.fillna(method=kwargs["fill_method"], inplace=True) # Name & Category vidya.name = f"VIDYA_{length}" vidya.category = "overlap" return vidya
def _to_label_features(self, feature_col: pd.Series) -> pd.DataFrame: feature_col: pd.Series = feature_col.astype('category') uq_vals = feature_col.cat.categories feature_col: pd.Series = feature_col.astype('object') for category_val in uq_vals: if self._feature_name + '_' + category_val not in self._feature_columns.values: feature_col.replace(category_val, self._aggregation_value, inplace=True) feature_col: pd.Series = feature_col.astype('category') feature_df = feature_col.to_frame(name=self._feature_name) return pd.get_dummies(feature_df, drop_first=False)
def rank(data: pd.Series) -> pd.Series: """Internal function used to rank ordinal and nominal data.""" unique = pd.Series(data.unique()) ranked = unique.rank() lookup = pd.concat([unique, ranked], axis=1) lookup_series = pd.Series(lookup.iloc[:, 1].values, index=lookup.iloc[:, 0]) return data.replace(lookup_series.to_dict()).astype(float)
def find_best_similar(serie: pd.Series, dict_ref: dict, similarity: Callable, applying=True): """ Find for each value in a pandas series the most similar string from a dictionary referential. This function uses a similarity algorithm (for example jaro-winkler or gestalt pattern matching). Return ------ serie : if apply is True, return the series with most similar computed values closest_value : if apply is False, return the dictionary of similarity for each value """ all_values = set(serie.values) all_values.remove(None) closest_value = {} for data in all_values: best_ratio = 0 for value in dict_ref.values(): ratio = similarity(value.lower(), data) if ratio >= best_ratio: best_ratio = ratio value_close = value closest_value[data] = value_close if applying: return serie.replace(closest_value) else: return closest_value
def create_bow(se: pd.Series, preproc: List[tuple] = [(r"\s+", "")], nlp=spacy.load('ja_ginza'), list_pos: List[str] = ["NOUN", "ADJ", "VERB", "PROPN", "PRON"], vocab=None) -> (pd.DataFrame, dict): se = se.copy() for x, y in preproc: se = se.replace(x, y, regex=True) se = se.str.strip() if vocab is None: vocab = {} for text in se.values: if text != "": for token in nlp(text): if token.pos_ in list_pos and not ( (token.pos_, token.string) in vocab): vocab[(token.pos_, token.string)] = len(vocab) def bag_of_words(text: str, vocab: dict): ndf = np.zeros(len(vocab)).astype(int) if text.strip() != "": for token in nlp(text): wk = vocab.get((token.pos_, token.string)) if wk is not None: ndf[wk] += 1 return ndf se = se.apply(lambda x: bag_of_words(x, vocab)) ndf = np.concatenate([x.reshape(1, -1) for x in se.tolist()], axis=0) df = pd.DataFrame(ndf, columns=[f"vocab_{i}" for i in range(ndf.shape[1])]) return df, vocab
def _from_dataframe(row: Series, na: Union[Any, List[Any]], nesting: str) -> Resource: new_na = row.replace(na, np.nan) no_na = new_na.dropna() items = list(no_na.items()) data = deflatten(items, nesting) return from_json(data, None)
def train_vw(X: pd.DataFrame, y: pd.Series, config: Config): cache_file = config.tmp_dir + "/.vw_cache" data_file = config.tmp_dir + "/vw_data_train.csv" cmd = " ".join([ "rm -f {cache} && vw", "-f {f}", "--cache_file {cache}", "--passes {passes}", "-l {l}", "--early_terminate {early_terminate}", "{df}" ]).format( cache=cache_file, df=data_file, f=config.model_dir + "/vw.model", passes=max(20, int(1000000/len(X))), l=25, early_terminate=1, ) if config["mode"] == "classification": cmd += " --loss_function logistic --link logistic" y = y.replace({0: -1}) save_to_vw(data_file, X, y) subprocess.Popen(cmd, shell=True).communicate()
def make_features(self, answers: Series = None, normalize: Union[Dict[float, float], bool] = True, norm_min: float = 0.0, norm_max: float = 1.0, drop_na: bool = True) -> Series: """ Create DataFrame of features for use in ML. :param answers: Answers to the Question from a Survey. :param normalize: Option to normalize the data by min-max normalization or map using a dictionary. :param norm_min: Value to use for lower bound of normalization range. :param norm_max: Value to use for upper bound of normalization range. :param drop_na: Whether to drop null responses from returned features. """ if answers is None: answers = self._data if drop_na: answers = answers.dropna() features = Series(data=answers.replace(self._categories), index=answers.index) if type(normalize) is bool: if normalize: min_cat_val = min(self.categories.values()) max_cat_val = max(self.categories.values()) return (norm_min + (features - min_cat_val) * (norm_max - norm_min) / (max_cat_val - min_cat_val)) else: return features elif isinstance(normalize, dict): return features.map(normalize) else: raise TypeError('normalize needs to be either bool or dictionary')
def complex_series_maths(ser_01: pd.Series, ser_02: pd.Series, function: str) -> pd.core.series.Series: """Write some math helper functions for series. Take the two given series, perfrom the required operation and return the new series. For example. Give the series: 0 0 1 1 2 2 dtype: int64 And the series: 0 2 1 3 2 4 dtype: int64 If the function given is 'add' you should return 0 2 1 4 2 6 dtype: int64 :param ser_01: Primary series to perform operation on :param ser_02: Secondary series to perform operation on :param function: The operation to perform Note: For this function always add ser_02 to ser_01, subtract ser_02 from ser_01, multiply ser_01 by ser_02, divide ser_01 by ser_02 Don't worry about None's and NaN and divide by zero. Let pandas do the work for you. """ ser_01 = ser_01.replace('nan', np.NaN) ser_02 = ser_02.replace('nan', np.NaN) if function == 'add': return ser_02 + ser_01 elif function == 'sub': return ser_02 - ser_01 elif function == 'mul': return ser_01 * ser_02 elif function == 'div': return ser_01 / ser_02 else: return 0
def slide_12(): data = Series([1., -999., 2., -999., -1000., 3.]) print data print data.replace(-999, np.nan) print data.replace([-999, -1000], np.nan) print data.replace([-999, -1000], [np.nan, 0]) print data.replace({-999: np.nan, -1000: 0})
def restore(self, col: pd.Series) -> pd.Series: """Restore column when to_pandas.""" return pd.Series( pd.Categorical.from_codes( col.replace(np.nan, -1).astype(int), categories=cast(CategoricalDtype, self.dtype).categories, ordered=cast(CategoricalDtype, self.dtype).ordered, ))
def calculate_age(date:pd.Series, year:int): """ Calculate the age from pandas series of dates of birth. """ date.fillna(0, inplace=True) date = date.apply(lambda x : int(str(int(x))[:4])) date = date.replace({0:None}) return year - date
def test_replace_series_no_regex(self): answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: 'Disagree', 4: 'Strongly Disagree'}) weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree': 5, 'Strongly Disagree': 1}) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) assert_series_equal(result, expected)
def describe_1d(series: pd.Series) -> dict: """Describe a series (infer the variable type, then calculate type-specific values). Args: series: The Series to describe. Returns: A Series containing calculated series description values. """ # Replace infinite values with NaNs to avoid issues with histograms later. series.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) # Infer variable types series_description = base.get_var_type(series) # Run type specific analysis if series_description["type"] == Variable.S_TYPE_UNSUPPORTED: series_description.update( describe_unsupported(series, series_description)) else: series_description.update( describe_supported(series, series_description)) type_to_func = { Variable.S_TYPE_CONST: describe_constant_1d, Variable.TYPE_BOOL: describe_boolean_1d, Variable.TYPE_NUM: describe_numeric_1d, Variable.TYPE_DATE: describe_date_1d, Variable.S_TYPE_UNIQUE: describe_unique_1d, Variable.TYPE_CAT: describe_categorical_1d, Variable.TYPE_URL: describe_url_1d, Variable.TYPE_PATH: describe_path_1d, } if series_description["type"] in type_to_func: series_description.update(type_to_func[series_description["type"]]( series, series_description)) else: raise ValueError("Unexpected type") # Return the description obtained return series_description
def create_site_specific_splits( site: pd.Series, proportions: Dict[str, int], random_state: Optional[Union[int, np.random.mtrand.RandomState]] = 989, ): """Splits sites into distinct groups whose sizes roughly matching the given proportions. Null sites are randomly assigned to groups using the provided proportions. Args: site (pd.Series): A series of sites, one element per observation, proportions (dict): A dict whose keys are the resulting groups and whose values are the rough proportion of data in each group. seed (int): Seed for random split of null sites. Example: Split data into groups where each site is in one and only one group with roughly 50-25-25 train-val-holdout proportions. >>> create_site_specific_splits(site, proportions={"train": 2, "val": 1, "holdout": 1}) Returns: pd.Series: A series containing the resulting split, one element per observation. """ assignments = {} sites = site.value_counts(dropna=True).sort_values(ascending=False).index n_subgroups = sum(proportions.values()) for i, subset in enumerate( roundrobin(*([subset] * proportions[subset] for subset in proportions))): for group in sites[i::n_subgroups]: assignments[group] = subset # Divide null sites among the groups null_sites = site.isnull() if null_sites.sum() > 0: logger.debug( f"{null_sites.sum():,} null sites randomly assigned to groups.") null_groups = [] for group, group_proportion in proportions.items(): null_group = f"{group}-{uuid4()}" null_groups.append(null_group) assignments[null_group] = group rng = (np.random.RandomState(random_state) if isinstance( random_state, int) else random_state) site = site.copy() site.loc[null_sites] = rng.choice( null_groups, p=np.asarray(list(proportions.values())) / sum(proportions.values()), size=null_sites.sum(), replace=True, ) return site.replace(assignments)
def target_binned_price_variation_kmeans(pct_var: pd.Series, **kwargs): values = pct_var.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').values values = np.reshape(values, (-1, 1)) discretizer = KBinsDiscretizer(n_bins=kwargs.get('n_bins', 3), strategy='kmeans', encode='ordinal') discrete = discretizer.fit_transform(values) return pd.Series(np.reshape(discrete, (-1, )), index=pct_var.index)
def autocast_series_dtype(series: pd.Series) -> pd.Series: """Cast any sane Series to str/category[str]/number/datetime. This is appropriate when parsing CSV data or Excel data. It _seems_ appropriate when a search-and-replace produces numeric columns like '$1.32' => '1.32' ... but perhaps that's only appropriate in very-specific cases. The input must be "sane": if the dtype is object or category, se assume _every value_ is str (or null). If the series is all-null, do nothing. Avoid spurious calls to this function: it's expensive. TODO handle dates. """ if series.dtype == bool: # Handle Excel formula: '=TRUE' # # We capitalize Pythonic, because A) sometimes the formula is in Python; # and B) the `series.astype(str)` below is hard to customize return series.replace({True: "True", False: "False"}) elif series.dtype == object: nulls = series.isnull() if (nulls | (series == "")).all(): return series try: # If it all looks like numbers (like in a CSV), cast to number. return pd.to_numeric(series) except (ValueError, TypeError): # Otherwise, we want all-string. Is that what we already have? # # Handles Excel formula: =IF(A1=1, 3, "Hi") array = series[~nulls].array if any(type(x) != str for x in array): series = series.astype(str) series[nulls] = None return series elif hasattr(series, "cat"): # Categorical series. Try to infer type of series. # # Assume categories are all str: after all, we're assuming the input is # "sane" and "sane" means only str categories are valid. if (series.isnull() | (series == "")).all(): return series try: return pd.to_numeric(series) except (ValueError, TypeError): # We don't cast categories to str here -- because we have no # callers that would create categories that aren't all-str. If we # ever do, this is where we should do the casting. return series else: assert is_numeric_dtype(series) or is_datetime64_dtype(series) return series
def safe_replace_series(pds: Series, values: Dict, strip: bool = True, lower: bool = False, inplace=False) -> Optional[Series]: if not inplace: pds = pds.copy() if strip and pds.dtype == "object": str_idx = pds.map(lambda x: isinstance(x, str)) pds.loc[str_idx] = pds.loc[str_idx].str.strip() if lower and pds.dtype == "object": str_idx = pds.map(lambda x: isinstance(x, str)) pds.loc[str_idx] = pds.loc[str_idx].str.lower() values = {k.lower(): v for k, v in values.items()} pds.replace(values, inplace=True) assert_values(pds, values.values()) if not inplace: return pds return None
def check_fillna(self, serie: pd.Series, method: str = '', value: int = 0): """ """ if self._fillna: serie_output = serie.copy(deep=False) serie_output = serie.replace([np.inf, -np.inf], np.nan) serie_output = serie_output.fillna(method='backfill') if method else serie_output.fillna(value) return serie_output else: return serie
def tract(series: pd.Series, initial: str, latest: str): """ The county & tract GEOID update function. :param series: :param initial: :param latest: """ return series.replace(to_replace='^({})'.format(initial), value=latest, regex=True)
def __compensate_original_column(self, original, columnnum): """补足原始数据的列宽 :original: 原始矩阵数据 :columnnum: 需要补足的列宽数目 """ rownum = np.shape(original)[0] for i in range(columnnum): s = Series(np.zeros(rownum)) col = self.__convert_row2column(s.replace(0, np.nan).values) original = np.concatenate([original, col], axis=1) return original
def parse_data(): sd = parse_get_data("../Test/Data/hemsedal_hollekolten_jan2016.xml") ff_s = Series(sd["25112"]["FF"]["val"], index=sd["25112"]["index"]) # replace fill values to NaN ff_s.replace(-99999.0, np.nan, inplace=True) ff24_s = ff_s.resample("D") sd = parse_get_data("../Test/Data/hemsedal_hoelto_jan2016.xml") ts = Series(sd["25100"]["RR_24"]["val"], index=sd["25100"]["index"]) # select only measurements at 06 every day ts = ts[ts.index.hour == 6] # ts.replace(-99999.0, np.nan, inplace=True) # ts.replace(-1.0, 0.0, inplace=True) print(ff24_s) plt.bar(ts.index, ts.values) plt.hold(True) plt.plot(ff_s.index, ff_s.values) plt.plot(ff24_s.index, ff24_s.values) # plt.bar(sd['25100']['index'], sd['25100']['RR_24']['val']) plt.show()
#This helped: http://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-columns temps = Series(temps) #get rid of the underlines temps = temps.drop(temps.index[[1]]) #create a data frame with one column temps = DataFrame(temps, columns=['col']) #create an object with a list of list (Q: Why does this return list of list? Can this be done with values?) for first row col_n=list(temps.ix[0].str.split()) #split columns of dataframe and make col_n the column indexes temps = pd.DataFrame(list(temps.col.str.split()), columns=col_n[0]) #drop the duplicate column name row temps = temps.drop(temps.index[0]) #this would strip white space, but I think it's unnecessary: temps.apply(lambda x: x.str.strip()) #Change Ms to missing values import numpy as np temps.replace('M', np.nan, inplace=True) #create a column with TX vs TN, change MO so is actually Month temps['Lvl'] = Series(temps['MO']).str[-2:] temps['MO'] = Series(temps['MO']).str[:-2] temps['YRMO'] = Series(temps['YR']+temps['MO']) #make year and month indexes (Q: Added Lvl as well does this shape make sense?) temps = temps.set_index(['YR','MO', 'YRMO','Lvl']) #Q: added in YRMO so can group and plot, but must be a way to do this with the hierarchical indexing temps = temps.stack().unstack(['Lvl']) #adding name to day index temps.head(100) temps.index.names = ['YR','MO', 'YRMO', 'DAY'] #convert TX and TN to numbers temps = temps.convert_objects(convert_numeric=True) #grouping yrmo_grouped = temps.groupby(level=(['YRMO'])).mean() #Q: really don't think that should need YRMO yr_grouped = temps.groupby(level=(['YR'])).mean()
#!/usr/bin/env python # encoding=utf-8 import pandas as pd import numpy as np from pandas import Series, DataFrame # 替换值 # 类似于fillna, 对值进行替换 data = Series([1., -999., 2., -999., -1000., 3.]) print data # 我们可以约定-999是缺失值,但是NaN没有二义性 print data.replace(-999, np.nan) # 一次替换多个 print data.replace([-999, -1000], np.nan) # 不同值替换成不同的值,注意位置对应 print data.replace([-999, -1000], [np.nan, 0]) # 字典: key是要被替换的, value是替换的 print data.replace({-999: np.nan, -1000: 0})
first,second,third=pieces first+'::'+second+'::'+third #更地道 的做法 '::'.join(pieces) #统计字符串 出现次数 val.count(',') # 替换 val.replace(',','::') #************************************************************ # 正则表达式 regex #re模块 三个类 :模式匹配 替换 拆分 import re text="foo bar\t baz \tqux" # '\s+' 代表了所有 分割空格 不管是 空几个 或是 字符表示 re.split('\s+',text)
def replace01(): data=Series([1,-999.,2.,-999.,-1000.,3.]) print data print data.replace(-999,np.nan) print data.replace([-999,-1000],np.nan) print data.replace({-999:np.nan,-1000:0})
class MySeries: def __init__(self, *args, **kwargs): self.x = Series(*args, **kwargs) self.values = self.x.values self.index = self.x.index def rolling_mean(self, *args, **kwargs): return MySeries(pd.rolling_mean(self.x, *args, **kwargs)) def rolling_count(self, *args, **kwargs): return MySeries(pd.rolling_count(self.x, *args, **kwargs)) def rolling_sum(self, *args, **kwargs): return MySeries(pd.rolling_sum(self.x, *args, **kwargs)) def rolling_median(self, *args, **kwargs): return MySeries(pd.rolling_median(self.x, *args, **kwargs)) def rolling_min(self, *args, **kwargs): return MySeries(pd.rolling_min(self.x, *args, **kwargs)) def rolling_max(self, *args, **kwargs): return MySeries(pd.rolling_max(self.x, *args, **kwargs)) def rolling_std(self, *args, **kwargs): return MySeries(pd.rolling_std(self.x, *args, **kwargs)) def rolling_var(self, *args, **kwargs): return MySeries(pd.rolling_var(self.x, *args, **kwargs)) def rolling_skew(self, *args, **kwargs): return MySeries(pd.rolling_skew(self.x, *args, **kwargs)) def rolling_kurtosis(self, *args, **kwargs): return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs)) def rolling_window(self, *args, **kwargs): return MySeries(pd.rolling_window(self.x, *args, **kwargs)) def cumprod(self, *args, **kwargs): return MySeries(self.x.cumprod(*args, **kwargs)) def cumsum(self, *args, **kwargs): return MySeries(self.x.cumsum(*args, **kwargs)) def diff(self, *args, **kwargs): return MySeries(self.x.diff(*args, **kwargs)) def div(self, *args, **kwargs): return MySeries(self.x.div(*args, **kwargs)) def mul(self, *args, **kwargs): return MySeries(self.x.mul(*args, **kwargs)) def add(self, *args, **kwargs): return MySeries(self.x.add(*args, **kwargs)) def dropna(self, *args, **kwargs): return MySeries(self.x.dropna(*args, **kwargs)) def fillna(self, *args, **kwargs): return MySeries(self.x.fillna(*args, **kwargs)) def floordiv(self, *args, **kwargs): return MySeries(self.x.floordiv(*args, **kwargs)) def mod(self, *args, **kwargs): return MySeries(self.x.mod(*args, **kwargs)) def nlargest(self, *args, **kwargs): return MySeries(self.x.nlargest(*args, **kwargs)) def nonzero(self, *args, **kwargs): return MySeries(self.x.nonzero(*args, **kwargs)) def nsmallest(self, *args, **kwargs): return MySeries(self.x.nsmallest(*args, **kwargs)) def pow(self, *args, **kwargs): return MySeries(self.x.pow(*args, **kwargs)) def rank(self, *args, **kwargs): return MySeries(self.x.rank(*args, **kwargs)) def round(self, *args, **kwargs): return MySeries(self.x.round(*args, **kwargs)) def shift(self, *args, **kwargs): return MySeries(self.x.shift(*args, **kwargs)) def sub(self, *args, **kwargs): return MySeries(self.x.sub(*args, **kwargs)) def abs(self, *args, **kwargs): return MySeries(self.x.abs(*args, **kwargs)) def clip(self, *args, **kwargs): return MySeries(self.x.clip(*args, **kwargs)) def clip_lower(self, *args, **kwargs): return MySeries(self.x.clip_lower(*args, **kwargs)) def clip_upper(self, *args, **kwargs): return MySeries(self.x.clip_upper(*args, **kwargs)) def interpolate(self, *args, **kwargs): return MySeries(self.x.interpolate(*args, **kwargs)) def resample(self, *args, **kwargs): return MySeries(self.x.resample(*args, **kwargs)) def replace(self, *args, **kwargs): return MySeries(self.x.replace(*args, **kwargs))
dtype: float64 ''' ser[:,'a'] # return all from primary index, but use secondary index = 'a' ser[1,'a'] # returns value at index 1 (primary), 'a' (secondary) df = ser.unstack() # converts hierarchical index series into dataframe with primary index as rows, and secondary index as columns #combine_first() method Series(np.where(pd.isnull(ser1),ser2,ser1), index = ['x','y','z','q','r','s']) #Series meets numpy where meets panda's isnull() method # the above statement sates where ser1 values are NaN, use ser2 values, else use ser1 values ser1.combine_first(ser2) #combine_first() does the same df1.combine_first(df2) # does the same with dataframes. ser1.replace(1,10) # replace '1' in your series with '10' ser1.replace(1,np.nan) # replace '1' in your series with NaN ser1.replace([1,4],[100,400]) # replace value (1 and 4) with (100 and 400) ser1.replace({4: 'clown' , 2: 'owl'}) # replace 4 with clown, and 2 with owl ############################################################### ### ### ### ### ### DataFrame basics ### ### ### ### ### ############################################################### # the key method here is DataFrame() import webbrowser