def pd_02(): string_data=Series(['a','b','c',np.nan,'e',None]) print string_data print string_data.isnull() print string_data.dropna() df=DataFrame(np.random.randn(7,3)) df.ix[:4,1]=np.nan df.ix[:2,2]=np.nan print df print df.dropna() print df.fillna(0) print df.fillna({1:0.5,3:-1}) print df df.fillna(0,inplace=True) print df
def is_boolean(series: pd.Series, series_description: dict) -> bool: """Is the series boolean type? Args: series: Series series_description: Series description Returns: True is the series is boolean type in the broad sense (e.g. including yes/no, NaNs allowed). """ keys = series_description["value_counts_without_nan"].keys() if pd.api.types.is_bool_dtype(keys): return True elif (1 <= series_description["distinct_count_without_nan"] <= 2 and pd.api.types.is_numeric_dtype(series) and series[~series.isnull()].between(0, 1).all()): return True elif 1 <= series_description["distinct_count_without_nan"] <= 4: unique_values = set([str(value).lower() for value in keys.values]) accepted_combinations = [ ["y", "n"], ["yes", "no"], ["true", "false"], ["t", "f"], ] if len(unique_values) == 2 and any( [unique_values == set(bools) for bools in accepted_combinations]): return True return False
def check_if_datetime_as_object_feature(X: Series) -> bool: type_family = get_type_family_raw(X.dtype) # TODO: Check if low numeric numbers, could be categorical encoding! # TODO: If low numeric, potentially it is just numeric instead of date if X.isnull().all(): return False if type_family != 'object': # TODO: seconds from epoch support return False try: # TODO: pd.Series(['20170204','20170205','20170206']) is incorrectly not detected as datetime_as_object # But we don't want pd.Series(['184','822828','20170206']) to be detected as datetime_as_object # Need some smart logic (check min/max values?, check last 2 values don't go >31?) pd.to_numeric(X) except: try: if len(X) > 500: # Sample to speed-up type inference X = X.sample(n=500, random_state=0) result = pd.to_datetime(X, errors='coerce') if result.isnull().mean() > 0.8: # If over 80% of the rows are NaN return False return True except: return False else: return False
def describe_url_1d(series: pd.Series, series_description: dict) -> dict: """Describe a url series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series[~series.isnull()].astype(str) stats = {} # Create separate columns for each URL part keys = ["scheme", "netloc", "path", "query", "fragment"] url_parts = dict(zip(keys, zip(*series.map(urlsplit)))) for name, part in url_parts.items(): stats[f"{name.lower()}_counts"] = pd.Series( part, name=name).value_counts() # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats["top"] = value_counts.index[0] stats["freq"] = value_counts.iloc[0] return stats
def init_row(self, rule: Rule, results: pd.Series, conn: Connector, context: Dict = None): """ Count metrics we want to measure using pd.Series api and set them to quality check object. """ if results.isnull().any(): raise ValueError( "In results of rule.apply can't be any Null values.") # todo - add to doc self.task_ts = context["task_ts"] self.attribute = rule.attribute self.rule_name = rule.name self.rule_description = rule.description self.total_records = results.shape[0] self.failed = results[results == False].shape[0] self.passed = results[results == True].shape[0] self.set_medians(conn) self.time_filter = rule.time_filter self.failed_percentage = self._perc(self.failed, self.total_records) self.passed_percentage = self._perc(self.passed, self.total_records) self.status = "invalid" if self.failed > 0 else "valid"
def infer_pd_series_spark_type( pser: pd.Series, dtype: Dtype, prefer_timestamp_ntz: bool = False) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param pser: :class:`pandas.Series` to be inferred :param dtype: the Series' dtype :param prefer_timestamp_ntz: if true, infers datetime without timezone as TimestampNTZType type. If false, infers it as TimestampType. :return: the inferred Spark data type """ if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() elif hasattr(pser.iloc[0], "__UDT__"): return pser.iloc[0].__UDT__ else: return from_arrow_type( pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) elif isinstance(dtype, CategoricalDtype): if isinstance(pser.dtype, CategoricalDtype): return as_spark_type(pser.cat.codes.dtype, prefer_timestamp_ntz=prefer_timestamp_ntz) else: # `pser` must already be converted to codes. return as_spark_type(pser.dtype, prefer_timestamp_ntz=prefer_timestamp_ntz) else: return as_spark_type(dtype, prefer_timestamp_ntz=prefer_timestamp_ntz)
def test_isnull(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isnull(), Series([False, True])) tm.assert_series_equal(s.notnull(), Series([True, False]))
def get_errors(self, series: pd.Series, column: 'column.Column'): errors = [] # Calculate which columns are valid using the child class's validate function, skipping empty entries if the # column specifies to do so simple_validation = ~self.validate(series) if column.allow_empty: # Failing results are those that are not empty, and fail the validation # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is if is_categorical_dtype(series) or is_numeric_dtype(series): validated = ~series.isnull() & simple_validation else: validated = (series.str.len() > 0) & simple_validation else: validated = simple_validation # Cut down the original series to only ones that failed the validation indices = series.index[validated] # Use these indices to find the failing items. Also print the index which is probably a row number for i in indices: element = series[i] errors.append(ValidationWarning( message=self.message, value=element, row=i, column=series.name )) return errors
def describe_path_1d(series: pd.Series, series_description: dict) -> dict: """Describe a path series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ series_description.update(describe_categorical_1d(series, series_description)) # Make sure we deal with strings (Issue #100) if "p_series" not in series_description: series = series[~series.isnull()].astype(str) series = series.map(Path) else: series = series_description["p_series"] del series_description["p_series"] stats = path_summary(series) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats["top"] = value_counts.index[0] stats["freq"] = value_counts.iloc[0] return stats
def main(): """ Handling of not applicable values """ string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) print string_data print string_data.isnull() string_data[0] = None print string_data.isnull() print None is np.nan, None == np.nan # not same # Exclude N/A print '','' NA = np.nan data = Series([1, NA, 3.5, NA, 7]) print data.dropna() print data[data.notnull()] data = DataFrame([ [1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.] ]) cleaned = data.dropna() # row that all value is not NA print data print cleaned print data.dropna(how='all') data[4] = None print data.dropna(axis=1, how='all') print data.dropna(thresh=2) # non NA is more 2 # Fill NA print '','' print data.fillna(0) print data.fillna({1: 0.5, 2: -1}) _ = data.fillna(0, inplace=True) print data print '','' df = DataFrame(np.arange(18).reshape((6, 3))) df.ix[2:, 1] = NA; df.ix[4:, 2] = NA print df print df.fillna(method='ffill') print df.fillna(method='ffill', limit=2) data = Series([1., NA, 3.5, NA, 7]) print data.fillna(data.mean())
def bin_variable(var: pd.Series, target: pd.Series = None, points: list = None, min_size: float = 5, rnd: int = 2, return_points: bool = False): """ Make binning on numeric variable, return categorized variable Keyword arguments: var (pd.Series) -- Numeric variable target (pd.Series) -- Target binary variable points (list) -- List of cut-points (default None) min_size (float) -- minimum size of group in percent rnd -- Round level for variable values (default 2) Output: List - [cut-points (list), categorized variable (pd.Series)] or Categorized variable (pd.Series) """ if points is None: if target is None: points = points_calulation(var=var, min_size=min_size, rnd=rnd) else: points = points_calculation_tree(var=var, target=target, min_size=min_size, rnd=rnd) if points == [-np.inf, np.inf]: var = pd.cut(var, bins=points, labels=['Not Missing']) else: points = list([ -np.inf, ] if points[0] != -np.inf else []) + points + list([ np.inf, ] if points[-1] != np.inf else []) var = pd.cut(var, bins=points, include_lowest=False) categories = [str(x) for x in var.cat.categories] if var.isnull().sum() > 0: var = var.cat.add_categories('Missing') var.fillna('Missing', inplace=True) var = var.astype(str) if len(categories) > 1: var[var == categories[0]] = '[<={}]'.format(points[1]) var[var == categories[-1]] = '(>{}]'.format(points[-2]) categories[0] = '[<={}]'.format(points[1]) categories[-1] = '(>{}]'.format(points[-2]) if (var == 'Missing').sum() > 0: categories = [ 'Missing', ] + categories var = pd.Categorical(var, categories=categories, ordered=True) return (points, var) if return_points else var
def normalize_edgeR(X: pd.DataFrame, method: str = "tmm", length: pd.Series = None, p=0.75, **kws): """ X: pd.DataFrame where rows are samples and columns are genes methods: ("tmm","rle","upperquartile") "TMM" is the weighted trimmed mean of M-values (to the reference) proposed by Robinson and Oshlack (2010), where the weights are from the delta method on Binomial data. "RLE" is the scaling factor method proposed by Anders and Huber (2010). We call it "relative log expression", as median library is calculated from the geometric mean of all columns and the median ratio of each sample to the median library is taken as the scale factor. "upperquartile" is the upper-quartile normalization method of Bullard et al (2010), in which the scale factors are calculated from the 75% quantile of the counts for each library, after removing genes which are zero in all libraries. This idea is generalized here to allow scaling by any quantile of the distributions. "GeTMM" Gene length corrected trimmed mean of M-values. Must include gene lengths. https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2246-7#MOESM4 edgeR: http://bioconductor.org/packages/release/bioc/html/edgeR.html """ edgeR = R_package_retrieve("edgeR") assert isinstance(X, pd.DataFrame), "type(df_counts) must be pd.DataFrame" # Method formatting assert_acceptable_arguments( query=[method.lower()], target={"tmm", "rle", "upperquartile", "getmm"}) if method in {"tmm", "rle"}: method = method.upper() # Check RPK for GeTMM if method.lower() == "getmm": assert length is not None, "If GeTMM is chosed as the method then `length` cannot be None. It must be either a pd.Series of sequences or sequence lengths" length = length[X.columns] assert length.isnull().sum( ) == 0, "Not all of the genes in `X.columns` are in `length.index`. Either use a different normalization or get the missing sequence lengths" # If sequences are given then convert to length (assumes CDS and no introns) if pd.api.types.is_string_dtype(length): length = length.map(len) X = X / length method = "TMM" # Gene axis as rows X = X.T # Labels idx_attrs = X.index idx_obsvs = X.columns # Convert pd.DataFrame to R-object rX = pandas_to_rpy2(X) d = edgeR.DGEList(counts=rX) # Calculate NormFactors normalization_factors = edgeR.calcNormFactors(d, method=method, p=p, **kws) # Normalized counts normalized_counts = edgeR.cpm(normalization_factors) X_tmm = pd.DataFrame(rpy2_to_pandas(normalized_counts), index=idx_attrs, columns=idx_obsvs).T return X_tmm
def test_isnull_for_inf(self): s = Series(['a', np.inf, np.nan, 1.0]) with pd.option_context('mode.use_inf_as_null', True): r = s.isnull() dr = s.dropna() e = Series([False, True, True, False]) de = Series(['a', 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de)
def validate_series(series: pd.Series) -> None: """ Ensure `series` is "valid" as per Workbench standards, or raise ValueError. "Valid" means: * If dtype is `object` or `categorical`, all values are `str`, `np.nan` or `None` * Otherwise, series must be numeric (but not "nullable integer") or datetime (without timezone). """ dtype = series.dtype if dtype in SupportedNumberDtypes: infinities = series.isin([np.inf, -np.inf]) if infinities.any(): idx = series[infinities].index[0] raise ValueError(("invalid value %r in column %r, row %r " "(infinity is not supported)") % (series[idx], series.name, idx)) return elif is_datetime64_dtype(dtype): # rejects datetime64ns return elif dtype == object: nonstr = (series[~series.isnull()].map(type) != str) if nonstr.any(): raise ValueError( "invalid value %r in column %r (object values must all be str)" % (series.iloc[nonstr[nonstr == True].index[0]], series.name)) elif hasattr(series, 'cat'): categories = series.cat.categories nonstr = (categories.map(type) != str) if nonstr.any(): raise ValueError( "invalid value %r in column %r (categories must all be str)" % (categories[np.flatnonzero(nonstr)[0]], series.name)) # Detect unused categories: they waste space, and since the module # author need only .remove_unused_categories() there isn't much reason # to allow them (other than the fact this check might be slow?). codes = np.unique(series.cat.codes) # retval is sorted if len(codes) and codes[0] == -1: codes = codes[1:] # At this point, if all categories are used, `codes` is an Array of # [0, 1, ..., len(categories)-1]. Otherwise, there's a "hole" somewhere # in `codes` (it may be at the end). if len(codes) != len(categories): # There are unused categories. That means an index into # `categories` is not in `codes`. Raise it. for i, category in enumerate(categories): if i >= len(codes) or codes[i] != i: raise ValueError(('unused category %r in column %r ' '(all categories must be used)') % (category, series.name)) assert False # the for-loop is guaranteed to raise, in theory else: raise ValueError('unsupported dtype %r in column %r' % (dtype, series.name))
def __call__(self, value: Series) -> bool: self.value = value has_null = True if not self.nullable: return not value.isnull().values.any() return has_null
def full_join_except_id(row: pd.Series): single_text_blob = [] for index, row_value in row[1:].items(): if not row.isnull()[index]: single_text_blob.append( str(row_value) ) # make sure non-string values (e.g. prices as floats) get converted return " ".join(single_text_blob)
def create_site_specific_splits( site: pd.Series, proportions: Dict[str, int], random_state: Optional[Union[int, np.random.mtrand.RandomState]] = 989, ): """Splits sites into distinct groups whose sizes roughly matching the given proportions. Null sites are randomly assigned to groups using the provided proportions. Args: site (pd.Series): A series of sites, one element per observation, proportions (dict): A dict whose keys are the resulting groups and whose values are the rough proportion of data in each group. seed (int): Seed for random split of null sites. Example: Split data into groups where each site is in one and only one group with roughly 50-25-25 train-val-holdout proportions. >>> create_site_specific_splits(site, proportions={"train": 2, "val": 1, "holdout": 1}) Returns: pd.Series: A series containing the resulting split, one element per observation. """ assignments = {} sites = site.value_counts(dropna=True).sort_values(ascending=False).index n_subgroups = sum(proportions.values()) for i, subset in enumerate( roundrobin(*([subset] * proportions[subset] for subset in proportions))): for group in sites[i::n_subgroups]: assignments[group] = subset # Divide null sites among the groups null_sites = site.isnull() if null_sites.sum() > 0: logger.debug( f"{null_sites.sum():,} null sites randomly assigned to groups.") null_groups = [] for group, group_proportion in proportions.items(): null_group = f"{group}-{uuid4()}" null_groups.append(null_group) assignments[null_group] = group rng = (np.random.RandomState(random_state) if isinstance( random_state, int) else random_state) site = site.copy() site.loc[null_sites] = rng.choice( null_groups, p=np.asarray(list(proportions.values())) / sum(proportions.values()), size=null_sites.sum(), replace=True, ) return site.replace(assignments)
def _check_inputs( s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series, ) -> None: """ Check that inputs have valid names and could be proabilities """ if ( s_test_pred.min() < 0 or s_test_pred.max() > 1 or s_calib_pred.min() < 0 or s_calib_pred.max() > 1 ): raise RuntimeError( "Probabilities outside (0,1) range were passed to calibrate" ) if not s_calib_pred.name == s_test_pred.name: warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}") if s_test_pred.isnull().sum() > 0: _log_missing_indices(s_test_pred) raise RuntimeError("Missing values in s_test_pred") if s_calib_pred.isnull().sum() > 0: _log_missing_indices(s_calib_pred) raise RuntimeError("Missing values in s_calib_pred") if s_calib_actual.isnull().sum() > 0: _log_missing_indices(s_calib_actual) raise RuntimeError("Missing values in s_calib_actual") if ( not len(s_calib_pred) == len(s_calib_actual) or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0 ): raise RuntimeError( f"len(s_calib_pred): {len(s_calib_pred)} " f"len(s_calib_actual): {len(s_calib_actual)} " f"index diff: " f"{s_calib_pred.index.difference(s_calib_actual.index)}" f"s_calib_pred.head() : {s_calib_pred.head()}" f"s_calib_pred.tail() : {s_calib_pred.tail()}" f"s_calib_actual.head() : {s_calib_actual.head()}" f"s_calib_actual.tail() : {s_calib_actual.tail()}" )
def parse(self, column_data: pd.Series): """Parse the column data and fill in the necessary properties""" assert not self._parsed, 'Cannot call parse twice. ' \ 'Use "col_prop.clone()" and parse again.' self._parsed = True assert isinstance(column_data, pd.Series), \ 'Currently, the input column data must be a Pandas Series.' self._num_sample = len(column_data) self._num_missing_samples = column_data.isnull().sum().sum().item() self._name = column_data.name
def slide_20(): import re data = {'Dave': '*****@*****.**', 'Steve': '*****@*****.**', 'Rob': '*****@*****.**', 'Wes': np.nan} data = Series(data) print data print data.isnull() print data.str.contains('gmail') pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' print data.str.findall(pattern, flags=re.IGNORECASE) matches = data.str.match(pattern, flags=re.IGNORECASE) print matches print matches.str.get(1) print matches.str[0] print data print data.str[:5]
def is_url(series: pd.Series, counts: dict) -> bool: if counts["distinct_count_without_nan"] > 0: try: result = series[~series.isnull()].astype(str).apply(urlparse) return result.apply( lambda x: all([x.scheme, x.netloc, x.path])).all() except ValueError: return False else: return False
def practice_four(): string_data = Series(['aa', 'ar', np.nan, 'av']) string_data.isnull # 判断是否为NA string_data[0] = None string_data.isnull() # 内置的None值也会当作NA处理 ''' NA处理方法 dropna 根据各标签的值中是否存在缺失数据对轴标签进行过滤 fillna 用指定值或插值方法(ffill或bfill)填充缺失数据 isnull 返回一个布尔值的对象,表明哪些值是缺失值NA notnull isnull的否定式 ''' # 滤除缺失数据 from numpy import nan as NA data = Series([1, NA, 3.5, NA, 7]) data.dropna() # 返回一个仅含非空数据和索引值的Series data[data.notnull()] # 同上 data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]]) data.dropna() # 默认丢弃任何含有缺失值的行 data.dropna(how='all') # 只丢弃全为NA的那些行 data.dropna(axis=1, how='all') # 只丢弃全为NA的那些列 df = DataFrame(np.random.randn(7, 3)) df.ix[:4, 1] = NA df.ix[:2, 2] = NA df.dropna(thresh=3) # 填充缺失数据 df.fillna(0) # 填充为0 df.fillna({1: 0.5, 3: -1}) # 可以实现对不同的列填充不同的值 df.fillna(0, inplace=True) # 返回被填充对象的引用 ''' fillna函数的参数 value 填充缺失值的标量值或字典对象 method 插值方式 axis 默认为0 inplace 修改调用者对象而不产生副本 limit 可以连续填充的最大数量 ''' pass
def get_bin_indices(self, series: pd.Series, dtype, bins) -> pd.Series: bin_index = {} for bin, range_config in bins.items(): bin_index[bin] = self.condition_index(range_config, dtype, series) values_index = functools.reduce(lambda a, b: a | b, bin_index.values()) null_index = series.isnull() other_index = series.index & ~(null_index | values_index) return bin_index, null_index, other_index, values_index
def degree_mean(data: pd.Series) -> float: """ Return the mean of a list of degrees """ if data.isnull().all(): return np.NaN rads = np.deg2rad(data) sums = np.arctan2(np.sum(np.sin(rads)), np.sum(np.cos(rads))) return (np.rad2deg(sums) + 360) % 360
def transform(self, X, y=None): pred_age_train = X[['SibSp', 'Parch']][Series.isnull(X['Age'])] pred_age = pred_age_train.apply(self.predict_age, axis=1) # 相等的赋值会按照 行号来进行赋值的 X['Age'][Series.notnull(X['Age'])] = X['Age'][Series.notnull(X['Age'])].map(lambda age: 1 if age < 16 else 0) X['Age'][np.isnan(X['Age'])] = pred_age X.rename(columns={'Age':'Child'}, inplace=True) return X
def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \ -> Union[pd.DataFrame, np.ndarray]: """ Fit models for each fold, then transform X Args: X: Data y: Target fit_params: Additional parameters passed to models Returns: Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. """ assert len(X) == len(y) self._pre_train(y) is_pandas = isinstance(X, pd.DataFrame) X = convert_input(X) y = convert_input_vector(y, X.index) if y.isnull().sum() > 0: # y == null is regarded as test data X_ = X.copy() X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params) X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params) else: X_ = self._fit_train(X, y, **fit_params) X_ = self._post_transform(self._post_fit(X_, y)) return X_ if self.return_same_type and is_pandas else X_.values
def _nullable_test(col: pd.Series, meta_col: dict) -> dict: col_name = meta_col.get("name") test_inputs = { "column": col_name, } res_dict = _result_dict("nullable", test_inputs) col_oob = col.isnull() return _fill_res_dict(col, col_oob, res_dict)
def missing_val(): missing = np.nan series_obj = Series(['row1', 'row2', missing, 'row4']) print(series_obj.isnull()) np.random.seed(25) DF_rand = DataFrame(np.random.randn(36).reshape(6, 6)) DF_rand.iloc[3:6, 0] = missing DF_rand.iloc[1:4, 5] = missing print(DF_rand.fillna(0)) filled_DF = DF_rand.fillna({0: .1, 5: 1.25}) ffill = DF_rand.fillna(method='ffill') print(DF_rand.isnull().sum()) pass
def impute_with_dist(feature: pd.Series): """ Imputes na's of column according to distribution :param df: :param feature: :return: """ probs = feature.value_counts(normalize=True) isnull = feature.isnull() feature[isnull] = np.random.choice(probs.index, size=len(feature[isnull]), p=probs.values) return feature
def validate_prediction_output(cls, result: pd.Series): """ Validates if prediction output is of type Series and values of series are float64 :param result: pandas series :return: None :raises: InvalidPredictionException """ if not isinstance(result, pd.Series): raise InvalidPredictionException("Output of model. predict should be of type pandas Series") if result.empty or result.isnull().all(): raise InvalidPredictionException("Prediction result for given test data was empty or None for all the rows." " Please verify the model") if not np.array_equal(result.fillna(0), result.fillna(0).astype(np.float64)): raise InvalidPredictionException("Prediction result of type other than int/float/double are not supported")
def rdf(series: pd.Series, mol_list: Iterable[MultiMolecule]) -> None: """Guess parameters in **df** using the Boltzmann-inverted radial distribution function.""" is_null = series.isnull() nonzero = series[~is_null].index atom_subset = set(chain.from_iterable(series[is_null].index)) # Construct the RDF and guess the parameters rdf_gen = (mol.init_rdf(atom_subset=atom_subset) for mol in mol_list) for rdf in rdf_gen: guess = estimate_lj(rdf) guess.index = pd.MultiIndex.from_tuples( sorted(i.split()) for i in guess.index) guess[guess.index.intersection(nonzero)] = np.nan series.update(guess[series.name])
def _validate_class_labels(self, y: Series): null_count = y.isnull().sum() if null_count: raise ValueError(f'Labels cannot contain missing (nan) values. Found {null_count} missing label values.') if self.problem_type == MULTICLASS and not self.eval_metric.needs_pred: y_unique = np.unique(y) valid_class_set = set(self.class_labels) unknown_classes = [] for cls in y_unique: if cls not in valid_class_set: unknown_classes.append(cls) if unknown_classes: # log_loss / pac_score raise ValueError(f'Multiclass scoring with eval_metric=\'{self.eval_metric.name}\' does not support unknown classes. Unknown classes: {unknown_classes}')
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param s: :class:`pandas.Series` to be inferred :return: the inferred Spark data type """ dt = s.dtype if dt == np.dtype('object'): if len(s) == 0 or s.isnull().all(): raise ValueError("can not infer schema from empty or null dataset") return types.from_arrow_type(pa.Array.from_pandas(s).type) elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): return types.TimestampType() else: return types.from_arrow_type(pa.from_numpy_dtype(dt))
def check_if_datetime_feature(self, X: Series): type_family = self.get_type_family(X.dtype) # TODO: Check if low numeric numbers, could be categorical encoding! # TODO: If low numeric, potentially it is just numeric instead of date if X.isnull().all(): return False if type_family == 'datetime': return True if type_family != 'object': # TODO: seconds from epoch support return False try: X.apply(pd.to_datetime) return True except: return False
def test_comparison_operators_with_nas(self): s = Series(bdate_range('1/1/2000', periods=10), dtype=object) s[::2] = np.nan # test that comparisons work ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] for op in ops: val = s[5] f = getattr(operator, op) result = f(s, val) expected = f(s.dropna(), val).reindex(s.index) if op == 'ne': expected = expected.fillna(True).astype(bool) else: expected = expected.fillna(False).astype(bool) assert_series_equal(result, expected) # fffffffuuuuuuuuuuuu # result = f(val, s) # expected = f(val, s.dropna()).reindex(s.index) # assert_series_equal(result, expected) # boolean &, |, ^ should work with object arrays and propagate NAs ops = ['and_', 'or_', 'xor'] mask = s.isnull() for bool_op in ops: f = getattr(operator, bool_op) filled = s.fillna(s[0]) result = f(s < s[9], s > s[3]) expected = f(filled < filled[9], filled > filled[3]) expected[mask] = False assert_series_equal(result, expected)
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True): # NOTE: inf only happens when explicitly setting bins # short circuit empty series s = Series(s) if s.count() == 0: return np.repeat(None, len(s)) if not np.iterable(bins): ind, label = cut(s, bins, retbins=retbins, labels=labels) # for now, pandas base cut doesn't support infinite ranges # so it bases first bin at 0 where we base on 1, and 0 is # [-inf, first] for us ind = ind + 1 else: bins = np.asarray(bins) #if (np.diff(bins) < 0).any(): # raise ValueError('bins must increase monotonically.') ind, label = inf_bins_to_cuts(s, bins) # build out ranges ranges = [] ranges.append(NumRange(-inf, label[0])) for x in range(len(label)-1): nr = NumRange(label[x], label[x+1]) ranges.append(nr) ranges.append(NumRange(label[-1], inf)) if not infinite: na_mask = (ind == 0) | (ind == len(bins)) np.putmask(ind, na_mask, -1) #ind = ind.astype(int) ind[s.isnull().values] = -1 # fastpath=True to skip the hashmap indexing. # The code generator will check identity, which won't match because # ind is an int position vector and ranges is a list of objects. # if fastpath is off, then it'll look like none of the values match return Categorical(ind, ranges, fastpath=True)
def fit(self, X, y=None): age = X[['SibSp', 'Parch','Age']][Series.notnull(X['Age'])] age_nan = X[['SibSp', 'Parch']][Series.isnull(X['Age'])] y_child = age['Age'].map(lambda age: 1 if age < 16 else 0) y_child = y_child.rename('y_child') age.drop('Age', axis=1, inplace=True) lr = LogisticRegression() paramters = {'C':[0.01, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3]} self.grid_search = GridSearchCV(lr, paramters, cv=5, n_jobs=-1) self.grid_search.fit(age, y_child) #print self.grid_search.best_params_ #print self.grid_search.best_score_ return self
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True): # NOTE: inf only happens when explicitly setting bins # short circuit empty series s = Series(s) if s.count() == 0: return np.repeat(None, len(s)) if not np.iterable(bins): ind, label = cut(s, bins, retbins=retbins, labels=labels) # for now, pandas base cut doesn't support infinite ranges # so it bases first bin at 0 where we base on 1, and 0 is # [-inf, first] for us ind = ind + 1 else: bins = np.asarray(bins) #if (np.diff(bins) < 0).any(): # raise ValueError('bins must increase monotonically.') ind, label = inf_bins_to_cuts(s, bins) # build out ranges ranges = [] ranges.append(NumRange(-inf, label[0])) for x in range(len(label)-1): nr = NumRange(label[x], label[x+1]) ranges.append(nr) ranges.append(NumRange(label[-1], inf)) if not infinite: na_mask = (ind == 0) | (ind == len(bins)) np.putmask(ind, na_mask, -1) #ind = ind.astype(int) ind[s.isnull().values] = -1 return Categorical(ind, ranges)
print s.index s2 = Series([100, "python", "scu", "lina"], index=["mark", "title", "university", "name"]) print s2 print s2.index print s2["name"] sd = {"python": 10000, "go": 8900, "lua": 7200} s3 = Series(sd) print s3 s4 = Series(sd, index=["python", "go", "java"]) print s4 print pd.isnull(s4) print s4.isnull() s4.index = ["p1", "p2", "p3"] print s4 data = {"name": ["yahoo", "facebook", "google"], "marks": [200, 400, 800], "price": [9, 3, 7]} f1 = DataFrame(data) print f1 f2 = DataFrame(data, columns=["name", "marks", "price"]) print f2 f3 = DataFrame(data, columns=["name", "marks", "price", "debt"], index=["a", "b", "c"]) print f3
# Lecture 23 - Missing Data import numpy as np from pandas import Series,DataFrame import pandas as pd data = Series(['one', 'two', np.nan, 'four']) data data.isnull() # identify missing values data.dropna() # drop all null values dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]]) dframe clean_dframe = dframe.dropna() # returns rows without any nulls clean_dframe dframe.dropna(how = 'all') # drops rows with all values missing dframe.dropna(axis = 1) # drops columns (will drop all of them) npn = np.nan dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]]) dframe2 # points without null dframe2.dropna(thresh = 2) # rows with at least 2 data points
#cov用于计算协方差 returns.MSFT.corr(returns.IBM) returns.MSFT.cov(returns.IBM) returns.corr() returns.cov() #DataFrame的corrwith方法,计算列或者行跟另一个Series或DataFrame之间的相关系数 #传入一个Series会返回一个相关系数值Series,传入一个DataFrame会计算按列名配对的相关系数 #计算AAPL与股票的相关系数 returns.corrwith(returns.AAPL) #计算百分比变化与成交量的相关系数 returns.corrwith(volume) #处理缺失数据 package = Series(['pandas','numpy','matplotlib',np.nan]) print package.isnull() #python内置的None值也会被当做NA处理 data = Series([1,np.nan,2,np.nan,3]) print data.dropna() print data[data.notnull()] #对于DataFrame,dropna默认丢弃任何含有缺失值的行 data = DataFrame(np.arange(16).reshape(4,4)) data[(data/2-1)%3==0]=np.nan #将第一行全部置为nan data.ix[0]=np.nan print data.dropna() #只丢弃全为NA的那些行 print data.dropna(how = "all") #第三列置为NA
import numpy as np import pandas as pd import pandas.io.data as web from pandas import Series, DataFrame from numpy import NaN as NA ############################################################### string_data = Series(['aardvark','artichoke',np.nan,'avocado']) print(string_data) print('\n') print(string_data.isnull()) print('\n') string_data[0]=None print(string_data.isnull()) print('\n') ############################################################### data =Series([1,NA,3.5,NA,7]) print(data.dropna()) print('\n') print(data[data.notnull()]) print('\n')
class BagOfWords: def __init__(self, filepath, volID, include_punctuation): ''' Construct a BagOfWords. volID is a string label for the volume. include_punctuation is a boolean. ''' self.volID = volID with open(filepath, encoding = 'utf-8') as f: filelines = f.readlines() self.rawcounts = dict() self.totalcount = 0 for line in filelines: line = line.rstrip() fields = line.split('\t') if len(fields) != 2: print("Illegal line length in " + filepath) print(line) continue else: tokentype = fields[0] count = fields[1] try: intcount = int(count) if include_punctuation or not all_nonalphanumeric(tokentype): self.rawcounts[tokentype] = intcount self.totalcount += intcount except ValueError: print("Cannot parse count " + count + " as integer.") continue self.numrawcounts = len(self.rawcounts) def selectfeatures(self, featurelist): ''' A BagOfWords is created with merely a dictionary of raw token counts. One could call this a sparse table. It has no entries where features are missing. We need to organize these as an ordered series of features, which includes only the features we have chosen to use in the current model, and has zeroes for missing values. ''' self.featurelist = featurelist self.numfeatures = len(featurelist) self.features = Series(self.rawcounts, index = featurelist, dtype = 'float64') # Pandas has the nice feature of building a series from a dictionary if it's # provided an index of values. So this effectively builds a series of entries # ordered by the keys in 'featurelist,' with NaN in places where rawcounts # had no corresponding key. self.features[self.features.isnull()] = 0 # This replaces NaN with zero, since missing words are effectively words with # count == 0. def normalizefrequencies(self): ''' Simply divides all frequencies by the total token count for this volume. ''' self.features = self.features / self.totalcount def standardizefrequencies(self, standardizer): ''' Convert features to z-scores by centering them on the means and scaling them by standard deviation. standardizer = an object of class StandardizingVector, presumably created either on the corpus that contains this volume, or on the training corpus that created the model we are about to use on this volume. ''' assert len(self.features) == len(standardizer.means) self.features = (self.features - standardizer.means) / standardizer.stdevs
regex = re.compile(r""" (?P<username>[A-Z0-9._%+-]+) @ (?P<domain>[A-Z0-9.-]+) \. (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE | re.VERBOSE ) m = regex.match('*****@*****.**') data = { 'Dave': '*****@*****.**', 'Steve': '*****@*****.**', 'Rob': '*****@*****.**', 'Wes': np.nan } data = Series(data) print(data) print(data.isnull()) print(data.str.contains('gmail')) print(data.str.findall(pattern, flags=re.IGNORECASE)) matches = data.str.match(pattern, flags=re.IGNORECASE) print(matches) print(matches.str.get(1)) print(matches.str[0]) print(data.str[:5])
def test_isnull(self): ser = Series([0,5.4,3,nan,-0.001]) assert_series_equal(ser.isnull(), Series([False,False,False,True,False])) ser = Series(["hi","",nan]) assert_series_equal(ser.isnull(), Series([False,False,True]))
def test_isnull(self): # GH 13737 s = Series([pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isnull(), Series([False, True])) tm.assert_series_equal(s.notnull(), Series([True, False]))
def main(): # Series # Series is likea 1d array. lst = [4, 7, -5, 3] obj = Series(lst) print obj print obj.values print obj.index obj2 = Series(copy.deepcopy(lst), index=['d', 'b', 'a', 'c']) print obj2 print obj2.index # numpy ndarray like print obj2['a'] obj2['d'] = 6 print obj2[['c', 'a', 'd']] print obj2 print obj2[obj2 > 0] print obj2 * 2 print np.exp(obj2) # dict like print 'b' in obj2, 'e' in obj2 # dict to Series sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000,} obj3 = Series(sdata) print obj3 # Value is NaN, if index not found. states = ['Calfornia', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index=states) print obj4 # Pandas function print pd.isnull(obj4) print pd.notnull(obj4) # Series method print obj4.isnull() # Value is NaN, Index that does not exist only on one side print obj3 print obj4 print obj3 + obj4 # Index name obj4.name = 'population' obj4.index.name = 'state' print obj4 # Change index obj4.index = ['a', 'b', 'c', 'd'] print obj4 # DataFrame # DataFrame is like a spread sheet(table) # DataFrame of Pandas are similar to R DataFrame print '','' data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9],} frame = DataFrame(data) print frame # Order columns print DataFrame(data, columns=['year', 'state', 'pop']) # Columns, named indexes # Column all value is NaN, if column not found in data frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print frame2 print frame2.columns print frame2['state'] print frame2.year # row data print frame2.ix['three'] # Change column value frame2['debt'] = 16.5 print frame2.debt frame2.debt = np.arange(5.) # error, size is not 5 print frame2.debt # Column value is Nan, if index not found val = Series([-1.2, -1.5, -1.7], index=['one', 'three', 'five']) frame2.debt = val print frame2 # new column. No frame.eastern frame2['eastern'] = frame2.state == 'Ohio' print frame2 # del column. No frame.eastern del frame2['eastern'] print frame2 # nested dict to DataFrame # outer key is column, and sorted # inner key in index, and sorted pop = {'Ohio':{2002: 3.6, 2000: 1.5, 2001:1.7}, 'Nevada': {2001: 2.4, 2002:2.9},} frame3 = DataFrame(pop) print frame3 # transpose print frame3.T # DataFrame is not sorted, if specify index/columns print DataFrame(pop, index=[2001, 2002, 2000], columns=['Ohio', 'Nevada']) # Values method return ndarray, ndarray has one type # return casted type, if ndarray has many type. print frame3.values print frame3.values.dtype print frame2.values print frame2.values.dtype print frame2['year'].values.dtype # Index Object(IntIndex, DateTimeIndex, PeriodIndex) obj = Series(range(3), index='a b c'.split(' ')) index = obj.index print type(index) print index.dtype # Index is immutable try: index[1] = 'd' except: print sys.exc_info() print obj.index is pd.Index(['a', 'b', 'c']) index = pd.Index(np.arange(3)) obj2 = Series(range(3), index=index) print obj2.index is index # Index as a Set print frame3 print type(frame3.columns) print 'Ohio' in frame3.columns print 2003 in frame3.index print index.append(pd.Index(['d'])) print index # result is true, immutable
# # Functions isnull and notnull are used in pandas to detect missing data. # In[40]: pd.isnull(obj4) # In[41]: pd.notnull(obj4) # In[42]: obj4.isnull() # Series automatically aligns differently-indexed data in arithmetric operations: # In[43]: obj3 # In[44]: obj4 # In[45]:
# 如果某个索引值在字典中没有,那么值就为NaN ## 数组形式 obj.values ## 索引对象 obj2.index obj2['a'] obj2[['c','a','d']] obj2[obj2>0] ## 运算 obj2*2 np.exp(obj2) 'b' in obj2 ## 缺失值 pd.isnull(obj4) pd.notnull(obj4) obj4.isnull() ## 自动对齐,对于索引的元素运算 obj3 + obj4 ## Series对象本身和索引都有一个name属性 obj4.name = 'population' obj4.index.name = 'state' ## DataFrame data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9]} frame = DataFrame(data) pop = {'Nevada':{2001:2.4,2002:2.9}, 'Ohio':{2000:1.5,2001:1.7,2002:3.6}} frame3 = DataFrame(pop) # 指定列就按照指定列排序
a=Series(repeat(0.0,10),index=list(letters[:10])[::-1]) b=Series(arange(0,10,dtype=np.float64),index=list(letters[:10])) c=Series(arange(0,10,dtype=np.float64),index=list(letters[:10])) a[1] = np.nan a[3] = np.nan a[5] = np.nan c[3:8] = np.nan #%% a #%% b #%% c #%% 空值检查 a.isnull() #%% a.notnull() #%% 直接删除系列中的空值 a.dropna() #%% a.fillna(-1) #%% a.fillna(b) # 这里要十分小心,虽然我们给fillna一个索引完全一致只是顺序不 # 同的系列,但它是实际还是当成list来处理了,即通过位置进行空值 # 填充而不是使用索引来对齐 #%% pad / ffill: 前值填充 c.fillna(method='ffill') #%% backfill / bfill: 后值填充 c.fillna(method='bfill') #%% 限定填充范围
class WordVector: ''' A WordVector is just like a BagOfWords, except that it has a simpler constructor — it just accepts a list of tokens. In Java, you could write multiple constructors for one class. In Python, I'd have to rewrite the constructor inelegantly to make these a single class. So. Two classes. ''' def __init__(self, listofwords): ''' Construct a WordVector from a list. ''' self.rawcounts = dict() self.totalcount = 0 for word in listofwords: self.totalcount += 1 if word in self.rawcounts: self.rawcounts[word] += 1 else: self.rawcounts[word] = 1 self.numrawcounts = len(self.rawcounts) def selectfeatures(self, featurelist): ''' A WordVector is created with merely a dictionary of raw token counts. One could call this a sparse table. It has no entries where features are missing. We need to organize these as an ordered series of features, which includes only the features we have chosen to use in the current model, and has zeroes for missing values. ''' self.featurelist = featurelist self.numfeatures = len(featurelist) self.features = Series(self.rawcounts, index = featurelist, dtype = 'float64') # Pandas has the nice feature of building a series from a dictionary if it's # provided an index of values. So this effectively builds a series of entries # ordered by the keys in 'featurelist,' with NaN in places where rawcounts # had no corresponding key. self.features[self.features.isnull()] = 0 # This replaces NaN with zero, since missing words are effectively words with # count == 0. def normalizefrequencies(self): ''' Simply divides all frequencies by the total token count for this volume. ''' self.features = self.features / self.totalcount def standardizefrequencies(self, standardizer): ''' Convert features to z-scores by centering them on the means and scaling them by standard deviation. standardizer = an object of class StandardizingVector, presumably created either on the corpus that contains this volume, or on the training corpus that created the model we are about to use on this volume. ''' assert len(self.features) == len(standardizer.means) self.features = (self.features - standardizer.means) / standardizer.stdevs
print obj2*2 print obj2-2 print np.exp(obj2) print "#=================直接通过字典来创建Series===========================#" sdata = {'ohio':35000,'Texes':71000,'Oregen':16000,'Utah':5000} obj3 = Series(sdata) print obj3 print "#-----------------------------#" stats = ['California','ohio','Texes'] obj4 = Series(sdata,index=stats) print obj4 print "#=================判断是否缺失===========================#" print pd.isnull(obj4) print pd.notnull(obj4) print obj4.isnull() print "#============================================#" obj4.name = "population" obj4.index.name = 'state' print obj4 print "#==================DataFrame==========================#" data = {'state':['Ohio','Ohio','Nevada','Nevada'], 'year':['20','201','22','23'], 'pop':[1.5,1.4,1.4,1.7] } frame = DataFrame(data) print frame print "#===================指定列表顺序=========================#" frame1 = DataFrame(data,columns=['year','state','pop'])
mask data = DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) data data.Qu1 data.Qu1.value_counts data.Qu1.value_counts() result = data.apply(value_counts).fillna(0) resutl result ## Handling Missing Data string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) string_data string_data.isnull string_data.isnull() string_data[0] = None string_data.isnull() string_data # Filtering out Missing Data from numpy import nan as NA data = Series([1, NA, 3.5, NA, 7]) data.dropna() data[data.notnull()] data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]]) data cleaned = data.dropna() cleaned cleaned = data.dropna() # it drops every row which contains at least one Na value data.dropna(how='all') # it only drops row with all value equals to NA
__author__ = 'ryu' import numpy as np import pandas as pd from numpy import nan as NA from pandas import Series, DataFrame string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) string_data[0] = None print string_data.isnull() string_data.fillna(0) data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]]) data.dropna() data.dropna(axis=1, how="all") df = DataFrame(np.random.randn(7, 3)) df.ix[:4, 1] = NA df.dropna(thresh=2) # At least how many non NA values df.fillna({1: 0.5, 3: -1}) df.fillna(method="bfill", limit=2) data = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) print data.index print data['b'], data['b':'c'], data.ix[['b', 'd']], data[:, 2] print data.unstack()
s Out[24]: the_index 0 4 1 7 2 -5 3 3 Name: a_Series, dtype: int64 ``` 可以通过字典创建Series pandas中缺失(missing)或NA值用`NaN`(即"非数字"not a number)表示. pandas的`isnull` `notnull`函数可以用来检测缺失数据:`pd.isnull(Series)``pd.notnull(Series)` 返回values为Boolean值的Series。 也可以直接这样 `Series.isnull()` Series间进行算术运算时,会自动对齐不同索引的数据,是因为使用了键值对的数据结构。 突然发现喵酱的书架的博客和书是一样的 ###DataFrame 表格型的数据结构 含有一组**有序**的列 ``` data = {'state':['Ohino','Ohino','Ohino','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9]} df = DataFrame(data) ``` 取列:标记或属性
print('\n') ############################################################### states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index=states) print(obj4) print('\n') print(pd.isnull(obj4)) print(pd.notnull(obj4)) print('\n') print(obj4.isnull()) print('\n') ############################################################### print(obj3 + obj4) print('\n') obj4.name = 'population' obj4.index.name = 'state' print(obj4)
# 用dic创建Series dic = { 'name':'ytx', 'age':15, 'sex':'man' } s = Series(dic) # print s # 传入一个list当键 l = ['name','sex','grade'] s1 = Series(dic,l) # print s1 # 检查是否为空值 r = s1.isnull() r1 = s1.notnull() # print r # print r1 # 相加使得相同索引的数据相加 s2 = s + s1 # print s2 # 给Series赋值名字 s.name = 'series_name' s.index.name = 'series_index_name' # print s
def test_isnull(self): ser = Series([0, 5.4, 3, nan, -0.001]) np.array_equal(ser.isnull(), Series([False, False, False, True, False]).values) ser = Series(["hi", "", nan]) np.array_equal(ser.isnull(), Series([False, False, True]).values)