Example #1
0
def pd_02():
    string_data=Series(['a','b','c',np.nan,'e',None])
    print string_data
    print string_data.isnull()
    print string_data.dropna()
    df=DataFrame(np.random.randn(7,3))
    df.ix[:4,1]=np.nan
    df.ix[:2,2]=np.nan
    print df
    print df.dropna()
    print df.fillna(0)
    print df.fillna({1:0.5,3:-1})
    print df
    df.fillna(0,inplace=True)
    print df
Example #2
0
def is_boolean(series: pd.Series, series_description: dict) -> bool:
    """Is the series boolean type?

    Args:
        series: Series
        series_description: Series description

    Returns:
        True is the series is boolean type in the broad sense (e.g. including yes/no, NaNs allowed).
    """
    keys = series_description["value_counts_without_nan"].keys()
    if pd.api.types.is_bool_dtype(keys):
        return True
    elif (1 <= series_description["distinct_count_without_nan"] <= 2
          and pd.api.types.is_numeric_dtype(series)
          and series[~series.isnull()].between(0, 1).all()):
        return True
    elif 1 <= series_description["distinct_count_without_nan"] <= 4:
        unique_values = set([str(value).lower() for value in keys.values])
        accepted_combinations = [
            ["y", "n"],
            ["yes", "no"],
            ["true", "false"],
            ["t", "f"],
        ]

        if len(unique_values) == 2 and any(
            [unique_values == set(bools) for bools in accepted_combinations]):
            return True

    return False
Example #3
0
def check_if_datetime_as_object_feature(X: Series) -> bool:
    type_family = get_type_family_raw(X.dtype)
    # TODO: Check if low numeric numbers, could be categorical encoding!
    # TODO: If low numeric, potentially it is just numeric instead of date
    if X.isnull().all():
        return False
    if type_family != 'object':  # TODO: seconds from epoch support
        return False
    try:
        # TODO: pd.Series(['20170204','20170205','20170206']) is incorrectly not detected as datetime_as_object
        #  But we don't want pd.Series(['184','822828','20170206']) to be detected as datetime_as_object
        #  Need some smart logic (check min/max values?, check last 2 values don't go >31?)
        pd.to_numeric(X)
    except:
        try:
            if len(X) > 500:
                # Sample to speed-up type inference
                X = X.sample(n=500, random_state=0)
            result = pd.to_datetime(X, errors='coerce')
            if result.isnull().mean() > 0.8:  # If over 80% of the rows are NaN
                return False
            return True
        except:
            return False
    else:
        return False
Example #4
0
    def describe_url_1d(series: pd.Series, series_description: dict) -> dict:
        """Describe a url series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        # Make sure we deal with strings (Issue #100)
        series = series[~series.isnull()].astype(str)

        stats = {}

        # Create separate columns for each URL part
        keys = ["scheme", "netloc", "path", "query", "fragment"]
        url_parts = dict(zip(keys, zip(*series.map(urlsplit))))
        for name, part in url_parts.items():
            stats[f"{name.lower()}_counts"] = pd.Series(
                part, name=name).value_counts()

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats["top"] = value_counts.index[0]
        stats["freq"] = value_counts.iloc[0]

        return stats
Example #5
0
    def init_row(self,
                 rule: Rule,
                 results: pd.Series,
                 conn: Connector,
                 context: Dict = None):
        """
        Count metrics we want to measure using pd.Series api and set them to quality check object.
        """
        if results.isnull().any():
            raise ValueError(
                "In results of rule.apply can't be any Null values.")

        # todo - add to doc
        self.task_ts = context["task_ts"]
        self.attribute = rule.attribute
        self.rule_name = rule.name
        self.rule_description = rule.description

        self.total_records = results.shape[0]
        self.failed = results[results == False].shape[0]
        self.passed = results[results == True].shape[0]

        self.set_medians(conn)

        self.time_filter = rule.time_filter
        self.failed_percentage = self._perc(self.failed, self.total_records)
        self.passed_percentage = self._perc(self.passed, self.total_records)
        self.status = "invalid" if self.failed > 0 else "valid"
Example #6
0
def infer_pd_series_spark_type(
        pser: pd.Series,
        dtype: Dtype,
        prefer_timestamp_ntz: bool = False) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param pser: :class:`pandas.Series` to be inferred
    :param dtype: the Series' dtype
    :param prefer_timestamp_ntz: if true, infers datetime without timezone as
        TimestampNTZType type. If false, infers it as TimestampType.
    :return: the inferred Spark data type
    """
    if dtype == np.dtype("object"):
        if len(pser) == 0 or pser.isnull().all():
            return types.NullType()
        elif hasattr(pser.iloc[0], "__UDT__"):
            return pser.iloc[0].__UDT__
        else:
            return from_arrow_type(
                pa.Array.from_pandas(pser).type, prefer_timestamp_ntz)
    elif isinstance(dtype, CategoricalDtype):
        if isinstance(pser.dtype, CategoricalDtype):
            return as_spark_type(pser.cat.codes.dtype,
                                 prefer_timestamp_ntz=prefer_timestamp_ntz)
        else:
            # `pser` must already be converted to codes.
            return as_spark_type(pser.dtype,
                                 prefer_timestamp_ntz=prefer_timestamp_ntz)
    else:
        return as_spark_type(dtype, prefer_timestamp_ntz=prefer_timestamp_ntz)
Example #7
0
 def test_isnull(self):
     # GH 13737
     s = Series(
         [pd.Period('2011-01', freq='M'),
          pd.Period('NaT', freq='M')])
     tm.assert_series_equal(s.isnull(), Series([False, True]))
     tm.assert_series_equal(s.notnull(), Series([True, False]))
Example #8
0
    def get_errors(self, series: pd.Series, column: 'column.Column'):

        errors = []

        # Calculate which columns are valid using the child class's validate function, skipping empty entries if the
        # column specifies to do so
        simple_validation = ~self.validate(series)
        if column.allow_empty:
            # Failing results are those that are not empty, and fail the validation
            # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is
            if is_categorical_dtype(series) or is_numeric_dtype(series):
                validated = ~series.isnull() & simple_validation
            else:
                validated = (series.str.len() > 0) & simple_validation

        else:
            validated = simple_validation

        # Cut down the original series to only ones that failed the validation
        indices = series.index[validated]

        # Use these indices to find the failing items. Also print the index which is probably a row number
        for i in indices:
            element = series[i]
            errors.append(ValidationWarning(
                message=self.message,
                value=element,
                row=i,
                column=series.name
            ))

        return errors
    def describe_path_1d(series: pd.Series, series_description: dict) -> dict:
        """Describe a path series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        series_description.update(describe_categorical_1d(series, series_description))

        # Make sure we deal with strings (Issue #100)
        if "p_series" not in series_description:
            series = series[~series.isnull()].astype(str)
            series = series.map(Path)
        else:
            series = series_description["p_series"]
            del series_description["p_series"]

        stats = path_summary(series)

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats["top"] = value_counts.index[0]
        stats["freq"] = value_counts.iloc[0]

        return stats
Example #10
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan # not same

    # Exclude N/A
    print '',''
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ])
    cleaned = data.dropna() # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how='all')
    data[4] = None
    print data.dropna(axis=1, how='all')
    print data.dropna(thresh=2) # non NA is more 2

    # Fill NA
    print '',''
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print '',''
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill', limit=2)
    data = Series([1., NA, 3.5, NA, 7])
    print data.fillna(data.mean())
def bin_variable(var: pd.Series,
                 target: pd.Series = None,
                 points: list = None,
                 min_size: float = 5,
                 rnd: int = 2,
                 return_points: bool = False):
    """
    Make binning on numeric variable, return categorized variable

    Keyword arguments:
        var (pd.Series) -- Numeric variable
        target (pd.Series) -- Target binary variable
        points (list) -- List of cut-points (default None)
        min_size (float) -- minimum size of group in percent
        rnd -- Round level for variable values (default 2)

    Output:
        List  - [cut-points (list), categorized variable (pd.Series)]
            or
        Categorized variable (pd.Series)
    """
    if points is None:
        if target is None:
            points = points_calulation(var=var, min_size=min_size, rnd=rnd)
        else:
            points = points_calculation_tree(var=var,
                                             target=target,
                                             min_size=min_size,
                                             rnd=rnd)

    if points == [-np.inf, np.inf]:
        var = pd.cut(var, bins=points, labels=['Not Missing'])
    else:
        points = list([
            -np.inf,
        ] if points[0] != -np.inf else []) + points + list([
            np.inf,
        ] if points[-1] != np.inf else [])
        var = pd.cut(var, bins=points, include_lowest=False)

    categories = [str(x) for x in var.cat.categories]
    if var.isnull().sum() > 0:
        var = var.cat.add_categories('Missing')
        var.fillna('Missing', inplace=True)

    var = var.astype(str)
    if len(categories) > 1:
        var[var == categories[0]] = '[<={}]'.format(points[1])
        var[var == categories[-1]] = '(>{}]'.format(points[-2])

        categories[0] = '[<={}]'.format(points[1])
        categories[-1] = '(>{}]'.format(points[-2])

    if (var == 'Missing').sum() > 0:
        categories = [
            'Missing',
        ] + categories
    var = pd.Categorical(var, categories=categories, ordered=True)
    return (points, var) if return_points else var
Example #12
0
def normalize_edgeR(X: pd.DataFrame,
                    method: str = "tmm",
                    length: pd.Series = None,
                    p=0.75,
                    **kws):
    """
    X: pd.DataFrame where rows are samples and columns are genes

    methods: ("tmm","rle","upperquartile")
        "TMM" is the weighted trimmed mean of M-values (to the reference) proposed by Robinson and Oshlack (2010), where the weights are from the delta method on Binomial data.
        "RLE" is the scaling factor method proposed by Anders and Huber (2010). We call it "relative log expression", as median library is calculated from the geometric mean of all columns and the median ratio of each sample to the median library is taken as the scale factor.
        "upperquartile" is the upper-quartile normalization method of Bullard et al (2010), in which the scale factors are calculated from the 75% quantile of the counts for each library, after removing genes which are zero in all libraries. This idea is generalized here to allow scaling by any quantile of the distributions.
        "GeTMM" Gene length corrected trimmed mean of M-values. Must include gene lengths. https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2246-7#MOESM4
    edgeR: http://bioconductor.org/packages/release/bioc/html/edgeR.html

    """
    edgeR = R_package_retrieve("edgeR")

    assert isinstance(X, pd.DataFrame), "type(df_counts) must be pd.DataFrame"
    # Method formatting
    assert_acceptable_arguments(
        query=[method.lower()],
        target={"tmm", "rle", "upperquartile", "getmm"})
    if method in {"tmm", "rle"}:
        method = method.upper()

    # Check RPK for GeTMM
    if method.lower() == "getmm":
        assert length is not None, "If GeTMM is chosed as the method then `length` cannot be None.  It must be either a pd.Series of sequences or sequence lengths"
        length = length[X.columns]
        assert length.isnull().sum(
        ) == 0, "Not all of the genes in `X.columns` are in `length.index`.  Either use a different normalization or get the missing sequence lengths"
        # If sequences are given then convert to length (assumes CDS and no introns)
        if pd.api.types.is_string_dtype(length):
            length = length.map(len)
        X = X / length
        method = "TMM"

    # Gene axis as rows
    X = X.T
    # Labels
    idx_attrs = X.index
    idx_obsvs = X.columns

    # Convert pd.DataFrame to R-object
    rX = pandas_to_rpy2(X)
    d = edgeR.DGEList(counts=rX)

    # Calculate NormFactors
    normalization_factors = edgeR.calcNormFactors(d, method=method, p=p, **kws)

    # Normalized counts
    normalized_counts = edgeR.cpm(normalization_factors)
    X_tmm = pd.DataFrame(rpy2_to_pandas(normalized_counts),
                         index=idx_attrs,
                         columns=idx_obsvs).T

    return X_tmm
Example #13
0
 def test_isnull_for_inf(self):
     s = Series(['a', np.inf, np.nan, 1.0])
     with pd.option_context('mode.use_inf_as_null', True):
         r = s.isnull()
         dr = s.dropna()
     e = Series([False, True, True, False])
     de = Series(['a', 1.0], index=[0, 3])
     tm.assert_series_equal(r, e)
     tm.assert_series_equal(dr, de)
Example #14
0
def validate_series(series: pd.Series) -> None:
    """
    Ensure `series` is "valid" as per Workbench standards, or raise ValueError.

    "Valid" means:

    * If dtype is `object` or `categorical`, all values are `str`, `np.nan` or
      `None`
    * Otherwise, series must be numeric (but not "nullable integer") or
      datetime (without timezone).
    """
    dtype = series.dtype
    if dtype in SupportedNumberDtypes:
        infinities = series.isin([np.inf, -np.inf])
        if infinities.any():
            idx = series[infinities].index[0]
            raise ValueError(("invalid value %r in column %r, row %r "
                              "(infinity is not supported)") %
                             (series[idx], series.name, idx))
        return
    elif is_datetime64_dtype(dtype):  # rejects datetime64ns
        return
    elif dtype == object:
        nonstr = (series[~series.isnull()].map(type) != str)
        if nonstr.any():
            raise ValueError(
                "invalid value %r in column %r (object values must all be str)"
                % (series.iloc[nonstr[nonstr == True].index[0]], series.name))
    elif hasattr(series, 'cat'):
        categories = series.cat.categories
        nonstr = (categories.map(type) != str)
        if nonstr.any():
            raise ValueError(
                "invalid value %r in column %r (categories must all be str)" %
                (categories[np.flatnonzero(nonstr)[0]], series.name))

        # Detect unused categories: they waste space, and since the module
        # author need only .remove_unused_categories() there isn't much reason
        # to allow them (other than the fact this check might be slow?).
        codes = np.unique(series.cat.codes)  # retval is sorted
        if len(codes) and codes[0] == -1:
            codes = codes[1:]
        # At this point, if all categories are used, `codes` is an Array of
        # [0, 1, ..., len(categories)-1]. Otherwise, there's a "hole" somewhere
        # in `codes` (it may be at the end).
        if len(codes) != len(categories):
            # There are unused categories. That means an index into
            # `categories` is not in `codes`. Raise it.
            for i, category in enumerate(categories):
                if i >= len(codes) or codes[i] != i:
                    raise ValueError(('unused category %r in column %r '
                                      '(all categories must be used)') %
                                     (category, series.name))
            assert False  # the for-loop is guaranteed to raise, in theory
    else:
        raise ValueError('unsupported dtype %r in column %r' %
                         (dtype, series.name))
Example #15
0
    def __call__(self, value: Series) -> bool:
        self.value = value

        has_null = True

        if not self.nullable:
            return not value.isnull().values.any()

        return has_null
Example #16
0
def full_join_except_id(row: pd.Series):
    single_text_blob = []
    for index, row_value in row[1:].items():
        if not row.isnull()[index]:
            single_text_blob.append(
                str(row_value)
            )  # make sure non-string values (e.g. prices as floats) get converted

    return " ".join(single_text_blob)
 def test_isnull_for_inf(self):
     s = Series(['a', np.inf, np.nan, 1.0])
     with pd.option_context('mode.use_inf_as_null', True):
         r = s.isnull()
         dr = s.dropna()
     e = Series([False, True, True, False])
     de = Series(['a', 1.0], index=[0, 3])
     tm.assert_series_equal(r, e)
     tm.assert_series_equal(dr, de)
Example #18
0
def create_site_specific_splits(
    site: pd.Series,
    proportions: Dict[str, int],
    random_state: Optional[Union[int, np.random.mtrand.RandomState]] = 989,
):
    """Splits sites into distinct groups whose sizes roughly matching the given proportions. Null
    sites are randomly assigned to groups using the provided proportions.

    Args:
        site (pd.Series): A series of sites, one element per observation,
        proportions (dict): A dict whose keys are the resulting groups and whose values are the
            rough proportion of data in each group.
        seed (int): Seed for random split of null sites.

    Example:
        Split data into groups where each site is in one and only one group with roughly 50-25-25
        train-val-holdout proportions.

        >>> create_site_specific_splits(site, proportions={"train": 2, "val": 1, "holdout": 1})

    Returns:
        pd.Series: A series containing the resulting split, one element per observation.

    """

    assignments = {}
    sites = site.value_counts(dropna=True).sort_values(ascending=False).index
    n_subgroups = sum(proportions.values())
    for i, subset in enumerate(
            roundrobin(*([subset] * proportions[subset]
                         for subset in proportions))):
        for group in sites[i::n_subgroups]:
            assignments[group] = subset

    # Divide null sites among the groups
    null_sites = site.isnull()
    if null_sites.sum() > 0:
        logger.debug(
            f"{null_sites.sum():,} null sites randomly assigned to groups.")
        null_groups = []
        for group, group_proportion in proportions.items():
            null_group = f"{group}-{uuid4()}"
            null_groups.append(null_group)
            assignments[null_group] = group

        rng = (np.random.RandomState(random_state) if isinstance(
            random_state, int) else random_state)
        site = site.copy()
        site.loc[null_sites] = rng.choice(
            null_groups,
            p=np.asarray(list(proportions.values())) /
            sum(proportions.values()),
            size=null_sites.sum(),
            replace=True,
        )

    return site.replace(assignments)
Example #19
0
    def _check_inputs(
        s_test_pred: pd.Series,
        s_calib_pred: pd.Series,
        s_calib_actual: pd.Series,
    ) -> None:
        """ Check that inputs have valid names and could be proabilities """

        if (
            s_test_pred.min() < 0
            or s_test_pred.max() > 1
            or s_calib_pred.min() < 0
            or s_calib_pred.max() > 1
        ):
            raise RuntimeError(
                "Probabilities outside (0,1) range were passed to calibrate"
            )

        if not s_calib_pred.name == s_test_pred.name:
            warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}")
        if s_test_pred.isnull().sum() > 0:
            _log_missing_indices(s_test_pred)
            raise RuntimeError("Missing values in s_test_pred")
        if s_calib_pred.isnull().sum() > 0:
            _log_missing_indices(s_calib_pred)
            raise RuntimeError("Missing values in s_calib_pred")
        if s_calib_actual.isnull().sum() > 0:
            _log_missing_indices(s_calib_actual)
            raise RuntimeError("Missing values in s_calib_actual")

        if (
            not len(s_calib_pred) == len(s_calib_actual)
            or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0
        ):
            raise RuntimeError(
                f"len(s_calib_pred): {len(s_calib_pred)} "
                f"len(s_calib_actual): {len(s_calib_actual)} "
                f"index diff: "
                f"{s_calib_pred.index.difference(s_calib_actual.index)}"
                f"s_calib_pred.head() : {s_calib_pred.head()}"
                f"s_calib_pred.tail() : {s_calib_pred.tail()}"
                f"s_calib_actual.head() : {s_calib_actual.head()}"
                f"s_calib_actual.tail() : {s_calib_actual.tail()}"
            )
Example #20
0
 def parse(self, column_data: pd.Series):
     """Parse the column data and fill in the necessary properties"""
     assert not self._parsed, 'Cannot call parse twice. ' \
                              'Use "col_prop.clone()" and parse again.'
     self._parsed = True
     assert isinstance(column_data, pd.Series), \
         'Currently, the input column data must be a Pandas Series.'
     self._num_sample = len(column_data)
     self._num_missing_samples = column_data.isnull().sum().sum().item()
     self._name = column_data.name
Example #21
0
def slide_20():
    import re
    data = {'Dave': '*****@*****.**',
            'Steve': '*****@*****.**',
            'Rob': '*****@*****.**',
            'Wes': np.nan}
    data = Series(data)
    print data
    print data.isnull()
    print data.str.contains('gmail')
    pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
    print data.str.findall(pattern, flags=re.IGNORECASE)

    matches = data.str.match(pattern, flags=re.IGNORECASE)
    print matches
    print matches.str.get(1)
    print matches.str[0]
    print data
    print data.str[:5]
def is_url(series: pd.Series, counts: dict) -> bool:
    if counts["distinct_count_without_nan"] > 0:
        try:
            result = series[~series.isnull()].astype(str).apply(urlparse)
            return result.apply(
                lambda x: all([x.scheme, x.netloc, x.path])).all()
        except ValueError:
            return False
    else:
        return False
Example #23
0
def practice_four():
    string_data = Series(['aa', 'ar', np.nan, 'av'])
    string_data.isnull  # 判断是否为NA
    string_data[0] = None
    string_data.isnull()  # 内置的None值也会当作NA处理
    '''
    NA处理方法
        dropna      根据各标签的值中是否存在缺失数据对轴标签进行过滤
        fillna      用指定值或插值方法(ffill或bfill)填充缺失数据
        isnull      返回一个布尔值的对象,表明哪些值是缺失值NA
        notnull     isnull的否定式
    '''

    # 滤除缺失数据
    from numpy import nan as NA
    data = Series([1, NA, 3.5, NA, 7])
    data.dropna()  # 返回一个仅含非空数据和索引值的Series
    data[data.notnull()]  # 同上
    data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5,
                                                                  3.]])
    data.dropna()  # 默认丢弃任何含有缺失值的行
    data.dropna(how='all')  # 只丢弃全为NA的那些行
    data.dropna(axis=1, how='all')  # 只丢弃全为NA的那些列
    df = DataFrame(np.random.randn(7, 3))
    df.ix[:4, 1] = NA
    df.ix[:2, 2] = NA
    df.dropna(thresh=3)

    # 填充缺失数据
    df.fillna(0)  # 填充为0
    df.fillna({1: 0.5, 3: -1})  # 可以实现对不同的列填充不同的值
    df.fillna(0, inplace=True)  # 返回被填充对象的引用
    '''
    fillna函数的参数
        value       填充缺失值的标量值或字典对象
        method      插值方式
        axis        默认为0
        inplace     修改调用者对象而不产生副本
        limit       可以连续填充的最大数量
    '''

    pass
Example #24
0
    def get_bin_indices(self, series: pd.Series, dtype, bins) -> pd.Series:

        bin_index = {}
        for bin, range_config in bins.items():
            bin_index[bin] = self.condition_index(range_config, dtype, series)

        values_index = functools.reduce(lambda a, b: a | b, bin_index.values())
        null_index = series.isnull()
        other_index = series.index & ~(null_index | values_index)

        return bin_index, null_index, other_index, values_index
Example #25
0
def degree_mean(data: pd.Series) -> float:
    """
    Return the mean of a list of degrees
    """

    if data.isnull().all():
        return np.NaN

    rads = np.deg2rad(data)
    sums = np.arctan2(np.sum(np.sin(rads)), np.sum(np.cos(rads)))
    return (np.rad2deg(sums) + 360) % 360
    def transform(self, X, y=None):

        pred_age_train = X[['SibSp', 'Parch']][Series.isnull(X['Age'])]

        pred_age = pred_age_train.apply(self.predict_age, axis=1)
        # 相等的赋值会按照 行号来进行赋值的
        X['Age'][Series.notnull(X['Age'])] = X['Age'][Series.notnull(X['Age'])].map(lambda age: 1 if age < 16 else 0)
        X['Age'][np.isnan(X['Age'])] = pred_age

        X.rename(columns={'Age':'Child'}, inplace=True)

        return X
Example #27
0
    def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \
            -> Union[pd.DataFrame, np.ndarray]:
        """
        Fit models for each fold, then transform X

        Args:
            X:
                Data
            y:
                Target
            fit_params:
                Additional parameters passed to models

        Returns:
            Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True.
        """
        assert len(X) == len(y)
        self._pre_train(y)

        is_pandas = isinstance(X, pd.DataFrame)
        X = convert_input(X)
        y = convert_input_vector(y, X.index)

        if y.isnull().sum() > 0:
            # y == null is regarded as test data
            X_ = X.copy()
            X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params)
            X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params)
        else:
            X_ = self._fit_train(X, y, **fit_params)

        X_ = self._post_transform(self._post_fit(X_, y))

        return X_ if self.return_same_type and is_pandas else X_.values
def _nullable_test(col: pd.Series, meta_col: dict) -> dict:

    col_name = meta_col.get("name")

    test_inputs = {
        "column": col_name,
    }

    res_dict = _result_dict("nullable", test_inputs)

    col_oob = col.isnull()

    return _fill_res_dict(col, col_oob, res_dict)
def missing_val():
    missing = np.nan
    series_obj = Series(['row1', 'row2', missing, 'row4'])
    print(series_obj.isnull())
    np.random.seed(25)
    DF_rand = DataFrame(np.random.randn(36).reshape(6, 6))
    DF_rand.iloc[3:6, 0] = missing
    DF_rand.iloc[1:4, 5] = missing
    print(DF_rand.fillna(0))
    filled_DF = DF_rand.fillna({0: .1, 5: 1.25})
    ffill = DF_rand.fillna(method='ffill')
    print(DF_rand.isnull().sum())
    pass
Example #30
0
def impute_with_dist(feature: pd.Series):
    """
    Imputes na's of column according to distribution
    :param df:
    :param feature:
    :return:
    """

    probs = feature.value_counts(normalize=True)
    isnull = feature.isnull()
    feature[isnull] = np.random.choice(probs.index,
                                       size=len(feature[isnull]),
                                       p=probs.values)
    return feature
Example #31
0
 def validate_prediction_output(cls, result: pd.Series):
     """
     Validates if prediction output is of type Series and values of series are float64
     :param result: pandas series
     :return:    None
     :raises:    InvalidPredictionException
     """
     if not isinstance(result, pd.Series):
         raise InvalidPredictionException("Output of model. predict should be of type pandas Series")
     if result.empty or result.isnull().all():
         raise InvalidPredictionException("Prediction result for given test data was empty or None for all the rows."
                                          " Please verify the model")
     if not np.array_equal(result.fillna(0), result.fillna(0).astype(np.float64)):
         raise InvalidPredictionException("Prediction result of type other than int/float/double are not supported")
Example #32
0
def rdf(series: pd.Series, mol_list: Iterable[MultiMolecule]) -> None:
    """Guess parameters in **df** using the Boltzmann-inverted radial distribution function."""
    is_null = series.isnull()
    nonzero = series[~is_null].index
    atom_subset = set(chain.from_iterable(series[is_null].index))

    # Construct the RDF and guess the parameters
    rdf_gen = (mol.init_rdf(atom_subset=atom_subset) for mol in mol_list)
    for rdf in rdf_gen:
        guess = estimate_lj(rdf)
        guess.index = pd.MultiIndex.from_tuples(
            sorted(i.split()) for i in guess.index)
        guess[guess.index.intersection(nonzero)] = np.nan
        series.update(guess[series.name])
Example #33
0
 def _validate_class_labels(self, y: Series):
     null_count = y.isnull().sum()
     if null_count:
         raise ValueError(f'Labels cannot contain missing (nan) values. Found {null_count} missing label values.')
     if self.problem_type == MULTICLASS and not self.eval_metric.needs_pred:
         y_unique = np.unique(y)
         valid_class_set = set(self.class_labels)
         unknown_classes = []
         for cls in y_unique:
             if cls not in valid_class_set:
                 unknown_classes.append(cls)
         if unknown_classes:
             # log_loss / pac_score
             raise ValueError(f'Multiclass scoring with eval_metric=\'{self.eval_metric.name}\' does not support unknown classes. Unknown classes: {unknown_classes}')
Example #34
0
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param s: :class:`pandas.Series` to be inferred
    :return: the inferred Spark data type
    """
    dt = s.dtype
    if dt == np.dtype('object'):
        if len(s) == 0 or s.isnull().all():
            raise ValueError("can not infer schema from empty or null dataset")
        return types.from_arrow_type(pa.Array.from_pandas(s).type)
    elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
        return types.TimestampType()
    else:
        return types.from_arrow_type(pa.from_numpy_dtype(dt))
 def check_if_datetime_feature(self, X: Series):
     type_family = self.get_type_family(X.dtype)
     # TODO: Check if low numeric numbers, could be categorical encoding!
     # TODO: If low numeric, potentially it is just numeric instead of date
     if X.isnull().all():
         return False
     if type_family == 'datetime':
         return True
     if type_family != 'object':  # TODO: seconds from epoch support
         return False
     try:
         X.apply(pd.to_datetime)
         return True
     except:
         return False
Example #36
0
    def test_comparison_operators_with_nas(self):
        s = Series(bdate_range('1/1/2000', periods=10), dtype=object)
        s[::2] = np.nan

        # test that comparisons work
        ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne']
        for op in ops:
            val = s[5]

            f = getattr(operator, op)
            result = f(s, val)

            expected = f(s.dropna(), val).reindex(s.index)

            if op == 'ne':
                expected = expected.fillna(True).astype(bool)
            else:
                expected = expected.fillna(False).astype(bool)

            assert_series_equal(result, expected)

            # fffffffuuuuuuuuuuuu
            # result = f(val, s)
            # expected = f(val, s.dropna()).reindex(s.index)
            # assert_series_equal(result, expected)

            # boolean &, |, ^ should work with object arrays and propagate NAs

        ops = ['and_', 'or_', 'xor']
        mask = s.isnull()
        for bool_op in ops:
            f = getattr(operator, bool_op)

            filled = s.fillna(s[0])

            result = f(s < s[9], s > s[3])

            expected = f(filled < filled[9], filled > filled[3])
            expected[mask] = False
            assert_series_equal(result, expected)
Example #37
0
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True):
    #  NOTE: inf only happens when explicitly setting bins

    # short circuit empty series
    s = Series(s)
    if s.count() == 0:
        return np.repeat(None, len(s))

    if not np.iterable(bins):
        ind, label = cut(s, bins, retbins=retbins, labels=labels)
        # for now, pandas base cut doesn't support infinite ranges
        # so it bases first bin at 0 where we base on 1, and 0 is 
        # [-inf, first] for us
        ind = ind + 1
    else:
        bins = np.asarray(bins)
        #if (np.diff(bins) < 0).any():
        #    raise ValueError('bins must increase monotonically.')
        ind, label = inf_bins_to_cuts(s, bins)

    # build out ranges
    ranges = []
    ranges.append(NumRange(-inf, label[0]))
    for x in range(len(label)-1):
       nr = NumRange(label[x], label[x+1]) 
       ranges.append(nr)
    ranges.append(NumRange(label[-1], inf))

    if not infinite:
        na_mask = (ind == 0) | (ind == len(bins))
        np.putmask(ind, na_mask, -1)

    #ind = ind.astype(int)
    ind[s.isnull().values] = -1
    # fastpath=True to skip the hashmap indexing. 
    # The code generator will check identity, which won't match because
    # ind is an int position vector and ranges is a list of objects.
    # if fastpath is off, then it'll look like none of the values match
    return Categorical(ind, ranges, fastpath=True)
    def fit(self, X, y=None):
        age = X[['SibSp', 'Parch','Age']][Series.notnull(X['Age'])]

        age_nan = X[['SibSp', 'Parch']][Series.isnull(X['Age'])]

        y_child = age['Age'].map(lambda age: 1 if age < 16 else 0)
        y_child = y_child.rename('y_child')

        age.drop('Age', axis=1, inplace=True)

        lr = LogisticRegression()
        paramters = {'C':[0.01, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3]}

        self.grid_search = GridSearchCV(lr, paramters, cv=5, n_jobs=-1)

        self.grid_search.fit(age, y_child)

        #print self.grid_search.best_params_
        #print self.grid_search.best_score_


        return self
Example #39
0
def _tile_inds(s, bins, labels=False, retbins=True, infinite=True):
    #  NOTE: inf only happens when explicitly setting bins

    # short circuit empty series
    s = Series(s)
    if s.count() == 0:
        return np.repeat(None, len(s))

    if not np.iterable(bins):
        ind, label = cut(s, bins, retbins=retbins, labels=labels)
        # for now, pandas base cut doesn't support infinite ranges
        # so it bases first bin at 0 where we base on 1, and 0 is 
        # [-inf, first] for us
        ind = ind + 1
    else:
        bins = np.asarray(bins)
        #if (np.diff(bins) < 0).any():
        #    raise ValueError('bins must increase monotonically.')
        ind, label = inf_bins_to_cuts(s, bins)
    

    # build out ranges
    ranges = []
    ranges.append(NumRange(-inf, label[0]))
    for x in range(len(label)-1):
       nr = NumRange(label[x], label[x+1]) 
       ranges.append(nr)
    ranges.append(NumRange(label[-1], inf))

    if not infinite:
        na_mask = (ind == 0) | (ind == len(bins))
        np.putmask(ind, na_mask, -1)

    #ind = ind.astype(int)
    ind[s.isnull().values] = -1
    return Categorical(ind, ranges)
Example #40
0
print s.index

s2 = Series([100, "python", "scu", "lina"], index=["mark", "title", "university", "name"])
print s2
print s2.index
print s2["name"]

sd = {"python": 10000, "go": 8900, "lua": 7200}
s3 = Series(sd)
print s3

s4 = Series(sd, index=["python", "go", "java"])
print s4

print pd.isnull(s4)
print s4.isnull()

s4.index = ["p1", "p2", "p3"]
print s4


data = {"name": ["yahoo", "facebook", "google"], "marks": [200, 400, 800], "price": [9, 3, 7]}
f1 = DataFrame(data)
print f1

f2 = DataFrame(data, columns=["name", "marks", "price"])
print f2

f3 = DataFrame(data, columns=["name", "marks", "price", "debt"], index=["a", "b", "c"])
print f3
# Lecture 23 - Missing Data

import numpy as np
from pandas import Series,DataFrame
import pandas as pd

data = Series(['one', 'two', np.nan, 'four'])
data


data.isnull() # identify missing values

data.dropna() # drop all null values

dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe

clean_dframe = dframe.dropna() # returns rows without any nulls
clean_dframe

dframe.dropna(how = 'all') # drops rows with all values missing

dframe.dropna(axis = 1) # drops columns (will drop all of them)

npn = np.nan
dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])
dframe2

# points without null
dframe2.dropna(thresh = 2) # rows with at least 2 data points
Example #42
0
#cov用于计算协方差
returns.MSFT.corr(returns.IBM)
returns.MSFT.cov(returns.IBM)
returns.corr()
returns.cov()

#DataFrame的corrwith方法,计算列或者行跟另一个Series或DataFrame之间的相关系数
#传入一个Series会返回一个相关系数值Series,传入一个DataFrame会计算按列名配对的相关系数
#计算AAPL与股票的相关系数
returns.corrwith(returns.AAPL)
#计算百分比变化与成交量的相关系数
returns.corrwith(volume)

#处理缺失数据
package = Series(['pandas','numpy','matplotlib',np.nan])
print package.isnull()
#python内置的None值也会被当做NA处理

data = Series([1,np.nan,2,np.nan,3])
print data.dropna()
print data[data.notnull()]

#对于DataFrame,dropna默认丢弃任何含有缺失值的行
data = DataFrame(np.arange(16).reshape(4,4))
data[(data/2-1)%3==0]=np.nan
#将第一行全部置为nan
data.ix[0]=np.nan
print data.dropna()
#只丢弃全为NA的那些行
print data.dropna(how = "all")
#第三列置为NA
import numpy as np
import pandas as pd

import pandas.io.data as web

from pandas import Series, DataFrame
from numpy import NaN as NA

###############################################################

string_data = Series(['aardvark','artichoke',np.nan,'avocado'])

print(string_data)
print('\n')
print(string_data.isnull())
print('\n')

string_data[0]=None
print(string_data.isnull())
print('\n')

###############################################################

data =Series([1,NA,3.5,NA,7])
print(data.dropna())

print('\n')

print(data[data.notnull()])

print('\n')
Example #44
0
class BagOfWords:

	def __init__(self, filepath, volID, include_punctuation):
		''' Construct a BagOfWords.
		volID is a string label for the volume.
		include_punctuation is a boolean.
		'''

		self.volID = volID

		with open(filepath, encoding = 'utf-8') as f:
			filelines = f.readlines()

		self.rawcounts = dict()
		self.totalcount = 0

		for line in filelines:
			line = line.rstrip()
			fields = line.split('\t')
			if len(fields) != 2:
				print("Illegal line length in " + filepath)
				print(line)
				continue
			else:
				tokentype = fields[0]
				count = fields[1]

				try:
					intcount = int(count)
					if include_punctuation or not all_nonalphanumeric(tokentype):
						self.rawcounts[tokentype] = intcount
						self.totalcount += intcount

				except ValueError:
					print("Cannot parse count " + count + " as integer.")
					continue

		self.numrawcounts = len(self.rawcounts)

	def selectfeatures(self, featurelist):
		''' A BagOfWords is created with merely a dictionary of raw token counts.
		One could call this a sparse table. It has no entries where features are
		missing.

		We need to organize these as an ordered series of features, which includes
		only the features we have chosen to use in the current model, and has zeroes for
		missing values.
		'''

		self.featurelist = featurelist
		self.numfeatures = len(featurelist)
		self.features = Series(self.rawcounts, index = featurelist, dtype = 'float64')
		# Pandas has the nice feature of building a series from a dictionary if it's
		# provided an index of values. So this effectively builds a series of entries
		# ordered by the keys in 'featurelist,' with NaN in places where rawcounts
		# had no corresponding key.

		self.features[self.features.isnull()] = 0
		# This replaces NaN with zero, since missing words are effectively words with
		# count == 0.

	def normalizefrequencies(self):
		''' Simply divides all frequencies by the total token count for this volume.
		'''

		self.features = self.features / self.totalcount

	def standardizefrequencies(self, standardizer):
		''' Convert features to z-scores by centering them on the means and
		scaling them by standard deviation.

		standardizer = an object of class StandardizingVector, presumably created
		either on the corpus that contains this volume, or on the training corpus
		that created the model we are about to use on this volume.
		'''

		assert len(self.features) == len(standardizer.means)

		self.features = (self.features - standardizer.means) / standardizer.stdevs
Example #45
0
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""",
                   flags=re.IGNORECASE | re.VERBOSE
                   )

m = regex.match('*****@*****.**')

data = {
    'Dave': '*****@*****.**',
    'Steve': '*****@*****.**',
    'Rob': '*****@*****.**',
    'Wes': np.nan
}

data = Series(data)
print(data)
print(data.isnull())
print(data.str.contains('gmail'))
print(data.str.findall(pattern, flags=re.IGNORECASE))

matches = data.str.match(pattern, flags=re.IGNORECASE)
print(matches)
print(matches.str.get(1))
print(matches.str[0])
print(data.str[:5])
Example #46
0
 def test_isnull(self):
     ser = Series([0,5.4,3,nan,-0.001])
     assert_series_equal(ser.isnull(), Series([False,False,False,True,False]))
     ser = Series(["hi","",nan])
     assert_series_equal(ser.isnull(), Series([False,False,True]))
Example #47
0
 def test_isnull(self):
     # GH 13737
     s = Series([pd.Period('2011-01', freq='M'),
                 pd.Period('NaT', freq='M')])
     tm.assert_series_equal(s.isnull(), Series([False, True]))
     tm.assert_series_equal(s.notnull(), Series([True, False]))
Example #48
0
def main():
    # Series
    # Series is likea 1d array.
    lst = [4, 7, -5, 3]
    obj = Series(lst)
    print obj
    print obj.values
    print obj.index

    obj2 = Series(copy.deepcopy(lst), index=['d', 'b', 'a', 'c'])
    print obj2
    print obj2.index
    # numpy ndarray like
    print obj2['a']
    obj2['d'] = 6
    print obj2[['c', 'a', 'd']]
    print obj2
    print obj2[obj2 > 0]
    print obj2 * 2
    print np.exp(obj2)

    # dict like
    print 'b' in obj2, 'e' in obj2
    # dict to Series
    sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000,}
    obj3 = Series(sdata)
    print obj3

    # Value is NaN, if index not found.
    states = ['Calfornia', 'Ohio', 'Oregon', 'Texas']
    obj4 = Series(sdata, index=states)
    print obj4
    # Pandas function
    print pd.isnull(obj4)
    print pd.notnull(obj4)
    # Series method
    print obj4.isnull()

    # Value is NaN, Index that does not exist only on one side
    print obj3
    print obj4
    print obj3 + obj4

    # Index name
    obj4.name = 'population'
    obj4.index.name = 'state'
    print obj4

    # Change index
    obj4.index = ['a', 'b', 'c', 'd']
    print obj4

    # DataFrame
    # DataFrame is like a spread sheet(table)
    # DataFrame of Pandas are similar to R DataFrame
    print '',''
    data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9],}
    frame = DataFrame(data)
    print frame

    # Order columns
    print DataFrame(data, columns=['year', 'state', 'pop'])

    # Columns, named indexes
    # Column all value is NaN, if column not found in data
    frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                       index=['one', 'two', 'three', 'four', 'five'])
    print frame2
    print frame2.columns
    print frame2['state']
    print frame2.year
    # row data
    print frame2.ix['three']
    # Change column value
    frame2['debt'] = 16.5
    print frame2.debt
    frame2.debt = np.arange(5.) # error, size is not 5
    print frame2.debt

    # Column value is Nan, if index not found
    val = Series([-1.2, -1.5, -1.7], index=['one', 'three', 'five'])
    frame2.debt = val
    print frame2

    # new column. No frame.eastern
    frame2['eastern'] = frame2.state == 'Ohio'
    print frame2
    # del column. No frame.eastern
    del frame2['eastern']
    print frame2

    # nested dict to DataFrame
    # outer key is column, and sorted
    # inner key in index, and sorted
    pop = {'Ohio':{2002: 3.6, 2000: 1.5, 2001:1.7},
            'Nevada': {2001: 2.4, 2002:2.9},}
    frame3 = DataFrame(pop)
    print frame3
    # transpose
    print frame3.T
    # DataFrame is not sorted, if specify index/columns
    print DataFrame(pop, index=[2001, 2002, 2000], columns=['Ohio', 'Nevada'])

    # Values method return ndarray, ndarray has one type
    # return casted type, if ndarray has many type.
    print frame3.values
    print frame3.values.dtype
    print frame2.values
    print frame2.values.dtype
    print frame2['year'].values.dtype

    # Index Object(IntIndex, DateTimeIndex, PeriodIndex)
    obj = Series(range(3), index='a b c'.split(' '))
    index = obj.index
    print type(index)
    print index.dtype
    # Index is immutable
    try:
        index[1] = 'd'
    except:
        print sys.exc_info()

    print obj.index is pd.Index(['a', 'b', 'c'])
    index = pd.Index(np.arange(3))
    obj2 = Series(range(3), index=index)
    print obj2.index is index

    # Index as a Set
    print frame3
    print type(frame3.columns)
    print 'Ohio' in frame3.columns
    print 2003 in frame3.index
    print index.append(pd.Index(['d']))
    print index # result is true, immutable
# 
# Functions isnull and notnull are used in pandas to detect missing data.

# In[40]:

pd.isnull(obj4)


# In[41]:

pd.notnull(obj4)


# In[42]:

obj4.isnull()


# Series automatically aligns differently-indexed data in arithmetric operations:

# In[43]:

obj3


# In[44]:

obj4


# In[45]:
Example #50
0
# 如果某个索引值在字典中没有,那么值就为NaN
## 数组形式
obj.values
## 索引对象
obj2.index
obj2['a']
obj2[['c','a','d']]
obj2[obj2>0]
## 运算
obj2*2
np.exp(obj2)
'b' in obj2
## 缺失值
pd.isnull(obj4)
pd.notnull(obj4)
obj4.isnull()
## 自动对齐,对于索引的元素运算
obj3 + obj4
## Series对象本身和索引都有一个name属性
obj4.name = 'population'
obj4.index.name = 'state'

## DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
pop = {'Nevada':{2001:2.4,2002:2.9},
'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
# 指定列就按照指定列排序
a=Series(repeat(0.0,10),index=list(letters[:10])[::-1])
b=Series(arange(0,10,dtype=np.float64),index=list(letters[:10]))
c=Series(arange(0,10,dtype=np.float64),index=list(letters[:10]))
a[1] = np.nan
a[3] = np.nan
a[5] = np.nan
c[3:8] = np.nan
#%%
a
#%%
b
#%%
c

#%% 空值检查
a.isnull()
#%%
a.notnull()
#%% 直接删除系列中的空值
a.dropna()
#%%
a.fillna(-1)
#%%
a.fillna(b)           # 这里要十分小心,虽然我们给fillna一个索引完全一致只是顺序不
                      # 同的系列,但它是实际还是当成list来处理了,即通过位置进行空值
                      # 填充而不是使用索引来对齐
#%% pad / ffill: 前值填充 
c.fillna(method='ffill')
#%% backfill / bfill: 后值填充
c.fillna(method='bfill')
#%% 限定填充范围 
Example #52
0
class WordVector:
	''' A WordVector is just like a BagOfWords, except that it has
	a simpler constructor — it just accepts a list of tokens.
	In Java, you could write multiple constructors for one class.
	In Python, I'd have to rewrite the constructor inelegantly to make
	these a single class. So. Two classes.
	'''

	def __init__(self, listofwords):
		''' Construct a WordVector from a list.
		'''

		self.rawcounts = dict()
		self.totalcount = 0

		for word in listofwords:
			self.totalcount += 1
			if word in self.rawcounts:
				self.rawcounts[word] += 1
			else:
				self.rawcounts[word] = 1

		self.numrawcounts = len(self.rawcounts)

	def selectfeatures(self, featurelist):
		''' A WordVector is created with merely a dictionary of raw token counts.
		One could call this a sparse table. It has no entries where features are
		missing.

		We need to organize these as an ordered series of features, which includes
		only the features we have chosen to use in the current model, and has zeroes for
		missing values.
		'''

		self.featurelist = featurelist
		self.numfeatures = len(featurelist)
		self.features = Series(self.rawcounts, index = featurelist, dtype = 'float64')
		# Pandas has the nice feature of building a series from a dictionary if it's
		# provided an index of values. So this effectively builds a series of entries
		# ordered by the keys in 'featurelist,' with NaN in places where rawcounts
		# had no corresponding key.

		self.features[self.features.isnull()] = 0
		# This replaces NaN with zero, since missing words are effectively words with
		# count == 0.

	def normalizefrequencies(self):
		''' Simply divides all frequencies by the total token count for this volume.
		'''

		self.features = self.features / self.totalcount

	def standardizefrequencies(self, standardizer):
		''' Convert features to z-scores by centering them on the means and
		scaling them by standard deviation.

		standardizer = an object of class StandardizingVector, presumably created
		either on the corpus that contains this volume, or on the training corpus
		that created the model we are about to use on this volume.
		'''

		assert len(self.features) == len(standardizer.means)

		self.features = (self.features - standardizer.means) / standardizer.stdevs
Example #53
0
print obj2*2
print obj2-2
print np.exp(obj2)

print "#=================直接通过字典来创建Series===========================#"
sdata = {'ohio':35000,'Texes':71000,'Oregen':16000,'Utah':5000}
obj3 = Series(sdata)
print obj3
print "#-----------------------------#"
stats = ['California','ohio','Texes']
obj4 = Series(sdata,index=stats)
print obj4
print "#=================判断是否缺失===========================#"
print pd.isnull(obj4)
print pd.notnull(obj4)
print obj4.isnull()
print "#============================================#"
obj4.name = "population"

obj4.index.name = 'state'
print obj4
print "#==================DataFrame==========================#"
data = {'state':['Ohio','Ohio','Nevada','Nevada'],
		'year':['20','201','22','23'],
		'pop':[1.5,1.4,1.4,1.7]
		}

frame = DataFrame(data)
print frame
print "#===================指定列表顺序=========================#"
frame1 = DataFrame(data,columns=['year','state','pop'])
Example #54
0
mask
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
data.Qu1
data.Qu1.value_counts
data.Qu1.value_counts()
result = data.apply(value_counts).fillna(0)
resutl
result
## Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull
string_data.isnull()
string_data[0] = None
string_data.isnull()
string_data
# Filtering out Missing Data
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
cleaned = data.dropna() # it drops every row which contains at least one Na value
data.dropna(how='all') # it only drops row with all value equals to NA
Example #55
0
__author__ = 'ryu'

import numpy as np
import pandas as pd
from numpy import nan as NA
from pandas import Series, DataFrame

string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data[0] = None
print string_data.isnull()
string_data.fillna(0)

data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
data.dropna()
data.dropna(axis=1, how="all")
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA
df.dropna(thresh=2)  # At least how many non NA values

df.fillna({1: 0.5, 3: -1})
df.fillna(method="bfill", limit=2)


data = Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
              [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
print data.index
print data['b'], data['b':'c'], data.ix[['b', 'd']], data[:, 2]
print data.unstack()
Example #56
0
s
Out[24]: 
the_index
0    4
1    7
2   -5
3    3
Name: a_Series, dtype: int64
```
可以通过字典创建Series  

pandas中缺失(missing)或NA值用`NaN`(即"非数字"not a number)表示.  
pandas的`isnull` `notnull`函数可以用来检测缺失数据:`pd.isnull(Series)``pd.notnull(Series)`  
返回values为Boolean值的Series。
也可以直接这样
`Series.isnull()`  

Series间进行算术运算时,会自动对齐不同索引的数据,是因为使用了键值对的数据结构。

突然发现喵酱的书架的博客和书是一样的

###DataFrame
表格型的数据结构  
含有一组**有序**的列  
```
data = {'state':['Ohino','Ohino','Ohino','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
df = DataFrame(data)
```
取列:标记或属性
Example #57
0
print('\n')
###############################################################

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)

print(obj4)
print('\n')

print(pd.isnull(obj4))
print(pd.notnull(obj4))

print('\n')

print(obj4.isnull())

print('\n')

###############################################################

print(obj3 + obj4)

print('\n')

obj4.name = 'population'

obj4.index.name = 'state'

print(obj4)
Example #58
0
# 用dic创建Series
dic = {
    'name':'ytx',
    'age':15,
    'sex':'man'
}
s = Series(dic)
# print s

# 传入一个list当键
l = ['name','sex','grade']
s1 = Series(dic,l)
# print s1

# 检查是否为空值
r = s1.isnull()
r1 = s1.notnull()
# print r
# print r1

# 相加使得相同索引的数据相加
s2 = s + s1
# print s2



# 给Series赋值名字
s.name = 'series_name'
s.index.name = 'series_index_name'
# print s
 def test_isnull(self):
     ser = Series([0, 5.4, 3, nan, -0.001])
     np.array_equal(ser.isnull(),
                    Series([False, False, False, True, False]).values)
     ser = Series(["hi", "", nan])
     np.array_equal(ser.isnull(), Series([False, False, True]).values)