Example #1
0
def __iter_snp(snp: pd.Series, phenotype: np.ndarray) -> pd.Series:
    """
    For a given SNP, encode the major allele as 0 and the minor allele as 1.
    Perform ordinary least squares linear regression on Phenotype ~ SNP Data.
    Put calculated P and R-Squared_adjusted values into the output dataframe.

    Parameters
    ----------
    snp : pd.Series
        pandas Series of SNP row from processed genotype dataframe.

    phenotype : np.ndarray
        numpy ndarray of phenotype values from processed phenotype dataframe.
    """
    snp_idx = snp.name
    # Encode
    alleles = snp.unique()
    allele_dict = {}
    if len(alleles) == 1:
        snps_to_drop.append(snp_idx)
        return
    else:
        indv = len(snp)
        a0 = (snp == alleles[0]).sum()
        a1 = indv - a0
        if a0 > a1:
            allele_dict = {alleles[0]: 0, alleles[1]: 1}
        else:
            allele_dict = {alleles[1]: 0, alleles[0]: 1}
    snp.replace(allele_dict, inplace=True)
    genotypes.loc[snp_idx].replace(allele_dict, inplace=True)
    model = sm.OLS(phenotype, snp.values)
    results = model.fit()
    gwas_output_df.iat[snp_idx, 1] = results.pvalues
    gwas_output_df.iat[snp_idx, 2] = results.rsquared_adj
Example #2
0
    def _to_label_features_with_min_cut(
            self,
            feature_col: pd.Series,
            feature_name: str,
            category_min_cnt: int = None,
            category_min_rate: float = None,
            aggregation_value='others') -> pd.DataFrame:
        feature_col: pd.Series = feature_col.astype('object')

        if category_min_cnt is not None or category_min_rate is not None:
            category_cnt_series: pd.Series = feature_col.value_counts()
            sum_cnt = len(feature_col.values)

            for category_val, cnt in zip(category_cnt_series.index,
                                         category_cnt_series.values):
                if (category_min_cnt is not None and cnt < category_min_cnt) \
                        or (category_min_rate is not None and float(cnt) / float(sum_cnt) < category_min_rate):
                    feature_col.replace(category_val,
                                        aggregation_value,
                                        inplace=True)

        feature_col: pd.Series = feature_col.astype('category')
        feature_df = feature_col.to_frame(name=feature_name)

        dummy_df: pd.DataFrame = pd.get_dummies(feature_df, drop_first=False)
        self._feature_columns = dummy_df.columns

        return dummy_df
Example #3
0
 def convert_postcode_toint(serie: pd.Series, str_postcode: list):
     """
     Convert a string postcode into integer.
     """
     serie.replace({i:None for i in str_postcode}, inplace=True)
     serie.fillna(0, inplace=True)
     serie = serie.astype(int)
     return serie
def cat2numeric(col:pd.Series)->None:
    """
    Convert catg column values to numeric values using 
    sklearns LabelEncoder
    """
    le = preprocessing.LabelEncoder()
    num_values = le.fit_transform(col.values)
    col.replace(col.values,num_values, inplace=True)
def impute_series_zeros_with_half_min(s: pd.Series):
    # Find minimal non-zero abundance per sample
    min_abundance = s.replace(0, np.nan).dropna().min()

    #  replace all zeros with half of the min abundance
    sample = s.replace(0, min_abundance / 2)

    return sample
def pandas_data_astype_part3():
    # 替换值
    data = Series([1, -999, 2, -999, -1000, 3])
    print(data.replace(to_replace=-999, value=np.nan))  # 把-999替换为np.nan
    print(data.replace(to_replace=[-999, -1000],
                       value=np.nan))  # 使用列表来实现一次替换多个值
    print(data.replace(to_replace=[-999, -1000], value=[np.nan,
                                                        0]))  # 对不同的值进行不同的替换
    print(data.replace(to_replace={-999: np.nan, -1000: 0}))  # 对不同的值进行不同的替换
Example #7
0
def one_hot_encode(series: pd.Series,
                   ais_col_name: str) -> Tuple[pd.DataFrame, OneHotEncoder]:
    series_categories = series.unique()  # in order of appearance
    ais_col_name = ais_col_name.lower()
    default_alias = ["Default", "Unknown", "Undefined"]

    if ais_col_name in categorical_values:
        categories = categorical_values[ais_col_name] + default_alias
        nan_idx = -1
        # prevent typing error by replacing NaN with custom decorator "nan_type"
        for idx, cat in enumerate(series_categories):
            if type(cat) is float:
                nan_idx = idx

        if nan_idx > -1:
            series_categories[nan_idx] = "default"

        for s_cat in series_categories:
            i = 0
            found = False
            while i < len(categories) and not found:
                cat = categories[i]
                if cat.lower() in s_cat.lower():
                    if cat in default_alias:
                        series = series.replace(s_cat,
                                                f"Default {ais_col_name}")
                    else:
                        series = series.replace(s_cat, cat)
                    found = True
                i += 1
            if not found or s_cat.lower() == "other":
                series = series.replace(s_cat, f"Other {ais_col_name}")

        desired_categories = categorical_values[ais_col_name] + [
            f"Other {ais_col_name}", f"Default {ais_col_name}"
        ]

        series = series.values.reshape(-1, 1)  # shape as column
        encoder = OneHotEncoder(sparse=False, categories=[desired_categories])
        encoder.fit(series)
        data_ohe = encoder.transform(series)
        df_ohe = pd.DataFrame(data_ohe,
                              columns=[
                                  desired_categories[i]
                                  for i in range(len(desired_categories))
                              ])
        return df_ohe, encoder

    else:
        raise ValueError(
            f"Error while one hot encoding: Cannot find column {ais_col_name} "
            f"in {categorical_values.keys()}")
def _split_T_in_col(nfl: pd.DataFrame, column: pd.Series):

    x = column.replace('T', '', regex=True)
    y = column.replace('^-?\d+', '', regex=True)

    y.where(y == 'T', other=1, inplace=True)
    y.where(y != 'T', other=0, inplace=True)

    x.name = column.name
    y.name = column.name + "_T"

    nfl[x.name] = x
    nfl[y.name] = y
Example #9
0
 def __work(se: pd.Series, except_strings: List[str]):
     if is_integer_str_regex(
             se, except_strings=except_strings).sum() == se.shape[0]:
         return se.replace(rep_nan, strtmp, inplace=False).astype(
             np.float64).astype(np.int64).astype(str).replace(
                 strtmp, rep_nan)
     elif is_float_str_regex(
             se, except_strings=except_strings).sum() == se.shape[0]:
         return se.replace(rep_nan, np.nan, inplace=False).astype(
             np.float32).round(n_round).astype(str).replace(
                 str(np.nan), rep_nan)
     else:
         return se
Example #10
0
def vidya(close, length=None, drift=None, offset=None, **kwargs):
    """Indicator: Variable Index Dynamic Average (VIDYA)"""
    # Validate Arguments
    length = int(length) if length and length > 0 else 14
    close = verify_series(close, length)
    drift = get_drift(drift)
    offset = get_offset(offset)

    if close is None: return

    def _cmo(source: Series, n: int, d: int):
        """Chande Momentum Oscillator (CMO) Patch
        For some reason: from pandas_ta.momentum import cmo causes
        pandas_ta.momentum.coppock to not be able to import it's
        wma like from pandas_ta.overlap import wma?
        Weird Circular TypeError!?!
        """
        mom = source.diff(d)
        positive = mom.copy().clip(lower=0)
        negative = mom.copy().clip(upper=0).abs()
        pos_sum = positive.rolling(n).sum()
        neg_sum = negative.rolling(n).sum()
        return (pos_sum - neg_sum) / (pos_sum + neg_sum)

    # Calculate Result
    m = close.size
    alpha = 2 / (length + 1)
    abs_cmo = _cmo(close, length, drift).abs()
    vidya = Series(0, index=close.index)
    for i in range(length, m):
        vidya.iloc[i] = alpha * abs_cmo.iloc[i] * close.iloc[i] + vidya.iloc[
            i - 1] * (1 - alpha * abs_cmo.iloc[i])
    vidya.replace({0: npNaN}, inplace=True)

    # Offset
    if offset != 0:
        vidya = vidya.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        vidya.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        vidya.fillna(method=kwargs["fill_method"], inplace=True)

    # Name & Category
    vidya.name = f"VIDYA_{length}"
    vidya.category = "overlap"

    return vidya
Example #11
0
    def _to_label_features(self, feature_col: pd.Series) -> pd.DataFrame:
        feature_col: pd.Series = feature_col.astype('category')
        uq_vals = feature_col.cat.categories
        feature_col: pd.Series = feature_col.astype('object')

        for category_val in uq_vals:
            if self._feature_name + '_' + category_val not in self._feature_columns.values:
                feature_col.replace(category_val,
                                    self._aggregation_value,
                                    inplace=True)

        feature_col: pd.Series = feature_col.astype('category')
        feature_df = feature_col.to_frame(name=self._feature_name)

        return pd.get_dummies(feature_df, drop_first=False)
Example #12
0
def rank(data: pd.Series) -> pd.Series:
    """Internal function used to rank ordinal and nominal data."""
    unique = pd.Series(data.unique())
    ranked = unique.rank()
    lookup = pd.concat([unique, ranked], axis=1)
    lookup_series = pd.Series(lookup.iloc[:, 1].values, index=lookup.iloc[:, 0])
    return data.replace(lookup_series.to_dict()).astype(float)
Example #13
0
def find_best_similar(serie: pd.Series, dict_ref: dict, similarity: Callable, applying=True):
    """
    Find for each value in a pandas series the most similar 
    string from a dictionary referential. This function uses a similarity algorithm 
    (for example jaro-winkler or gestalt pattern matching).
    
    Return 
    ------
    serie : if apply is True, return the series with most similar computed values 
    closest_value : if apply is False, return the dictionary of similarity for each value 
    """
    
    all_values = set(serie.values)
    all_values.remove(None)

    closest_value = {}

    for data in all_values:
        best_ratio = 0

        for value in dict_ref.values():
            ratio = similarity(value.lower(), data)

            if ratio >= best_ratio:
                best_ratio = ratio
                value_close = value

        closest_value[data] = value_close
    
    if applying:
        return serie.replace(closest_value)
    else:
        return closest_value
Example #14
0
def create_bow(se: pd.Series,
               preproc: List[tuple] = [(r"\s+", "")],
               nlp=spacy.load('ja_ginza'),
               list_pos: List[str] = ["NOUN", "ADJ", "VERB", "PROPN", "PRON"],
               vocab=None) -> (pd.DataFrame, dict):
    se = se.copy()
    for x, y in preproc:
        se = se.replace(x, y, regex=True)
    se = se.str.strip()
    if vocab is None:
        vocab = {}
        for text in se.values:
            if text != "":
                for token in nlp(text):
                    if token.pos_ in list_pos and not (
                        (token.pos_, token.string) in vocab):
                        vocab[(token.pos_, token.string)] = len(vocab)

    def bag_of_words(text: str, vocab: dict):
        ndf = np.zeros(len(vocab)).astype(int)
        if text.strip() != "":
            for token in nlp(text):
                wk = vocab.get((token.pos_, token.string))
                if wk is not None:
                    ndf[wk] += 1
        return ndf

    se = se.apply(lambda x: bag_of_words(x, vocab))
    ndf = np.concatenate([x.reshape(1, -1) for x in se.tolist()], axis=0)
    df = pd.DataFrame(ndf, columns=[f"vocab_{i}" for i in range(ndf.shape[1])])
    return df, vocab
Example #15
0
def _from_dataframe(row: Series, na: Union[Any, List[Any]],
                    nesting: str) -> Resource:
    new_na = row.replace(na, np.nan)
    no_na = new_na.dropna()
    items = list(no_na.items())
    data = deflatten(items, nesting)
    return from_json(data, None)
Example #16
0
def train_vw(X: pd.DataFrame, y: pd.Series, config: Config):
    cache_file = config.tmp_dir + "/.vw_cache"
    data_file = config.tmp_dir + "/vw_data_train.csv"

    cmd = " ".join([
        "rm -f {cache} && vw",
        "-f {f}",
        "--cache_file {cache}",
        "--passes {passes}",
        "-l {l}",
        "--early_terminate {early_terminate}",
        "{df}"
    ]).format(
        cache=cache_file,
        df=data_file,
        f=config.model_dir + "/vw.model",
        passes=max(20, int(1000000/len(X))),
        l=25,
        early_terminate=1,
    )

    if config["mode"] == "classification":
        cmd += " --loss_function logistic --link logistic"
        y = y.replace({0: -1})

    save_to_vw(data_file, X, y)
    subprocess.Popen(cmd, shell=True).communicate()
Example #17
0
    def make_features(self,
                      answers: Series = None,
                      normalize: Union[Dict[float, float], bool] = True,
                      norm_min: float = 0.0,
                      norm_max: float = 1.0,
                      drop_na: bool = True) -> Series:
        """
        Create DataFrame of features for use in ML.

        :param answers: Answers to the Question from a Survey.
        :param normalize: Option to normalize the data by min-max normalization
                          or map using a dictionary.
        :param norm_min: Value to use for lower bound of normalization range.
        :param norm_max: Value to use for upper bound of normalization range.
        :param drop_na: Whether to drop null responses from returned features.
        """
        if answers is None:
            answers = self._data
        if drop_na:
            answers = answers.dropna()
        features = Series(data=answers.replace(self._categories),
                          index=answers.index)
        if type(normalize) is bool:
            if normalize:
                min_cat_val = min(self.categories.values())
                max_cat_val = max(self.categories.values())
                return (norm_min + (features - min_cat_val) *
                        (norm_max - norm_min) / (max_cat_val - min_cat_val))
            else:
                return features
        elif isinstance(normalize, dict):
            return features.map(normalize)
        else:
            raise TypeError('normalize needs to be either bool or dictionary')
Example #18
0
def complex_series_maths(ser_01: pd.Series, ser_02: pd.Series,
                         function: str) -> pd.core.series.Series:
    """Write some math helper functions for series.
    Take the two given series, perfrom the required operation and
        return the new series.
    For example. Give the series:
        0    0
        1    1
        2    2
        dtype: int64

    And the series:
        0     2
        1     3
        2     4
        dtype: int64

    If the function given is 'add' you should return
        0     2
        1     4
        2     6
        dtype: int64

    :param ser_01: Primary series to perform operation on
    :param ser_02: Secondary series to perform operation on
    :param function: The operation to perform

    Note:
    For this function always add ser_02 to ser_01,
        subtract ser_02 from ser_01,
        multiply ser_01 by ser_02,
        divide ser_01 by ser_02
    Don't worry about None's and NaN and divide by zero.
        Let pandas do the work for you.
    """
    ser_01 = ser_01.replace('nan', np.NaN)
    ser_02 = ser_02.replace('nan', np.NaN)
    if function == 'add':
        return ser_02 + ser_01
    elif function == 'sub':
        return ser_02 - ser_01
    elif function == 'mul':
        return ser_01 * ser_02
    elif function == 'div':
        return ser_01 / ser_02
    else:
        return 0
Example #19
0
def slide_12():
    data = Series([1., -999., 2., -999., -1000., 3.])
    print data

    print data.replace(-999, np.nan)
    print data.replace([-999, -1000], np.nan)
    print data.replace([-999, -1000], [np.nan, 0])

    print data.replace({-999: np.nan, -1000: 0})
Example #20
0
 def restore(self, col: pd.Series) -> pd.Series:
     """Restore column when to_pandas."""
     return pd.Series(
         pd.Categorical.from_codes(
             col.replace(np.nan, -1).astype(int),
             categories=cast(CategoricalDtype, self.dtype).categories,
             ordered=cast(CategoricalDtype, self.dtype).ordered,
         ))
Example #21
0
def calculate_age(date:pd.Series, year:int):
    """
    Calculate the age from pandas series of dates of birth.
    """
    date.fillna(0, inplace=True)
    date = date.apply(lambda x : int(str(int(x))[:4]))
    date = date.replace({0:None})
    return year - date
Example #22
0
 def test_replace_series_no_regex(self):
     answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3:
                      'Disagree', 4: 'Strongly Disagree'})
     weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3,
                       'Strongly Agree': 5, 'Strongly Disagree': 1})
     expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
     result = answer.replace(weights)
     assert_series_equal(result, expected)
Example #23
0
def describe_1d(series: pd.Series) -> dict:
    """Describe a series (infer the variable type, then calculate type-specific values).

    Args:
        series: The Series to describe.

    Returns:
        A Series containing calculated series description values.
    """

    # Replace infinite values with NaNs to avoid issues with histograms later.
    series.replace(to_replace=[np.inf, np.NINF, np.PINF],
                   value=np.nan,
                   inplace=True)

    # Infer variable types
    series_description = base.get_var_type(series)

    # Run type specific analysis
    if series_description["type"] == Variable.S_TYPE_UNSUPPORTED:
        series_description.update(
            describe_unsupported(series, series_description))
    else:
        series_description.update(
            describe_supported(series, series_description))

        type_to_func = {
            Variable.S_TYPE_CONST: describe_constant_1d,
            Variable.TYPE_BOOL: describe_boolean_1d,
            Variable.TYPE_NUM: describe_numeric_1d,
            Variable.TYPE_DATE: describe_date_1d,
            Variable.S_TYPE_UNIQUE: describe_unique_1d,
            Variable.TYPE_CAT: describe_categorical_1d,
            Variable.TYPE_URL: describe_url_1d,
            Variable.TYPE_PATH: describe_path_1d,
        }

        if series_description["type"] in type_to_func:
            series_description.update(type_to_func[series_description["type"]](
                series, series_description))
        else:
            raise ValueError("Unexpected type")

    # Return the description obtained
    return series_description
Example #24
0
def create_site_specific_splits(
    site: pd.Series,
    proportions: Dict[str, int],
    random_state: Optional[Union[int, np.random.mtrand.RandomState]] = 989,
):
    """Splits sites into distinct groups whose sizes roughly matching the given proportions. Null
    sites are randomly assigned to groups using the provided proportions.

    Args:
        site (pd.Series): A series of sites, one element per observation,
        proportions (dict): A dict whose keys are the resulting groups and whose values are the
            rough proportion of data in each group.
        seed (int): Seed for random split of null sites.

    Example:
        Split data into groups where each site is in one and only one group with roughly 50-25-25
        train-val-holdout proportions.

        >>> create_site_specific_splits(site, proportions={"train": 2, "val": 1, "holdout": 1})

    Returns:
        pd.Series: A series containing the resulting split, one element per observation.

    """

    assignments = {}
    sites = site.value_counts(dropna=True).sort_values(ascending=False).index
    n_subgroups = sum(proportions.values())
    for i, subset in enumerate(
            roundrobin(*([subset] * proportions[subset]
                         for subset in proportions))):
        for group in sites[i::n_subgroups]:
            assignments[group] = subset

    # Divide null sites among the groups
    null_sites = site.isnull()
    if null_sites.sum() > 0:
        logger.debug(
            f"{null_sites.sum():,} null sites randomly assigned to groups.")
        null_groups = []
        for group, group_proportion in proportions.items():
            null_group = f"{group}-{uuid4()}"
            null_groups.append(null_group)
            assignments[null_group] = group

        rng = (np.random.RandomState(random_state) if isinstance(
            random_state, int) else random_state)
        site = site.copy()
        site.loc[null_sites] = rng.choice(
            null_groups,
            p=np.asarray(list(proportions.values())) /
            sum(proportions.values()),
            size=null_sites.sum(),
            replace=True,
        )

    return site.replace(assignments)
Example #25
0
def target_binned_price_variation_kmeans(pct_var: pd.Series, **kwargs):
    values = pct_var.replace([np.inf, -np.inf],
                             np.nan).fillna(method='ffill').values
    values = np.reshape(values, (-1, 1))
    discretizer = KBinsDiscretizer(n_bins=kwargs.get('n_bins', 3),
                                   strategy='kmeans',
                                   encode='ordinal')
    discrete = discretizer.fit_transform(values)
    return pd.Series(np.reshape(discrete, (-1, )), index=pct_var.index)
Example #26
0
def autocast_series_dtype(series: pd.Series) -> pd.Series:
    """Cast any sane Series to str/category[str]/number/datetime.

    This is appropriate when parsing CSV data or Excel data. It _seems_
    appropriate when a search-and-replace produces numeric columns like
    '$1.32' => '1.32' ... but perhaps that's only appropriate in very-specific
    cases.

    The input must be "sane": if the dtype is object or category, se assume
    _every value_ is str (or null).

    If the series is all-null, do nothing.

    Avoid spurious calls to this function: it's expensive.

    TODO handle dates.
    """
    if series.dtype == bool:
        # Handle Excel formula: '=TRUE'
        #
        # We capitalize Pythonic, because A) sometimes the formula is in Python;
        # and B) the `series.astype(str)` below is hard to customize
        return series.replace({True: "True", False: "False"})
    elif series.dtype == object:
        nulls = series.isnull()
        if (nulls | (series == "")).all():
            return series
        try:
            # If it all looks like numbers (like in a CSV), cast to number.
            return pd.to_numeric(series)
        except (ValueError, TypeError):
            # Otherwise, we want all-string. Is that what we already have?
            #
            # Handles Excel formula: =IF(A1=1, 3, "Hi")
            array = series[~nulls].array
            if any(type(x) != str for x in array):
                series = series.astype(str)
                series[nulls] = None
            return series
    elif hasattr(series, "cat"):
        # Categorical series. Try to infer type of series.
        #
        # Assume categories are all str: after all, we're assuming the input is
        # "sane" and "sane" means only str categories are valid.
        if (series.isnull() | (series == "")).all():
            return series
        try:
            return pd.to_numeric(series)
        except (ValueError, TypeError):
            # We don't cast categories to str here -- because we have no
            # callers that would create categories that aren't all-str. If we
            # ever do, this is where we should do the casting.
            return series
    else:
        assert is_numeric_dtype(series) or is_datetime64_dtype(series)
        return series
Example #27
0
def safe_replace_series(pds: Series,
                        values: Dict,
                        strip: bool = True,
                        lower: bool = False,
                        inplace=False) -> Optional[Series]:
    if not inplace:
        pds = pds.copy()
    if strip and pds.dtype == "object":
        str_idx = pds.map(lambda x: isinstance(x, str))
        pds.loc[str_idx] = pds.loc[str_idx].str.strip()
    if lower and pds.dtype == "object":
        str_idx = pds.map(lambda x: isinstance(x, str))
        pds.loc[str_idx] = pds.loc[str_idx].str.lower()
        values = {k.lower(): v for k, v in values.items()}
    pds.replace(values, inplace=True)
    assert_values(pds, values.values())
    if not inplace:
        return pds
    return None
Example #28
0
 def check_fillna(self, serie: pd.Series, method: str = '', value: int = 0):
     """
     """
     if self._fillna:
         serie_output = serie.copy(deep=False)
         serie_output = serie.replace([np.inf, -np.inf], np.nan)
         serie_output = serie_output.fillna(method='backfill') if method else serie_output.fillna(value)
         return serie_output
     else:
         return serie
Example #29
0
    def tract(series: pd.Series, initial: str, latest: str):
        """
        The county & tract GEOID update function.

        :param series:
        :param initial:
        :param latest:
        """

        return series.replace(to_replace='^({})'.format(initial), value=latest, regex=True)
Example #30
0
 def __compensate_original_column(self, original, columnnum):
     """补足原始数据的列宽
     :original: 原始矩阵数据
     :columnnum: 需要补足的列宽数目
     """
     rownum = np.shape(original)[0]
     for i in range(columnnum):
         s = Series(np.zeros(rownum))
         col = self.__convert_row2column(s.replace(0, np.nan).values)
         original = np.concatenate([original, col], axis=1)
     return original
Example #31
0
def parse_data():
    sd = parse_get_data("../Test/Data/hemsedal_hollekolten_jan2016.xml")
    ff_s = Series(sd["25112"]["FF"]["val"], index=sd["25112"]["index"])

    # replace fill values to NaN
    ff_s.replace(-99999.0, np.nan, inplace=True)
    ff24_s = ff_s.resample("D")

    sd = parse_get_data("../Test/Data/hemsedal_hoelto_jan2016.xml")
    ts = Series(sd["25100"]["RR_24"]["val"], index=sd["25100"]["index"])

    # select only measurements at 06 every day
    ts = ts[ts.index.hour == 6]
    # ts.replace(-99999.0, np.nan, inplace=True)
    # ts.replace(-1.0, 0.0, inplace=True)
    print(ff24_s)

    plt.bar(ts.index, ts.values)
    plt.hold(True)

    plt.plot(ff_s.index, ff_s.values)
    plt.plot(ff24_s.index, ff24_s.values)
    # plt.bar(sd['25100']['index'], sd['25100']['RR_24']['val'])
    plt.show()
Example #32
0
#This helped: http://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-columns
temps = Series(temps)
#get rid of the underlines
temps = temps.drop(temps.index[[1]])
#create a data frame with one column
temps = DataFrame(temps, columns=['col'])
#create an object with a list of list (Q: Why does this return list of list? Can this be done with values?) for first row
col_n=list(temps.ix[0].str.split())
#split columns of dataframe and make col_n the column indexes
temps = pd.DataFrame(list(temps.col.str.split()), columns=col_n[0])
#drop the duplicate column name row
temps = temps.drop(temps.index[0])
#this would strip white space, but I think it's unnecessary: temps.apply(lambda x: x.str.strip())
#Change Ms to missing values
import numpy as np
temps.replace('M', np.nan, inplace=True)
#create a column with TX vs TN, change MO so is actually Month
temps['Lvl'] = Series(temps['MO']).str[-2:]
temps['MO'] = Series(temps['MO']).str[:-2]
temps['YRMO'] = Series(temps['YR']+temps['MO'])
#make year and month indexes (Q: Added Lvl as well does this shape make sense?)
temps = temps.set_index(['YR','MO', 'YRMO','Lvl']) #Q: added in YRMO so can group and plot, but must be a way to do this with the hierarchical indexing
temps = temps.stack().unstack(['Lvl'])
#adding name to day index
temps.head(100)
temps.index.names = ['YR','MO', 'YRMO', 'DAY']
#convert TX and TN to numbers
temps = temps.convert_objects(convert_numeric=True)
#grouping
yrmo_grouped = temps.groupby(level=(['YRMO'])).mean() #Q: really don't think that should need YRMO
yr_grouped = temps.groupby(level=(['YR'])).mean()
Example #33
0
#!/usr/bin/env python
# encoding=utf-8

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 替换值
# 类似于fillna, 对值进行替换
data = Series([1., -999., 2., -999., -1000., 3.])
print data
# 我们可以约定-999是缺失值,但是NaN没有二义性
print data.replace(-999, np.nan)
# 一次替换多个
print data.replace([-999, -1000], np.nan)
# 不同值替换成不同的值,注意位置对应
print data.replace([-999, -1000], [np.nan, 0])
# 字典: key是要被替换的, value是替换的
print data.replace({-999: np.nan, -1000: 0})
first,second,third=pieces

first+'::'+second+'::'+third

#更地道  的做法

'::'.join(pieces)

#统计字符串 出现次数

val.count(',')

# 替换

val.replace(',','::')


#************************************************************
#  正则表达式 regex

#re模块  三个类 :模式匹配   替换  拆分

import re

text="foo bar\t baz \tqux"


# '\s+' 代表了所有 分割空格 不管是 空几个 或是 字符表示
re.split('\s+',text)
def replace01():
    data=Series([1,-999.,2.,-999.,-1000.,3.])
    print data
    print data.replace(-999,np.nan)
    print data.replace([-999,-1000],np.nan)
    print data.replace({-999:np.nan,-1000:0})
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index
    
    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))
        
    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))
    
    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))
    
    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))
        
    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
dtype: float64
'''

ser[:,'a'] # return all from primary index, but use secondary index = 'a'
ser[1,'a'] # returns value at index 1 (primary), 'a' (secondary)

df = ser.unstack() # converts hierarchical index series into dataframe with primary index as rows, and secondary index as columns

#combine_first() method
Series(np.where(pd.isnull(ser1),ser2,ser1), index = ['x','y','z','q','r','s']) #Series meets numpy where meets panda's isnull() method
# the above statement sates where ser1 values are NaN, use ser2 values, else use ser1 values
ser1.combine_first(ser2) #combine_first() does the same

df1.combine_first(df2) # does the same with dataframes. 

ser1.replace(1,10) # replace '1' in your series with '10'
ser1.replace(1,np.nan) # replace '1' in your series with NaN
ser1.replace([1,4],[100,400]) # replace value (1 and 4) with (100 and 400)
ser1.replace({4: 'clown' , 2: 'owl'}) # replace 4 with clown, and 2 with owl

###############################################################
###															###
###															###
###   				DataFrame basics						###
###															###
###															###
###############################################################

# the key method here is DataFrame()

import webbrowser