def make_x_axis(timestamp: pd.Series, interval: float):
    # print(timestamp.tail(1).values[0] - timestamp.head(1).values[0])
    if timestamp.tail(1).values[0] - timestamp.head(1).values[0] > 2:
        return np.arange(
            timestamp.head(1).values[0] + 1,
            timestamp.tail(1).values[0], interval)
    else:
        raise ValueError('too short time in eye')
Beispiel #2
0
    def describe_categorical_1d(series: pd.Series,
                                series_description: dict) -> dict:
        """Describe a categorical series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        # Make sure we deal with strings (Issue #100)
        series = series.astype(str)

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

        redact = config["vars"]["cat"]["redact"].get(float)
        if not redact:
            stats.update({"first_rows": series.head(5)})

        stats.update(
            histogram_compute(value_counts,
                              len(value_counts),
                              name="histogram_frequencies"))

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            stats["chi_squared"] = list(chisquare(value_counts.values))

        check_length = config["vars"]["cat"]["length"].get(bool)
        if check_length:
            stats.update(length_summary(series))
            stats.update(
                histogram_compute(stats["length"],
                                  stats["length"].nunique(),
                                  name="histogram_length"))

        check_unicode = config["vars"]["cat"]["characters"].get(bool)
        if check_unicode:
            stats.update(unicode_summary(series))
            stats["n_characters_distinct"] = stats["n_characters"]
            stats["n_characters"] = stats["character_counts"].values.sum()

            stats["category_alias_counts"].index = stats[
                "category_alias_counts"].index.str.replace("_", " ")

        words = config["vars"]["cat"]["words"]
        if words:
            stats.update(word_summary(series))

        coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get(
            bool)
        if coerce_str_to_date:
            stats["date_warning"] = warning_type_date(series)

        return stats
Beispiel #3
0
def get_clamped_value_counts(value_counts: pd.Series,
                             max_categories_incl_other: int) -> pd.Series:
    # Returns a Series of a maximum length, where overflowing rows are
    # put into a "Others" category (index = OTHERS_GROUPED)
    # IMPORTANT: assuming value_counts is ALREADY SORTED
    if len(value_counts) <= max_categories_incl_other:
        categories_shown_as_is = len(value_counts)
    else:
        categories_shown_as_is = max_categories_incl_other - 1

    # Fix for #10
    # clamped_series = pd.Series(value_counts[0:categories_shown_as_is])
    clamped_series = pd.Series(value_counts.head(categories_shown_as_is))

    # Fix for #10
    num_in_tail = len(value_counts) - categories_shown_as_is
    # categories_in_other = value_counts[categories_shown_as_is:]
    categories_in_other = value_counts.tail(num_in_tail)

    if len(categories_in_other) > 0:
        total_in_other = sum(categories_in_other)
        other_series = pd.Series([total_in_other], index=[OTHERS_GROUPED])
        clamped_series = clamped_series.append(other_series,
                                               ignore_index=False)

    return clamped_series
Beispiel #4
0
def fit(
    X: pd.DataFrame,
    y: pd.Series,
    output_dir: str,
    class_order: Optional[List[str]] = None,
    row_weights: Optional[np.ndarray] = None,
    **kwargs,
):
    """
    This hook must be implemented with your fitting code, for running drum in the fit mode.

    This hook MUST ALWAYS be implemented for custom tasks.
    For inference models, this hook can stick around unimplemented, and won’t be triggered.

    Parameters
    ----------
    X: pd.DataFrame - training data to perform fit on
    y: pd.Series - target data to perform fit on
    output_dir: the path to write output. This is the path provided in '--output' parameter of the
        'drum fit' command.
    class_order : A two element long list dictating the order of classes which should be used for
        modeling. Class order will always be passed to fit by DataRobot for classification tasks,
        and never otherwise. When models predict, they output a likelihood of one class, with a
        value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
        dictates that the first element in the list will be the 0 class, and the second will be the
        1 class.
    row_weights: An array of non-negative numeric values which can be used to dictate how important
        a row is. Row weights is only optionally used, and there will be no filtering for which
        custom models support this. There are two situations when values will be passed into
        row_weights, during smart downsampling and when weights are explicitly provided by the user
    kwargs: Added for forwards compatibility

    Returns
    -------
    Nothing
    """
    logging.info(y.head())
    # Feel free to delete which ever one of these you aren't using
    if class_order is not None:
        if y.dtype == np.dtype("bool"):
            y = y.astype("str")
        estimator = make_classifier(X)
    else:
        raise Exception(
            "Running multiclass estimator task: class_order expected to be not None"
        )
    estimator.fit(X, y)

    # You must serialize out your model to the output_dir given, however if you wish to change this
    # code, you will probably have to add a load_model method to read the serialized model back in
    # When prediction is done.
    # Check out this doc for more information on serialization https://github.com/datarobot/custom-\
    # model-templates/tree/master/custom_model_runner#python
    # NOTE: We currently set a 10GB limit to the size of the serialized model
    with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
        pickle.dump(estimator, fp)
    with open("{}/class_labels.txt".format(output_dir), "wb") as fp:
        fp.write("\n".join(str(class_)
                           for class_ in estimator.classes_).encode("utf-8"))
def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = summary["value_counts_without_nan"]

    summary.update(
        histogram_compute(
            value_counts, summary["n_distinct"], name="histogram_frequencies"
        )
    )

    redact = config["vars"]["cat"]["redact"].get(float)
    if not redact:
        summary.update({"first_rows": series.head(5)})

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(histogram=value_counts.values)

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        summary.update(length_summary(series))
        summary.update(
            histogram_compute(
                summary["length"], summary["length"].nunique(), name="histogram_length"
            )
        )

    check_unicode = config["vars"]["cat"]["characters"].get(bool)
    if check_unicode:
        summary.update(unicode_summary(series))
        summary["n_characters_distinct"] = summary["n_characters"]
        summary["n_characters"] = summary["character_counts"].values.sum()

        try:
            summary["category_alias_counts"].index = summary[
                "category_alias_counts"
            ].index.str.replace("_", " ")
        except AttributeError:
            pass

    words = config["vars"]["cat"]["words"]
    if words:
        summary.update(word_summary(series))

    return series, summary
Beispiel #6
0
def choose_predict_class(predicted_classes: pd.Series) -> int:
    # Print error message if most frequently predicted classes have the same frequency
    if len(predicted_classes) > 1:
        if predicted_classes.iloc[0] == predicted_classes.iloc[1]:
            print(
                "Prediction ambiguous: At least two classes appear equally often as nearest neighbors."
            )

    predicted = predicted_classes.head(1).index
    return predicted[0].astype(int)
    def number_of_access(self, corp_name):
        # 日にちごとのアクセス数
        number_of_access_per_day = self.df["日付"].value_counts()

        series = Series(number_of_access_per_day)
        print(series.head(20))

        # 日付順にソート
        series_sort = series.sort_index()
        series_sort.plot()
        plt.show()
Beispiel #8
0
def _contains_instance_attrs(series: pd.Series,
                             is_method,
                             class_name: str,
                             attrs: list,
                             sample_size: int = 1) -> bool:
    # TODO: user configurable .head or .sample
    # TODO: performance testing for series[0], series.iloc[0], series.head, series.sample
    if not all(is_method(x, class_name) for x in series.head(sample_size)):
        return False

    try:
        return all(all(hasattr(x, attr) for attr in attrs) for x in series)
    except AttributeError:
        return False
Beispiel #9
0
def pandas_describe_categorical_1d(
        config: Settings, series: pd.Series,
        summary: dict) -> Tuple[Settings, pd.Series, dict]:
    """Describe a categorical series.

    Args:
        config: report Settings object
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = summary["value_counts_without_nan"]
    value_counts.index = value_counts.index.astype(str)

    redact = config.vars.cat.redact
    if not redact:
        summary.update({"first_rows": series.head(5)})

    chi_squared_threshold = config.vars.num.chi_squared_threshold
    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(histogram=value_counts.values)

    if config.vars.cat.length:
        summary.update(length_summary_vc(value_counts))
        summary.update(
            histogram_compute(
                config,
                summary["length_histogram"].index.values,
                len(summary["length_histogram"]),
                name="histogram_length",
                weights=summary["length_histogram"].values,
            ))

    if config.vars.cat.characters:
        summary.update(unicode_summary_vc(value_counts))

    if config.vars.cat.words:
        summary.update(
            word_summary_vc(value_counts, config.vars.cat.stop_words))

    return config, series, summary
Beispiel #10
0
def get_unclaimed_list(sources: pd.Series) -> str:
    """Get a list of the posts grouped by sources."""
    items = [
        i18n["queue"]["unclaimed_list_entry"].format(count=count, source=source)
        for source, count in sources.head(5).iteritems()
    ]
    result = "\n".join(items)

    if len(sources) > 5:
        rest = sources[5:]
        source_count = len(rest)
        post_count = rest.sum()
        result += "\n" + i18n["queue"]["unclaimed_list_others"].format(
            post_count=post_count, source_count=source_count
        )

    return result
    def encode_X(self, X: pd.Series):
        if self.tokenizer is None:
            self.logger.error(
                "Please initial the embedding by Word_Embedding().init_embedding_layer first"
            )
            return None

        # TODO: fix me, this shouldn't happen!!
        X = X.where((pd.notnull(X)), '')
        self.logger.info("X.head={}".format(X.head(5)))
        self.logger.info("X.shape={}".format(X.shape))
        self.logger.info("X.values.shape={}".format(X.values.shape))
        X = X.values.ravel()
        X = self.tokenizer.texts_to_sequences(X)
        self.logger.info("sequance X {}".format(X))
        X = sequence.pad_sequences(X, maxlen=self.max_text_len)
        self.logger.info("padding X {}".format(X))
        return X
Beispiel #12
0
    def test_update_times_mean(self, obj, nogil, parallel, nopython, adjust,
                               ignore_na, halflife_with_times):
        times = Series(
            np.array(
                [
                    "2020-01-01", "2020-01-05", "2020-01-07", "2020-01-17",
                    "2020-01-21"
                ],
                dtype="datetime64",
            ))
        expected = obj.ewm(
            0.5,
            adjust=adjust,
            ignore_na=ignore_na,
            times=times,
            halflife=halflife_with_times,
        ).mean()

        engine_kwargs = {
            "nogil": nogil,
            "parallel": parallel,
            "nopython": nopython
        }
        online_ewm = (obj.head(2).ewm(
            0.5,
            adjust=adjust,
            ignore_na=ignore_na,
            times=times.head(2),
            halflife=halflife_with_times,
        ).online(engine_kwargs=engine_kwargs))
        # Test resetting once
        for _ in range(2):
            result = online_ewm.mean()
            tm.assert_equal(result, expected.head(2))

            result = online_ewm.mean(update=obj.tail(3),
                                     update_times=times.tail(3))
            tm.assert_equal(result, expected.tail(3))

            online_ewm.reset()
from utils import util

dateList = [1, 3, 5, 6, 8]
db = {"No.1": "Wo", "No.2": "Shi", "No.3": "Ni", "No.4": "Da", "No.5": "Ye"}
# Creating a Series by passing a list of values, letting pandas create a default integer index
s = Series(dateList, index=["A", "B", "C", "D", "E"])  # 索引的长度必须和list的长度一样,否则为[0, ..., len(data) - 1]
util.report_tag("Series 处理list结构数据")
print "Series data structures is \n", s
print "index is ", s.index
print "values is ", s.values
print "the fist element is ", s[0]
print "0~3 element is \n", s[:3]
print ">3 element is \n", s[s > 3]
# print "通过 s[6] 会报错", s[6]
print "通过 s.get(6) return None", s.get(6)
print "查看前2行\n", s.head(2)
print "查看最后2行\n", s.tail(2)

util.report_tag("Series 处理dict数据")
s = Series(db)
print "Series data structures is \n", s
# Series更像是一个dict ,可以直接通过 s[index] 取值 ,判断是否存在index,假如通过s[index]取一个不存在的索引,将会报 KeyError,而用
# s.get(index) 不会报错 s.get(index,"default")这时取不到时可以用一个默认值代替
print "No.1 is ", s['No.1']
print "No.1 is exist", 'No.1' in s
print "No.1 & No.2 is \n", s[['No.1', 'No.2']]

util.report_tag("Series 过滤查询")
s = Series(dateList)
print "data > 5 is \n", s[s > 5]
Beispiel #14
0
    store_ids = X_test["Id"]
    X_test.drop(["Id", "Store"], axis=1, inplace=True)
    print('X_train:\n', X_train)
    print('Y_train:\n', Y_train)
    print('X_test:\n', X_test)
    # Linear Regression
    lreg = LinearRegression()
    lreg.fit(X_train, Y_train)
    Y_pred = lreg.predict(X_test)
    print('Y_pred:\n', Y_pred)
    scores.append(lreg.score(X_train, Y_train))

    # Xgboost
    # params = {"objective": "reg:linear",  "max_depth": 10}
    # T_train_xgb = xgb.DMatrix(X_train, Y_train)
    # X_test_xgb  = xgb.DMatrix(X_test)
    # gbm = xgb.train(params, T_train_xgb, 100)
    # Y_pred = gbm.predict(X_test_xgb)

    # append predicted values of current store to submission
    submission = submission.append(Series(Y_pred, index=store_ids))

# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))
print('submission:\n', submission.head())
# save to csv file
submission = pd.DataFrame({"Id": submission.index, "Sales": submission.values})
submission.head()
print('scores:\n', scores)
# submission.to_csv('rossmann.csv', index=False)
df
s = Series(np.arange(10,14),index=list('abcd'))
s
df+s # Column을 기준으로 broadcasting이 일어남.
df.add(s2, axis=0) # 축을 지정해주면 그거 기준으로 broadcasting일어남.



# 3. Lambda, map, apply -> 요기가 좀 재밌고 실용적!!!
# 굉장히 편하게 pandas에서 적용가능
import pandas as pd
import numpy as np
from pandas import Series

s1 = Series(np.arange(10))
s1.head(5)
s1.map(lambda x: x**2).head(5)

# 값들 대체할때 꽤쉽게 가능.
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z).head(5)
s1 = Series(np.arange(10))
s1
s2=Series(np.arange(10,20))
s2
s1.map(s2) #요렇게 시리즈끼리 맵핑가능


# 아래예제와 같이 sex에 따라서 one-hot 매길수 있다. dict타입으로 mapping
df = pd.read_csv("C:/djangocym/study_2018/lab_bla/data/wages.csv")
df.head()
#创建一个含有3列的DataFrame来承载这些假想数据
M = 500
df = DataFrame(
    {
        'Monmentum': np.random.randn(M) / 200 + 0.03,
        'Value': np.random.randn(M) / 200 + 0.08,
        'ShortInterest': np.random.randn(M) / 200 - 0.02
    },
    index=tickers[:M])
print(df.head())

#随机给公司分类,Financial或tech
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = Series(ind_names[sampler], index=tickers, name='industry')
print(industries.head())
#现在,就可以根据行业分类进行分组并执行分组聚合和变换了
by_industry = df.groupby(industries)
#计算按行分组的平均值
print(by_industry.mean())
print(by_industry.describe())


#行业内标准化过程
def zscore(group):
    # 每个股票减去所在组的平均值再除以标准差
    return (group - group.mean()) / group.std()


#这样处理后,各行业的平均值为0,标准差为1
df_stand = by_industry.apply(zscore)
# In[46]:

data.head()

# In[48]:

data.infl.plot()

# ### 重采样和频率转换

# In[2]:

rng = pd.date_range('3/4/2018', periods=100, freq='D')
ts = Series(np.random.randn(100), index=rng)
ts.head()

# In[5]:

ts.resample('M').mean()

# In[7]:

# ts.resample('M',kind = 'period').mean()
ts.resample('M', kind='timestamp').mean()

# In[13]:

rng = pd.date_range('1/1/2000', periods=12, freq='T')
ts = Series(np.arange(12), index=rng)
ts
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series,DataFrame


print("==============Series绘图=================")
s1=Series(np.random.randn(1000)).cumsum()               #相当于reduce操作
s2=Series(np.random.randn(1000)).cumsum()
print(s1.head(10))

'''一张图绘制多条线'''
s1.plot(label='s1')
s2.plot(label='s2')
plt.legend()
plt.show()

'''子图subplots,一张图绘制多种类型图'''
fig,ax=plt.subplots(2,1)
s1[0:10].plot(ax=ax[0],label='s1',kind='bar')           #柱形图
s2.plot(ax=ax[1],label='s2')
plt.legend()
plt.show()

print("==============DataFrame绘图=================")
df=DataFrame(np.random.randint(1,10,40).reshape(10,-1),columns=['A','B','C','D'])
print(df.head())

df.plot(kind='barh',stacked=True)                       #横向堆叠的柱状图
plt.show()
Beispiel #19
0
fields = ['rpt_key', 'updated_at']

series = series[series.rpt_key == 'btc_krw']
drop_column = ['date_id', 'datetime_id', 'market','diff_24h','diff_per_24h','bid','ask','low','high', 'volume', 'updated_at','rpt_key']
series = series.drop(drop_column, axis = 1)
series_super = timeseries_to_superviser(series)
print(series.head())
#series.plot()
#pyplot.show()

def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return Series(diff)

# invert differenced value
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

differenced = difference(series.values, 1)
print(differenced.head())

inverted = list()
for i in range(len(differenced)):
    value = inverse_difference(series, differenced[i], len(series)-i)
    inverted.append(value)
inverted = Series(inverted)
print(inverted.head())
    elif "png" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "gif" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "css" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "js" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "ico" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "xml" in str(url):
        ana_df_2=ana_df_2.drop(i)
    elif "tmpl" in str(url):
        ana_df_2=ana_df_2.drop(i)

# 2つのデータをまとめる
ana_df=pd.concat([ana_df_1,ana_df_2])
# 日にちごとのアクセス数
number_of_access_per_day = ana_df["日付"].value_counts()

series=Series(number_of_access_per_day)

# 日付順にソート
series_sort=series.sort_index()
series_sort.plot()

ana_df.to_excel("ana_access_log.xlsx")

print(series.head(20))
plt.show()
Beispiel #21
0
os.chdir('data/')
total_data = DataFrame([])
total_labs = []
for d in os.listdir(os.curdir):
    if d.startswith('tctodd'):
        for f in os.listdir(d):
            loc = f.rfind('-')
            word = f[:loc]
            num = int(f[loc + 1:loc + 2])
            f = os.getcwd() + '/' + d + '/' + f
            with open(f, 'r') as fp:
                temp = read_csv(f, delimiter='\t', header=None)
                temp.replace([None, np.nan, np.inf], 0)

                if len(temp) > 100:
                    temp = Series(temp.head(100).as_matrix().flatten())
                else:
                    zeroes = DataFrame(np.zeros((100 - len(temp), 22)))
                    temp = Series(concat([temp, zeroes]).as_matrix().flatten())
                    #temp = temp.append(zeroes, ignore_index=True).as_matrix().flatten()
                    # Fill up to 100 values

                total_data = total_data.append(temp, ignore_index=True)
                #total_data = total_data.append(temp, ignore_index=True)
                total_labs.append(word)
                if num == 3:
                    testing[word] = temp
                else:
                    training[word].append(temp)
os.chdir('../')
def describe_categorical_1d(config: Settings, series: pd.Series,
                            summary: dict) -> Tuple[Settings, pd.Series, dict]:
    """Describe a categorical series.

    Args:
        config: report Settings
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = summary["value_counts_without_nan"]
    histogram_largest = config.vars.cat.histogram_largest
    histogram_data = value_counts
    if histogram_largest > 0:
        histogram_data = histogram_data.nlargest(histogram_largest)

    summary.update(
        histogram_compute(
            config,
            histogram_data,
            summary["n_distinct"],
            name="histogram_frequencies",
        ))

    redact = config.vars.cat.redact
    if not redact:
        summary.update({"first_rows": series.head(5)})

    chi_squared_threshold = config.vars.num.chi_squared_threshold
    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(histogram=value_counts.values)

    if config.vars.cat.length:
        summary.update(length_summary(series))
        summary.update(
            histogram_compute(
                config,
                summary["length"],
                summary["length"].nunique(),
                name="histogram_length",
            ))

    if config.vars.cat.characters:
        summary.update(unicode_summary(series))
        summary["n_characters_distinct"] = summary["n_characters"]
        summary["n_characters"] = summary["character_counts"].values.sum()

        with contextlib.suppress(AttributeError):
            summary["category_alias_counts"].index = summary[
                "category_alias_counts"].index.str.replace("_", " ")

    if config.vars.cat.words:
        summary.update(word_summary(series))

    return config, series, summary
Beispiel #23
0
print(s.loc['a':'f'])
print(s.loc[::-1])
print(s.iloc[::-2])

# Series基本概念
# 可以把Series看作一个定长的有序字典
print(s.shape)
print(s.size)
print(s.index)
# print(s.values)
values = s.values
print(type(values))  # <class 'numpy.ndarray'>

# 可以通过head(), tail()快速查看Series对象的样式
# 当数据量庞大时,通过head(),tail()查看前几个数据和后几个数据,s.head/tail(n), n默认为5
print(s.head())
print(s.tail())

# 当索引没有对应的值时,可能出现缺失数据显示NaN(Not a Number)的情况
s[['d', 'f']] = np.nan
print(s)
'''
a    65.0
b    70.0
c    59.0
d     NaN
e    85.0
f     NaN
g    90.0
h    98.0
i    99.0
# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import pandas as pd

s=Series([1,2,3],index=['a','b','c'])
d=DataFrame([[1,2,3],[4,5,6]],columns=['a','b','c'])

#head() method will return top 5 records
print (s.head())
print (s.describe())
print (d.head())
print (d.describe())

#read data from xml file
excel_data=pd.read_excel("./server.xlsx")
print (excel_data.head)
from pandas import read_csv
from pandas import datetime
from pandas import Series
from sklearn.preprocessing import MinMaxScaler


# load data
def parser(x):
    return datetime.strptime(x, '%Y/%m/%d')


series = read_csv('data_set/shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True,
                  date_parser=parser)
print (series.head())

# 所谓缩放,就是把一组数组中的数字都变成[-1,1]范围的数字,取数组中最大的那个数组,令其为1,最小的数字,令其为-1,
# 剩下的数字根据比例关系,在[-1,1]中给其找一个对应值
# 缩放
X = series.values
X = X.reshape(len(X), 1)  # MinMaxScaler函数需要矩阵作为输入,所以reshape数据为矩阵,因为是一维数组,所以生成的是n行1列的一个矩阵
scaler = MinMaxScaler(feature_range=(-1, 1))  # 定义缩放范围,-1,1是数据缩放的范围
scaler = scaler.fit(X)  # 调用缩放数据的fun
scalered_X = scaler.transform(X)#转换成一个[-1,1]区间的矩阵
scalered_series = Series(scalered_X[:, 0])#把矩阵序列化成列表
print (scalered_series.head())

# 逆缩放,反着来一遍,转换回去
inverted_X = scaler.inverse_transform(scalered_X)#把数值为[-1,1]之间的矩阵转换成正常数据的矩阵
inverted_series = Series(inverted_X[:, 0])#把矩阵转换成列表
print (inverted_series.head())
'''
plot data

Series
'''
data = Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()


'''
DataFrame
'''
data = DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
data = data.cumsum()
print(data.head())

#data.plot()
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.pyplot.show()

'''
plot methods:
    'bar','hist','box','kde','area','scatter','hexbin','pie'
'''




Beispiel #27
0
print("\n=======================================================")
print("Series_DataFrame_operation")
print("=======================================================")
df = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
s = Series(np.arange(10,14), index=list("abcd"))
print(df+s)
# column을 기준으로 broadcasting이 발생

print("\n=======================================================")
print("map_apply_lambda")
print("=======================================================")
# map for series
# Pandas의 series type의 데이터에도 map 함수 사용가능
# function 대신 dict, sequence형 자료등으로 대체 가능
s1 = Series(np.arange(10)) # 0~9
print(s1.head(5))
print("\n")
print(s1.map(lambda x: x**2).head(5))
print("\n")
z = {1: 'A', 2: 'B', 3: 'C'}
print(s1.map(z).head(5)) # 없는 값은 NaN
print("\n")
s2 = Series(np.arange(10,20)) # 10~19
print(s1.map(s2)) # 같은 index 번호끼리
print("\n=======================================================\n")

df = pd.read_csv("./data/wages.csv")
print(df.head(5))
print("\n")
print(df.sex.unique()) # unique(): series data의 유일한 값을 list로 반환 ['male' 'female']
print("\n")
Beispiel #28
0
#split columns of dataframe and make col_n the column indexes
temps = pd.DataFrame(list(temps.col.str.split()), columns=col_n[0])
#drop the duplicate column name row
temps = temps.drop(temps.index[0])
#this would strip white space, but I think it's unnecessary: temps.apply(lambda x: x.str.strip())
#Change Ms to missing values
import numpy as np
temps.replace('M', np.nan, inplace=True)
#create a column with TX vs TN, change MO so is actually Month
temps['Lvl'] = Series(temps['MO']).str[-2:]
temps['MO'] = Series(temps['MO']).str[:-2]
temps['YRMO'] = Series(temps['YR']+temps['MO'])
#make year and month indexes (Q: Added Lvl as well does this shape make sense?)
temps = temps.set_index(['YR','MO', 'YRMO','Lvl']) #Q: added in YRMO so can group and plot, but must be a way to do this with the hierarchical indexing
temps = temps.stack().unstack(['Lvl'])
#adding name to day index
temps.head(100)
temps.index.names = ['YR','MO', 'YRMO', 'DAY']
#convert TX and TN to numbers
temps = temps.convert_objects(convert_numeric=True)
#grouping
yrmo_grouped = temps.groupby(level=(['YRMO'])).mean() #Q: really don't think that should need YRMO
yr_grouped = temps.groupby(level=(['YR'])).mean()

#Let's try graphing!
import matplotlib.pyplot as plt
yrmo_grouped.plot() #this is pandas plot which is a wrapper on plt.plot()
yr_grouped.plot()
#you can do rolling averages!
pd.rolling_sum(temps,1000).plot()
#test
s3["a":"e"]  # Series的切片
s3.iloc[0:2]  # 输出第一行和第二行
'''3.Series的基本概念'''
# 3.1 series的基本属性
s = Series(data=np.random.randint(0, 150, size=4),
           index=['语文', '数学', '英语', 'Python'])
print(s.shape)  # (4,)表示1维
print(s.size)
print(s.values)
print(s.index)

# 3.2可以通过head(),tail()快速查看Series对象的样式
s = Series(data=np.random.randint(0, 150, size=10))
new_index = pd.date_range('20160101', periods=len(s), freq='D')
s.index = new_index  # 给s的索引附上新的值
s.head()  # 快速查看头五个
s.tail()  # 快速查看末尾五个
s.head(3)  # 快速查看头3个
s.tail(3)  # 快速查看末尾3个

# 3.3 检测缺失数据  !!!
s = Series(data={
    "a": 10,
    "b": 20,
    "c": 30
}, index=list("abcd"))  # 当索引没有对应的值时,可能出现缺失数据显示为NaN
print(s)
print(s[3])

# 可以使用pd.isnull(),pd.notnull(),或自带isnull(),notnull()函数检测缺失数据
pd.isnull(s)  # 缺失的数据返回true,否则返回false
Beispiel #30
0
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

# In[ ]:

data['CATEGORY'][:6]

# In[ ]:

data.describe()

# In[ ]:

data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
            (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
            & data.CATEGORY.notnull()]
data.head(5)
data.info()

# In[ ]:


def to_cat_list(catstr):
    stripped = (x.strip() for x in catstr.split(','))
    return [x for x in stripped if x]


def get_all_categories(cat_series):
    cat_sets = (set(to_cat_list(x)) for x in cat_series)
    return sorted(set.union(*cat_sets))

Beispiel #31
0
def return_head(ser: pd.Series, num: int) -> pd.core.series.Series:
    """Return the first num elements of the given Series.
    """
    return ser.head(num)
import string
string.lowercase, string.uppercase

# <codecell>

# we can make a list composed of the individual lowercase letters 

list(string.lowercase)

# <codecell>

# create a pandas Series out of the list of lowercase letters

lower = Series(list(string.lowercase), name='lower')
print type(lower)
lower.head()

# <codecell>

# create a pandas Series out of the list of lowercase letters

upper = Series(list(string.uppercase), name='upper')

# <codecell>

# concatenate the two Series as columns, using axis=1 
# axis = 0 would result in two rows in the DataFrame

df = pd.concat((lower, upper), axis=1)
df.head()
Beispiel #33
0
    return Series(diff)


# invert differenced value
def inverse_difference(history, yhat, interval=1):
    return yhat + history[-interval]


# load dataset
def parser(x):
    return datetime.strptime('190' + x, '%Y-%m')


series = read_csv('shampoo-sales.csv',
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True,
                  date_parser=parser)
print(series.head())
# transform to be stationary
differenced = difference(series, 1)
print(differenced.head())
# invert transform
inverted = list()
for i in range(len(differenced)):
    value = inverse_difference(series, differenced[i], len(series) - i)
    inverted.append(value)
inverted = Series(inverted)
print(inverted.head())
Beispiel #34
0
import string
string.lowercase, string.uppercase

# <codecell>

# we can make a list composed of the individual lowercase letters

list(string.lowercase)

# <codecell>

# create a pandas Series out of the list of lowercase letters

lower = Series(list(string.lowercase), name='lower')
print type(lower)
lower.head()

# <codecell>

# create a pandas Series out of the list of lowercase letters

upper = Series(list(string.uppercase), name='upper')

# <codecell>

# concatenate the two Series as columns, using axis=1
# axis = 0 would result in two rows in the DataFrame

df = pd.concat((lower, upper), axis=1)
df.head()
Beispiel #35
0
    def _resample(
        series: pd.Series,
        resampling_startpoint: datetime,
        resampling_endpoint: datetime,
        resolution: str,
        aggregation_methods: Union[str, List[str], Callable] = "mean",
        interpolation_method: str = "linear_interpolation",
        interpolation_limit: str = "8H",
    ):
        """
        Takes a single series and resamples it.
        See :class:`gordo.machine.dataset.base.GordoBaseDataset.join_timeseries`
        """

        startpoint_sametz = resampling_startpoint.astimezone(
            tz=series.index[0].tzinfo)
        endpoint_sametz = resampling_endpoint.astimezone(
            tz=series.index[0].tzinfo)

        if series.index[0] > startpoint_sametz:
            # Insert a NaN at the startpoint, to make sure that all resampled
            # indexes are the same. This approach will "pad" most frames with
            # NaNs, that will be removed at the end.
            startpoint = pd.Series([np.NaN],
                                   index=[startpoint_sametz],
                                   name=series.name)
            series = startpoint.append(series)
            logging.debug(f"Appending NaN to {series.name} "
                          f"at time {startpoint_sametz}")

        elif series.index[0] < resampling_startpoint:
            msg = (f"Error - for {series.name}, first timestamp "
                   f"{series.index[0]} is before the resampling start point "
                   f"{startpoint_sametz}")
            logging.error(msg)
            raise RuntimeError(msg)

        if series.index[-1] < endpoint_sametz:
            endpoint = pd.Series([np.NaN],
                                 index=[endpoint_sametz],
                                 name=series.name)
            series = series.append(endpoint)
            logging.debug(f"Appending NaN to {series.name} "
                          f"at time {endpoint_sametz}")
        elif series.index[-1] > endpoint_sametz:
            msg = (
                f"Error - for {series.name}, last timestamp "
                f"{series.index[-1]} is later than the resampling end point "
                f"{endpoint_sametz}")
            logging.error(msg)
            raise RuntimeError(msg)

        logging.debug("Head (3) and tail(3) of dataframe to be resampled:")
        logging.debug(series.head(3))
        logging.debug(series.tail(3))

        resampled = series.resample(resolution,
                                    label="left").agg(aggregation_methods)
        # If several aggregation methods are provided, agg returns a dataframe
        # instead of a series. In this dataframe the column names are the
        # aggregation methods, like "max" and "mean", so we have to make a
        # multi-index with the series-name as the top-level and the
        # aggregation-method as the lower-level index.
        # For backwards-compatibility we *dont* return a multi-level index
        # when we have a single resampling method.
        if isinstance(resampled,
                      pd.DataFrame):  # Several aggregation methods provided
            resampled.columns = pd.MultiIndex.from_product(
                [[series.name], resampled.columns],
                names=["tag", "aggregation_method"])

        if interpolation_method not in ["linear_interpolation", "ffill"]:
            raise ValueError(
                "Interpolation method should be either linear_interpolation of ffill"
            )

        if interpolation_limit is not None:
            limit = int(
                pd.Timedelta(interpolation_limit).total_seconds() /
                pd.Timedelta(resolution).total_seconds())

            if limit <= 0:
                raise ValueError(
                    "Interpolation limit must be larger than given resolution")
        else:
            limit = None

        if interpolation_method == "linear_interpolation":
            return resampled.interpolate(limit=limit).dropna()

        else:
            return resampled.fillna(method=interpolation_method,
                                    limit=limit).dropna()
Beispiel #36
0
series = series[series.rpt_key == 'btc_krw']
drop_column = ['date_id', 'datetime_id', 'market','diff_24h','diff_per_24h','bid','ask','low','high', 'volume', 'updated_at','rpt_key']
series = series.drop(drop_column, axis = 1)
print(series.head())
#series.plot()
#pyplot.show()

from sklearn import preprocessing

X = series.values
X = X.reshape(len(X),1)
scaler = preprocessing.MinMaxScaler(feature_range = (-1,1))
scaler = scaler.fit(X)
scaled_X = scaler.transform(X)

inverted_X = scaler.inverse_transform(scaled_X)
inverted_series = Series(inverted_X[:, 0])
print(inverted_series.head())

train, test = X[0:round(len(X)*0.1)], X[-round(len(X)*0.1):]

X,y = train[:,0:-1], train[:, -1]
X = X.reshape(X.shape[0], 1, X.shape[1])

layer = LSTM(neurous, batch_input_shape = (batch_size, X.shape[1], X.shape[2], stateful = True)

model = Sequential()
model.add(layer)
model.add(Dense(1))
model.compile(loss = 'mean_squared_error', optimizer = 'adam')