def make_x_axis(timestamp: pd.Series, interval: float): # print(timestamp.tail(1).values[0] - timestamp.head(1).values[0]) if timestamp.tail(1).values[0] - timestamp.head(1).values[0] > 2: return np.arange( timestamp.head(1).values[0] + 1, timestamp.tail(1).values[0], interval) else: raise ValueError('too short time in eye')
def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} redact = config["vars"]["cat"]["redact"].get(float) if not redact: stats.update({"first_rows": series.head(5)}) stats.update( histogram_compute(value_counts, len(value_counts), name="histogram_frequencies")) chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: stats.update(length_summary(series)) stats.update( histogram_compute(stats["length"], stats["length"].nunique(), name="histogram_length")) check_unicode = config["vars"]["cat"]["characters"].get(bool) if check_unicode: stats.update(unicode_summary(series)) stats["n_characters_distinct"] = stats["n_characters"] stats["n_characters"] = stats["character_counts"].values.sum() stats["category_alias_counts"].index = stats[ "category_alias_counts"].index.str.replace("_", " ") words = config["vars"]["cat"]["words"] if words: stats.update(word_summary(series)) coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get( bool) if coerce_str_to_date: stats["date_warning"] = warning_type_date(series) return stats
def get_clamped_value_counts(value_counts: pd.Series, max_categories_incl_other: int) -> pd.Series: # Returns a Series of a maximum length, where overflowing rows are # put into a "Others" category (index = OTHERS_GROUPED) # IMPORTANT: assuming value_counts is ALREADY SORTED if len(value_counts) <= max_categories_incl_other: categories_shown_as_is = len(value_counts) else: categories_shown_as_is = max_categories_incl_other - 1 # Fix for #10 # clamped_series = pd.Series(value_counts[0:categories_shown_as_is]) clamped_series = pd.Series(value_counts.head(categories_shown_as_is)) # Fix for #10 num_in_tail = len(value_counts) - categories_shown_as_is # categories_in_other = value_counts[categories_shown_as_is:] categories_in_other = value_counts.tail(num_in_tail) if len(categories_in_other) > 0: total_in_other = sum(categories_in_other) other_series = pd.Series([total_in_other], index=[OTHERS_GROUPED]) clamped_series = clamped_series.append(other_series, ignore_index=False) return clamped_series
def fit( X: pd.DataFrame, y: pd.Series, output_dir: str, class_order: Optional[List[str]] = None, row_weights: Optional[np.ndarray] = None, **kwargs, ): """ This hook must be implemented with your fitting code, for running drum in the fit mode. This hook MUST ALWAYS be implemented for custom tasks. For inference models, this hook can stick around unimplemented, and won’t be triggered. Parameters ---------- X: pd.DataFrame - training data to perform fit on y: pd.Series - target data to perform fit on output_dir: the path to write output. This is the path provided in '--output' parameter of the 'drum fit' command. class_order : A two element long list dictating the order of classes which should be used for modeling. Class order will always be passed to fit by DataRobot for classification tasks, and never otherwise. When models predict, they output a likelihood of one class, with a value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order dictates that the first element in the list will be the 0 class, and the second will be the 1 class. row_weights: An array of non-negative numeric values which can be used to dictate how important a row is. Row weights is only optionally used, and there will be no filtering for which custom models support this. There are two situations when values will be passed into row_weights, during smart downsampling and when weights are explicitly provided by the user kwargs: Added for forwards compatibility Returns ------- Nothing """ logging.info(y.head()) # Feel free to delete which ever one of these you aren't using if class_order is not None: if y.dtype == np.dtype("bool"): y = y.astype("str") estimator = make_classifier(X) else: raise Exception( "Running multiclass estimator task: class_order expected to be not None" ) estimator.fit(X, y) # You must serialize out your model to the output_dir given, however if you wish to change this # code, you will probably have to add a load_model method to read the serialized model back in # When prediction is done. # Check out this doc for more information on serialization https://github.com/datarobot/custom-\ # model-templates/tree/master/custom_model_runner#python # NOTE: We currently set a 10GB limit to the size of the serialized model with open("{}/artifact.pkl".format(output_dir), "wb") as fp: pickle.dump(estimator, fp) with open("{}/class_labels.txt".format(output_dir), "wb") as fp: fp.write("\n".join(str(class_) for class_ in estimator.classes_).encode("utf-8"))
def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]: """Describe a categorical series. Args: series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] summary.update( histogram_compute( value_counts, summary["n_distinct"], name="histogram_frequencies" ) ) redact = config["vars"]["cat"]["redact"].get(float) if not redact: summary.update({"first_rows": series.head(5)}) chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(histogram=value_counts.values) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: summary.update(length_summary(series)) summary.update( histogram_compute( summary["length"], summary["length"].nunique(), name="histogram_length" ) ) check_unicode = config["vars"]["cat"]["characters"].get(bool) if check_unicode: summary.update(unicode_summary(series)) summary["n_characters_distinct"] = summary["n_characters"] summary["n_characters"] = summary["character_counts"].values.sum() try: summary["category_alias_counts"].index = summary[ "category_alias_counts" ].index.str.replace("_", " ") except AttributeError: pass words = config["vars"]["cat"]["words"] if words: summary.update(word_summary(series)) return series, summary
def choose_predict_class(predicted_classes: pd.Series) -> int: # Print error message if most frequently predicted classes have the same frequency if len(predicted_classes) > 1: if predicted_classes.iloc[0] == predicted_classes.iloc[1]: print( "Prediction ambiguous: At least two classes appear equally often as nearest neighbors." ) predicted = predicted_classes.head(1).index return predicted[0].astype(int)
def number_of_access(self, corp_name): # 日にちごとのアクセス数 number_of_access_per_day = self.df["日付"].value_counts() series = Series(number_of_access_per_day) print(series.head(20)) # 日付順にソート series_sort = series.sort_index() series_sort.plot() plt.show()
def _contains_instance_attrs(series: pd.Series, is_method, class_name: str, attrs: list, sample_size: int = 1) -> bool: # TODO: user configurable .head or .sample # TODO: performance testing for series[0], series.iloc[0], series.head, series.sample if not all(is_method(x, class_name) for x in series.head(sample_size)): return False try: return all(all(hasattr(x, attr) for attr in attrs) for x in series) except AttributeError: return False
def pandas_describe_categorical_1d( config: Settings, series: pd.Series, summary: dict) -> Tuple[Settings, pd.Series, dict]: """Describe a categorical series. Args: config: report Settings object series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] value_counts.index = value_counts.index.astype(str) redact = config.vars.cat.redact if not redact: summary.update({"first_rows": series.head(5)}) chi_squared_threshold = config.vars.num.chi_squared_threshold if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(histogram=value_counts.values) if config.vars.cat.length: summary.update(length_summary_vc(value_counts)) summary.update( histogram_compute( config, summary["length_histogram"].index.values, len(summary["length_histogram"]), name="histogram_length", weights=summary["length_histogram"].values, )) if config.vars.cat.characters: summary.update(unicode_summary_vc(value_counts)) if config.vars.cat.words: summary.update( word_summary_vc(value_counts, config.vars.cat.stop_words)) return config, series, summary
def get_unclaimed_list(sources: pd.Series) -> str: """Get a list of the posts grouped by sources.""" items = [ i18n["queue"]["unclaimed_list_entry"].format(count=count, source=source) for source, count in sources.head(5).iteritems() ] result = "\n".join(items) if len(sources) > 5: rest = sources[5:] source_count = len(rest) post_count = rest.sum() result += "\n" + i18n["queue"]["unclaimed_list_others"].format( post_count=post_count, source_count=source_count ) return result
def encode_X(self, X: pd.Series): if self.tokenizer is None: self.logger.error( "Please initial the embedding by Word_Embedding().init_embedding_layer first" ) return None # TODO: fix me, this shouldn't happen!! X = X.where((pd.notnull(X)), '') self.logger.info("X.head={}".format(X.head(5))) self.logger.info("X.shape={}".format(X.shape)) self.logger.info("X.values.shape={}".format(X.values.shape)) X = X.values.ravel() X = self.tokenizer.texts_to_sequences(X) self.logger.info("sequance X {}".format(X)) X = sequence.pad_sequences(X, maxlen=self.max_text_len) self.logger.info("padding X {}".format(X)) return X
def test_update_times_mean(self, obj, nogil, parallel, nopython, adjust, ignore_na, halflife_with_times): times = Series( np.array( [ "2020-01-01", "2020-01-05", "2020-01-07", "2020-01-17", "2020-01-21" ], dtype="datetime64", )) expected = obj.ewm( 0.5, adjust=adjust, ignore_na=ignore_na, times=times, halflife=halflife_with_times, ).mean() engine_kwargs = { "nogil": nogil, "parallel": parallel, "nopython": nopython } online_ewm = (obj.head(2).ewm( 0.5, adjust=adjust, ignore_na=ignore_na, times=times.head(2), halflife=halflife_with_times, ).online(engine_kwargs=engine_kwargs)) # Test resetting once for _ in range(2): result = online_ewm.mean() tm.assert_equal(result, expected.head(2)) result = online_ewm.mean(update=obj.tail(3), update_times=times.tail(3)) tm.assert_equal(result, expected.tail(3)) online_ewm.reset()
from utils import util dateList = [1, 3, 5, 6, 8] db = {"No.1": "Wo", "No.2": "Shi", "No.3": "Ni", "No.4": "Da", "No.5": "Ye"} # Creating a Series by passing a list of values, letting pandas create a default integer index s = Series(dateList, index=["A", "B", "C", "D", "E"]) # 索引的长度必须和list的长度一样,否则为[0, ..., len(data) - 1] util.report_tag("Series 处理list结构数据") print "Series data structures is \n", s print "index is ", s.index print "values is ", s.values print "the fist element is ", s[0] print "0~3 element is \n", s[:3] print ">3 element is \n", s[s > 3] # print "通过 s[6] 会报错", s[6] print "通过 s.get(6) return None", s.get(6) print "查看前2行\n", s.head(2) print "查看最后2行\n", s.tail(2) util.report_tag("Series 处理dict数据") s = Series(db) print "Series data structures is \n", s # Series更像是一个dict ,可以直接通过 s[index] 取值 ,判断是否存在index,假如通过s[index]取一个不存在的索引,将会报 KeyError,而用 # s.get(index) 不会报错 s.get(index,"default")这时取不到时可以用一个默认值代替 print "No.1 is ", s['No.1'] print "No.1 is exist", 'No.1' in s print "No.1 & No.2 is \n", s[['No.1', 'No.2']] util.report_tag("Series 过滤查询") s = Series(dateList) print "data > 5 is \n", s[s > 5]
store_ids = X_test["Id"] X_test.drop(["Id", "Store"], axis=1, inplace=True) print('X_train:\n', X_train) print('Y_train:\n', Y_train) print('X_test:\n', X_test) # Linear Regression lreg = LinearRegression() lreg.fit(X_train, Y_train) Y_pred = lreg.predict(X_test) print('Y_pred:\n', Y_pred) scores.append(lreg.score(X_train, Y_train)) # Xgboost # params = {"objective": "reg:linear", "max_depth": 10} # T_train_xgb = xgb.DMatrix(X_train, Y_train) # X_test_xgb = xgb.DMatrix(X_test) # gbm = xgb.train(params, T_train_xgb, 100) # Y_pred = gbm.predict(X_test_xgb) # append predicted values of current store to submission submission = submission.append(Series(Y_pred, index=store_ids)) # append rows(store,date) that were closed, and assign their sales value to 0 submission = submission.append(Series(0, index=closed_store_ids)) print('submission:\n', submission.head()) # save to csv file submission = pd.DataFrame({"Id": submission.index, "Sales": submission.values}) submission.head() print('scores:\n', scores) # submission.to_csv('rossmann.csv', index=False)
df s = Series(np.arange(10,14),index=list('abcd')) s df+s # Column을 기준으로 broadcasting이 일어남. df.add(s2, axis=0) # 축을 지정해주면 그거 기준으로 broadcasting일어남. # 3. Lambda, map, apply -> 요기가 좀 재밌고 실용적!!! # 굉장히 편하게 pandas에서 적용가능 import pandas as pd import numpy as np from pandas import Series s1 = Series(np.arange(10)) s1.head(5) s1.map(lambda x: x**2).head(5) # 값들 대체할때 꽤쉽게 가능. z = {1: 'A', 2: 'B', 3: 'C'} s1.map(z).head(5) s1 = Series(np.arange(10)) s1 s2=Series(np.arange(10,20)) s2 s1.map(s2) #요렇게 시리즈끼리 맵핑가능 # 아래예제와 같이 sex에 따라서 one-hot 매길수 있다. dict타입으로 mapping df = pd.read_csv("C:/djangocym/study_2018/lab_bla/data/wages.csv") df.head()
#创建一个含有3列的DataFrame来承载这些假想数据 M = 500 df = DataFrame( { 'Monmentum': np.random.randn(M) / 200 + 0.03, 'Value': np.random.randn(M) / 200 + 0.08, 'ShortInterest': np.random.randn(M) / 200 - 0.02 }, index=tickers[:M]) print(df.head()) #随机给公司分类,Financial或tech ind_names = np.array(['FINANCIAL', 'TECH']) sampler = np.random.randint(0, len(ind_names), N) industries = Series(ind_names[sampler], index=tickers, name='industry') print(industries.head()) #现在,就可以根据行业分类进行分组并执行分组聚合和变换了 by_industry = df.groupby(industries) #计算按行分组的平均值 print(by_industry.mean()) print(by_industry.describe()) #行业内标准化过程 def zscore(group): # 每个股票减去所在组的平均值再除以标准差 return (group - group.mean()) / group.std() #这样处理后,各行业的平均值为0,标准差为1 df_stand = by_industry.apply(zscore)
# In[46]: data.head() # In[48]: data.infl.plot() # ### 重采样和频率转换 # In[2]: rng = pd.date_range('3/4/2018', periods=100, freq='D') ts = Series(np.random.randn(100), index=rng) ts.head() # In[5]: ts.resample('M').mean() # In[7]: # ts.resample('M',kind = 'period').mean() ts.resample('M', kind='timestamp').mean() # In[13]: rng = pd.date_range('1/1/2000', periods=12, freq='T') ts = Series(np.arange(12), index=rng) ts
import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas import Series,DataFrame print("==============Series绘图=================") s1=Series(np.random.randn(1000)).cumsum() #相当于reduce操作 s2=Series(np.random.randn(1000)).cumsum() print(s1.head(10)) '''一张图绘制多条线''' s1.plot(label='s1') s2.plot(label='s2') plt.legend() plt.show() '''子图subplots,一张图绘制多种类型图''' fig,ax=plt.subplots(2,1) s1[0:10].plot(ax=ax[0],label='s1',kind='bar') #柱形图 s2.plot(ax=ax[1],label='s2') plt.legend() plt.show() print("==============DataFrame绘图=================") df=DataFrame(np.random.randint(1,10,40).reshape(10,-1),columns=['A','B','C','D']) print(df.head()) df.plot(kind='barh',stacked=True) #横向堆叠的柱状图 plt.show()
fields = ['rpt_key', 'updated_at'] series = series[series.rpt_key == 'btc_krw'] drop_column = ['date_id', 'datetime_id', 'market','diff_24h','diff_per_24h','bid','ask','low','high', 'volume', 'updated_at','rpt_key'] series = series.drop(drop_column, axis = 1) series_super = timeseries_to_superviser(series) print(series.head()) #series.plot() #pyplot.show() def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return Series(diff) # invert differenced value def inverse_difference(history, yhat, interval=1): return yhat + history[-interval] differenced = difference(series.values, 1) print(differenced.head()) inverted = list() for i in range(len(differenced)): value = inverse_difference(series, differenced[i], len(series)-i) inverted.append(value) inverted = Series(inverted) print(inverted.head())
elif "png" in str(url): ana_df_2=ana_df_2.drop(i) elif "gif" in str(url): ana_df_2=ana_df_2.drop(i) elif "css" in str(url): ana_df_2=ana_df_2.drop(i) elif "js" in str(url): ana_df_2=ana_df_2.drop(i) elif "ico" in str(url): ana_df_2=ana_df_2.drop(i) elif "xml" in str(url): ana_df_2=ana_df_2.drop(i) elif "tmpl" in str(url): ana_df_2=ana_df_2.drop(i) # 2つのデータをまとめる ana_df=pd.concat([ana_df_1,ana_df_2]) # 日にちごとのアクセス数 number_of_access_per_day = ana_df["日付"].value_counts() series=Series(number_of_access_per_day) # 日付順にソート series_sort=series.sort_index() series_sort.plot() ana_df.to_excel("ana_access_log.xlsx") print(series.head(20)) plt.show()
os.chdir('data/') total_data = DataFrame([]) total_labs = [] for d in os.listdir(os.curdir): if d.startswith('tctodd'): for f in os.listdir(d): loc = f.rfind('-') word = f[:loc] num = int(f[loc + 1:loc + 2]) f = os.getcwd() + '/' + d + '/' + f with open(f, 'r') as fp: temp = read_csv(f, delimiter='\t', header=None) temp.replace([None, np.nan, np.inf], 0) if len(temp) > 100: temp = Series(temp.head(100).as_matrix().flatten()) else: zeroes = DataFrame(np.zeros((100 - len(temp), 22))) temp = Series(concat([temp, zeroes]).as_matrix().flatten()) #temp = temp.append(zeroes, ignore_index=True).as_matrix().flatten() # Fill up to 100 values total_data = total_data.append(temp, ignore_index=True) #total_data = total_data.append(temp, ignore_index=True) total_labs.append(word) if num == 3: testing[word] = temp else: training[word].append(temp) os.chdir('../')
def describe_categorical_1d(config: Settings, series: pd.Series, summary: dict) -> Tuple[Settings, pd.Series, dict]: """Describe a categorical series. Args: config: report Settings series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] histogram_largest = config.vars.cat.histogram_largest histogram_data = value_counts if histogram_largest > 0: histogram_data = histogram_data.nlargest(histogram_largest) summary.update( histogram_compute( config, histogram_data, summary["n_distinct"], name="histogram_frequencies", )) redact = config.vars.cat.redact if not redact: summary.update({"first_rows": series.head(5)}) chi_squared_threshold = config.vars.num.chi_squared_threshold if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(histogram=value_counts.values) if config.vars.cat.length: summary.update(length_summary(series)) summary.update( histogram_compute( config, summary["length"], summary["length"].nunique(), name="histogram_length", )) if config.vars.cat.characters: summary.update(unicode_summary(series)) summary["n_characters_distinct"] = summary["n_characters"] summary["n_characters"] = summary["character_counts"].values.sum() with contextlib.suppress(AttributeError): summary["category_alias_counts"].index = summary[ "category_alias_counts"].index.str.replace("_", " ") if config.vars.cat.words: summary.update(word_summary(series)) return config, series, summary
print(s.loc['a':'f']) print(s.loc[::-1]) print(s.iloc[::-2]) # Series基本概念 # 可以把Series看作一个定长的有序字典 print(s.shape) print(s.size) print(s.index) # print(s.values) values = s.values print(type(values)) # <class 'numpy.ndarray'> # 可以通过head(), tail()快速查看Series对象的样式 # 当数据量庞大时,通过head(),tail()查看前几个数据和后几个数据,s.head/tail(n), n默认为5 print(s.head()) print(s.tail()) # 当索引没有对应的值时,可能出现缺失数据显示NaN(Not a Number)的情况 s[['d', 'f']] = np.nan print(s) ''' a 65.0 b 70.0 c 59.0 d NaN e 85.0 f NaN g 90.0 h 98.0 i 99.0
# -*- coding: utf-8 -*- from pandas import Series,DataFrame import pandas as pd s=Series([1,2,3],index=['a','b','c']) d=DataFrame([[1,2,3],[4,5,6]],columns=['a','b','c']) #head() method will return top 5 records print (s.head()) print (s.describe()) print (d.head()) print (d.describe()) #read data from xml file excel_data=pd.read_excel("./server.xlsx") print (excel_data.head)
from pandas import read_csv from pandas import datetime from pandas import Series from sklearn.preprocessing import MinMaxScaler # load data def parser(x): return datetime.strptime(x, '%Y/%m/%d') series = read_csv('data_set/shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) print (series.head()) # 所谓缩放,就是把一组数组中的数字都变成[-1,1]范围的数字,取数组中最大的那个数组,令其为1,最小的数字,令其为-1, # 剩下的数字根据比例关系,在[-1,1]中给其找一个对应值 # 缩放 X = series.values X = X.reshape(len(X), 1) # MinMaxScaler函数需要矩阵作为输入,所以reshape数据为矩阵,因为是一维数组,所以生成的是n行1列的一个矩阵 scaler = MinMaxScaler(feature_range=(-1, 1)) # 定义缩放范围,-1,1是数据缩放的范围 scaler = scaler.fit(X) # 调用缩放数据的fun scalered_X = scaler.transform(X)#转换成一个[-1,1]区间的矩阵 scalered_series = Series(scalered_X[:, 0])#把矩阵序列化成列表 print (scalered_series.head()) # 逆缩放,反着来一遍,转换回去 inverted_X = scaler.inverse_transform(scalered_X)#把数值为[-1,1]之间的矩阵转换成正常数据的矩阵 inverted_series = Series(inverted_X[:, 0])#把矩阵转换成列表 print (inverted_series.head())
''' plot data Series ''' data = Series(np.random.randn(1000),index=np.arange(1000)) data = data.cumsum() ''' DataFrame ''' data = DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD')) data = data.cumsum() print(data.head()) #data.plot() ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1') data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax) plt.pyplot.show() ''' plot methods: 'bar','hist','box','kde','area','scatter','hexbin','pie' '''
print("\n=======================================================") print("Series_DataFrame_operation") print("=======================================================") df = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd")) s = Series(np.arange(10,14), index=list("abcd")) print(df+s) # column을 기준으로 broadcasting이 발생 print("\n=======================================================") print("map_apply_lambda") print("=======================================================") # map for series # Pandas의 series type의 데이터에도 map 함수 사용가능 # function 대신 dict, sequence형 자료등으로 대체 가능 s1 = Series(np.arange(10)) # 0~9 print(s1.head(5)) print("\n") print(s1.map(lambda x: x**2).head(5)) print("\n") z = {1: 'A', 2: 'B', 3: 'C'} print(s1.map(z).head(5)) # 없는 값은 NaN print("\n") s2 = Series(np.arange(10,20)) # 10~19 print(s1.map(s2)) # 같은 index 번호끼리 print("\n=======================================================\n") df = pd.read_csv("./data/wages.csv") print(df.head(5)) print("\n") print(df.sex.unique()) # unique(): series data의 유일한 값을 list로 반환 ['male' 'female'] print("\n")
#split columns of dataframe and make col_n the column indexes temps = pd.DataFrame(list(temps.col.str.split()), columns=col_n[0]) #drop the duplicate column name row temps = temps.drop(temps.index[0]) #this would strip white space, but I think it's unnecessary: temps.apply(lambda x: x.str.strip()) #Change Ms to missing values import numpy as np temps.replace('M', np.nan, inplace=True) #create a column with TX vs TN, change MO so is actually Month temps['Lvl'] = Series(temps['MO']).str[-2:] temps['MO'] = Series(temps['MO']).str[:-2] temps['YRMO'] = Series(temps['YR']+temps['MO']) #make year and month indexes (Q: Added Lvl as well does this shape make sense?) temps = temps.set_index(['YR','MO', 'YRMO','Lvl']) #Q: added in YRMO so can group and plot, but must be a way to do this with the hierarchical indexing temps = temps.stack().unstack(['Lvl']) #adding name to day index temps.head(100) temps.index.names = ['YR','MO', 'YRMO', 'DAY'] #convert TX and TN to numbers temps = temps.convert_objects(convert_numeric=True) #grouping yrmo_grouped = temps.groupby(level=(['YRMO'])).mean() #Q: really don't think that should need YRMO yr_grouped = temps.groupby(level=(['YR'])).mean() #Let's try graphing! import matplotlib.pyplot as plt yrmo_grouped.plot() #this is pandas plot which is a wrapper on plt.plot() yr_grouped.plot() #you can do rolling averages! pd.rolling_sum(temps,1000).plot() #test
s3["a":"e"] # Series的切片 s3.iloc[0:2] # 输出第一行和第二行 '''3.Series的基本概念''' # 3.1 series的基本属性 s = Series(data=np.random.randint(0, 150, size=4), index=['语文', '数学', '英语', 'Python']) print(s.shape) # (4,)表示1维 print(s.size) print(s.values) print(s.index) # 3.2可以通过head(),tail()快速查看Series对象的样式 s = Series(data=np.random.randint(0, 150, size=10)) new_index = pd.date_range('20160101', periods=len(s), freq='D') s.index = new_index # 给s的索引附上新的值 s.head() # 快速查看头五个 s.tail() # 快速查看末尾五个 s.head(3) # 快速查看头3个 s.tail(3) # 快速查看末尾3个 # 3.3 检测缺失数据 !!! s = Series(data={ "a": 10, "b": 20, "c": 30 }, index=list("abcd")) # 当索引没有对应的值时,可能出现缺失数据显示为NaN print(s) print(s[3]) # 可以使用pd.isnull(),pd.notnull(),或自带isnull(),notnull()函数检测缺失数据 pd.isnull(s) # 缺失的数据返回true,否则返回false
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10] # In[ ]: data['CATEGORY'][:6] # In[ ]: data.describe() # In[ ]: data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) & (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & data.CATEGORY.notnull()] data.head(5) data.info() # In[ ]: def to_cat_list(catstr): stripped = (x.strip() for x in catstr.split(',')) return [x for x in stripped if x] def get_all_categories(cat_series): cat_sets = (set(to_cat_list(x)) for x in cat_series) return sorted(set.union(*cat_sets))
def return_head(ser: pd.Series, num: int) -> pd.core.series.Series: """Return the first num elements of the given Series. """ return ser.head(num)
import string string.lowercase, string.uppercase # <codecell> # we can make a list composed of the individual lowercase letters list(string.lowercase) # <codecell> # create a pandas Series out of the list of lowercase letters lower = Series(list(string.lowercase), name='lower') print type(lower) lower.head() # <codecell> # create a pandas Series out of the list of lowercase letters upper = Series(list(string.uppercase), name='upper') # <codecell> # concatenate the two Series as columns, using axis=1 # axis = 0 would result in two rows in the DataFrame df = pd.concat((lower, upper), axis=1) df.head()
return Series(diff) # invert differenced value def inverse_difference(history, yhat, interval=1): return yhat + history[-interval] # load dataset def parser(x): return datetime.strptime('190' + x, '%Y-%m') series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) print(series.head()) # transform to be stationary differenced = difference(series, 1) print(differenced.head()) # invert transform inverted = list() for i in range(len(differenced)): value = inverse_difference(series, differenced[i], len(series) - i) inverted.append(value) inverted = Series(inverted) print(inverted.head())
def _resample( series: pd.Series, resampling_startpoint: datetime, resampling_endpoint: datetime, resolution: str, aggregation_methods: Union[str, List[str], Callable] = "mean", interpolation_method: str = "linear_interpolation", interpolation_limit: str = "8H", ): """ Takes a single series and resamples it. See :class:`gordo.machine.dataset.base.GordoBaseDataset.join_timeseries` """ startpoint_sametz = resampling_startpoint.astimezone( tz=series.index[0].tzinfo) endpoint_sametz = resampling_endpoint.astimezone( tz=series.index[0].tzinfo) if series.index[0] > startpoint_sametz: # Insert a NaN at the startpoint, to make sure that all resampled # indexes are the same. This approach will "pad" most frames with # NaNs, that will be removed at the end. startpoint = pd.Series([np.NaN], index=[startpoint_sametz], name=series.name) series = startpoint.append(series) logging.debug(f"Appending NaN to {series.name} " f"at time {startpoint_sametz}") elif series.index[0] < resampling_startpoint: msg = (f"Error - for {series.name}, first timestamp " f"{series.index[0]} is before the resampling start point " f"{startpoint_sametz}") logging.error(msg) raise RuntimeError(msg) if series.index[-1] < endpoint_sametz: endpoint = pd.Series([np.NaN], index=[endpoint_sametz], name=series.name) series = series.append(endpoint) logging.debug(f"Appending NaN to {series.name} " f"at time {endpoint_sametz}") elif series.index[-1] > endpoint_sametz: msg = ( f"Error - for {series.name}, last timestamp " f"{series.index[-1]} is later than the resampling end point " f"{endpoint_sametz}") logging.error(msg) raise RuntimeError(msg) logging.debug("Head (3) and tail(3) of dataframe to be resampled:") logging.debug(series.head(3)) logging.debug(series.tail(3)) resampled = series.resample(resolution, label="left").agg(aggregation_methods) # If several aggregation methods are provided, agg returns a dataframe # instead of a series. In this dataframe the column names are the # aggregation methods, like "max" and "mean", so we have to make a # multi-index with the series-name as the top-level and the # aggregation-method as the lower-level index. # For backwards-compatibility we *dont* return a multi-level index # when we have a single resampling method. if isinstance(resampled, pd.DataFrame): # Several aggregation methods provided resampled.columns = pd.MultiIndex.from_product( [[series.name], resampled.columns], names=["tag", "aggregation_method"]) if interpolation_method not in ["linear_interpolation", "ffill"]: raise ValueError( "Interpolation method should be either linear_interpolation of ffill" ) if interpolation_limit is not None: limit = int( pd.Timedelta(interpolation_limit).total_seconds() / pd.Timedelta(resolution).total_seconds()) if limit <= 0: raise ValueError( "Interpolation limit must be larger than given resolution") else: limit = None if interpolation_method == "linear_interpolation": return resampled.interpolate(limit=limit).dropna() else: return resampled.fillna(method=interpolation_method, limit=limit).dropna()
series = series[series.rpt_key == 'btc_krw'] drop_column = ['date_id', 'datetime_id', 'market','diff_24h','diff_per_24h','bid','ask','low','high', 'volume', 'updated_at','rpt_key'] series = series.drop(drop_column, axis = 1) print(series.head()) #series.plot() #pyplot.show() from sklearn import preprocessing X = series.values X = X.reshape(len(X),1) scaler = preprocessing.MinMaxScaler(feature_range = (-1,1)) scaler = scaler.fit(X) scaled_X = scaler.transform(X) inverted_X = scaler.inverse_transform(scaled_X) inverted_series = Series(inverted_X[:, 0]) print(inverted_series.head()) train, test = X[0:round(len(X)*0.1)], X[-round(len(X)*0.1):] X,y = train[:,0:-1], train[:, -1] X = X.reshape(X.shape[0], 1, X.shape[1]) layer = LSTM(neurous, batch_input_shape = (batch_size, X.shape[1], X.shape[2], stateful = True) model = Sequential() model.add(layer) model.add(Dense(1)) model.compile(loss = 'mean_squared_error', optimizer = 'adam')