def deserialize(self, item, force_bytes_to_unicode=False): index = self._index_from_records(item) name = item.dtype.names[-1] data = item[name] if force_bytes_to_unicode: if len(data) and isinstance(data[0], bytes): data = data.astype('unicode') if isinstance(index, MultiIndex): unicode_indexes = [] # MultiIndex requires a conversion at each level. for level in range(len(index.levels)): _index = index.get_level_values(level) if isinstance(_index[0], bytes): _index = _index.astype('unicode') unicode_indexes.append(_index) index = unicode_indexes else: if len(index) and type(index[0]) == bytes: index = index.astype('unicode') if PD_VER < '0.23.0': return Series.from_array(data, index=index, name=name) else: return Series(data, index=index, name=name)
def draw_power_graphics(power,sp=None,before=None,after=None,band=100): """ Draw different graphics for the power (the number of graphics depend of how many data are included in the power and the value of band) power : dataframe with the values of the power sp : array of [array of x, array of y] array 3D with the points we want highlight in the graphics with different colors before : we want to consider only the value before a certain date after : we want to consider only the value after a certain date band : how many values of power we want draw in each graphics -------------------------------------------------------- Note: this method use the 'time' and 'value' columns and the date must be passed in this format -> %Y-%m-%d %H:%M:%S """ i = 0 tmp_power = power.copy() if before!=None: tmp_power = tmp_power[tmp_power['time']<=before] if after!=None: tmp_power = tmp_power[tmp_power['time']>=after] serie = Series.from_array(tmp_power['value']) while(i*band<len(serie)): pyplot.figure(figsize=(20, 5)) pyplot.plot(serie[i*band:(i+1)*band],'ro') v = [i*band, (i+1)*band, 0, 1200] pyplot.axis(v) v = [serie.index[0], serie.index[len(serie)-1], 0, 1200]#xmin,xmax,ymin,ymax if sp != None: j = 0 while j<len(sp): pyplot.scatter(sp[j][0],sp[j][1],marker=(0, 3)) j += 1 pyplot.show() i += 1
def get_special_point(power,events,borders,eventName,numericValue): """ Return, and print, the average of energy used during the event which we are interesting (example: eventName="coffee") power : dataframe with the values of the power events : dataframe with the events (and respective datetime) borders : [min,MAX] how many seconds before and after the events we want to consider in the graphics eventName : name of the event we want consider for the graphics numericValue : value we want to give in the Y axis for each event -------------------------------------------------------- Note: this method use the 'time' and 'value' columns and the date must be passed in this format -> %Y-%m-%d %H:%M:%S """ event_consider = events[events['eventName']==eventName].reset_index(drop=True) #around turn_on i = 0 count = 0 event_index = [] while(i<len(event_consider)): date = time.mktime(datetime.strptime(event_consider['time'][i], "%Y-%m-%d %H:%M:%S").timetuple()) start = str(datetime.fromtimestamp(date-borders[0])) end = str(datetime.fromtimestamp(date+borders[1])) serie = Series.from_array(power[(power['time']>=start)&(power['time']<=end)]['value']) if len(serie)>0: event_index.append(serie.index[int(len(serie)/2)]) count += 1 i += 1 print("number of", eventName ,"in groudtruth and power=",count) return event_index,[numericValue]*len(event_index)
def draw_around_event(power,events,borders,eventName,maxY=1200): """ Draw different graphics for each event saved in events dataframe. In each graphics is drawn the levels of power around the event power : dataframe with the values of the power events : dataframe with the events (and respective datetime) borders : [min,MAX] how many seconds before and after the events we want to consider in the graphics eventName : name of the event we want consider for the graphics maxY : the max Y value of power we in the Cartesian axis -------------------------------------------------------- Note: this method use the 'time' and 'value' columns and the date must be passed in this format -> %Y-%m-%d %H:%M:%S """ event_consider = events[events['eventName']==eventName].reset_index(drop=True) print("number of", eventName ,"in groudtruth=",len(event_consider)) i = 0 while(i<len(event_consider)): date = time.mktime(datetime.strptime(event_consider['time'][i], "%Y-%m-%d %H:%M:%S").timetuple()) start = str(datetime.fromtimestamp(date-borders[0])) end = str(datetime.fromtimestamp(date+borders[1])) print(date,start,end) i += 1 serie = Series.from_array(power[(power['time']>=start)&(power['time']<=end)]['value']) if len(serie)>0: v = [serie.index[0], serie.index[len(serie)-1], 0, maxY]#xmin,xmax,ymin,ymax pyplot.figure(figsize=(20, 5)) pyplot.plot(serie,'ro') pyplot.axis(v) pyplot.show() else: print("No data of power for this event")
def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) df = DataFrame(arr) assert df['Date'][0] == dates[0][0] assert df['Forecasting'][0] == dates[0][1] s = Series(arr['Date']) assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] s = Series.from_array(arr['Date'], Index([0])) assert s[0] == dates[0][0]
def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) df = DataFrame(arr) assert df["Date"][0] == dates[0][0] assert df["Forecasting"][0] == dates[0][1] s = Series(arr["Date"]) assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = Series.from_array(arr["Date"], Index([0])) assert s[0] == dates[0][0]
def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) df = DataFrame(arr) self.assertEqual(df['Date'][0], dates[0][0]) self.assertEqual(df['Forecasting'][0], dates[0][1]) s = Series(arr['Date']) self.assertTrue(s[0], Timestamp) self.assertEqual(s[0], dates[0][0]) s = Series.from_array(arr['Date'], Index([0])) self.assertEqual(s[0], dates[0][0])
def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] arr = np.array(dates, dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) df = DataFrame(arr) assert df['Date'][0] == dates[0][0] assert df['Forecasting'][0] == dates[0][1] s = Series(arr['Date']) assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = Series.from_array(arr['Date'], Index([0])) assert s[0] == dates[0][0]
def previsao_matematica(reservatId, data): seriesArray = Series.from_array(predict_info.getSeries(reservatId, data)) seriesValues = seriesArray.values mathDict = {'calculado': False, 'volumes': [], 'dias': 0} #if isNonStationary(seriesValues) == True: days_in_year = 1 differenced = predict_info.difference(seriesValues, days_in_year) # fit model model = ARIMA(differenced, order=(1,0,1)) model_fit = model.fit(disp = -1) # multi-step out-of-sample forecast forecast = model_fit.forecast(steps=180)[0] # invert the differenced forecast to something usable mathDict['calculado'] = True history = [x for x in seriesValues] for yhat in forecast: inverted = predict_info.inverse_difference(history, yhat, days_in_year) history.append(inverted) if inverted >= 0.0: mathDict['volumes'].append("%.4f" % round((inverted), 4)) mathDict['dias'] = mathDict['dias'] + 1 return mathDict
def previsao_matematica(reservatId, data): seriesArray = Series.from_array(predict_info.getSeries(reservatId, data)) seriesValues = seriesArray.values mathDict = {'calculado': False, 'volumes': [], 'dias': 0} #if isNonStationary(seriesValues) == True: days_in_year = 1 differenced = predict_info.difference(seriesValues, days_in_year) # fit model model = ARIMA(differenced, order=(1, 0, 1)) model_fit = model.fit(disp=-1) # multi-step out-of-sample forecast forecast = model_fit.forecast(steps=180)[0] # invert the differenced forecast to something usable mathDict['calculado'] = True history = [x for x in seriesValues] for yhat in forecast: inverted = predict_info.inverse_difference(history, yhat, days_in_year) history.append(inverted) if inverted >= 0.0: mathDict['volumes'].append("%.4f" % round((inverted), 4)) mathDict['dias'] = mathDict['dias'] + 1 return mathDict
def deserialize(self, item): index = self._index_from_records(item) name = item.dtype.names[-1] return Series.from_array(item[name], index=index, name=name)
def from_records(self, recarr): index = self._index_from_records(recarr) name = recarr.dtype.names[-1] return Series.from_array(recarr[name], index=index, name=name)
tot[:10] # ## 导出或者打印数据 # In[12]: import sys file6 = open('F:/电影/数据分析/pydata-book-master/ch06/ex6.csv') data = pd.read_csv(file6) data.to_csv('F:/电影/数据分析/pydata-book-master/ch06/ooo.csv') #data.to_csv(sys.stdout, na_rep='NULL',index=False,column=False) # data.to_csv(sys.stdout, sep='|') # In[13]: kk = Series.from_array('F:/电影/数据分析/pydata-book-master/ch06/tseries.csv') kk # ## 手工处理分隔符 # In[14]: import csv f = open('F:/电影/数据分析/pydata-book-master/ch06/ex7.csv') reader = csv.reader(f) reader # In[15]: for line in reader: print(line)
def chapter_2(): random.seed(0) a = random.randn(16).reshape(4, 4) * 10 print(linalg.det(a)) print(linalg.inv(a)) print(np.dot(a, linalg.inv(a))) [eig_value, eig_vector] = linalg.eig(a) print(eig_value) print(eig_vector) def sample_func(x): return x**2 + 2 * x + 1 print(sp.optimize.newton(sample_func, 0)) print(minimize_scalar(sample_func, method="Brent")) sample_pandas_data = Series.from_array( [12, 23, 34, 45, 56, 67, 78, 89, 90, 121], index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']) print(sample_pandas_data) attri_data1 = { 'ID': ['100', '101', '102', '103', '104'], 'city': ['Tokyo', 'Osaka', 'Kyoto', 'Hokkaidao', 'Tokyo'], 'birth_year': [1990, 1989, 1992, 1997, 1982], 'name': ['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steeve'] } attri_data_frame1 = DataFrame(attri_data1, index=["a", "b", "c", "d", "e"]) print(attri_data_frame1) print(attri_data_frame1[["ID", "city"]].T) print(attri_data_frame1[attri_data_frame1['city'].isin(["Tokyo", "Osaka"])]) print(attri_data_frame1.drop(['birth_year'], axis=1)) attri_data2 = { 'ID': ['100', '101', '102', '105', '107'], 'math': [50, 43, 33, 76, 98], 'English': [90, 30, 20, 50, 30], 'sex': ['M', 'F', 'F', 'M', 'M'] } attri_data_frame2 = DataFrame.from_dict(attri_data2) print(attri_data_frame2) print(pd.merge(attri_data_frame1, attri_data_frame2, "outer")) print(attri_data_frame2.groupby("sex")["math"].mean()) def scatter(): # 散布図 random.seed(0) x = np.random.randn(300) y = np.sin(x) + np.random.randn(300) plt.plot(x, y, "o") # plt.scatter(x, y) plt.title("Title Name") plt.xlabel("X") plt.ylabel("Y") plt.grid(True) plt.show() def continuous(): # 連続曲線 np.random.seed(0) numpy_data_x = np.arange(1000) numpy_random_data_y = np.random.randn(1000).cumsum() plt.plot(numpy_data_x, numpy_random_data_y, label="Label") plt.legend() plt.xlabel("X") plt.ylabel("Y") plt.grid(True) plt.show() def sin_and_cos(): plt.subplot(2, 2, 1) x1 = np.linspace(-10, 10, 100) plt.plot(x1, np.sin(x1)) plt.subplot(2, 2, 2) x2 = np.linspace(-10, 10, 100) plt.plot(x2, np.sin(2 * x2)) plt.subplot(2, 2, 3) x3 = np.linspace(-10, 10, 100) plt.plot(x3, np.sin(x3)) plt.subplot(2, 2, 4) x4 = np.linspace(-10, 10, 100) plt.plot(x4, np.sin(2 * x4)) plt.grid(True) plt.show() def hist(): random.seed(0) plt.subplot(3, 1, 1) plt.hist(np.random.randn(10**5) * 10 + 50, bins=60, range=(20, 80)) plt.subplot(3, 1, 2) plt.hist(random.uniform(0.0, 1.0, 1000), bins=100) plt.subplot(3, 1, 3) plt.hist(random.uniform(0.0, 1.0, 1000), bins=100) plt.grid(True) plt.show() def monte_carlo(): random.seed(0) n = 1000000 x = random.uniform(-1.0, 1.0, n) y = random.uniform(-1.0, 1.0, n) r = np.sqrt(x**2 + y**2) mask = r < 1 print("pi =", np.sum(mask) * 4 / n) plt.subplot(2, 1, 1) plt.scatter(x[mask], y[mask]) plt.subplot(2, 1, 2) plt.scatter(x[mask == 0], y[mask == 0]) plt.show()
def deserialize(self, item, _force_bytes_to_unicode=False): index = self._index_from_records(item) name = item.dtype.names[-1] return Series.from_array(item[name], index=index, name=name)
print(cleaned) coeff = {} for comp in new_s_and_p: coeff[comp] = [] for comp1 in new_s_and_p: count = 0 coeffs = {} for comp2 in new_s_and_p: if(s_and_p.index(comp1)<s_and_p.index(comp2)): nums = [] for i in range (0, len(new_s_and_p[comp1][0])): nums.append(new_s_and_p[comp1][0][i]-new_s_and_p[comp2][0][i]) series = Series.from_array(nums[0:755]) X = series.values result = adfuller(X) coeffs[comp2] = (np.corrcoef(new_s_and_p[comp1][0][0:755],new_s_and_p[comp2][0][0:755])[0][1], result[0], new_s_and_p[comp1][1]) coeff[comp1] = coeffs print("Done with Coeffs. Now selecting pairs.") pairs_0_50 = [] pairs_50_80 = [] pairs_80_90 = [] pairs_90_97 = [] pairs_97_100 = [] for comp1 in coeff.keys(): for comp2 in coeff[comp1].keys(): if coeff[comp1][comp2][1]<-2.579 and coeff[comp1][comp2][0] <.5:
from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier #from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error #from sklearn.utils import check_array from sklearn.grid_search import GridSearchCV #from sklearn.model_selection import cross_val_score from sklearn.decomposition import PCA from sklearn import preprocessing import matplotlib.pyplot as plt df = pandas.read_csv("C:/Users/user/Desktop/Drchen/Cap8/capsule8.csv") df1 = pandas.read_excel("C:/Users/user/Desktop/Drchen/Cap8/cap8.xlsx") df2 = pandas.read_excel("C:/Users/user/Desktop/Drchen/Cap8/tcp_life.xlsx") x = Series.from_array(df['ts']) y = Series.from_array(df['depth']) #binwidth = 200 # 1.time-series & histogram for ts x.hist() x.plot() #time-series & histogram for depth y.hist() plt.plot(x, y) plot_1 = plt.plot(df['ts']) plot_2 = plt.plot(df['ts'], df['depth']) #time - series & histogram for rate_1 & rate_5
#train_valid_X = full_set_X[0:891] train_valid_X = full_set[0:891] train_valid_y = targets #test_X = full_set_X[891:] test_X = full_set[891:] train_X, valid_X, train_y, valid_y = train_test_split (train_valid_X, train_valid_y, train_size = .7) print(full_set.shape, train_X.shape, valid_X.shape, train_y.shape, valid_y.shape, test_X.shape) from pandas import Series from matplotlib import pyplot series = Series.from_array(train['Age']) series.hist() pyplot.show() cl = RandomForestClassifier(n_estimators=100) #cl = RandomForestClassifier(n_estimators=50, max_features='sqrt') #cl = GaussianNB() #cl = LogisticRegression() #cl = KNeighborsClassifier(n_neighbors=3) #cl = GradientBoostingClassifier() #cl = SVC() #import xgboost as xgb
from pandas import Series from pandas import DataFrame from pandas import TimeGrouper from matplotlib import pyplot import numpy as np import pandas as pd pdf = engineSample50cycleWindow.values # COMMAND ---------- series = Series.from_array(pdf) groups = series.groupby(TimeGrouper(1)) years = DataFrame() for name, group in groups: years[name.year] = group.values years = years.T pyplot.matshow(years, interpolation=None, aspect='auto') pyplot.show() # COMMAND ---------- import matplotlib.pyplot as plt dataSet = renamed_df.toPandas() fig, ax = plt.subplots()