def evaluate(model, test_features, test_labels): # Created with help from https://towardsdatascience.com/random-forest-in-python-24d0893d51c0 predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) rmse = np.sqrt(mean_squared_error(test_labels, predictions)) accuracy = 100 - mape r2 = r2_score(test_labels, predictions) print('Average Error: {:0.4f}'.format(np.mean(errors))) print('Accuracy = {:0.2f}%'.format(accuracy)) print('RMSE: {:0.4f}'.format(rmse)) print('R-Squared: {:0.4f}'.format(r2))
def rank(series): # Calculate the mean of the input series mean = pd.mean(series) # Return the mean and its rank as a list if mean > 90: return [mean, 'high'] if mean > 60: return [mean, 'medium'] return [mean, 'low']
'Population':['39512223', '1359711', '19453561']}) df3 = pd.merge(left=df1, right=df2, on='State', how='inner') df3 Notice how there are only two rows in the merged dataframe. We can also be more inclusive and match on `State` column, but retain all rows. This is equivalent to an 'or' join. df3 = pd.merge(left=df1, right=df2, on='State', how='outer') df3 This is a very handy way to merge data when you have lots of files with missing data. See Jake Vanderplas's [tutorial](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.07-Merge-and-Join.ipynb) for a more in depth overview. ## Grouping We've seen above that it is very easy to summarize data over columns using the builtin functions such as `pd.mean()`. Sometimes we are interested in summarizing data over different groups of rows. For example, what is the mean of participants in Condition A compared to Condition B? This is suprisingly easy to compute in pandas using the `groupby` operator, where we aggregate data using a specific operation over different labels. One useful way to conceptualize this is using the **Split, Apply, Combine** operation (similar to map-reduce). ![](../images/pandas/split-apply-combine.png) This figure is taken from Jake Vanderplas's tutorial and highlights how input data can be *split* on some key and then an operation such as sum can be *applied* separately to each split. Finally, the results of the applied function for each key can be *combined* into a new data frame. ### Groupby In this example, we will use the `groupby` operator to split the data based on gender labels and separately calculate the mean for each group. df.groupby('gender_name').mean() Other default aggregation methods include `.count()`, `.mean()`, `.median()`, `.min()`, `.max()`, `.std()`, `.var()`, and `.sum()`
#3 显示构造,然后使用将这些对象作为index参数,或者通过reindex方法更新索引 df = pd.DataFrame(np.random.rand(4, 2), columns=['data1', 'data2'], index=) pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]) #数组 pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) #元组 pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) #笛卡尔积 pop.index.names = ['state', 'year'] #多级列索引 index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit']) columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names=['subject', 'type']) health_data = pd.DataFrame(data, index=index, columns=columns) #多级索引的累计方法 pd.mean(level='year') #对行索引进行累计操作 pd.mean(axis = 1,level='name') #对列索引进行累计操作 health_data #df多维索引,使用ix(),loc()函数,对于二维索引,如果包含中文,切片可能有BUG df.loc['a',1] #二级索引 df.loc['a'].loc['1'] #二级索引 df.loc['a'] df.loc['a':'c'] #一级多个索引 df.loc[['a','b']] #一级多个索引 df[:, 2000] df[df > 22000000] df['a'] #第一层次的索引 df['a':'c'] df.ix[['a':'c']] df[:,2] #第一层次的全部,第二层次的2 df.loc[:,'data1':'data2'] #多个列索引
def kilojoules(pd): return (pd.mean() * len(pd)) / 1000
plt.show() #look at a scatter plot of the data, look for any erroneous points/outliers plt.figure() plt.scatter(<data_raw_1>, <data_raw_2>) plt.title("Raw Data") plt.xlabel("X label") plt.ylabel("Y label") plt.show() #set the bounds on the data set for cleaning and repeat for all columns data_clean[column] = data_raw[column][<lower_bound> <= data_raw[column] <= <upper_bound>] data_clean.head() data_clean.to_pickle("path") #CAUTION!!! Only read pickles that YOU generate!!!!! No Exceptions! #now get the avg, std dev, max, min, mean = pd.mean(data_clean[column]) std = pd.std(data_clean[column]) min = pd.min(data_clean[column]) max = pd.max(data_clean[column]) print("Mean: %f, Stand Dev: %f, Minimum: %f, Maximum: %f").format(mean, std, min, max) #now generate plots using clean data and save fig spec dir plt.figure() plt.<plot type>(<data_raw_1>, <data_raw_2>) #hist, scatter, plot, box plt.title("Data") #describe what the plot is plt.xlabel("X label") plt.ylabel("Y label") plt.savefig("path-to-directory.png") #pick a path that you know you'll find it plt.show()
origin_traveltime = [] for i in leave_time.axes[1][1:13]: l = travel_time(leave_time, 6, i) origin_traveltime.append(l) hope_traveltime = [] for i in leave_time_speed.axes[1][1:13]: l = travel_time(leave_time_speed, 6, i) hope_traveltime.append(l) limit_hope_traveltime = [] for i in leave_time_speed1.axes[1][1:13]: l = travel_time(leave_time_speed1, 6, i) limit_hope_traveltime.append(l) pd.mean(map(lambda x, y: (x - y) / y, limit_hope_traveltime, origin_traveltime)) #平均增加损失时间39.48% #如果用hope的非但不损失还可以增加9.8%的速度 #下面计算车头时距的稳定性增加多少########## #1车头时距数值 def head1(data): dic = pd.DataFrame() h = [] for j in range(11): n = j + 1 b = 'banci%i' % (n) b1 = 'banci%i' % (n + 1)