Exemple #1
0
def evaluate(model, test_features, test_labels):
    # Created with help from https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    rmse = np.sqrt(mean_squared_error(test_labels, predictions))
    accuracy = 100 - mape
    r2 = r2_score(test_labels, predictions)
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print('RMSE: {:0.4f}'.format(rmse))
    print('R-Squared: {:0.4f}'.format(r2))
Exemple #2
0
def rank(series):
    # Calculate the mean of the input series
    mean = pd.mean(series)
    # Return the mean and its rank as a list
    if mean > 90:
        return [mean, 'high']
    if mean > 60:
        return [mean, 'medium']
    return [mean, 'low']
                    'Population':['39512223', '1359711', '19453561']})

df3 = pd.merge(left=df1, right=df2, on='State', how='inner')
df3

Notice how there are only two rows in the merged dataframe.

We can also be more inclusive and match on `State` column, but retain all rows. This is equivalent to an 'or' join.

df3 = pd.merge(left=df1, right=df2, on='State', how='outer')
df3

This is a very handy way to merge data when you have lots of files with missing data.  See Jake Vanderplas's [tutorial](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.07-Merge-and-Join.ipynb) for a more in depth overview.

## Grouping
We've seen above that it is very easy to summarize data over columns using the builtin functions such as `pd.mean()`. Sometimes we are interested in summarizing data over different groups of rows. For example, what is the mean of participants in Condition A compared to Condition B?

This is suprisingly easy to compute in pandas using the `groupby` operator, where we aggregate data using a specific operation over different labels.

One useful way to conceptualize this is using the **Split, Apply, Combine** operation (similar to map-reduce).

![](../images/pandas/split-apply-combine.png)

This figure is taken from Jake Vanderplas's tutorial and highlights how input data can be *split* on some key and then an operation such as sum can be *applied* separately to each split. Finally, the results of the applied function for each key can be *combined* into a new data frame.

### Groupby
In this example, we will use the `groupby` operator to split the data based on gender labels and separately calculate the mean for each group.

df.groupby('gender_name').mean()

Other default aggregation methods include `.count()`, `.mean()`, `.median()`, `.min()`, `.max()`, `.std()`, `.var()`, and `.sum()`
Exemple #4
0
#3 显示构造,然后使用将这些对象作为index参数,或者通过reindex方法更新索引
df = pd.DataFrame(np.random.rand(4, 2),
                  columns=['data1', 'data2'],
                  index=)
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]) #数组
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) #元组
pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) #笛卡尔积

pop.index.names = ['state', 'year'] #多级列索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])
health_data = pd.DataFrame(data, index=index, columns=columns)
#多级索引的累计方法
pd.mean(level='year') #对行索引进行累计操作
pd.mean(axis = 1,level='name') #对列索引进行累计操作
health_data
#df多维索引,使用ix(),loc()函数,对于二维索引,如果包含中文,切片可能有BUG
df.loc['a',1] #二级索引
df.loc['a'].loc['1'] #二级索引
df.loc['a'] 
df.loc['a':'c'] #一级多个索引
df.loc[['a','b']] #一级多个索引
df[:, 2000]
df[df > 22000000]
df['a'] #第一层次的索引
df['a':'c']
df.ix[['a':'c']]
df[:,2] #第一层次的全部,第二层次的2
df.loc[:,'data1':'data2'] #多个列索引
Exemple #5
0
def kilojoules(pd):
    return (pd.mean() * len(pd)) / 1000
plt.show()

#look at a scatter plot of the data, look for any erroneous points/outliers
plt.figure()
plt.scatter(<data_raw_1>, <data_raw_2>)
plt.title("Raw Data")
plt.xlabel("X label")
plt.ylabel("Y label")
plt.show()

#set the bounds on the data set for cleaning and repeat for all columns
data_clean[column] = data_raw[column][<lower_bound> <= data_raw[column] <= <upper_bound>]
data_clean.head()
data_clean.to_pickle("path") #CAUTION!!! Only read pickles that YOU generate!!!!! No Exceptions!

#now get the avg, std dev, max, min,
mean = pd.mean(data_clean[column])
std = pd.std(data_clean[column])
min = pd.min(data_clean[column])
max = pd.max(data_clean[column])
print("Mean: %f, Stand Dev: %f, Minimum: %f, Maximum: %f").format(mean, std, min, max)

#now generate plots using clean data and save fig spec dir
plt.figure()
plt.<plot type>(<data_raw_1>, <data_raw_2>) #hist, scatter, plot, box
plt.title("Data") #describe what the plot is
plt.xlabel("X label")
plt.ylabel("Y label")
plt.savefig("path-to-directory.png") #pick a path that you know you'll find it
plt.show()
origin_traveltime = []
for i in leave_time.axes[1][1:13]:
    l = travel_time(leave_time, 6, i)
    origin_traveltime.append(l)

hope_traveltime = []
for i in leave_time_speed.axes[1][1:13]:
    l = travel_time(leave_time_speed, 6, i)
    hope_traveltime.append(l)

limit_hope_traveltime = []
for i in leave_time_speed1.axes[1][1:13]:
    l = travel_time(leave_time_speed1, 6, i)
    limit_hope_traveltime.append(l)

pd.mean(map(lambda x, y: (x - y) / y, limit_hope_traveltime,
            origin_traveltime))

#平均增加损失时间39.48%
#如果用hope的非但不损失还可以增加9.8%的速度

#下面计算车头时距的稳定性增加多少##########


#1车头时距数值
def head1(data):
    dic = pd.DataFrame()
    h = []
    for j in range(11):
        n = j + 1
        b = 'banci%i' % (n)
        b1 = 'banci%i' % (n + 1)