Beispiel #1
0
def test_series_std(ddof):
    np.random.seed(0)
    arr = np.random.random(100) - 0.5
    sr = Series(arr)
    pd = sr.to_pandas()
    got = sr.std(ddof=ddof)
    expect = pd.std(ddof=ddof)
    np.testing.assert_approx_equal(expect, got)
def standardize(data):
    c_data = subtract_mean(data)
    std_data = c_data / pd.std(c_data)
    return std_data
plt.show()

#look at a scatter plot of the data, look for any erroneous points/outliers
plt.figure()
plt.scatter(<data_raw_1>, <data_raw_2>)
plt.title("Raw Data")
plt.xlabel("X label")
plt.ylabel("Y label")
plt.show()

#set the bounds on the data set for cleaning and repeat for all columns
data_clean[column] = data_raw[column][<lower_bound> <= data_raw[column] <= <upper_bound>]
data_clean.head()
data_clean.to_pickle("path") #CAUTION!!! Only read pickles that YOU generate!!!!! No Exceptions!

#now get the avg, std dev, max, min,
mean = pd.mean(data_clean[column])
std = pd.std(data_clean[column])
min = pd.min(data_clean[column])
max = pd.max(data_clean[column])
print("Mean: %f, Stand Dev: %f, Minimum: %f, Maximum: %f").format(mean, std, min, max)

#now generate plots using clean data and save fig spec dir
plt.figure()
plt.<plot type>(<data_raw_1>, <data_raw_2>) #hist, scatter, plot, box
plt.title("Data") #describe what the plot is
plt.xlabel("X label")
plt.ylabel("Y label")
plt.savefig("path-to-directory.png") #pick a path that you know you'll find it
plt.show()