forked from bognev/abbakumov
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lec5.py
86 lines (73 loc) · 2.64 KB
/
lec5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from scipy import stats
# hypothesis true hypothesis false
#hypothesis accepted + 2nd type error
#hypothesis rejected 1st type error +
#alpha significance = 0.05, 0.01, 0.005 margin from top for frequency of 1st type mistakes
#it will give us 1 mistake from 20, 100, 200 checks
def two_histograms(x, y):
fig, ax = plt.subplots(figsize=(5,3))
ax.hist(x, alpha=0.5, weights=[1./len(x)]*len(x), label='x')
ax.hist(y, alpha=0.5, weights=[1./len(y)]*len(y), label='y')
plt.tight_layout()
plt.draw()
df = pd.read_csv("./Shad_Python_06_2/town_1959_2.csv", encoding='cp1251')
print(df.tail())
df = df.set_index(u'номер')
res = stats.shapiro(np.log(df[u'население']))
print('p-value: ', res[1])
fig, ax = plt.subplots(2,1, figsize=(5,3))
ax[0].hist(df[u'население'], bins=100)
ax[1].hist(np.log10(df[u'население']), bins=50)
plt.tight_layout()
plt.draw()
df = pd.read_csv("./Shad_Python_06_2/Albuquerque/Albuquerque Home Prices_data.txt", sep='\t')
print(df.head())
df.replace(-9999, np.nan, inplace=True)
print(df.head())
x=df[df['COR']==1]['PRICE']
y=df[df['COR']==0]['PRICE']
x.name, y.name = 'corner', 'not corner'
two_histograms(x, y)
res = stats.mannwhitneyu(x, y)
print('p-value: ', res[1])
df = pd.read_csv("./Shad_Python_06_2/agedeath.dat.txt", sep='\s+', header=None, names=['group', 'age', 'index'])
print(df.head())
x = df[df['group'] == 'sovr']['age']
y = df[df['group'] == 'aris']['age']
two_histograms(x, y)
res = stats.fligner(x, y)
print('p-value: ', res[1])
res = stats.ttest_ind(x, y, equal_var=False)
print('p-value: ', res[1])
df = pd.read_csv("./Shad_Python_06_2/interference.csv")
print(df.head())
x = df['DiffCol']
y = df['Black']
x.name, y.name = 'DiffCol', 'Black'
two_histograms(x, y)
res = stats.fligner(x, y)
print('p-value: ', res[1])
res = stats.ttest_ind(x, y, equal_var=False)
print('p-value: ', res[1])
contingency_table = pd.DataFrame([[28, 72], [20, 80]], index=['city', 'country'], columns=['for', 'against'])
print(contingency_table)
res = stats.chi2_contingency(contingency_table)
print('p-value: ', res[1])
df = pd.read_csv("./Shad_Python_06_2/Albuquerque/Albuquerque Home Prices_data.txt", sep='\t')
df.replace(-9999, np.nan, inplace=True)
print(df.head())
fix, ax = plt.subplots()
ax.scatter(df['PRICE'], df['SQFT'])
plt.draw()
res = stats.pearsonr(df['PRICE'], df['SQFT'])
print('pearson rho: ', res[0])
print('p-value: ', res[1])
res = stats.pearsonr(df['TAX'], df['SQFT'])
print('pearson rho: ', res[0])
print('p-value: ', res[1])
plt.show()