import window_s_p_ft as win from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import train_test_split total_score = 0 stop = 1000 for x in range(stop): clf = KNeighborsClassifier() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec for s in data_train] data_test_labels = [s.spec for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) total_score = total_score / stop print('all') print(total_score) specs = ['FK', 'FM', 'MN', 'OE'] for sp in specs: total_score = 0 for x in range(stop): clf = KNeighborsClassifier() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_train] data_test_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_test] data_train = [s.grades for s in data_train]
import networkx as nx import matplotlib.pyplot as plt import window_s_p_ft as win import numpy as np import math from sklearn.manifold import MDS cl = 'L' data = win.getData(class_=cl) data = sorted(data, key=lambda s: np.mean(s), reverse=True) studs = win.getStudents(class_=cl) studs = sorted(studs, key=lambda s: np.mean(s.grades), reverse=True) st_corr = np.corrcoef(data, rowvar=1) mds = MDS(n_components=2, dissimilarity='precomputed') dists = np.empty((len(st_corr), len(st_corr))) for ii in range(len(data)): for jj in range(len(data)): dists[ii][jj] = math.sqrt(2 * (1 - st_corr[ii][jj])) pos = mds.fit(dists).embedding_ G = nx.Graph() G.add_nodes_from(range(len(data))) labels = [] for ii in range(len(data)): labels.append(str(ii + 1) + " " + str(studs[ii].spec)) for jj in range(ii + 1, len(data)): d = math.sqrt(2 * (1 - st_corr[ii][jj])) G.add_edge(ii, jj, weight=d)
# plt.ylabel("Log(Explained variance)") # '''log-log plot''' # log_x = [math.log(x) for x in range(1, len(pca.explained_variance_)+1)] # plt.xlabel("Log(no. PCA)") # plt.ylabel("Log(Explained variance)") # plt.plot(log_x, log_var, 'o-', label="Log") # plt.legend(bbox_to_anchor=(1, 1), loc=1, borderaxespad=0.) # print() # plt.show() # '''PCA matrix''' # log_comp = np.asarray([[math.log(math.fabs(x)) for x in list_] for list_ in pca.components_]) # comp = log_comp.T # comp = pca.components_.T students = win.getStudents() '''correlation student-student''' st_corr = np.empty([len(students), len(students)]) for ii, st1 in enumerate(students): for jj, st2 in enumerate(students): try: st_corr[ii, jj] = math.log(math.fabs(pearsonr(st1.grades, st2.grades)[0])) except ValueError: print(pearsonr(st1.grades, st2.grades)[0]) print(st1.grades) print(st2.grades) # courses = win.getCourses() # '''correlation course-course''' # co_corr = np.empty([len(courses), len(courses)]) # for ii, c1 in enumerate(courses):
import numpy as np import window_s_p_ft as win # import window_s_p_foto as win from scipy.stats import pearsonr from sklearn.metrics import mutual_info_score import heatmap import random '''OE, MN, FK, FM''' students = win.getStudents(shuffle=False) '''correlation student-student''' st_corr = np.empty([len(students), len(students)]) for ii, st1 in enumerate(students): for jj, st2 in enumerate(students): st_corr[ii, jj] = mutual_info_score(st1.grades, st2.grades)[0] cl = students[0].class_ sp = students[0].spec anno = [] anno2 = [] for ii, st in enumerate(students): if st.class_ != cl: anno.append((ii - 1, students[ii - 1].class_)) cl = st.class_ if st.spec != sp: anno2.append((ii - 1, students[ii - 1].spec)) sp = st.spec anno.append(((len(students) - 1), students[-1].class_)) anno2.append(((len(students) - 1), students[-1].spec)) anno = anno + anno2
def classify(data=None, clf=None, repeat=10, test_size=0.2, leave=False): '''applies classification method based on a classification object clf data must be list of objects-students; repeat should be an integer and it makes the classification happen 'repeat' number of times and printed results are averaged over all repeats returns a dictionary of results(accuracy, precision etc.)''' if data is None: data = win.getStudents() if clf is None: clf = LinearDiscriminantAnalysis(solver='lsqr') clf = clf data = data total_score = 0 stop = repeat results = OrderedDict() results['method'] = str(clf) if leave is False: for x in range(stop): data_train, data_test = train_test_split(data, test_size=test_size) data_train_labels = [s.spec for s in data_train] data_test_labels = [s.spec for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) total_score = total_score / stop results['ACC for all specs'] = round(total_score, 2) specs = ['FK', 'FM', 'MN', 'OE'] for sp in specs: total_score = 0 total_sensitivity = 0 total_specificity = 0 total_precision = 0 total_npv = 0 total_prevalence = 0 for x in range(stop): sensitivity = 0 # true positive specificity = 0 # true negative precision = 0 npv = 0 prevalence = 0 data_train, data_test = train_test_split( data, test_size=test_size) data_train_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_train] data_test_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) prediction = clf.predict(data_test) for ii, d in enumerate(prediction): if d == data_test_labels[ii] and d == sp: sensitivity += 1 elif d == data_test_labels[ii] and d != sp: specificity += 1 else: pass try: sensitivity = sensitivity / data_test_labels.count(sp) except ZeroDivisionError: sensitivity = 0 try: specificity = specificity / \ data_test_labels.count('NOT ' + sp) except ZeroDivisionError: specificity = 0 try: precision = sensitivity / prediction.tolist().count(sp) except ZeroDivisionError: precision = 0 try: npv = specificity / prediction.tolist().count('NOT ' + sp) except ZeroDivisionError: npv = 0 prevalence = data_test_labels.count(sp) / len(data_test_labels) total_sensitivity += sensitivity total_specificity += specificity total_precision += precision total_npv += npv total_prevalence += prevalence total_score = total_score / stop total_sensitivity = total_sensitivity / stop total_specificity = total_specificity / stop total_precision = total_precision / stop total_npv = total_npv / stop total_prevalence = total_prevalence / stop # results[sp + ' accuracy: '] = total_score # results[sp + ' sensitivity: '] = total_sensitivity # results[sp + ' specificity: '] = total_specificity # results[sp + ' precision: '] = total_precision # results[sp + ' negative predictive value: '] = total_npv results[sp + ' acc - prevalence: '] = round( total_score - max(total_prevalence, 1 - total_prevalence), 2) else: for x in range(stop): loo = LeaveOneOut(n=len(data)) for train_index, test_index in loo: data_train, data_test = [data[ii] for ii in train_index], data[test_index[0]] data_train_labels = [s.spec for s in data_train] data_test_labels = data_test.spec data_train = [s.grades for s in data_train] data_test = data_test.grades clf.fit(data_train, data_train_labels) if clf.predict(data_test)[0] == data_test_labels: total_score += 1 total_score = total_score / stop / len(loo) results['ACC for all specs'] = round(total_score, 2) specs = ['FK', 'FM', 'MN', 'OE'] for sp in specs: total_score = 0 total_prevalence = 0 for x in range(stop): # prevalence = 0 loo = LeaveOneOut(n=len(data)) for train_index, test_index in loo: data_train, data_test = [data[ii] for ii in train_index], data[test_index[0]] data_train_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_train] data_test_labels = data_test.spec if data_test.spec == sp else 'NOT ' + sp data_train = [s.grades for s in data_train] data_test = data_test.grades prediction = clf.predict(data_test) clf.fit(data_train, data_train_labels) if prediction[0] == data_test_labels: total_score += 1 if data_test_labels == sp: total_prevalence += 1 # total_prevalence += prevalence total_score = total_score / stop / len(loo) total_prevalence = total_prevalence / stop / len(loo) # results[sp + ' accuracy: '] = round(total_score, 2) results[sp + ' acc - prevalence: '] = round( total_score - max(total_prevalence, 1 - total_prevalence), 2) return results
clf=LinearDiscriminantAnalysis(solver='lsqr'), leave=leave), classify(data=data, repeat=repeat, test_size=test_size, clf=DecisionTreeClassifier(), leave=leave), classify(data=data, repeat=repeat, test_size=test_size, clf=KNeighborsClassifier(n_neighbors=5, weights='uniform'), leave=leave)] for cl in classified: print("") for x, y in cl.items(): print(x, y) repeat = 1000 test_size = 0.25 studs = win.getStudents(spare=True) nustuds = [] for s in studs: if None in s.grades or 0 in s.grades or np.mean(s.grades) < 3.0: pass else: nustuds.append(s) # data = win.getData() # pca = KernelPCA(n_components=None, kernel='sigmoid') # data = pca.fit_transform(data) # mle -> n_components_ = 12 # # data = np.corrcoef(data) # for ii, s in enumerate(studs): # s.grades = data[ii] run(nustuds, test_size, repeat, False)