def init_classifier_fn(self, **kwargs): cs_df = self._academic_clusterer.courses_features AcademicFailureEstimator.COURSES = cs_df['course'].values se_df = self._academic_clusterer.semesters_features sf_df = self._academic_clusterer.students_features gpa_df = self._academic_clusterer.ha_df.drop_duplicates(['student','GPA']) ss_df = pd_merge( se_df, sf_df, on='student' ) ss_df = pd_merge( ss_df, gpa_df, on='student' ) ss_df = pd_merge( ss_df, cs_df, on='course' ) data = ss_df.apply( self.get_ss_features, axis=1 ) data = np_array( data.tolist() ) X = data y = ss_df['ha_reprobado'].apply(lambda x: 0 if x else 1).values # H = np_unique( X[:,0] ) # H = np_array( [ H, np_zeros( len(H) ) ] ).T # l = np_ones( len( H ) ) # X = np_append( X, H, axis=0) # y = np_append( y, l ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7) # logreg = LogisticRegression(random_state=7) logreg = AdaBoostClassifier(random_state=10) logreg = CalibratedClassifierCV( logreg, cv=2, method='sigmoid') # logreg = GaussianNB() logreg.fit(X, y) logreg_prob = logreg.predict_proba logreg_predict = logreg.predict y_pred = logreg.predict(X_test) recall = recall_score(y_test, y_pred) def quality(data): _z_ = logreg_predict(data)[0] sample = X[ y==_z_ ] sample_ = X[ y==(1-_z_) ] d = np_linalg_norm( [data] - sample ) d_ = np_linalg_norm( [data] - sample_ ) r = np_max( d_ )/np_max( d ) # r = np_mean( d )/np_mean( d_ ) # r = np_min( d )/np_min( d_ ) # r = 0.5 * ( r + recall ) if r > 1: r = abs( 1-r ) r = 0.5 * ( r + recall ) return str( r ) clf = lambda data: [ logreg_prob( data ), quality(data) ] self._clf = clf """
def mark_duplicates(dataframe): dataframe["is_duplicate"] = dataframe.duplicated( subset=["Название тендера и лота", "Опубликован", "Сумма НЦК"], keep=False) dataframe_duplicates = dataframe.loc[ dataframe["is_duplicate"], ["Название тендера и лота", "Опубликован", "Сумма НЦК", "Робот"]] gb = dataframe_duplicates.groupby( ["Название тендера и лота", "Опубликован", "Сумма НЦК" ], ).aggregate(min) gb = gb.rename(index=str, columns={"Робот": "min"}) gb["max"] = dataframe_duplicates.groupby( ["Название тендера и лота", "Опубликован", "Сумма НЦК"])['Робот'].max() gb["duplicates_have_different_sources"] = gb.apply( duplicates_have_different_sources, axis=1) gb = gb[gb["duplicates_have_different_sources"]] gb = gb.reset_index() dataframe = pd_merge( dataframe, gb, on=["Название тендера и лота", "Опубликован", "Сумма НЦК"], how='left') dataframe["original_or_copy"] = dataframe.apply(original_or_copy, axis=1) dataframe.drop(["max", "min", "duplicates_have_different_sources"], axis=1, inplace=True) return dataframe
def ah_GPA(ha_df, gpa_df): if 'promedio_GPA' in list( ha_df ): _gpaha_df = ha_df else: _gpaha_df = pd_merge(ha_df, gpa_df, on='student', how='left') _gpaha_df['grade_GPA'] = _gpaha_df['grade'] - _gpaha_df['GPA'] _gpaha_df.fillna(0.) _gpaha_df.to_csv('./data/kuleuven/ha_df.csv', dtype={'student':int32, 'grade':float32, 'year':int32, 'GPA':float32, 'ap_GPA':float32, 'performance':float32, 'grade_GPA':float32}) gpaha_df = _gpaha_df return _gpaha_df
def courses_features_calc(ha_df, population_IDs=[]): global e1071 sample_df = ha_df skewness = e1071.skewness def alpha_calc(chunk): alpha = ( chunk['GPA'].values**2 ).sum() / ( chunk['grade'].values * chunk['GPA'].values ).sum() return alpha def beta_calc(chunk): beta = ( chunk['grade_GPA'].values ).sum() / len( chunk ) return beta def skewness_calc(chunk): _skewness = skewness( FloatVector( chunk['grade_GPA'].values ) ) return _skewness[0] def count_calc(chunk): _count = len( chunk ) return _count def course_features_record(academic_history): cod_materia_acad = academic_history['course'].values[0] try: cod_materia_acad = cod_materia_acad[:cod_materia_acad.index(' ')] except: cod_materia_acad = cod_materia_acad tmp = {'course': cod_materia_acad, 'alpha': alpha_calc( academic_history ), 'beta': beta_calc( academic_history ), 'skewness': skewness_calc( academic_history ), 'count': count_calc( academic_history ), } return tmp ha_gb = sample_df.groupby('course') abs_df = ha_gb.apply( course_features_record ) cs_df = kuleuven_loader.cs_df cs_df['course'] = cs_df['code'].values # cs_df.info() try: abs_df = DataFrame.from_records( abs_df.tolist() ) except: pass abs_df = pd_merge( abs_df, cs_df[['course','credits']], on='course' ) return abs_df
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.cross_validation import train_test_split from numpy import average as np_average from numpy import array as np_array from skfuzzy import cmeans, cmeans_predict from data_loader import kuleuven_loader from itertools import combinations in_source = "kuleuven" dispatcher = WSDispatcher(source=in_source) se_df = dispatcher.academic_clusterer.semesters_features sf_df = dispatcher.academic_clusterer.students_features ss_df = pd_merge(se_df, sf_df, on="student") # cd doc/calibration_test/ abs_df = pd_read_csv("../../data/kuleuven/abs_df_1716653621.csv", index_col=0) abs_df = abs_df.fillna(-1000) ha_df = pd_read_csv("../../data/kuleuven/students_courses.csv", index_col=0) ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"]) sha_df = pd_merge(ha_df, sf_df, on="student") sha_df = pd_merge(sha_df, abs_df, on="course") OP = [] OP_append = OP.append def plot_calibration_curve_from_data(X, y, est, name, fig_index):