def init_classifier_fn(self, **kwargs):
        cs_df = self._academic_clusterer.courses_features
        AcademicFailureEstimator.COURSES = cs_df['course'].values
        
        se_df = self._academic_clusterer.semesters_features
        sf_df = self._academic_clusterer.students_features
        gpa_df = self._academic_clusterer.ha_df.drop_duplicates(['student','GPA'])
        ss_df = pd_merge( se_df, sf_df, on='student' )
        ss_df = pd_merge( ss_df, gpa_df, on='student' )
        ss_df = pd_merge( ss_df, cs_df, on='course' )
        
        data = ss_df.apply( self.get_ss_features, axis=1 )
        data = np_array( data.tolist() )
        X = data
        y = ss_df['ha_reprobado'].apply(lambda x: 0 if x else 1).values

        # H = np_unique( X[:,0] )
        # H = np_array( [ H, np_zeros( len(H) ) ] ).T
        # l = np_ones( len( H ) )
        # X = np_append( X, H, axis=0)
        # y = np_append( y, l )

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.30,
                                                            random_state=7)

        # logreg = LogisticRegression(random_state=7)
        logreg = AdaBoostClassifier(random_state=10)
        logreg = CalibratedClassifierCV( logreg, cv=2, method='sigmoid')
        # logreg = GaussianNB()
        logreg.fit(X, y)
        logreg_prob = logreg.predict_proba
        logreg_predict = logreg.predict

        y_pred = logreg.predict(X_test)
        recall = recall_score(y_test, y_pred)

        def quality(data):
            _z_ = logreg_predict(data)[0]
            sample = X[ y==_z_ ]
            sample_ = X[ y==(1-_z_) ]
            d = np_linalg_norm( [data] - sample )
            d_ = np_linalg_norm( [data] - sample_ )
            r = np_max( d_ )/np_max( d )
            # r = np_mean( d )/np_mean( d_ )
            # r = np_min( d )/np_min( d_ )
            # r = 0.5 * ( r + recall )
            if r > 1:
                r = abs( 1-r )
            r = 0.5 * ( r + recall )
            return str( r )
        
        clf = lambda data: [ logreg_prob( data ), quality(data) ]
        self._clf = clf
        """
Esempio n. 2
0
def mark_duplicates(dataframe):

    dataframe["is_duplicate"] = dataframe.duplicated(
        subset=["Название тендера и лота", "Опубликован", "Сумма НЦК"],
        keep=False)
    dataframe_duplicates = dataframe.loc[
        dataframe["is_duplicate"],
        ["Название тендера и лота", "Опубликован", "Сумма НЦК", "Робот"]]

    gb = dataframe_duplicates.groupby(
        ["Название тендера и лота", "Опубликован", "Сумма НЦК"
         ], ).aggregate(min)

    gb = gb.rename(index=str, columns={"Робот": "min"})
    gb["max"] = dataframe_duplicates.groupby(
        ["Название тендера и лота", "Опубликован",
         "Сумма НЦК"])['Робот'].max()
    gb["duplicates_have_different_sources"] = gb.apply(
        duplicates_have_different_sources, axis=1)
    gb = gb[gb["duplicates_have_different_sources"]]
    gb = gb.reset_index()
    dataframe = pd_merge(
        dataframe,
        gb,
        on=["Название тендера и лота", "Опубликован", "Сумма НЦК"],
        how='left')
    dataframe["original_or_copy"] = dataframe.apply(original_or_copy, axis=1)
    dataframe.drop(["max", "min", "duplicates_have_different_sources"],
                   axis=1,
                   inplace=True)
    return dataframe
def ah_GPA(ha_df, gpa_df):
    if 'promedio_GPA' in list( ha_df ):
        _gpaha_df = ha_df
    else:
        _gpaha_df = pd_merge(ha_df, gpa_df, on='student', how='left')
        _gpaha_df['grade_GPA'] = _gpaha_df['grade'] - _gpaha_df['GPA']
        _gpaha_df.fillna(0.)
        _gpaha_df.to_csv('./data/kuleuven/ha_df.csv', dtype={'student':int32,
        'grade':float32,
        'year':int32,
        'GPA':float32,
        'ap_GPA':float32,
        'performance':float32,
        'grade_GPA':float32})
        gpaha_df = _gpaha_df
    return _gpaha_df
def courses_features_calc(ha_df, population_IDs=[]):
    global e1071
    sample_df = ha_df
    skewness = e1071.skewness
    
    def alpha_calc(chunk):
        alpha = ( chunk['GPA'].values**2 ).sum() / ( chunk['grade'].values * chunk['GPA'].values ).sum()
        return alpha
        
    def beta_calc(chunk):
        beta = ( chunk['grade_GPA'].values ).sum() / len( chunk )
        return beta

    def skewness_calc(chunk):
        _skewness = skewness( FloatVector( chunk['grade_GPA'].values ) )
        return _skewness[0]
        
    def count_calc(chunk):
        _count = len( chunk )
        return _count
        
    def course_features_record(academic_history):
        cod_materia_acad = academic_history['course'].values[0]
        try:
            cod_materia_acad = cod_materia_acad[:cod_materia_acad.index(' ')]
        except: 
            cod_materia_acad = cod_materia_acad
        tmp = {'course': cod_materia_acad,
               'alpha': alpha_calc( academic_history ),
               'beta': beta_calc( academic_history ),
               'skewness': skewness_calc( academic_history ),
               'count': count_calc( academic_history ),
               }
        return tmp
    
    ha_gb = sample_df.groupby('course')
    abs_df = ha_gb.apply( course_features_record )
    cs_df = kuleuven_loader.cs_df
    cs_df['course'] = cs_df['code'].values
    # cs_df.info()
    try: abs_df = DataFrame.from_records( abs_df.tolist() )
    except: pass
    abs_df = pd_merge( abs_df, cs_df[['course','credits']], on='course' )
    return abs_df
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.cross_validation import train_test_split
from numpy import average as np_average
from numpy import array as np_array
from skfuzzy import cmeans, cmeans_predict
from data_loader import kuleuven_loader
from itertools import combinations

in_source = "kuleuven"
dispatcher = WSDispatcher(source=in_source)
se_df = dispatcher.academic_clusterer.semesters_features
sf_df = dispatcher.academic_clusterer.students_features
ss_df = pd_merge(se_df, sf_df, on="student")

# cd doc/calibration_test/

abs_df = pd_read_csv("../../data/kuleuven/abs_df_1716653621.csv", index_col=0)
abs_df = abs_df.fillna(-1000)
ha_df = pd_read_csv("../../data/kuleuven/students_courses.csv", index_col=0)
ha_df = ha_df.drop_duplicates(["year", "status", "course", "grade", "student"])
sha_df = pd_merge(ha_df, sf_df, on="student")
sha_df = pd_merge(sha_df, abs_df, on="course")

OP = []
OP_append = OP.append


def plot_calibration_curve_from_data(X, y, est, name, fig_index):