names=['user', 'mark'],
    index_col=0)
ex2_excellent1 = examResults2.loc[examResults2['mark'] >= 40]
ex2_weak1 = examResults2.loc[examResults2['mark'] < 40]

examResults3 = pd.read_csv(
    '2020-06-29-einstein/marks/2020-ca116-continuous-assessment/2020-ca116-ex3-final.txt',
    delimiter=' ',
    header=None,
    names=['user', 'mark'],
    index_col=0)
ex3_excellent1 = examResults3.loc[examResults3['mark'] >= 40]
ex3_weak1 = examResults3.loc[examResults3['mark'] < 40]

#filtered log by activity Type
lectureList = dataProcessing.getLectureList(ca116_eventLog_2019, ['html|py'])
eventLog_ca116_filtered = ca116_eventLog_2019.loc[
    ca116_eventLog_2019['description'].str.contains('|'.join(lectureList))]
# ex1_personal_log_1 = dataProcessing.addConceptPageToLog(ex1_personal_log_1)

# eventLog_ca116_filtered = eventLog_ca116_filtered.drop(eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.contains('http|report|ex|dashboard|graphs.html')].index)

eventLog_ca116_filtered = eventLog_ca116_filtered.drop(
    eventLog_ca116_filtered.loc[eventLog_ca116_filtered['concept:name'].isin(
        ['click-0', 'click-1', 'click-2'])].index)
eventLog_ca116_filtered.loc[
    eventLog_ca116_filtered['description'].str.contains('.html|.web'),
    'pageType'] = 'Read_Lecture_Note'
eventLog_ca116_filtered.loc[
    eventLog_ca116_filtered['description'].str.contains('correct|incorrect'),
    'pageType'] = 'Exercise'
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")
# sns.set()
# sns.set_style("whitegrid", {"axes.facecolor": ".9"})
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

eventLog_ca116 = pd.read_csv('ca116_eventLog_nonfixed.csv')
eventLog_ca116 = eventLog_ca116.drop([1160345])
eventLog_ca116['time:timestamp'] = pd.to_datetime(
    eventLog_ca116['time:timestamp'])
eventLog_ca116 = eventLog_ca116.loc[:, ~eventLog_ca116.columns.str.
                                    contains('^Unnamed')]

lectureList = dataProcessing.getLectureList(eventLog_ca116, ['html|py'])
eventLog_ca116_filtered = eventLog_ca116.loc[
    eventLog_ca116['description'].str.contains('|'.join(lectureList))]
# ex1_personal_log_1 = dataProcessing.addConceptPageToLog(ex1_personal_log_1)

# eventLog_ca116_filtered = eventLog_ca116_filtered.drop(eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.contains('http|report|ex|dashboard|graphs.html')].index)
eventLog_ca116_filtered = eventLog_ca116_filtered.drop(
    eventLog_ca116_filtered.loc[eventLog_ca116_filtered['concept:name'].isin(
        ['click-0', 'click-1', 'click-2'])].index)
eventLog_ca116_filtered.loc[
    eventLog_ca116_filtered['description'].str.contains('.html|.web'),
    'pageType'] = 'Read_Lecture_Note'
eventLog_ca116_filtered.loc[
    eventLog_ca116_filtered['description'].str.contains('correct|incorrect'),
    'pageType'] = 'Exercise'
eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.
Ejemplo n.º 3
0
def predictionRandomForestProbability(dfActivityMatrixWithResult,
                                      excellentList,
                                      weakList,
                                      practice,
                                      lectureList=[],
                                      pvalue=0.05):
    if len(lectureList) == 0:
        lectureList = dataProcessing.getLectureList(dfActivityMatrixWithResult)

    logPage = dfActivityMatrixWithResult.loc[
        dfActivityMatrixWithResult['description'].str.contains(
            '|'.join(lectureList))]
    # ex1_LogPageIf = dataProcessing.addCompleteTimeToEventLog(ex1_LogPageIf)
    # LogPageActivityCountByUser = logPage.groupby([pd.Grouper(key='org:resource'),pd.Grouper(key='concept:name')]).count()
    LogPageactivityCountByUser = FCAMiner.activityDataMatrixContruct(logPage)
    LogPageactivityCountByUser = LogPageactivityCountByUser.fillna(0)
    excellent_LogPageActivityCountByUser = LogPageactivityCountByUser.loc[
        LogPageactivityCountByUser.index.isin(excellentList)]
    weak_LogPageActivityCountByUser = LogPageactivityCountByUser.loc[
        LogPageactivityCountByUser.index.isin(weakList)]

    excellent_LogPageActivityCountByUser['result_exam_1'] = 1
    weak_LogPageActivityCountByUser['result_exam_1'] = 0
    LogPageactivityCountByUser = pd.concat([
        excellent_LogPageActivityCountByUser, weak_LogPageActivityCountByUser
    ])
    # cum_practice_col = ['correct_adjusted','successPassedRate'] #,'cumm_practice','successPassedRate'
    # LogPageactivityCountByUser = LogPageactivityCountByUser.merge(practice.loc[:,cum_practice_col],
    #                                         left_on=LogPageactivityCountByUser.index, right_on=practice.index)
    # LogPageactivityCountByUser = LogPageactivityCountByUser.set_index('key_0')

    colSelection = [
    ]  #['correct_adjusted','successPassedRate'] #select only col whose p-value <=0.05 with result exam
    for col in LogPageactivityCountByUser.columns:
        if col not in ['result_exam_1', 'click-3']:
            #     corr = pearsonr(LogPageactivityCountByUser['result_exam_1'], LogPageactivityCountByUser[col])
            #     if corr[1] <= pvalue:
            #         colSelection.append(col)
            colSelection.append(col)

    if len(colSelection) == 0:
        return 'No columns correlated with result_exam_1'

    X = LogPageactivityCountByUser[colSelection]
    y = LogPageactivityCountByUser['result_exam_1']
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=1)  # 70% training and 30% test

    #Create a Gaussian Classifier
    clf = RandomForestClassifier(n_estimators=1000)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)
    y_pred = make_decision(y_pred_proba, 0.5)

    accuracy_score = metrics.accuracy_score(y_test, y_pred),
    f1_score = metrics.f1_score(y_test, y_pred),
    precision_score = metrics.precision_score(y_test, y_pred),
    recall_score = metrics.recall_score(y_test, y_pred)
    roc_auc = metrics.roc_auc_score(y_test, y_pred)

    metricsResult = {
        'accuracy_score': accuracy_score,
        'f1_score': f1_score,
        'precision_score': precision_score,
        'recall_score': recall_score,
        'roc_auc': roc_auc
    }

    feature_imp = pd.Series(clf.feature_importances_,
                            index=X.columns).sort_values(ascending=False)
    testingData = pd.DataFrame(
        columns=['y_test', 'y_pred', 'y_pred_proba_0', 'y_pred_proba_1'])
    testingData['y_test'] = y_test
    testingData['y_pred'] = y_pred
    testingData['y_pred_proba_0'] = y_pred_proba[:, 0]
    testingData['y_pred_proba_1'] = y_pred_proba[:, 1]
    return [
        metricsResult, feature_imp, clf, LogPageactivityCountByUser,
        testingData
    ]