names=['user', 'mark'], index_col=0) ex2_excellent1 = examResults2.loc[examResults2['mark'] >= 40] ex2_weak1 = examResults2.loc[examResults2['mark'] < 40] examResults3 = pd.read_csv( '2020-06-29-einstein/marks/2020-ca116-continuous-assessment/2020-ca116-ex3-final.txt', delimiter=' ', header=None, names=['user', 'mark'], index_col=0) ex3_excellent1 = examResults3.loc[examResults3['mark'] >= 40] ex3_weak1 = examResults3.loc[examResults3['mark'] < 40] #filtered log by activity Type lectureList = dataProcessing.getLectureList(ca116_eventLog_2019, ['html|py']) eventLog_ca116_filtered = ca116_eventLog_2019.loc[ ca116_eventLog_2019['description'].str.contains('|'.join(lectureList))] # ex1_personal_log_1 = dataProcessing.addConceptPageToLog(ex1_personal_log_1) # eventLog_ca116_filtered = eventLog_ca116_filtered.drop(eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.contains('http|report|ex|dashboard|graphs.html')].index) eventLog_ca116_filtered = eventLog_ca116_filtered.drop( eventLog_ca116_filtered.loc[eventLog_ca116_filtered['concept:name'].isin( ['click-0', 'click-1', 'click-2'])].index) eventLog_ca116_filtered.loc[ eventLog_ca116_filtered['description'].str.contains('.html|.web'), 'pageType'] = 'Read_Lecture_Note' eventLog_ca116_filtered.loc[ eventLog_ca116_filtered['description'].str.contains('correct|incorrect'), 'pageType'] = 'Exercise'
from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler warnings.filterwarnings("ignore") # sns.set() # sns.set_style("whitegrid", {"axes.facecolor": ".9"}) import os os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' eventLog_ca116 = pd.read_csv('ca116_eventLog_nonfixed.csv') eventLog_ca116 = eventLog_ca116.drop([1160345]) eventLog_ca116['time:timestamp'] = pd.to_datetime( eventLog_ca116['time:timestamp']) eventLog_ca116 = eventLog_ca116.loc[:, ~eventLog_ca116.columns.str. contains('^Unnamed')] lectureList = dataProcessing.getLectureList(eventLog_ca116, ['html|py']) eventLog_ca116_filtered = eventLog_ca116.loc[ eventLog_ca116['description'].str.contains('|'.join(lectureList))] # ex1_personal_log_1 = dataProcessing.addConceptPageToLog(ex1_personal_log_1) # eventLog_ca116_filtered = eventLog_ca116_filtered.drop(eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.contains('http|report|ex|dashboard|graphs.html')].index) eventLog_ca116_filtered = eventLog_ca116_filtered.drop( eventLog_ca116_filtered.loc[eventLog_ca116_filtered['concept:name'].isin( ['click-0', 'click-1', 'click-2'])].index) eventLog_ca116_filtered.loc[ eventLog_ca116_filtered['description'].str.contains('.html|.web'), 'pageType'] = 'Read_Lecture_Note' eventLog_ca116_filtered.loc[ eventLog_ca116_filtered['description'].str.contains('correct|incorrect'), 'pageType'] = 'Exercise' eventLog_ca116_filtered.loc[eventLog_ca116_filtered['description'].str.
def predictionRandomForestProbability(dfActivityMatrixWithResult, excellentList, weakList, practice, lectureList=[], pvalue=0.05): if len(lectureList) == 0: lectureList = dataProcessing.getLectureList(dfActivityMatrixWithResult) logPage = dfActivityMatrixWithResult.loc[ dfActivityMatrixWithResult['description'].str.contains( '|'.join(lectureList))] # ex1_LogPageIf = dataProcessing.addCompleteTimeToEventLog(ex1_LogPageIf) # LogPageActivityCountByUser = logPage.groupby([pd.Grouper(key='org:resource'),pd.Grouper(key='concept:name')]).count() LogPageactivityCountByUser = FCAMiner.activityDataMatrixContruct(logPage) LogPageactivityCountByUser = LogPageactivityCountByUser.fillna(0) excellent_LogPageActivityCountByUser = LogPageactivityCountByUser.loc[ LogPageactivityCountByUser.index.isin(excellentList)] weak_LogPageActivityCountByUser = LogPageactivityCountByUser.loc[ LogPageactivityCountByUser.index.isin(weakList)] excellent_LogPageActivityCountByUser['result_exam_1'] = 1 weak_LogPageActivityCountByUser['result_exam_1'] = 0 LogPageactivityCountByUser = pd.concat([ excellent_LogPageActivityCountByUser, weak_LogPageActivityCountByUser ]) # cum_practice_col = ['correct_adjusted','successPassedRate'] #,'cumm_practice','successPassedRate' # LogPageactivityCountByUser = LogPageactivityCountByUser.merge(practice.loc[:,cum_practice_col], # left_on=LogPageactivityCountByUser.index, right_on=practice.index) # LogPageactivityCountByUser = LogPageactivityCountByUser.set_index('key_0') colSelection = [ ] #['correct_adjusted','successPassedRate'] #select only col whose p-value <=0.05 with result exam for col in LogPageactivityCountByUser.columns: if col not in ['result_exam_1', 'click-3']: # corr = pearsonr(LogPageactivityCountByUser['result_exam_1'], LogPageactivityCountByUser[col]) # if corr[1] <= pvalue: # colSelection.append(col) colSelection.append(col) if len(colSelection) == 0: return 'No columns correlated with result_exam_1' X = LogPageactivityCountByUser[colSelection] y = LogPageactivityCountByUser['result_exam_1'] # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=1) # 70% training and 30% test #Create a Gaussian Classifier clf = RandomForestClassifier(n_estimators=1000) #Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_test) y_pred = make_decision(y_pred_proba, 0.5) accuracy_score = metrics.accuracy_score(y_test, y_pred), f1_score = metrics.f1_score(y_test, y_pred), precision_score = metrics.precision_score(y_test, y_pred), recall_score = metrics.recall_score(y_test, y_pred) roc_auc = metrics.roc_auc_score(y_test, y_pred) metricsResult = { 'accuracy_score': accuracy_score, 'f1_score': f1_score, 'precision_score': precision_score, 'recall_score': recall_score, 'roc_auc': roc_auc } feature_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False) testingData = pd.DataFrame( columns=['y_test', 'y_pred', 'y_pred_proba_0', 'y_pred_proba_1']) testingData['y_test'] = y_test testingData['y_pred'] = y_pred testingData['y_pred_proba_0'] = y_pred_proba[:, 0] testingData['y_pred_proba_1'] = y_pred_proba[:, 1] return [ metricsResult, feature_imp, clf, LogPageactivityCountByUser, testingData ]