def test_essays(): essays_paths = glob.glob("data/csv/*.csv") for essaypath in sorted(essays_paths): essayname = essaypath.split("/")[-1].split(".")[0] essays = EssayCollection(essaypath,essayname) essay_types = essays.meta_data()["essay_type"] print essayname, "TR", (essay_types=="TRAINING").sum(), "VA", (essay_types=="VALIDATION").sum()
def test_essays(): essays_paths = glob.glob("data/csv/*.csv") for essaypath in sorted(essays_paths): essayname = essaypath.split("/")[-1].split(".")[0] essays = EssayCollection(essaypath, essayname) essay_types = essays.meta_data()["essay_type"] print essayname, "TR", (essay_types == "TRAINING").sum(), "VA", ( essay_types == "VALIDATION").sum()
<Student_Test_List> <Student_Test_Details Student_Test_ID="%(test_id)s" Grade="%(final_score)s" Total_CR_Item_Count="1"> <Item_DataPoint_List> <Item_DataPoint_Details Item_ID="%(item_id)s" Data_Point="" Item_No="1" Final_Score="%(final_score)d"> <Read_Details Read_Number="1" Score_Value="%(final_score)s" Reader_ID="1" Date_Time="20141026134100" /> </Item_DataPoint_Details> </Item_DataPoint_List> </Student_Test_Details> </Student_Test_List> </Student_Details> """ for ensemblepath in glob.glob("ensemble/*.csv"): preds = pd.read_csv(ensemblepath) item_id = os.path.split(ensemblepath)[-1][:5] essays = EssayCollection("data/csv/" + item_id + "_1.csv") realscores = essays.meta_data()["score3"] scores = [col for col in preds.columns if col.find("prob") > 0] # optimize probability optpar = np.array([0.0, 0.0]) for grade, col in enumerate(scores): trainpred = np.array(preds.ix[preds.essay_type == "TRAINING", col]) trainreal = np.array( (realscores[preds.essay_type == "TRAINING"] == grade).map(int)) def fnopt(par): loss = log_loss(trainreal, logitinv(par[0] * trainpred + par[1])) return loss opt = fmin(fnopt, np.array([0.0, 0.0]))
<Student_Test_Details Student_Test_ID="%(test_id)s" Grade="%(final_score)s" Total_CR_Item_Count="1"> <Item_DataPoint_List> <Item_DataPoint_Details Item_ID="%(item_id)s" Data_Point="" Item_No="1" Final_Score="%(final_score)d"> <Read_Details Read_Number="1" Score_Value="%(final_score)s" Reader_ID="1" Date_Time="20141026134100" /> </Item_DataPoint_Details> </Item_DataPoint_List> </Student_Test_Details> </Student_Test_List> </Student_Details> """ for ensemblepath in glob.glob("ensemble/*.csv"): preds = pd.read_csv(ensemblepath) item_id = os.path.split(ensemblepath)[-1][:5] essays = EssayCollection("data/csv/" + item_id + "_1.csv") realscores = essays.meta_data()["score3"] scores = [col for col in preds.columns if col.find("prob") > 0] # optimize probability optpar = np.array([0.0,0.0]) for grade,col in enumerate(scores): trainpred = np.array(preds.ix[preds.essay_type=="TRAINING",col]) trainreal = np.array((realscores[preds.essay_type=="TRAINING"]==grade).map(int)) def fnopt(par): loss = log_loss(trainreal, logitinv(par[0]*trainpred+par[1])) return loss opt = fmin(fnopt, np.array([0.0,0.0])) optpar += opt optpar /= len(scores)
import os import numpy as np import pandas as pd from essay.essay import EssayCollection from lib import kappa from lib.utils import logit from scipy.optimize import fmin, fmin_bfgs, fmin_cg, fmin_ncg from sklearn import cross_validation from sklearn.linear_model import ElasticNet results = open("results.csv", "w") results.write("essay,model,score\n") for essaypath in sorted(glob.glob("data/csv/*.csv")): print essaypath, essays = EssayCollection(essaypath) essayname = os.path.split(essaypath)[-1][:-4] predictions_all_list = [] trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] for modelpath in glob.glob("models/*" + essayname + "*"): modelname = os.path.split(modelpath)[-1] predictions = pd.read_csv(modelpath) predictions["modelname"] = modelname results.write("%s,%s,%.6f\n" % (essayname, modelname[8:], kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][trainset], predictions["pred_scorer_3"][trainset]))) predictions_all_list.append(predictions.copy())
import os import numpy as np import pandas as pd from essay.essay import EssayCollection from lib import kappa from lib.utils import logit from scipy.optimize import fmin, fmin_bfgs, fmin_cg, fmin_ncg from sklearn import cross_validation from sklearn.linear_model import ElasticNet results = open("results.csv","w") results.write("essay,model,score\n") for essaypath in sorted(glob.glob("data/csv/*.csv")): print essaypath, essays = EssayCollection(essaypath) essayname = os.path.split(essaypath)[-1][:-4] predictions_all_list = [] trainset = np.where(essays.meta_data().essay_type=="TRAINING")[0] for modelpath in glob.glob("models/*" + essayname + "*"): modelname = os.path.split(modelpath)[-1] predictions = pd.read_csv(modelpath) predictions["modelname"] = modelname results.write("%s,%s,%.6f\n" % (essayname, modelname[8:], kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset]))) predictions_all_list.append(predictions.copy()) results.write("%s,%s,%.6f\n" % (essayname, "HUMAN", kappa.quadratic_weighted_kappa(essays.meta_data()["score1"][trainset],essays.meta_data()["score2"][trainset]))) predictions_all = pd.concat(predictions_all_list) scores = [col for col in predictions_all.columns if col.find("grade") > 0]
def model_generic_1ofK_clas(pipeline,model_name,model_f,essays_paths,parallel = False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath,essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:,non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] for scorer in [3]: kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)] for grade in sorted(essays.meta_data()["score%d" % (scorer)].unique()): y_all = np.array(essays.meta_data()["score%d" % (scorer)].map(lambda y: 1 if int(y)==int(grade) else 0)) pred_name = "scorer_%d_grade_%d" % (scorer,grade) predictions[pred_name] = 0 if parallel: pool = Pool(processes=4) essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n] for n in range(8)]) pool.close() for n, essay_set in enumerate(essay_sets): te_ind = kf[n][1] predictions.ix[te_ind,pred_name] = essay_set[:,1] else: for n,(tr_ind,te_ind) in enumerate(kf): predictions.ix[te_ind,pred_name] = model_f(X_all[tr_ind,:], y_all[tr_ind], X_all[te_ind,:], feature_names=essays.feature_names) predictions = predictions.ix[:,sorted(predictions.columns)] predictions["pred_scorer_3"] = np.array(predictions.ix[:,[c for c in predictions.columns if c.startswith("scorer_3")]]).argmax(axis=1) predictions.to_csv(save_as,index=False) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_DBN(pipeline,model_name,model_f,essays_paths,parallel = False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath,essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:,non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] scorer = 3 kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)] scores = sorted(essays.meta_data()["score%d" % (scorer)].unique()) predictions = np.zeros((essays.meta_data().shape[0],len(scores))) model_f.layer_sizes[0] = X_all.shape[1] model_f.layer_sizes[2] = len(scores) try: for n,(tr_ind,te_ind) in enumerate(kf): print n scaler = StandardScaler() _ = scaler.fit(X_all[tr_ind,:]) X_tr = scaler.transform(X_all[tr_ind,:]) / 50.0 X_te = scaler.transform(X_all[te_ind,:]) / 50.0 model_f.fit(X_tr, y_all[tr_ind]) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][tr_ind],model_f.predict(X_tr)) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][te_ind],model_f.predict(X_te)) predictions[te_ind,:] = model_f.predict_proba(X_te) except: pass predictions = pd.DataFrame(predictions) predictions.columns = ["scorer_%d_grade_%d" % (scorer,grade) for grade in scores] predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1) predictions.to_csv(save_as,index=False) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_1ofK_clas(pipeline, model_name, model_f, essays_paths, parallel=False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline[ "name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath, essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:, non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] for scorer in [3]: kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset, testset)] for grade in sorted(essays.meta_data()["score%d" % (scorer)].unique()): y_all = np.array(essays.meta_data()["score%d" % (scorer)].map( lambda y: 1 if int(y) == int(grade) else 0)) pred_name = "scorer_%d_grade_%d" % (scorer, grade) predictions[pred_name] = 0 if parallel: pool = Pool(processes=4) essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n] for n in range(8)]) pool.close() for n, essay_set in enumerate(essay_sets): te_ind = kf[n][1] predictions.ix[te_ind, pred_name] = essay_set[:, 1] else: for n, (tr_ind, te_ind) in enumerate(kf): predictions.ix[te_ind, pred_name] = model_f( X_all[tr_ind, :], y_all[tr_ind], X_all[te_ind, :], feature_names=essays.feature_names) predictions = predictions.ix[:, sorted(predictions.columns)] predictions["pred_scorer_3"] = np.array( predictions. ix[:, [c for c in predictions.columns if c.startswith("scorer_3")]]).argmax(axis=1) predictions.to_csv(save_as, index=False) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][trainset], predictions["pred_scorer_3"][trainset])
def model_generic_DBN(pipeline, model_name, model_f, essays_paths, parallel=False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline[ "name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath, essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:, non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] scorer = 3 kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset, testset)] scores = sorted(essays.meta_data()["score%d" % (scorer)].unique()) predictions = np.zeros((essays.meta_data().shape[0], len(scores))) model_f.layer_sizes[0] = X_all.shape[1] model_f.layer_sizes[2] = len(scores) try: for n, (tr_ind, te_ind) in enumerate(kf): print n scaler = StandardScaler() _ = scaler.fit(X_all[tr_ind, :]) X_tr = scaler.transform(X_all[tr_ind, :]) / 50.0 X_te = scaler.transform(X_all[te_ind, :]) / 50.0 model_f.fit(X_tr, y_all[tr_ind]) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][tr_ind], model_f.predict(X_tr)) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][te_ind], model_f.predict(X_te)) predictions[te_ind, :] = model_f.predict_proba(X_te) except: pass predictions = pd.DataFrame(predictions) predictions.columns = [ "scorer_%d_grade_%d" % (scorer, grade) for grade in scores ] predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1) predictions.to_csv(save_as, index=False) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][trainset], predictions["pred_scorer_3"][trainset])