def model_generic_1ofK_clas(pipeline,model_name,model_f,essays_paths,parallel = False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath,essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:,non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] for scorer in [3]: kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)] for grade in sorted(essays.meta_data()["score%d" % (scorer)].unique()): y_all = np.array(essays.meta_data()["score%d" % (scorer)].map(lambda y: 1 if int(y)==int(grade) else 0)) pred_name = "scorer_%d_grade_%d" % (scorer,grade) predictions[pred_name] = 0 if parallel: pool = Pool(processes=4) essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n] for n in range(8)]) pool.close() for n, essay_set in enumerate(essay_sets): te_ind = kf[n][1] predictions.ix[te_ind,pred_name] = essay_set[:,1] else: for n,(tr_ind,te_ind) in enumerate(kf): predictions.ix[te_ind,pred_name] = model_f(X_all[tr_ind,:], y_all[tr_ind], X_all[te_ind,:], feature_names=essays.feature_names) predictions = predictions.ix[:,sorted(predictions.columns)] predictions["pred_scorer_3"] = np.array(predictions.ix[:,[c for c in predictions.columns if c.startswith("scorer_3")]]).argmax(axis=1) predictions.to_csv(save_as,index=False) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_DBN(pipeline,model_name,model_f,essays_paths,parallel = False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline["name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath,essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(),dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int),dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:,non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id':range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] scorer = 3 kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset,testset)] scores = sorted(essays.meta_data()["score%d" % (scorer)].unique()) predictions = np.zeros((essays.meta_data().shape[0],len(scores))) model_f.layer_sizes[0] = X_all.shape[1] model_f.layer_sizes[2] = len(scores) try: for n,(tr_ind,te_ind) in enumerate(kf): print n scaler = StandardScaler() _ = scaler.fit(X_all[tr_ind,:]) X_tr = scaler.transform(X_all[tr_ind,:]) / 50.0 X_te = scaler.transform(X_all[te_ind,:]) / 50.0 model_f.fit(X_tr, y_all[tr_ind]) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][tr_ind],model_f.predict(X_tr)) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][te_ind],model_f.predict(X_te)) predictions[te_ind,:] = model_f.predict_proba(X_te) except: pass predictions = pd.DataFrame(predictions) predictions.columns = ["scorer_%d_grade_%d" % (scorer,grade) for grade in scores] predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1) predictions.to_csv(save_as,index=False) print kappa.quadratic_weighted_kappa(essays.meta_data()["score3"][trainset],predictions["pred_scorer_3"][trainset])
def model_generic_1ofK_clas(pipeline, model_name, model_f, essays_paths, parallel=False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline[ "name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath, essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:, non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] for scorer in [3]: kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset, testset)] for grade in sorted(essays.meta_data()["score%d" % (scorer)].unique()): y_all = np.array(essays.meta_data()["score%d" % (scorer)].map( lambda y: 1 if int(y) == int(grade) else 0)) pred_name = "scorer_%d_grade_%d" % (scorer, grade) predictions[pred_name] = 0 if parallel: pool = Pool(processes=4) essay_sets = pool.map(cv, [[kf, X_all, y_all, model_f, n] for n in range(8)]) pool.close() for n, essay_set in enumerate(essay_sets): te_ind = kf[n][1] predictions.ix[te_ind, pred_name] = essay_set[:, 1] else: for n, (tr_ind, te_ind) in enumerate(kf): predictions.ix[te_ind, pred_name] = model_f( X_all[tr_ind, :], y_all[tr_ind], X_all[te_ind, :], feature_names=essays.feature_names) predictions = predictions.ix[:, sorted(predictions.columns)] predictions["pred_scorer_3"] = np.array( predictions. ix[:, [c for c in predictions.columns if c.startswith("scorer_3")]]).argmax(axis=1) predictions.to_csv(save_as, index=False) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][trainset], predictions["pred_scorer_3"][trainset])
def model_generic_DBN(pipeline, model_name, model_f, essays_paths, parallel=False): print model_name for essaypath in sorted(essays_paths): print essaypath, essayname = essaypath.split("/")[-1].split(".")[0] save_as = MODEL_PATHS + essayname + "_" + pipeline[ "name"] + "_" + model_name if os.path.exists(save_as): print "Skipping" continue essays = EssayCollection(essaypath, essayname) essays.apply_datasteps(pipeline["steps"]) essays.create_feature_matrix(min_sparsity=5) predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "")[0] X_all = np.array(essays.feature_matrix.todense(), dtype=np.float32) y_all = np.array(essays.meta_data()["score3"].map(int), dtype=np.int32) non_duplicated = get_nonduplicate_columns(pd.DataFrame(X_all)) print "orig dimensions", X_all.shape[1], X_all = X_all[:, non_duplicated] print "reduced dimensions", X_all.shape[1], predictions = pd.DataFrame({'id': range(essays.meta_data().shape[0])}) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] scorer = 3 kf = cross_validation.KFold(len(trainset), n_folds=7, random_state=0) kf = [(trainset[tr], trainset[te]) for tr, te in kf] + [(trainset, testset)] scores = sorted(essays.meta_data()["score%d" % (scorer)].unique()) predictions = np.zeros((essays.meta_data().shape[0], len(scores))) model_f.layer_sizes[0] = X_all.shape[1] model_f.layer_sizes[2] = len(scores) try: for n, (tr_ind, te_ind) in enumerate(kf): print n scaler = StandardScaler() _ = scaler.fit(X_all[tr_ind, :]) X_tr = scaler.transform(X_all[tr_ind, :]) / 50.0 X_te = scaler.transform(X_all[te_ind, :]) / 50.0 model_f.fit(X_tr, y_all[tr_ind]) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][tr_ind], model_f.predict(X_tr)) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][te_ind], model_f.predict(X_te)) predictions[te_ind, :] = model_f.predict_proba(X_te) except: pass predictions = pd.DataFrame(predictions) predictions.columns = [ "scorer_%d_grade_%d" % (scorer, grade) for grade in scores ] predictions["pred_scorer_3"] = np.array(predictions).argmax(axis=1) predictions.to_csv(save_as, index=False) print kappa.quadratic_weighted_kappa( essays.meta_data()["score3"][trainset], predictions["pred_scorer_3"][trainset])