def __init__(self, fs_tuition, sigmasq, opts): self.fs_tuition = fs_tuition self.sigmasq = sigmasq self.opts = opts self.fs_rhs = lu.student_problem_vars() self.lsn_rhs = deepcopy(self.fs_rhs) self.lsn_rhs.remove('OverallRank') self.lsn_rhs.remove('Tuition') self.lsn_models = self.gen_matric_ev() self.data = None
def lsn_long_est(self, data): """ Application/admission estimates from lsnLong.csv """ rhs = lu.student_problem_vars() stages = {} stages['app'] = ensemble.GradientBoostingClassifier( n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 ).fit(data[rhs], data['app']) stages['admit'] = ensemble.GradientBoostingClassifier( n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 ).fit(data.loc[data['app'] == 1, rhs], data.loc[data['app'] == 1, 'admit']) stages['matric'] = ensemble.GradientBoostingClassifier( n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0 ).fit(data.loc[data['admit'] == 1, rhs], data.loc[data['admit'] == 1, 'matric']) if self.opts['verbose']: self._lsn_long_diagnostics(stages, data, rhs) return stages
def estimator_plot(self, stage, key): """ Plotting function for any given stage """ feature_importance = stage.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 axis = self.fig.add_subplot(2, 2, self.plotnum) axis.barh(pos, feature_importance[sorted_idx], align='center') axis.set_yticks(pos) rhs = np.array(lu.student_problem_vars()) rhs[rhs == 'OverallRank'] = 'Rank' rhs[rhs == 'LSDAS_GPA'] = 'GPA' rhs[rhs == 'year'] = 'Year' axis.set_yticklabels(rhs[sorted_idx].tolist()) axis.set_xlabel('Relative Importance: {0}'.format(key)) self.plotnum += 1
def gen_data(self, treat): """ Generate random dataset """ if treat: years = [2010, 2011, 2012] else: years = [2007, 2008, 2009] app_data_treat = self.app_data.loc[ self.app_data['year'].isin(years) ].reset_index(drop=True) data = [] for year in np.unique(app_data_treat['year']): app_data_year = self.app_data.loc[ self.app_data['year'] == year ].reset_index(drop=True) students = np.random.choice( app_data_year['user'], size=(gen_n_apps(year).tolist()[0] / 5) # Scale for memory ) keep_vars = lu.student_problem_vars() keep_vars.append('school') #out = [] out = mp.Manager().list() if self.opts['multiprocessing']: mp_args = ((app_data_year, students, keep_vars, out, i) for i in range(len(students))) pool = mp.Pool(processes=lc.N_THREADS) pool.map(gen_data_task, mp_args) pool.close() pool.join() else: for i in xrange(len(students)): out_data = app_data_year.loc[ app_data_year['user'] == students[i], keep_vars ] out_data['id'] = i out.append(out_data) data_year = pd.concat(list(out)) data_year.reset_index(inplace=True) data.append(data_year) data = pd.concat(data) data.reset_index(inplace=True) return data
def __init__(self, app_data, firststage, opts): self.app_data = app_data self.data = None self.firststage = firststage self.opts = opts self.rhs = lu.student_problem_vars()