def _fetch_data(self): """ 获取建模数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'A县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) # # 筛掉可能有错误的数据 # 人均年收入除以100后,查看分布,少于5次的不纳入模型, 效果不佳,废弃 # df = pd.DataFrame(X, columns=self.features) # print '#df.shape:', df.shape # df['person_year_total_income'] = df['person_year_total_income'] / 100 # df['person_year_total_income'] = df['person_year_total_income'].astype(int) # df['person_year_total_income'] = df['person_year_total_income'] * 100 # df = df.groupby('person_year_total_income').filter(lambda x: len(x) > 5) # print '#df.shape:', df.shape # X, Y = df.loc[:, self.features[:-1]], df.loc[:, self.target] return X, Y
def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == '彝良县').all() X, Y = [], [] for item in objs: col_list = [] for col in [ 'tv', 'washing_machine', 'fridge', 'reason', 'is_danger_house', 'is_back_poor', 'is_debt', 'standard', 'arable_land', 'debt_total', 'living_space', 'member_count', 'person_year_total_income', 'year_total_income', 'subsidy_total', 'wood_land', 'xin_nong_he_total', 'xin_yang_lao_total', 'call_number', 'bank_name', 'bank_number', 'help_plan' ]: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize('poor_status', getattr(item, 'poor_status')) Y.append(normalized_value) return X, Y
def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'B县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, Pinkunhu2015.ny_person_income > 0, Pinkunhu2015.ny_person_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) # 设置虚拟变量 df = pd.DataFrame(X, columns=self.features) for item in self.dummy_features: dummies = pd.get_dummies(df[item], prefix=item) df = df.join(dummies) # 删除已设置虚拟变量的原变量 df = df.drop(self.dummy_features, axis=1) X = df.loc[:] return X, Y
def _fetch_data(self): """ 获取建模数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter(Pinkunhu2015.county == 'A县').all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) return X, Y
def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'B县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) return X, Y