Ejemplo n.º 1
0
    def _fetch_test_data(self):
        """ 获取测试数据 """
        session = get_db_session()
        objs = session.query(Pinkunhu2015).filter(
            Pinkunhu2015.county == '彝良县').all()
        X, Y = [], []
        for item in objs:
            col_list = []
            for col in [
                    'tv', 'washing_machine', 'fridge', 'reason',
                    'is_danger_house', 'is_back_poor', 'is_debt', 'standard',
                    'arable_land', 'debt_total', 'living_space',
                    'member_count', 'person_year_total_income',
                    'year_total_income', 'subsidy_total', 'wood_land',
                    'xin_nong_he_total', 'xin_yang_lao_total', 'call_number',
                    'bank_name', 'bank_number', 'help_plan'
            ]:

                normalized_value = normalize(col, getattr(item, col))
                col_list.append(normalized_value)
            X.append(col_list)
            normalized_value = normalize('poor_status',
                                         getattr(item, 'poor_status'))
            Y.append(normalized_value)

        return X, Y
Ejemplo n.º 2
0
    def _fetch_data(self):
        """ 获取建模数据 """
        session = get_db_session()
        objs = session.query(Pinkunhu2015).filter(
            Pinkunhu2015.county == 'A县',
            Pinkunhu2015.ny_person_income != -1,
            Pinkunhu2015.person_year_total_income > 0,
            Pinkunhu2015.person_year_total_income < 7000,
        ).all()
        X, Y = [], []
        for item in objs:
            col_list = []
            for col in self.features:
                normalized_value = normalize(col, getattr(item, col))
                col_list.append(normalized_value)
            X.append(col_list)
            normalized_value = normalize(self.target,
                                         getattr(item, self.target))
            Y.append(normalized_value)

        # # 筛掉可能有错误的数据
        # 人均年收入除以100后,查看分布,少于5次的不纳入模型, 效果不佳,废弃
        # df = pd.DataFrame(X, columns=self.features)
        # print '#df.shape:', df.shape
        # df['person_year_total_income'] = df['person_year_total_income'] / 100
        # df['person_year_total_income'] = df['person_year_total_income'].astype(int)
        # df['person_year_total_income'] = df['person_year_total_income'] * 100
        # df = df.groupby('person_year_total_income').filter(lambda x: len(x) > 5)
        # print '#df.shape:', df.shape
        # X, Y = df.loc[:, self.features[:-1]], df.loc[:, self.target]

        return X, Y
Ejemplo n.º 3
0
    def _fetch_test_data(self):
        """ 获取测试数据 """
        session = get_db_session()
        objs = session.query(Pinkunhu2015).filter(
            Pinkunhu2015.county == 'B县',
            Pinkunhu2015.ny_person_income != -1,
            Pinkunhu2015.person_year_total_income > 0,
            Pinkunhu2015.person_year_total_income < 7000,
            Pinkunhu2015.ny_person_income > 0,
            Pinkunhu2015.ny_person_income < 7000,
        ).all()
        X, Y = [], []
        for item in objs:
            col_list = []
            for col in self.features:
                normalized_value = normalize(col, getattr(item, col))
                col_list.append(normalized_value)
            X.append(col_list)
            normalized_value = normalize(self.target,
                                         getattr(item, self.target))
            Y.append(normalized_value)

        # 设置虚拟变量
        df = pd.DataFrame(X, columns=self.features)
        for item in self.dummy_features:
            dummies = pd.get_dummies(df[item], prefix=item)
            df = df.join(dummies)
        # 删除已设置虚拟变量的原变量
        df = df.drop(self.dummy_features, axis=1)
        X = df.loc[:]

        return X, Y
Ejemplo n.º 4
0
    def _fetch_data(self):
        """ 获取建模数据 """
        session = get_db_session()
        objs = session.query(Pinkunhu2015).filter(Pinkunhu2015.county == 'A县').all()
        X, Y = [], []
        for item in objs:
            col_list = []
            for col in self.features:
                normalized_value = normalize(col, getattr(item, col))
                col_list.append(normalized_value)
            X.append(col_list)
            normalized_value = normalize(self.target, getattr(item, self.target))
            Y.append(normalized_value)

        return X, Y
Ejemplo n.º 5
0
    def stat_col_percent(self, col):
        top_num = 10
        session = get_db_session()
        objs = session.query(getattr(
            Pinkunhu2015, col)).filter(Pinkunhu2015.poor_status != '已脱贫')
        res0 = {}
        for item in objs:
            k = (getattr(item, col) or '')[:2]
            cnt = res0.get(k, 0)
            res0[k] = cnt + 1

        self.plot_col_name(self.count_to_percent(res0)[:20],
                           title='%s-未脱贫' % col)

        objs = session.query(getattr(Pinkunhu2015, col))
        res1 = {}
        for item in objs:
            k = (getattr(item, col) or '')[:2]
            cnt = res1.get(k, 0)
            res1[k] = cnt + 1

        self.plot_col_name(self.count_to_percent(res1)[:20],
                           title='%s-全部' % col)

        objs2 = session.query(getattr(
            Pinkunhu2015, col)).filter(Pinkunhu2015.poor_status == '已脱贫')
        res2 = {}
        for item in objs2:
            k = (getattr(item, col) or '')[:2]
            cnt = res2.get(k, 0)
            res2[k] = cnt + 1

        self.plot_col_name(self.count_to_percent(res2)[:20],
                           title='%s-已脱贫' % col)

        res = []
        for k, v in res1.iteritems():
            print k
            res.append((k, res2.get(k, 0) * 100.0 / v))

        res = sorted(res, key=lambda x: -x[1])
        result = []
        for item in res:
            if res2.get(item[0]) > 200:
                result.append(
                    ('%s (%s户)' % (item[0], res2.get(item[0])), item[1]))

        self.plot_col_name(result[:top_num], title='%s-百分比' % col)
Ejemplo n.º 6
0
    def _fetch_test_data(self):
        """ 获取测试数据 """
        session = get_db_session()
        objs = session.query(Pinkunhu2015).filter(
                Pinkunhu2015.county == 'B县', Pinkunhu2015.ny_person_income != -1,
                Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000,
        ).all()
        X, Y = [], []
        for item in objs:
            col_list = []
            for col in self.features:
                normalized_value = normalize(col, getattr(item, col))
                col_list.append(normalized_value)
            X.append(col_list)
            normalized_value = normalize(self.target, getattr(item, self.target))
            Y.append(normalized_value)

        return X, Y
Ejemplo n.º 7
0
def get_normalize():
    """ 根据数据库列值自动生成标量的map
        文件头部的 MAPPINGS 就是用该方法生成的
    """
    session = get_db_session()
    # 字符串标量 转化成数字标量
    res = OrderedDict()
    for col in [
            'reason', 'is_danger_house', 'is_back_poor', 'is_danger_house',
            'is_debt', 'poor_status', 'standard'
    ]:
        data = OrderedDict()
        idx = 1
        for item, in session.query(getattr(Pinkunhu2015, col)).distinct():
            data[item] = idx
            idx += 1
        res[col] = data
    print json.dumps(res, ensure_ascii=False, indent=2)
Ejemplo n.º 8
0
def chart_person_year_total_income():
    # 获取数据
    session = get_db_session()
    objs = session.query(Pinkunhu2015.person_year_total_income,
                         Pinkunhu2015.ny_person_income).filter(
                             Pinkunhu2015.county == 'A县',
                             Pinkunhu2015.ny_person_income >= -1,
                             Pinkunhu2015.ny_person_income < 7000,
                             Pinkunhu2015.person_year_total_income > 0,
                             Pinkunhu2015.person_year_total_income < 7000,
                         ).all()
    X, Y = [], []
    for item in objs:
        X.append(item.person_year_total_income)
        Y.append(
            item.ny_person_income if item.ny_person_income != -1 else -1000)
    # 绘图
    fig = plt.figure(1, figsize=(10, 10))
    ax1 = fig.add_subplot(221)
    ax2 = fig.add_subplot(222)
    ax3 = fig.add_subplot(223)
    ax4 = fig.add_subplot(224)
    # 画点图
    ax1.plot(X, Y, 'k.', markersize=2)

    # 画热力图
    heatmap_16, extent_16 = myplot(X, Y, nb=16)
    heatmap_32, extent_32 = myplot(X, Y, nb=32)
    heatmap_64, extent_64 = myplot(X, Y, nb=64)

    ax2.imshow(heatmap_16, extent=extent_16, origin='lower', aspect='auto')
    ax2.set_title("Smoothing over 16 neighbors")

    ax3.imshow(heatmap_32, extent=extent_32, origin='lower', aspect='auto')
    ax3.set_title("Smoothing over 32 neighbors")

    #Make the heatmap using a smoothing over 64 neighbors
    ax4.imshow(heatmap_64, extent=extent_64, origin='lower', aspect='auto')
    ax4.set_title("Smoothing over 64 neighbors")

    plt.show()
Ejemplo n.º 9
0
def update_next_year():
    """ 更新下一年是否脱贫以及下一年收入数据 """
    step = 100  # 一次取100条数据
    session = get_db_session()
    # 2016年数据
    mappings2016 = {}
    min_id, max_id = session.query(func.min(Pinkunhu2016.id),
                                   func.max(Pinkunhu2016.id)).one()
    while min_id <= max_id:
        objs = session.query(Pinkunhu2016).filter(
            Pinkunhu2016.id >= min_id, Pinkunhu2016.id <= min_id + step).all()
        for item in objs:
            mappings2016[item.card_number] = True
            mappings2016[item.card_number +
                         'year_total_income'] = item.year_total_income
            mappings2016[
                item.card_number +
                'person_year_total_income'] = item.person_year_total_income

        min_id += step
        print 'min_id:%s' % min_id

    # 2015年数据
    mappings2015 = {}
    min_id, max_id = session.query(func.min(Pinkunhu2015.id),
                                   func.max(Pinkunhu2015.id)).one()
    while min_id <= max_id:
        objs = session.query(Pinkunhu2015).filter(
            Pinkunhu2015.id >= min_id, Pinkunhu2015.id <= min_id + step).all()
        for item in objs:
            # 存取2015年数据
            mappings2015[item.card_number] = True
            mappings2015[item.card_number +
                         'year_total_income'] = item.year_total_income
            mappings2015[
                item.card_number +
                'person_year_total_income'] = item.person_year_total_income

            # 设置下一年数据
            if mappings2016.get(item.card_number):
                item.ny_is_poor = 1
                item.ny_total_income = mappings2016[item.card_number +
                                                    'year_total_income']
                item.ny_person_income = mappings2016[
                    item.card_number + 'person_year_total_income']
            else:
                print item.id, '不存在'

        min_id += step
        print 'min_id:%s' % min_id

    # 2014年数据
    min_id, max_id = session.query(func.min(Pinkunhu2014.id),
                                   func.max(Pinkunhu2014.id)).one()
    while min_id <= max_id:
        objs = session.query(Pinkunhu2014).filter(
            Pinkunhu2014.id >= min_id, Pinkunhu2014.id <= min_id + step).all()
        for item in objs:
            # 设置下一年数据
            if mappings2015.get(item.card_number):
                item.ny_is_poor = 1
                item.ny_total_income = mappings2015[item.card_number +
                                                    'year_total_income']
                item.ny_person_income = mappings2015[
                    item.card_number + 'person_year_total_income']
            else:
                print item.id, '不存在'

        min_id += step
        print 'min_id:%s' % min_id

    session.flush()
    session.commit()