def get_zsxx_info(self): sssqz_max = self.sssqz_max.strftime('%Y-%m-%d') data, sssqz_max = handle(self.nsrsbh, sssqz_max).get_sbxx_data() if not self.sssqz_max: self.sssqz_max = sssqz_max sssqz = sssqz_max.strftime('%Y-%m-%d') sql = '''select distinct nsrsbh, sssq_q, sssq_z, jkqx, jkfsrq, se, zsxm_mc, SKZL_MC from zx_sbzsxx_sample_w_2 a where not exists (select 1 from zx_sbzsxx_sample_w_2 b where a.nsrsbh = b.nsrsbh and substr(a.sssq_q,1,4)=substr(b.sssq_q,1,4) and a.lrsj < b.lrsj - 3 / 24 / 60) and sssq_z <= '{0}' and nsrsbh = '{1}' '''.format(sssqz, self.nsrsbh) result, col = self.db_handle.query(sql) zsxx_info = common.dataframe(result, col) #self.sssqz_max = datetime.datetime.strptime(zsxx_info['sssq_z'].max(), # '%Y-%m-%d') self.zsxx_info = zsxx_info print(zsxx_info.head())
def get_cw_data(self): sql = '''select a.*,b.* from zcfzb_xm a, lrb_xm b where a.nsrsbh=b.nsrsbh and a.nsrsbh='{0}' '''.format(self.nsrsbh) result, col = self.db_handle.query(sql) self.cw_data = common.dataframe(result, col) pprint(self.cw_data)
def get_sbxx_data(self): sql = '''select distinct nsrsbh, t.sssqq, t.sssqz, t.qbxse, t.ysxssr, t.ybtse, t.yjse, t.jmse, t.sbrq, t.sbqx, t.zsxmmc from zx_sbxx_sample_w_2 t where t.nsrsbh = '{0}' and sssqq >= to_char(sysdate - 365 * 3, 'yyyy-mm-dd') and ZSXMMC IN ('增值税', '企业所得税') and not exists (select 1 from zx_sbxx_sample_w_2 b where t.nsrsbh = b.nsrsbh and t.lrsj < b.lrsj - 3 / 24 / 60)'''.format(self.nsrsbh) result, col = self.db_handle.query(sql) sbxx_df = common.dataframe(result, col) sssqz_max = datetime.datetime.strptime(sbxx_df['sssqz'].max(), '%Y-%m-%d') if not self.sssqz_max: self.sssqz_max = sssqz_max for i in ['qbxse', 'jmse', 'yjse', 'ybtse']: sbxx_df[i] = sbxx_df[i].astype('float') sssqz = self.sssqz_max.strftime('%Y-%m-%d') self.data = sbxx_df[sbxx_df['sssqz'] <= sssqz] pprint(sbxx_df) return self.data, self.sssqz_max
def get_jc_info(self): #稽查信息 sql = '''select distinct wfwzlxmc, aydjrq, jclxmc from zx_jcajxx_sample a where nsrsbh = '{0}' and not exists (select 1 from zx_jcajxx_sample b where a.nsrsbh = b.nsrsbh and substr(a.AYDJRQ, 1, 4) = substr(b.AYDJRQ, 1, 4) and a.lrsj < b.lrsj - 3 / 24 / 60) '''.format( self.nsrsbh) result, col = self.db_handle.query(sql) jc_df = common.dataframe(result, col) pprint(jc_df) month_bins = [3, 6, 12, 24] jc_info = DataFrame(index=[0]) sssqz = self.sssqz_max for n in month_bins: #距离观察时间n个月 date_diff_1 = (sssqz - relativedelta(months=0)).strftime('%Y-%m-%d') date_diff_2 = (sssqz - relativedelta(months=n)).strftime('%Y-%m-%d') #获取满足条件的数据 data = jc_df[(jc_df['aydjrq'] < date_diff_1) & (jc_df['aydjrq'] > date_diff_2)] jc_info['jcaj' + '_' + str(n)] = len(data) pprint(jc_info) return jc_info
def get_nsrjcxx_info(self): if not self.sssqz_max: data, self.sssqz_max = handle(self.nsrsbh, self.sssqz_max).get_sbxx_data() #nsr基础信息 sql = '''select nsrsbh,sshydm ,zczb,nslxmc,xydj ,kyrq from (select * from t_nsrjcxx where nsrsbh='{0}' order by lrsj desc) where rownum=1'''.format(self.nsrsbh) result, col = self.db_handle.query(sql) nsr_df = common.dataframe(result, col) #行业代码 sql = '''select hyml_dm,hymx_dm from dm_hy_2017''' result, col = self.db_handle.query(sql) hydm_df = common.dataframe(result, col) hydm_df.columns = ['hyml_dm', 'sshydm'] nsr_info = pd.merge(nsr_df, hydm_df, how='left', on='sshydm') nsr_info['hy_class'] = nsr_info['hyml_dm'].apply( lambda x: self.transform_hydm(x)) nsr_info['kyrq_dis'] = (self.sssqz_max - pd.to_datetime( nsr_info['kyrq'])).apply(lambda x: round(x.days / 365.0, 1)) nsr_info.rename(columns={ 'sshydm': 'hy', 'xydj': 'nsrxypj', 'nslxmc': 'nsrlx' }, inplace=True) nsr_info.drop(['hyml_dm', 'kyrq'], axis=1, inplace=True) #nsr 年龄与性别 sql = '''select dbr_zjhm from zx_lxrxx_sample a where not exists (select 1 from zx_lxrxx_sample b where a.nsrsbh = b.nsrsbh and a.lrsj < b.lrsj - 3 / 24 / 60) and nsrsbh ='{0}' and bssf = 1'''.format(self.nsrsbh) result, col = self.db_handle.query(sql) lxrxx_info = common.dataframe(result, col) nsr_info['nl_1'] = lxrxx_info['dbr_zjhm'].apply( lambda x: self.transform_sfz(x, True)) nsr_info['xb_1'] = lxrxx_info['dbr_zjhm'].apply( lambda x: self.transform_sfz(x, False)) nsr_info['dbr_zjhm'] = lxrxx_info['dbr_zjhm'] #pprint(lxrxx_info.loc[0].ix[0][6:10]) pprint(nsr_info) return nsr_info
def payh_sbxx_check(self): sql = '''select * from t_payh_sb where nsrsbh='{0}' '''.format( self.nsrsbh) result, col = self.db_handle.query(sql) sbxx_info = common.dataframe(result, col) #pprint(sbxx_info) sb_feature = handle(self.nsrsbh, self.sssqz_max).get_concat_df() #pprint(sb_feature) sb_equal, sb_diff = self.check_data(sb_feature, sbxx_info) pprint(sb_diff) sb_equal.to_csv(self.path + 'sb_equal.csv') sb_diff.to_csv(self.path + 'sb_diff.csv')
def payh_cwzb_check(self): sql = '''select a.*,b.* from t_cwzb_list a,t_cwzb_list_1 b where a.nsrsbh=b.nsrsbh and a.nsrsbh='{0}' and rownum=1'''.format( self.nsrsbh) result, col = self.db_handle.query(sql) cwzb_info = common.dataframe(result, col) #pprint(cwzb_info) cwzb_feature = handle_cwzb(self.nsrsbh).get_concat_df() #pprint(cwzb_feature) cwzb_equal, cwzb_diff = self.check_data(cwzb_feature, cwzb_info) cwzb_equal.to_csv(self.path + 'cwzb_equal.csv') cwzb_diff.to_csv(self.path + 'cwzb_diff.csv') return cwzb_equal, cwzb_diff
def get_wfwz_info(self): sql = '''select distinct zywfwzss, djrq, wfwzlxdm from zx_wfwzxx_sample a where nsrsbh = '{0}' and not exists (select 1 from zx_wfwzxx_sample b where a.nsrsbh = b.nsrsbh and substr(a.DJRQ,1,4)=substr(b.DJRQ,1,4) and a.lrsj < b.lrsj - 3 / 24 / 60) '''.format(self.nsrsbh) result, col = self.db_handle.query(sql) wfwz_df = common.dataframe(result, col) pprint(wfwz_df) #违章代碼 wzmc = [ '发票违法次数', '非主观故意违法次数', '抗税次数', '骗税次数', '其他违法次数', '税收政策例外违法次数', '税务机关执法不当次数', '逃避缴纳税款', '违反税收管理次数' ] wzdm = ['04', '06', '03', '02', '99', '08', '07', '01', '05', ''] #指标名称 zbmc = [ 'wfwz_fpwf', 'wfwz_fzggy', 'wfwz_ks', 'wfwz_ps', 'wfwz_qt', 'wfwz_sszclwwfcs', 'wfwz_swjg', 'wfwz_tbjnsk', 'wfwz_wfssgl', 'wfwz' ] month_bins = [3, 6, 9, 12] wfwz_info = DataFrame(index=[0]) sssqz = self.sssqz_max for i, j in zip(wzdm, zbmc): for n in month_bins: #距离观察时间n个月 date_diff_1 = (sssqz - relativedelta(months=0)).strftime('%Y-%m-%d') date_diff_2 = (sssqz - relativedelta(months=n)).strftime('%Y-%m-%d') #获取满足条件的数据 if j == 'wfwz': data = wfwz_df[(wfwz_df['djrq'] > date_diff_2) & (wfwz_df['djrq'] < date_diff_1)] else: data = wfwz_df[(wfwz_df['wfwzlxdm'] == i) & (wfwz_df['djrq'] > date_diff_2) & ((wfwz_df['djrq'] < date_diff_1))] wfwz_info[j + '_' + str(n) + 'm'] = len(data) #wfwz_24m date_diff_1 = (sssqz - relativedelta(months=0)).strftime('%Y-%m-%d') date_diff_2 = (sssqz - relativedelta(months=24)).strftime('%Y-%m-%d') data = wfwz_df[(wfwz_df['djrq'] > date_diff_2) & (wfwz_df['djrq'] < date_diff_1)] wfwz_info['wfwz_24m'] = len(data) pprint(wfwz_info) return wfwz_info
def get_bg_info(self): sql = '''select distinct bgrq, bgqnr, bghnr, bgxmmc from zx_bgdjxx_sample a where nsrsbh ='{0}' and bgqnr != bghnr and not exists (select 1 from zx_bgdjxx_sample b where a.nsrsbh = b.nsrsbh and substr(a.bgrq, 1, 4) = substr(b.bgrq, 1, 4) and a.lrsj < b.lrsj - 3 / 24 / 60) '''.format(self.nsrsbh) result, col = self.db_handle.query(sql) bg_df = common.dataframe(result, col) pprint(bg_df) #变更名称 bgmc = [ '办税人员证件号码', '财务负责人身份证件号码', '生产经营地址', '投资方', '注册资本', '经营范围', ['法定代表人(负责人)移动电话', '法定代表人(负责人、业主)移动电话'], ['国标行业', '国标行业(附)'] ] #指标名称代码 zbmc = [ 'bg_bsry', 'bg_cwfzr', 'bg_dz', 'bg_tzf', 'bg_zczb', 'jyfw', 'bg_frdbdh', 'bg_hybg' ] month_bins = [ 3, 6, 9, 12, 24, ] sssqz = self.sssqz_max bg_info = DataFrame(index=[0]) flag = 0 for i, j in zip(bgmc, zbmc): flag += 1 for n in month_bins: #距离观察时间n个月 date_diff_1 = (sssqz - relativedelta(months=0)).strftime('%Y-%m-%d') date_diff_2 = (sssqz - relativedelta(months=n)).strftime('%Y-%m-%d') #获取满足条件的数据 if flag <= 6: data = bg_df[(bg_df['bgxmmc'] == i) & (bg_df['bgrq'] < date_diff_1) & (bg_df['bgrq'] > date_diff_2)] else: data = bg_df[(bg_df['bgxmmc'].isin(i)) & (bg_df['bgrq'] < date_diff_1) & (bg_df['bgrq'] > date_diff_2)] bg_info[j + '_' + str(n) + 'm'] = len(data) pprint(bg_info) return bg_info
def payh_sxyxx_check(self): sql = '''select * from t_payh_sxyxx_2 where nsrsbh='{0}' '''.format( self.nsrsbh) result, col = self.db_handle.query(sql) sxy_info = common.dataframe(result, col) pprint(sxy_info) sxy_feature = handle_sxyxx(self.nsrsbh, self.sssqz_max).get_concat_df() pprint(sxy_feature.head()) sxy_equal, sxy_diff = self.check_data(sxy_feature, sxy_info) pprint(sxy_diff) sxy_equal.to_csv(self.path + 'sxy_equal.csv') sxy_diff.to_csv(self.path + 'sxy_diff.csv') return sxy_equal, sxy_diff
def payh_zsxx_check(self): sql = '''select * from t_payh_sbzs_xz where nsrsbh='{0}' '''.format( self.nsrsbh) result, col = self.db_handle.query(sql) zsxx_info = common.dataframe(result, col) pprint(zsxx_info) zsxx_feature = handle_zsxx(self.nsrsbh, self.sssqz_max).get_concat_df() pprint('zsxx_feature:') pprint(zsxx_feature) zsxx_equal, zsxx_diff = self.check_data(zsxx_feature, zsxx_info) pprint(zsxx_diff) zsxx_equal.to_csv(self.path + 'zsxx_equal.csv') zsxx_diff.to_csv(self.path + 'zsxx_diff.csv') return zsxx_equal, zsxx_diff
def payh_jcxx_check(self): sql = '''select * from t_payh_jbxx_2 where nsrsbh='{0}' '''.format( self.nsrsbh) result, col = self.db_handle.query(sql) jcxx_info = common.dataframe(result, col) pprint('jcxx:') pprint(jcxx_info) jcxx_feature = handle_jcxx(self.nsrsbh, self.sssqz_max).get_concat_df() pprint('jcxx_feature:') pprint(jcxx_feature) jcxx_equal, jcxx_diff = self.check_data(jcxx_feature, jcxx_info) pprint(jcxx_diff) jcxx_equal.to_csv(self.path + 'jcxx_equal.csv') jcxx_diff.to_csv(self.path + 'jcxx_diff.csv') return jcxx_equal, jcxx_diff
def get_tzf_info(self): #投资方信息 sql = '''select distinct nsrsbh,tzfmc,tzfjjxzdm,tzbl,tzfjjxzmc, zjhm,tzbl * tzbl as tzbl2 from zx_tzfxx_sample a where nsrsbh = '{0}' and not exists (select 1 from zx_tzfxx_sample b where a.nsrsbh = b.nsrsbh and a.lrsj < b.lrsj - 3 / 24 / 60)'''.format(self.nsrsbh) result, col = self.db_handle.query(sql) tzf_df = common.dataframe(result, col) tzf_info = DataFrame(index=[0]) #tzfjjxzdm = ['400', '410', '411', '412', '413'] tzf_info['holder_count'] = len(tzf_df) tzf_info['holder_count_natural'] = len( tzf_df[(tzf_df['tzfjjxzdm'].notnull()) & (tzf_df['tzfjjxzdm'].str.startswith('4'))]) #tzf_info['holder_count_corporate'] = len( # tzf_df[~(tzf_df['tzfjjxzdm'].str.startswith('4')) # & (tzf_df['tzfjjxzdm'].notnull())]) tzf_info['holder_count_corporate'] = tzf_info[ 'holder_count'] - tzf_info['holder_count_natural'] tzf_info['hh_index'] = tzf_df['tzbl2'].sum() #第一大股东 nsr_df = self.get_nsrjcxx_info() tzf_df_new = tzf_df[tzf_df['tzbl'] == tzf_df['tzbl'].max()] holder_first = tzf_df_new[(tzf_df_new['tzfjjxzdm'].str.startswith('4')) & (tzf_df_new['tzfjjxzdm'].notnull())] if len(holder_first) > 0: tzf_info['holder_first'] = 0 tzf_info['holdr_first_frdb'] = 0 else: tzf_info['holder_first'] = 1 holdr_frdb_1 = tzf_df_new[(tzf_df_new['tzfjjxzdm'].str.startswith('4')) & (tzf_df_new['tzfjjxzdm'].notnull()) & (tzf_df_new['zjhm'] == nsr_df['dbr_zjhm'].values[0])] holdr_frdb_2 = tzf_df_new[(tzf_df_new['tzfjjxzdm'].notnull()) & (tzf_df_new['tzfjjxzdm'].str.startswith('4')) & (tzf_df_new['zjhm'] == 'X')] if len(holdr_frdb_1) > 0 or len(holdr_frdb_2) > 0: tzf_info['holder_first_frdb'] = 1 elif 'holder_first_frdb' not in tzf_info.columns: tzf_info['holder_first_frdb'] = 2 pprint(tzf_info) return tzf_info
def get_sxy_data(self): #data, sssqz_max = handle(self.nsrsbh, self.sssqz_max).get_sbxx_data() sssqz_max = self.sssqz_max.strftime('%Y-%m-%d') xse_info = handle(self.nsrsbh, sssqz_max).get_xse_info() self.qbxse_last = xse_info['qbxse_last'].min() self.qbxse_pre = xse_info['qbxse_1_1'].min() if not self.sssqz_max: self.now_year = sssqz_max.strftime('%Y-%m-%d')[:4] else: self.now_year = self.sssqz_max.strftime('%Y-%m-%d')[:4] self.last_year = str(int(self.now_year) - 1) sql = '''select distinct jyje, jyjebl, gfnsrsbh, xfnsrsbh, sssq, sxybz, nsrsbh, se, pm, nsrmc from zx_jydx a where nsrsbh = '{0}' and sssq <= '{1}' and not exists (select * from zx_jydx b where a.nsrsbh = b.nsrsbh and a.sssq = b.sssq and a.lrsj < b.lrsj - 3 / 24 / 60)'''.format( self.nsrsbh, self.now_year) result, col = self.db_handle.query(sql) self.sxy = common.dataframe(result, col) for col in ['jyje', 'jyjebl', 'se', 'pm', 'sxybz']: self.sxy[col] = self.sxy[col].astype('float') pprint(self.sxy)