def request_area_personcont(school: str, schooltype: str, buildings: []): num = int(len(buildings) / 999) + 1 split_buildings = np.array_split(buildings, num) hrjk_dict = {} ldrk_dict = {} hrjk_list = [] ldrk_list = [] for split_building in split_buildings: try: hjrk_sql = "SELECT age,sum(num) NUM,RKXZ FROM FT_RKJZT WHERE lddm IN(%s) AND age <18 AND AGE >=0 AND rkxz='深圳户籍人口' GROUP BY AGE,RKXZ" ldrk_sql = "SELECT age,sum(num) NUM,RKXZ FROM FT_RKJZT WHERE lddm IN(%s) AND age <18 AND AGE >=0 AND rkxz='流动人口' GROUP BY AGE,RKXZ" in_p = ', '.join(list(map(lambda x: "'%s'" % x, split_building))) hjrk_sql = hjrk_sql % in_p ldrk_sql = ldrk_sql % in_p hrjk_df = db_util.execute2Dataframe(hjrk_sql) ldrk_df = db_util.execute2Dataframe(ldrk_sql) for index, row in hrjk_df.iterrows(): age = row['AGE'] num = row['NUM'] if not hrjk_dict.get(age): hrjk_dict[age] = 0 hrjk_dict[age] = hrjk_dict[age] + num for index, row in ldrk_df.iterrows(): age = row['AGE'] num = row['NUM'] if not ldrk_dict.get(age): ldrk_dict[age] = 0 ldrk_dict[age] = ldrk_dict[age] + num except Exception as e: print(e) for age in hrjk_dict: hrjk_list.append({ 'AGE': age, 'NUM': hrjk_dict[age], 'RKXZ': '深圳户籍人口', 'SCHOOLNAME': school, 'SCHOOLTYPE': schooltype }) for age in ldrk_dict: ldrk_list.append({ 'AGE': age, 'NUM': ldrk_dict[age], 'RKXZ': '流动人口', 'SCHOOLNAME': school, 'SCHOOLTYPE': schooltype }) hrjk_list.extend(ldrk_list) return DataFrame(hrjk_list)
def fenci(): sql=""" select qymc,sqnr,dfnr from ENT_REQUIRENTMENT """ df = db_util.execute2Dataframe(sql) # df = DataFrame(pd.read_excel("E:\\svn仓库\\svnrepo\\python\\03data\\data\\各委办局直接过来的数据\\外迁企业相关数据\\企业诉求\\2018-4.xlsx")) wordsCount = {} wordsList = [] stopwords = get_stopWords() for index, row in df.iterrows(): qymc = row['QYMC'] question = row['SQNR'] answer = row['DFNR'] seg_list = jieba.cut(question, cut_all=True) for word in seg_list: if word.strip() == '': continue if word in stopwords: continue if wordsCount.get(word) is None: wordsCount[word] = 0 wordsCount[word] += 1 for k, v in wordsCount.items(): wordsList.append({'单词': k, '出现次数': v}) result = DataFrame(wordsList).sort_values(by='出现次数', na_position='first') gen_word_cloud(wordsCount) print(result)
def request_area_building(): """ 获取学区图中每个学区的0-18岁人口信息 :return: DataFrame """ file = 'D:\\pypy\\pythonresult\\教育学位\\学校人口信息.xls' if os.path.isfile(file): area_data = DataFrame(pd.read_excel(file)) if area_data is not None or not area_data.empty: return area_data areas = db_util.execute2Dataframe('SELECT\ WWYJFX.T_JY_SCHOOLAREA.SCHOOLNAME,\ WWYJFX.T_JY_SCHOOLAREA.SCHOOL_FULLNAME,\ WWYJFX.T_JY_SCHOOLAREA.SCHOOLTYPE,\ WWYJFX.T_JY_SCHOOLAREA.POLYGON_84\ FROM\ WWYJFX.T_JY_SCHOOLAREA\ ') # areas = DataFrame(pd.read_excel('D:\\pypy\\pythonresult\\教育学位\\学区信息.xls')) data = { 'f': 'json', 'returnGeometry': 'false', 'spatialRel': 'esriSpatialRelIntersects', 'geometryType': 'esriGeometryPolygon', 'inSR': 4490, 'outFields': 'BLDG_NO,NOWNAME', 'outSR': 4490 } url_prefox = 'http://10.190.55.55:8080/arcgis/rest/services/FTKSJ/JZWDLM_CGCS2000/MapServer/1/query' person_data = DataFrame() for index, row in areas.iterrows(): polygon_84 = row['POLYGON_84'] schoolname = row['SCHOOLNAME'] schooltype = row['SCHOOLTYPE'] if polygon_84 is not None and polygon_84 is not '' and polygon_84 is not np.nan: geometry = split_point_to_geometry(polygon_84) data['geometry'] = geometry result = spider_util.open_url(url_prefox, 5, 20, data=data) # 20秒超时 jsondata = demjson.decode(result) buildings = get_building(jsondata) if buildings is None or len(buildings) == 0: print('该学校:' + schoolname + '楼栋id为空') continue childinfo = request_area_personcont(schoolname, schooltype, buildings) person_data = person_data.append(childinfo) df = DataFrame(person_data) df.to_excel(file, index=False) return df
def address_format(table,lonField,latField): sql="select * from "+table delete_sql="delete from "+table df=db_util.execute2Dataframe(sql) length=len(df) for i in range(length): lon=df.at[i,lonField] lat = df.at[i, latField] lon=float(lon) lat=float(lat) if lon is None or lon == '' or math.isnan(lon): continue addressComponent=address_standardization.location2normaladdress(lon,lat,coordtype='gcj02ll') street=addressComponent['town'] df.at[i, 'STREET']=street spider_util.log_progress(i,length,detailedLog=True) # db_util.delete(delete_sql) df.to_csv('C:\\Users\\admin\\Desktop\\'+table+'.csv',index=False,sep=',')
def format(): df =db_util.execute2Dataframe('select * from T_OPEN_SGXKZXX ') dflen=len(df.index)#总行数 for x in range(dflen): addr = df['CONST_LOCATION'].iloc[x] try: addressComponent=address_standardization.formatAddress(addr) df.set_value(x, 'QU', addressComponent['district']) df.set_value(x, 'STREET', addressComponent['town']) df.set_value(x, 'DL', addressComponent['street']) df.set_value(x, 'BD_X', addressComponent['bd_x']) df.set_value(x, 'BD_Y', addressComponent['bd_y']) df.set_value(x, 'LON84', addressComponent['lon84']) df.set_value(x, 'LAT84', addressComponent['lat84']) except Exception as e: print('地址转换错误:',addr,e) spider_util.log_progress(x,dflen) print(df) df.to_excel('D:\\011111111111111111111111\\00临时文件\\T_OPEN_SGXKZXX.xlsx', index=False)
def loadSimpDat(): # simpDat = [['r', 'z', 'h', 'j', 'p'], # ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], # ['z'], # ['r', 'x', 'n', 'o', 's'], # ['y', 'r', 'x', 'z', 'q', 't', 'p'], # ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] # sql = """ # select a.qymc,a.sqnr,a.dfnr,b.hydm from ENT_REQUIRENTMENT a left join T_SJZX_SSZTJBXX b on a.qymc=b.qymc # """ sql=""" select a.qymc,a.sqnr,a.dfnr,c.GBHY from ENT_REQUIRENTMENT a inner join T_YW_ZZ_FR b on a.qymc=b.jgmc inner join OPENDATA_SY_INFO c on b.TYSHXYDM =c.TYSHXYDM """ simpDat=[] df = db_util.execute2Dataframe(sql) # df = DataFrame(pd.read_excel("E:\\svn仓库\\svnrepo\\python\\03data\\data\\各委办局直接过来的数据\\外迁企业相关数据\\企业诉求\\2018-4.xlsx")) wordsCount = {} wordsList = [] stopwords = get_stopWords() # 引入TF-IDF关键词抽取接口 textrank = analyse.textrank for index, row in df.iterrows(): hydm = row['GBHY'] qymc = row['QYMC'] question = row['SQNR'] answer = row['DFNR'] # 基于TF-IDF算法进行关键词抽取 keywords = textrank(question,topK=10) print('关键词:') # 输出抽取出的关键词 print('/'.join(keywords)) seg_list = jieba.cut(question, cut_all=True) keywordsFilter=[] for word in keywords: if word.strip() == '': continue if word in stopwords: continue keywordsFilter.append(word) if hydm is not None: keywordsFilter.append(hydm) simpDat.append(keywordsFilter) for word in seg_list: if word.strip() == '': continue if word in stopwords: continue if wordsCount.get(word) is None: wordsCount[word] = 0 wordsCount[word] += 1 for k, v in wordsCount.items(): wordsList.append({'单词': k, '出现次数': v}) result = DataFrame(wordsList).sort_values(by='出现次数', na_position='first') gen_word_cloud(wordsCount) top100Df=result[-20:] top100List=top100Df['单词'].tolist() # for index, row in df.iterrows(): # keywords = [] # hydm = row['GBHY'] # question = row['SQNR'] # answer = row['DFNR'] # seg_list = jieba.cut(question, cut_all=True) # for word in seg_list: # if word.strip() == '': # continue # if word in stopwords: # continue # if word in top100List: # if word in keywords: # continue # keywords.append(word) # if len(keywords)!=0 and hydm is not None: # keywords.append(hydm) # simpDat.append(keywords) return simpDat
def baseinfo(): sql = """ SELECT DISTINCT a.DWMC ,b.cjzs,b.zjzs,c.total_wqzs,d.tdjycs,d.zjtdjyrq, CASE WHEN e.stockcode IS NOT NULL THEN '已上市' ELSE '' END AS isListed, e.stockcode,e.stockname,e.companylistingdate,e.phone,e.employeenum, CASE WHEN f.qymc IS NOT NULL THEN '准备上市' ELSE '' END AS PREPARELIST, f.* FROM ENTERPRISE_INFO_ZDGZ a LEFT JOIN (SELECT qymc,sum(cjzs) cjzs,sum(zjzs) zjzs FROM ENT_DX_CZJXX GROUP BY qymc) b ON a.dwmc=b.qymc LEFT JOIN (SELECT qymc,sum(TOTAL_WQZS) TOTAL_WQZS FROM ENT_DX_WQXX GROUP BY qymc ) c ON a.dwmc=c.qymc LEFT JOIN (SELECT gsmc,count(*) TDJYCS,max(jyrq) ZJTDJYRQ FROM LAND_EXCHANGE GROUP BY gsmc)d ON a.dwmc=d.gsmc LEFT JOIN ent_listed_company e ON a.dwmc=e.COMPANYNAME LEFT JOIN ENT_IPO f ON a.dwmc=f.qymc WHERE b.cjzs IS NOT NULL OR b.zjzs IS NOT NULL OR c.total_wqzs IS NOT NULL OR d.tdjycs IS NOT NULL OR d.zjtdjyrq IS NOT NULL OR f.qymc IS NOT NULL """ df = db_util.execute2Dataframe(sql) # 统计公司社保缴纳情况 # s = """ # select a.dwmc,c.YJNY,count(*) JNRS from ENTERPRISE_INFO_ZDGZ a inner join T_SJZX_RKSBXX b # on a.dwmc=b.UNIT_NAME inner JOIN T_SJZX_SBMXXX_2017TO2018 c on b.SI_NO=c.shbxh # GROUP BY a.dwmc,c.YJNY # order by c.YJNY # """ s = """ SELECT a.dwmc ,yjny,count(*) JNRS FROM ENTERPRISE_INFO_ZDGZ A INNER JOIN LGL_UNITSOCIAL_SECURITY b ON A .dwmc = b.DWMC INNER JOIN T_SJZX_SBMXXX_2017TO2018 c ON b.dwbm=c.sbdwbh GROUP BY a.dwmc,yjny ORDER BY yjny """ qysb_df = db_util.execute2Dataframe(s) qysb_dict = {} for index, row in qysb_df.iterrows(): dwmc = row['DWMC'] dw_data = qysb_dict.get(dwmc) if dw_data is None: dw_data = [] qysb_dict[dwmc] = dw_data dw_data.append(row['JNRS']) company_list = [] for k, v in qysb_dict.items(): v = v[:-1] if len(v) == 0: continue v_serise = pd.Series(v) std = v_serise.std() mean = v_serise.mean() entent = '上升' isdown = False if v[-1] - mean < 0: entent = '下降' isdown = True if v[-1] - mean == 0: entent = '不变' cov = std / mean # if isdown and cov>0.1 and mean>10: company_data = {'DWMC': k, 'SBJNRS': round(mean), 'ZJFD': entent, 'cov': cov} company_list.append(company_data) if isdown: print(k, '数据:', v, '标准差:', std, '均值:', mean, '变异系数:', cov, '增减幅度:', entent) company_df = DataFrame(company_list) merge_df = df.merge(company_df, how='left', left_on='DWMC', right_on='DWMC') print(merge_df) merge_df.to_excel('D:\\python\\企业社保信息.xlsx')
# if word in keywords: # continue # keywords.append(word) # if len(keywords)!=0 and hydm is not None: # keywords.append(hydm) # simpDat.append(keywords) return simpDat if __name__ == "__main__": sql=""" select GBHY from OPENDATA_SY_INFO GROUP BY gbhy """ df = db_util.execute2Dataframe(sql) gbhylist=df["GBHY"].tolist() dataSet=loadSimpDat() L, suppData = AssociationRulesUtil.apriori(dataSet, minSupport=0.02) print('频繁项:',L) filterItems=[] for items in L: for frozenset in items: for item in frozenset: if len(frozenset)>1 and item in gbhylist: filterItems.append(frozenset) break print('过滤项',filterItems) # main()