def extract_szdq(content): #所在地区的判断调用了cpca,保存结果的省、市、区;结果中无法判断省、市、区的为空;有多种匹配结果的,可能出现warning retList = [] contentList = [content] szdq = cpca.transform(contentList) if content == szdq['地址'].tolist()[0] or (len(content) > len(szdq['省'].tolist()[0]) and szdq['市'].tolist()[0] == ''): szdq = cpca.transform(contentList, cut=False) retList += szdq['省'].tolist() retList += szdq['市'].tolist() retList += szdq['区'].tolist() return retList
def test_transform(): addr_list = ["徐汇区虹漕路461号58号楼5楼", "泉州市洛江区万安塘西工业区", "福建省鼓楼区鼓楼医院", "天津市", "我家的地址是江苏淮安清江浦区人民路111号", '我家的地址是江苏淮安清江浦区上海路111号', "上海市浦东新区东明路街道三林路15号", "贵州省黔南布依族苗族自治州长顺县长寨街道和平中路28号", "宁夏", "淮安市市辖区"] transed = cpca.transform(addr_list) assert_addr(transed) # 测试pos_sensitive transed = cpca.transform(addr_list, pos_sensitive=True) assert_addr(transed, pos_sensitive=True)
def main(): duration = TimeDuration() duration.start() rows = get_corp_list() corp_count = len(rows) corp_index = 0 sql_list = [] for row in rows: corp_id = row[0] stock_code = row[1] stock_name = row[2] register_address = row[3] print('{} 准备处理第{}支股票,股票代码:{}, 股票名称:{},地址:{}'.format( datetime.datetime.now(), corp_index, stock_code, stock_name, register_address)) df = cpca.transform([register_address]) data = json.loads(df.to_json(orient="records",force_ascii=False)) province = data[0]['省'] city = data[0]['市'] corp_index += 1 remain_count = corp_count - corp_index print('{} 解析成功!省:{},市:{},当前剩余{}支股票,已耗时:{}'.format(datetime.datetime.now(), province, city, remain_count, duration.getTillNow())) sql_list.append(get_sql(corp_id, province, city)) # print(sql_list) batch_execute(sql_list) duration.stop() duration.printDurationInfo()
def earthquake_event(): print("earthquake_event begin") cursor = connection.cursor() sql = f"select weibo_post.post_id,weibo_post.task_id,post_content,post_time from weibo_post,noise_judge where weibo_post.post_id=noise_judge.post_id and weibo_post.task_id=noise_judge.task_id and noise_judge.noise='0' and (noise_judge.task_id={'or noise_judge.task_id='.join(earthquake_id)}) and CONCAT(noise_judge.task_id,'_',noise_judge.post_id) not in (select CONCAT(task_id,'_',post_id) from event);" cursor.execute(sql) text = cursor.fetchall() events = [] for content in text: try: text = content[2] for i in cut_sent(text): values = cpca.transform([i], open_warning=False).values province = values[0][0] city = values[0][1] area = values[0][2] if(len(province) != 0): break e = re.search("发生.{0,6}?地震", content[2]) # print(text, province, city, area) if e != None: e = e.group(0) if len(province) == 0 and len(city) == 0 and len(area) == 0: continue else: t = event(task_id=content[1], post_id=content[0], province=province, city=city, area=area,event=e,time=content[3]) events.append(t) except: continue event.objects.bulk_create(events) cursor.close() print("earthquake_event end") return
def analysis_data(): with open('result.csv', 'r') as f: data = f.read().split('\n') citys = [] total = 0 for l in data: l = l.strip() if l == '': continue total += 1 city = l.split(',')[2] df = cpca.transform([city]) if (df.get_values()[0][1] == ''): continue city = df.get_values()[0][1].replace('市', '') print(city) no_citys = ['池州', '黔西南布依族苗族自治州'] # 去除pyecharts中未收录的城市 if city in no_citys: continue #city = city.replace('省','') flag = True for c in citys: if c['name'] == city: c['num'] += 1 flag = False break if flag: citys.append({'name': city, 'num': 1}) # 按照数量排个序 citys = sorted(citys, key=lambda e: e["num"], reverse=True) print(citys) c = [(city['name'], city['num']) for city in citys] return (c, total)
def get_city_class(lines: List[str]) -> Dict: import cpca import jieba import jieba.posseg as pseg jieba.enable_paddle() court_name = None citylist = [] for line in lines: # 提取人民法院名称 if line == '\n': continue if "人民法院" in line: line = re.sub(r'\s', '', line) line = re.split(r'[,:;。\s+]', line) for subline in line: if "人民法院" in subline: subline = re.sub(r'[\n\s]', '', subline) seg_list = pseg.cut(subline, use_paddle=True) for seg in seg_list: if seg.flag == 'ns' or seg.flag == 'nt' or seg.flag == 'ORG': court_name = re.sub(r'[,:;。]', '', seg.word) break if court_name is not None: break if court_name is not None: # 进行地市分级 city = re.sub("人民法院", '', court_name) citylist.append(city) city_class = cpca.transform(citylist) break return city_class.iloc[0].to_dict()
def stu_map_analysis(strlist): print("开始清洗地址数据==============>>>>>>") # json数组字符串解构 address = strlist.get('data') # before_split = list[1:len(list)-1] # address = before_split.split(',') c = {"address": address} df = pd.DataFrame(c, columns=['address']) adlist = df['address'].tolist() # 地址分词 cut = cpca.transform(adlist) # 替换表中的空字符串为NaN cut.replace(to_replace=r'^\s*$', value=np.nan, regex=True, inplace=True) # 删除'省'有空值的行,在原表更改 cut.dropna(subset=['省'], inplace=True) provincelist = cut['省'].tolist() mplist = [] for i in range(len(provincelist)): s = provincelist[i] # 删除两端空白 strips = re.sub('D', '', s) # 删除后缀名 res = re.search('(?P<province>[^省|市|壮族自治区|自治区|维吾尔自治区|回族自治区|特别行政区]+)', strips) mplist.append(res.groupdict().get('province')) mplist = pd.DataFrame(mplist) counts = mplist[0].value_counts() data = [] # 组装响应数据 for i in range(len(counts)): data.append({"name": counts.index[i], "value": counts[i]}) print("学生地址信息清理完成============>>>>>>") return data
def getinfos(str): querystr = "https://www.qichacha.com/search?key=" dict = {} dict['原公司名称'] = str r = requests.get(querystr + str, headers=headers) doc = pq(r.text) num = doc('#countOld > span').text() if r.status_code != 200 or not num: raise ValueError(u'查询过多,本日IP已被企查查禁用') if int(num.strip()) == 0: raise ValueError(u'没有找到相关公司名称') infos = doc('#search-result > tr:nth-child(1) > td:nth-child(3)') a = infos('a') infos_str = infos.text() + chr(10) #把最后一行也加一个换行符,方便用正则匹配 dict['查找到公司名称'] = a.text().split(' ')[0] for item in ['负责人', '法定代表人', '注册资本', '成立时间', '邮箱', '电话', '地址']: resp = re.search(f'{item}:\s?(.+?)\s', infos_str) if resp: dict[item] = re.search(f'{item}:\s?(.+?)\s', infos_str).group(1) else: dict[item] = '' if "贵安" in dict['地址']: dict['城市'] = "贵安新区" else: df = cpca.transform([dict['地址']]) dict['城市'] = df.loc[0, '市'] + df.loc[0, '区'] return dict
def parse_address(text): text_arr = {text} df = cpca.transform(text_arr, cut=False) province = df.iat[0, 0] city = df.iat[0, 1] district = df.iat[0, 2] other = df.iat[0, 3] #print(other) # pattern = re.compile(district + '(?P<street>.*?(街道|道|站|局|处|街|大道|小区|场|坊|路|团|委员会|区|县|州)){0,1}(\w*){0,1}') # result = pattern.findall(text) # print(result) addr_json = { "province": province, "city": city, "district": district, "street": "", "other": "" } addr_json = get_street(addr_json, text) addr_json = get_other(addr_json, text) # print(addr_json) return addr_json
def city_distinguish(str_): ending = []#最后输出为三元列表,分别为出发地,目的地,时间,时间可能因输入多个时间而形式为列表,请识别类型后处理 c = [str_] df1 = cpca.transform(c,cut=False,open_warning=False) prov1 = df1.iloc[0,0] city1 = df1.iloc[0,1] city2 = df1.iloc[0,2] if city2 != "": city1 = city2 if prov1 == city1: n = prov1.find("省") str_ = str_.replace(prov1[:n],"") str_ = str_.replace(prov1[:2],"") elif prov1 != city1: n = prov1.find("省") m = city1.find("市") str_ = str_.replace(prov1[:n],"").replace(city1[:m],"") str_ = str_.replace(prov1[:2],"").replace(city1[:2],"") c1 = [str_] #print(c1) df2 = cpca.transform(c1,cut=False,open_warning=False) #print(df2) if df2.iloc[0,0] != '': if prov1 != city1: start_city = city1 elif prov1 == city1: start_city = prov1 if df2.iloc[0,0] != df2.iloc[0,1]: destination = df2.iloc[0,1] elif df2.iloc[0,0] == df2.iloc[0,1]: destination = df2.iloc[0,0] #print("出发地:",start_city,"目的地:",destination) ending.extend([start_city,destination]) elif df2.iloc[0,0] == '': start_city = "您好像没有输入出发地" if prov1 != city1 and city1 != "": destination = city1 elif prov1 == city1: destination = prov1 elif prov1 != "" and city1 == "": destination = prov1 #print(start_city,"目的地:",destination) ending.extend(["北京市",destination]) time = time_extract(str_) ending.extend(time) return ending
def get_company_info(self, company_url, item): """ 获取工商信息和变更信息 @param item: @param company_url: 公司请求url @return: 返回获取到的数据 """ headers = { 'User-Agent': UserAgent(verify_ssl=False).random, 'cookie': self.cookie, } try: response_com = requests.get(company_url, headers=headers, verify=False) company_response = response_com.text.replace(' ', '').replace('\n', '') except Exception as e: print(f'获取失败! {self.company_name} --> {company_url}--> {e}') return {} else: time.sleep(1) # 公司名 item['company_name'] = self.merge_list(re.findall(r'<h1>(.*?)</h1>', company_response)) # 注册资本 register_capital = ''.join( re.findall(r'<tdclass=["\']tb["\']>注册资本</td><td.*?>(.*?)</td>', company_response, re.DOTALL)) item['register_capital'], item['register_currency'] = self.money( re.sub(r'[\n 注册资本::]', '', register_capital, re.DOTALL)) # 实缴资本 register_capital = ''.join( re.findall(r'<tdclass=["\']tb["\']>实缴资本</td><td.*?>(.*?)<', company_response, re.DOTALL)) item['real_capital'], item['real_currency'] = self.money( re.sub(r'[\n 实缴资本::]', '', register_capital, re.DOTALL)) # 统一社会信用代码 credit_code = ''.join( re.findall(r'<tdclass=["\']tb["\']>统一社会信用代码</td><tdclass=["\']["\']>(.*?)</td>', company_response, re.DOTALL)) item['credit_code'] = re.sub(r'[\n 统一社会信用代码::-]', '', credit_code, re.DOTALL) # 工商注册号 business_number = ''.join( re.findall(r'<tdclass=["\']tb["\']>工商注册号</td><tdclass=["\']["\']>(.*?)</td>', company_response, re.DOTALL)) item['business_number'] = re.sub(r'[\n 工商注册号::-]', '', business_number, re.DOTALL) # 登记机关 reg_addr = ''.join(re.findall( r'<tdwidth=["\'][0-9]{0,}%["\']class=["\']tb["\']>登记机关</td><tdwidth=["\'][0-9]{0,}%["\']class=["\']["\']>(.*?)</td>', company_response, re.DOTALL)) item['reg_addr'] = re.sub(r'[\n 登记机关::-]', '', reg_addr, re.DOTALL) # 所属行业 industry = ''.join( re.findall(r'<tdclass=["\']tb["\']>所属行业</td><tdclass=["\']["\']>(.*?)</td>', company_response, re.DOTALL)).strip() item['industry'] = re.sub(r'[\n 所属行业::-]', '', industry, re.DOTALL) # 企业类型 business_type = ''.join(re.findall( r'<tdwidth=["\'][0-9]{0,}%["\']class=["\']tb["\']>企业类型</td><tdwidth=["\'][0-9]{0,}%["\']class=["\']["\']>(.*?)</td>', company_response, re.DOTALL)).strip() item['business_type'] = re.sub(r'[\n 企业类型::-]', '', business_type, re.DOTALL) item['province'], item['city'], item['area'] = cpca.transform([item['address']]).loc[0, ['省', '市', '区']] return item
def insertToMongoDB(set1): # 打开文件teacher.csv with open('D:\\bysj-workspace\\education_data\\2_student_info.csv', 'r', encoding='gbk') as csvfile: # 调用csv中的DictReader函数直接获取数据为字典形式 reader = csv.DictReader(csvfile) # 创建一个counts计数一下 查看一共添加了了多少条数据以验证是否正确无重复添加 counts = 0 demo = {} place = ['demo'] for each in reader: # 将数据中需要转换类型的数据转换类型。 demo['stu_id'] = int(each['bf_StudentID']) demo['stu_name'] = each['bf_Name'] demo['stu_sex'] = each['bf_sex'] demo['stu_nation'] = each['bf_nation'] demo['stu_borndate'] = int(each['bf_BornDate']) demo['cla_name'] = each['cla_Name'] # 判断住址是否登记 if each['bf_NativePlace'].strip() == '': demo['stu_nativeplace'] = "未登记" else: place[0] = each['bf_NativePlace'] df = cpca.transform(place) # 首先将pandas读取的数据转化为array data_array = np.array(df) # 然后转化为list形式 data_list = data_array.tolist() #浙江省内细分到市,省外到省 if data_list[0][0] == '浙江省': demo['stu_nativeplace'] = data_list[0][0] + data_list[0][1] else: demo['stu_nativeplace'] = data_list[0][0] demo['stu_residencetype'] = each['Bf_ResidenceType'] demo['stu_policy'] = each['bf_policy'] demo['cla_id'] = int(each['cla_id']) demo['cla_term'] = each['cla_term'] #判断学生是否处于休学 if each['bf_leaveSchool'].strip() == '': demo['stu_leaveschool'] = "否" if each['bf_zhusu'].strip() == '': demo['stu_zhusu'] = "不住宿" demo['stu_qinshihao'] = 0 else: demo['stu_zhusu'] = "住宿" demo['stu_qinshihao'] = int(each['bf_qinshihao']) else: demo['stu_leaveschool'] = "是" demo['stu_zhusu'] = "不住宿" demo['stu_qinshihao'] = 0 # 每次使用同一个变量存储不同的数据,导致数据库认为每次存储的是同一条数据,最终生成同一个_id值 # 手动添加_id值,当插入的数据带有_id的字段时,mongodb就不再自动生成_id demo['_id'] = int(each['bf_StudentID']) print(demo) set1.insert_one(demo) counts += 1 print('成功添加了' + str(counts) + '条数据 ')
def readfile(): file_object = open('douban.txt', 'r') try: for line in file_object: item = json.loads(line) if item == None: continue author = item['author'] text = item['text'] images = item['images'] id = item['id'] addr_transform = cpca.transform([text]) addr = None if addr_transform['省'].str.split(' ')[0] != None: addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省') if addr is None and author['loc'] is not None: cpca.transform([author['loc']['name']]) if addr_transform['省'].str.split(' ')[0] != None: addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省') if addr is not None: if addr == '广西壮族自治区': addr = '广西' if addr == '香港特别行政区': addr = '香港' if addr == '澳门特别行政区': addr = '澳门' addr_dic[addr] = addr_dic.get(addr, 0) + 1 seg_list = jieba.cut(text, cut_all=False) text_list.extend(seg_list) index = 0 for i in images: index = index + 1 url = i.get('large').get('url') r = requests.get(url) with open('./image/{}-{}.jpg'.format(id, index), 'wb') as f: f.write(r.content) finally: file_object.close()
async def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: user_in = tracker.latest_message.get("text") province, city = cpca.transform([user_in]).loc[0, ["省", "市"]] city = province if city in ["市辖区", None] else city text = await self.get_weather(await self.get_location_id(city)) dispatcher.utter_message(text=text) return []
def match_name_tel(s: str): # get name and phone number s = s.rstrip(".") match = s.split(",") name = match[0] #get name tel = re.search('\d{11}', match[1]).group(0) #get phone number mat = re.split('\d{11}', match[1]) rest_addr = mat[0] + mat[1] # join two string DataFrame = cpca.transform([rest_addr]) # use the ilst return (name, tel, DataFrame)
def is_loc(loc): d = cpca.transform([loc]) if str(d['省'][0]): return True if str(d['市'][0]): return True if str(d['区'][0]): return True return False
def is_loc(loc): d = cpca.transform([loc], open_warning=False) if str(d['省'][0]): return True if str(d['市'][0]): return True if str(d['区'][0]): return True return False
def process_item(self, item, spider): if isinstance(item, HuocheDealerItem): if item.get('address') and (not item.get('province') or not item['city']): dataframe = transform([item['address']]) item['province'] = dataframe['省'].values[0] item['city'] = dataframe['市'].values[0] if item.get('tags') and isinstance(item['tags'], list): item['tags'] = ','.join(item['tags']) return item
def judge_cpca(senta, sentb): ''' 判断省市区信息是否一致 ''' # logger.info('judge_cpca...') df = cpca.transform([senta, sentb],open_warning=False,cut=False, lookahead=3) for i, j in zip(df.loc[0][:-1], df.loc[1][:-1]): if i != j: return False return True
def judge_source(_location): result = cpca.transform(_location, cut=False) result = result[['省', '市', '区']] maps = MapAreas() for var in result.columns: result[var] = result[var].apply(lambda x: maps.trans_map(x)) if result.loc[(result['省'] == '') & (result['市'] == '') & (result['区'] == '')].shape[0] > 0: result.loc[(result['省'] == '') & (result['市'] == '') & (result['区'] == ''), ['省']] = '000000' return result
def get_area_out_txt(): file = open('../data/deal/titles.txt', mode='r', encoding='utf-8') area_out = open('../data/deal/area_out.txt', mode='w+', encoding='utf-8') titles = [] for title in file.readlines(): titles.append(str(title).splitlines()[0]) df = cpca.transform(titles, cut=False, lookahead=3) area_out.writelines('ID' + '\t' + '省' + '\t' + '市' + '\t' + '区' + '\t' + '地址' + '\n') for index, rows in df.iterrows(): row = str(index) + '\t' + rows['省'] + '\t' + rows['市'] + '\t' + rows['区'] + '\t' + rows['地址'] + '\n' area_out.writelines(row) print('地点数据获取完成')
def place(newPlace): originalText = [newPlace] placeKeyVlue = cpca.transform(originalText, cut=False) placeVlue = placeKeyVlue.values place = placeVlue[0][0] + placeVlue[0][1] + placeVlue[0][2] if place != '': province = placeVlue[0][0] country = '中国' else: province = '' country = '' return province, country