Ejemplo n.º 1
0
def extract_szdq(content):   
    #所在地区的判断调用了cpca,保存结果的省、市、区;结果中无法判断省、市、区的为空;有多种匹配结果的,可能出现warning
    retList = []
    contentList = [content]
    szdq = cpca.transform(contentList)
    if content == szdq['地址'].tolist()[0] or (len(content) > len(szdq['省'].tolist()[0]) and szdq['市'].tolist()[0] == ''):
        szdq = cpca.transform(contentList, cut=False)
    retList += szdq['省'].tolist()
    retList += szdq['市'].tolist()
    retList += szdq['区'].tolist()
    return retList
def test_transform():
    addr_list = ["徐汇区虹漕路461号58号楼5楼", "泉州市洛江区万安塘西工业区", "福建省鼓楼区鼓楼医院",
                 "天津市",
                 "我家的地址是江苏淮安清江浦区人民路111号",
                 '我家的地址是江苏淮安清江浦区上海路111号',
                 "上海市浦东新区东明路街道三林路15号",
                 "贵州省黔南布依族苗族自治州长顺县长寨街道和平中路28号",
                 "宁夏",
                 "淮安市市辖区"]
    transed = cpca.transform(addr_list)
    assert_addr(transed)

    # 测试pos_sensitive
    transed = cpca.transform(addr_list, pos_sensitive=True)
    assert_addr(transed, pos_sensitive=True)
Ejemplo n.º 3
0
def main():
    duration = TimeDuration()
    duration.start()
    rows = get_corp_list()
    corp_count = len(rows)
    corp_index = 0
    sql_list = []
    for row in rows:
        corp_id = row[0]
        stock_code = row[1]
        stock_name = row[2]
        register_address = row[3]

        print('{} 准备处理第{}支股票,股票代码:{}, 股票名称:{},地址:{}'.format(
            datetime.datetime.now(), corp_index, stock_code, stock_name, register_address))

        df = cpca.transform([register_address])
        data = json.loads(df.to_json(orient="records",force_ascii=False))
        province = data[0]['省']
        city = data[0]['市']
        corp_index += 1
        remain_count = corp_count - corp_index
        print('{} 解析成功!省:{},市:{},当前剩余{}支股票,已耗时:{}'.format(datetime.datetime.now(), province, city, remain_count, duration.getTillNow()))
        sql_list.append(get_sql(corp_id, province, city))

    # print(sql_list)
    batch_execute(sql_list)

    duration.stop()
    duration.printDurationInfo()
Ejemplo n.º 4
0
def earthquake_event():
    print("earthquake_event begin")
    cursor = connection.cursor()
    sql = f"select weibo_post.post_id,weibo_post.task_id,post_content,post_time from weibo_post,noise_judge where weibo_post.post_id=noise_judge.post_id and weibo_post.task_id=noise_judge.task_id and noise_judge.noise='0' and (noise_judge.task_id={'or noise_judge.task_id='.join(earthquake_id)}) and CONCAT(noise_judge.task_id,'_',noise_judge.post_id) not in (select CONCAT(task_id,'_',post_id) from event);"
    cursor.execute(sql)
    text = cursor.fetchall()
    events = []
    for content in text:
        try:
            text = content[2]
            for i in cut_sent(text):
                values = cpca.transform([i], open_warning=False).values
                province = values[0][0]
                city = values[0][1]
                area = values[0][2]
                if(len(province) != 0):
                    break
            e = re.search("发生.{0,6}?地震", content[2])
            # print(text, province, city, area)
            if e != None:
                e = e.group(0)
            if len(province) == 0 and len(city) == 0 and len(area) == 0:
                continue
            else:
                t = event(task_id=content[1], post_id=content[0], province=province, city=city, area=area,event=e,time=content[3])
                events.append(t)
        except:
            continue
    event.objects.bulk_create(events)
    cursor.close()
    print("earthquake_event end")
    return
Ejemplo n.º 5
0
def analysis_data():
    with open('result.csv', 'r') as f:
        data = f.read().split('\n')
    citys = []
    total = 0
    for l in data:
        l = l.strip()
        if l == '':
            continue
        total += 1
        city = l.split(',')[2]
        df = cpca.transform([city])
        if (df.get_values()[0][1] == ''):
            continue
        city = df.get_values()[0][1].replace('市', '')
        print(city)
        no_citys = ['池州', '黔西南布依族苗族自治州']  # 去除pyecharts中未收录的城市
        if city in no_citys: continue
        #city = city.replace('省','')
        flag = True
        for c in citys:
            if c['name'] == city:
                c['num'] += 1
                flag = False
                break
        if flag:
            citys.append({'name': city, 'num': 1})

    # 按照数量排个序
    citys = sorted(citys, key=lambda e: e["num"], reverse=True)

    print(citys)
    c = [(city['name'], city['num']) for city in citys]
    return (c, total)
Ejemplo n.º 6
0
def get_city_class(lines: List[str]) -> Dict:
    import cpca
    import jieba
    import jieba.posseg as pseg
    jieba.enable_paddle()
    court_name = None
    citylist = []
    for line in lines:  # 提取人民法院名称
        if line == '\n':
            continue
        if "人民法院" in line:
            line = re.sub(r'\s', '', line)
            line = re.split(r'[,:;。\s+]', line)
            for subline in line:
                if "人民法院" in subline:
                    subline = re.sub(r'[\n\s]', '', subline)
                    seg_list = pseg.cut(subline, use_paddle=True)
                    for seg in seg_list:
                        if seg.flag == 'ns' or seg.flag == 'nt' or seg.flag == 'ORG':
                            court_name = re.sub(r'[,:;。]', '', seg.word)
                            break
                if court_name is not None:
                    break
        if court_name is not None:  # 进行地市分级
            city = re.sub("人民法院", '', court_name)
            citylist.append(city)
            city_class = cpca.transform(citylist)
            break
    return city_class.iloc[0].to_dict()
Ejemplo n.º 7
0
def stu_map_analysis(strlist):
    print("开始清洗地址数据==============>>>>>>")
    # json数组字符串解构
    address = strlist.get('data')
    # before_split = list[1:len(list)-1]
    # address = before_split.split(',')
    c = {"address": address}
    df = pd.DataFrame(c, columns=['address'])
    adlist = df['address'].tolist()
    # 地址分词
    cut = cpca.transform(adlist)
    # 替换表中的空字符串为NaN
    cut.replace(to_replace=r'^\s*$', value=np.nan, regex=True, inplace=True)
    # 删除'省'有空值的行,在原表更改
    cut.dropna(subset=['省'], inplace=True)
    provincelist = cut['省'].tolist()
    mplist = []
    for i in range(len(provincelist)):
        s = provincelist[i]
        # 删除两端空白
        strips = re.sub('D', '', s)
        # 删除后缀名
        res = re.search('(?P<province>[^省|市|壮族自治区|自治区|维吾尔自治区|回族自治区|特别行政区]+)',
                        strips)
        mplist.append(res.groupdict().get('province'))
    mplist = pd.DataFrame(mplist)
    counts = mplist[0].value_counts()
    data = []
    # 组装响应数据
    for i in range(len(counts)):
        data.append({"name": counts.index[i], "value": counts[i]})
    print("学生地址信息清理完成============>>>>>>")
    return data
Ejemplo n.º 8
0
def getinfos(str):
    querystr = "https://www.qichacha.com/search?key="
    dict = {}
    dict['原公司名称'] = str
    r = requests.get(querystr + str, headers=headers)
    doc = pq(r.text)
    num = doc('#countOld > span').text()
    if r.status_code != 200 or not num: raise ValueError(u'查询过多,本日IP已被企查查禁用')
    if int(num.strip()) == 0: raise ValueError(u'没有找到相关公司名称')
    infos = doc('#search-result > tr:nth-child(1) > td:nth-child(3)')
    a = infos('a')
    infos_str = infos.text() + chr(10)  #把最后一行也加一个换行符,方便用正则匹配

    dict['查找到公司名称'] = a.text().split(' ')[0]
    for item in ['负责人', '法定代表人', '注册资本', '成立时间', '邮箱', '电话', '地址']:
        resp = re.search(f'{item}:\s?(.+?)\s', infos_str)
        if resp:
            dict[item] = re.search(f'{item}:\s?(.+?)\s', infos_str).group(1)
        else:
            dict[item] = ''

    if "贵安" in dict['地址']:
        dict['城市'] = "贵安新区"
    else:
        df = cpca.transform([dict['地址']])
        dict['城市'] = df.loc[0, '市'] + df.loc[0, '区']
    return dict
Ejemplo n.º 9
0
def parse_address(text):
    text_arr = {text}
    df = cpca.transform(text_arr, cut=False)

    province = df.iat[0, 0]
    city = df.iat[0, 1]
    district = df.iat[0, 2]
    other = df.iat[0, 3]

    #print(other)

    # pattern = re.compile(district + '(?P<street>.*?(街道|道|站|局|处|街|大道|小区|场|坊|路|团|委员会|区|县|州)){0,1}(\w*){0,1}')
    # result = pattern.findall(text)
    # print(result)

    addr_json = {
        "province": province,
        "city": city,
        "district": district,
        "street": "",
        "other": ""
    }

    addr_json = get_street(addr_json, text)
    addr_json = get_other(addr_json, text)
    # print(addr_json)
    return addr_json
Ejemplo n.º 10
0
def city_distinguish(str_):
    ending = []#最后输出为三元列表,分别为出发地,目的地,时间,时间可能因输入多个时间而形式为列表,请识别类型后处理
    c = [str_]
    df1 = cpca.transform(c,cut=False,open_warning=False)
    prov1 = df1.iloc[0,0]
    city1 = df1.iloc[0,1]
    city2 = df1.iloc[0,2]
    if city2 != "":
        city1 = city2
    if prov1 == city1:
        n = prov1.find("省")
        str_ = str_.replace(prov1[:n],"")
        str_ = str_.replace(prov1[:2],"")
    elif prov1 != city1:
        n = prov1.find("省")
        m = city1.find("市")
        str_ = str_.replace(prov1[:n],"").replace(city1[:m],"")
        str_ = str_.replace(prov1[:2],"").replace(city1[:2],"")
    c1 = [str_]
    #print(c1)
    df2 = cpca.transform(c1,cut=False,open_warning=False)
    #print(df2)
    if df2.iloc[0,0] != '':
        if prov1 != city1:
            start_city = city1
        elif prov1 == city1:
            start_city = prov1
        if df2.iloc[0,0] != df2.iloc[0,1]:
            destination = df2.iloc[0,1]
        elif df2.iloc[0,0] == df2.iloc[0,1]:
            destination = df2.iloc[0,0]
        #print("出发地:",start_city,"目的地:",destination)
        ending.extend([start_city,destination])
    elif df2.iloc[0,0] == '':
        start_city = "您好像没有输入出发地"
        if prov1 != city1 and city1 != "":
            destination = city1
        elif prov1 == city1:
            destination = prov1
        elif prov1 != "" and city1 == "":
            destination = prov1
        #print(start_city,"目的地:",destination)
        ending.extend(["北京市",destination])
    time = time_extract(str_)
    ending.extend(time)
    return ending
Ejemplo n.º 11
0
 def get_company_info(self, company_url, item):
     """
     获取工商信息和变更信息
     @param item:
     @param company_url: 公司请求url
     @return: 返回获取到的数据
     """
     headers = {
         'User-Agent': UserAgent(verify_ssl=False).random,
         'cookie': self.cookie,
     }
     try:
         response_com = requests.get(company_url, headers=headers, verify=False)
         company_response = response_com.text.replace(' ', '').replace('\n', '')
     except Exception as e:
         print(f'获取失败! {self.company_name} --> {company_url}--> {e}')
         return {}
     else:
         time.sleep(1)
         # 公司名
         item['company_name'] = self.merge_list(re.findall(r'<h1>(.*?)</h1>', company_response))
         # 注册资本
         register_capital = ''.join(
             re.findall(r'<tdclass=["\']tb["\']>注册资本</td><td.*?>(.*?)</td>', company_response, re.DOTALL))
         item['register_capital'], item['register_currency'] = self.money(
             re.sub(r'[\n 注册资本::]', '', register_capital, re.DOTALL))
         # 实缴资本
         register_capital = ''.join(
             re.findall(r'<tdclass=["\']tb["\']>实缴资本</td><td.*?>(.*?)<', company_response, re.DOTALL))
         item['real_capital'], item['real_currency'] = self.money(
             re.sub(r'[\n 实缴资本::]', '', register_capital, re.DOTALL))
         # 统一社会信用代码
         credit_code = ''.join(
             re.findall(r'<tdclass=["\']tb["\']>统一社会信用代码</td><tdclass=["\']["\']>(.*?)</td>', company_response,
                        re.DOTALL))
         item['credit_code'] = re.sub(r'[\n 统一社会信用代码::-]', '', credit_code, re.DOTALL)
         # 工商注册号
         business_number = ''.join(
             re.findall(r'<tdclass=["\']tb["\']>工商注册号</td><tdclass=["\']["\']>(.*?)</td>', company_response,
                        re.DOTALL))
         item['business_number'] = re.sub(r'[\n 工商注册号::-]', '', business_number, re.DOTALL)
         # 登记机关
         reg_addr = ''.join(re.findall(
             r'<tdwidth=["\'][0-9]{0,}%["\']class=["\']tb["\']>登记机关</td><tdwidth=["\'][0-9]{0,}%["\']class=["\']["\']>(.*?)</td>',
             company_response, re.DOTALL))
         item['reg_addr'] = re.sub(r'[\n 登记机关::-]', '', reg_addr, re.DOTALL)
         # 所属行业
         industry = ''.join(
             re.findall(r'<tdclass=["\']tb["\']>所属行业</td><tdclass=["\']["\']>(.*?)</td>', company_response,
                        re.DOTALL)).strip()
         item['industry'] = re.sub(r'[\n 所属行业::-]', '', industry, re.DOTALL)
         # 企业类型
         business_type = ''.join(re.findall(
             r'<tdwidth=["\'][0-9]{0,}%["\']class=["\']tb["\']>企业类型</td><tdwidth=["\'][0-9]{0,}%["\']class=["\']["\']>(.*?)</td>',
             company_response, re.DOTALL)).strip()
         item['business_type'] = re.sub(r'[\n 企业类型::-]', '', business_type, re.DOTALL)
         item['province'], item['city'], item['area'] = cpca.transform([item['address']]).loc[0, ['省', '市', '区']]
         return item
Ejemplo n.º 12
0
def insertToMongoDB(set1):
    # 打开文件teacher.csv
    with open('D:\\bysj-workspace\\education_data\\2_student_info.csv',
              'r',
              encoding='gbk') as csvfile:
        # 调用csv中的DictReader函数直接获取数据为字典形式
        reader = csv.DictReader(csvfile)
        # 创建一个counts计数一下 查看一共添加了了多少条数据以验证是否正确无重复添加
        counts = 0
        demo = {}
        place = ['demo']
        for each in reader:
            # 将数据中需要转换类型的数据转换类型。
            demo['stu_id'] = int(each['bf_StudentID'])
            demo['stu_name'] = each['bf_Name']
            demo['stu_sex'] = each['bf_sex']
            demo['stu_nation'] = each['bf_nation']
            demo['stu_borndate'] = int(each['bf_BornDate'])
            demo['cla_name'] = each['cla_Name']
            # 判断住址是否登记
            if each['bf_NativePlace'].strip() == '':
                demo['stu_nativeplace'] = "未登记"
            else:
                place[0] = each['bf_NativePlace']
                df = cpca.transform(place)
                # 首先将pandas读取的数据转化为array
                data_array = np.array(df)
                # 然后转化为list形式
                data_list = data_array.tolist()
                #浙江省内细分到市,省外到省
                if data_list[0][0] == '浙江省':
                    demo['stu_nativeplace'] = data_list[0][0] + data_list[0][1]
                else:
                    demo['stu_nativeplace'] = data_list[0][0]
            demo['stu_residencetype'] = each['Bf_ResidenceType']
            demo['stu_policy'] = each['bf_policy']
            demo['cla_id'] = int(each['cla_id'])
            demo['cla_term'] = each['cla_term']
            #判断学生是否处于休学
            if each['bf_leaveSchool'].strip() == '':
                demo['stu_leaveschool'] = "否"
                if each['bf_zhusu'].strip() == '':
                    demo['stu_zhusu'] = "不住宿"
                    demo['stu_qinshihao'] = 0
                else:
                    demo['stu_zhusu'] = "住宿"
                    demo['stu_qinshihao'] = int(each['bf_qinshihao'])
            else:
                demo['stu_leaveschool'] = "是"
                demo['stu_zhusu'] = "不住宿"
                demo['stu_qinshihao'] = 0
            # 每次使用同一个变量存储不同的数据,导致数据库认为每次存储的是同一条数据,最终生成同一个_id值
            # 手动添加_id值,当插入的数据带有_id的字段时,mongodb就不再自动生成_id
            demo['_id'] = int(each['bf_StudentID'])
            print(demo)
            set1.insert_one(demo)
            counts += 1
            print('成功添加了' + str(counts) + '条数据 ')
Ejemplo n.º 13
0
def readfile():
    file_object = open('douban.txt', 'r')
    try:
        for line in file_object:
            item = json.loads(line)
            if item == None:
                continue
            author = item['author']
            text = item['text']
            images = item['images']
            id = item['id']

            addr_transform = cpca.transform([text])
            addr = None

            if addr_transform['省'].str.split(' ')[0] != None:
                addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省')

            if addr is None and author['loc'] is not None:
                cpca.transform([author['loc']['name']])
                if addr_transform['省'].str.split(' ')[0] != None:
                    addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省')

            if addr is not None:
                if addr == '广西壮族自治区':
                    addr = '广西'
                if addr == '香港特别行政区':
                    addr = '香港'
                if addr == '澳门特别行政区':
                    addr = '澳门'
                addr_dic[addr] = addr_dic.get(addr, 0) + 1

            seg_list = jieba.cut(text, cut_all=False)
            text_list.extend(seg_list)

            index = 0
            for i in images:
                index = index + 1
                url = i.get('large').get('url')
                r = requests.get(url)
                with open('./image/{}-{}.jpg'.format(id, index), 'wb') as f:
                    f.write(r.content)

    finally:
        file_object.close()
Ejemplo n.º 14
0
 async def run(self, dispatcher: CollectingDispatcher,
               tracker: Tracker,
               domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
     user_in = tracker.latest_message.get("text")
     province, city = cpca.transform([user_in]).loc[0, ["省", "市"]]
     city = province if city in ["市辖区", None] else city
     text = await self.get_weather(await self.get_location_id(city))
     dispatcher.utter_message(text=text)
     return []
Ejemplo n.º 15
0
def match_name_tel(s: str):  # get name and phone number
    s = s.rstrip(".")
    match = s.split(",")
    name = match[0]  #get name
    tel = re.search('\d{11}', match[1]).group(0)  #get phone number
    mat = re.split('\d{11}', match[1])
    rest_addr = mat[0] + mat[1]  # join two string
    DataFrame = cpca.transform([rest_addr])  # use the ilst
    return (name, tel, DataFrame)
def is_loc(loc):
    d = cpca.transform([loc])
    if str(d['省'][0]):
        return True
    if str(d['市'][0]):
        return True
    if str(d['区'][0]):
        return True
    return False
def is_loc(loc):
    d = cpca.transform([loc], open_warning=False)
    if str(d['省'][0]):
        return True
    if str(d['市'][0]):
        return True
    if str(d['区'][0]):
        return True
    return False
Ejemplo n.º 18
0
 def process_item(self, item, spider):
     if isinstance(item, HuocheDealerItem):
         if item.get('address') and (not item.get('province')
                                     or not item['city']):
             dataframe = transform([item['address']])
             item['province'] = dataframe['省'].values[0]
             item['city'] = dataframe['市'].values[0]
         if item.get('tags') and isinstance(item['tags'], list):
             item['tags'] = ','.join(item['tags'])
     return item
Ejemplo n.º 19
0
def judge_cpca(senta, sentb):
    '''
    判断省市区信息是否一致
    '''
    # logger.info('judge_cpca...')
    df = cpca.transform([senta, sentb],open_warning=False,cut=False, lookahead=3)
    for i, j in zip(df.loc[0][:-1], df.loc[1][:-1]):
        if i != j:
            return False
    return True
Ejemplo n.º 20
0
def judge_source(_location):
    result = cpca.transform(_location, cut=False)
    result = result[['省', '市', '区']]
    maps = MapAreas()
    for var in result.columns:
        result[var] = result[var].apply(lambda x: maps.trans_map(x))
    if result.loc[(result['省'] == '') & (result['市'] == '') &
                  (result['区'] == '')].shape[0] > 0:
        result.loc[(result['省'] == '') & (result['市'] == '') &
                   (result['区'] == ''), ['省']] = '000000'
    return result
Ejemplo n.º 21
0
def get_area_out_txt():
    file = open('../data/deal/titles.txt', mode='r', encoding='utf-8')
    area_out = open('../data/deal/area_out.txt', mode='w+', encoding='utf-8')
    titles = []
    for title in file.readlines():
        titles.append(str(title).splitlines()[0])
    df = cpca.transform(titles, cut=False, lookahead=3)
    area_out.writelines('ID' + '\t' + '省' + '\t' + '市' + '\t' + '区' + '\t' + '地址' + '\n')
    for index, rows in df.iterrows():
        row = str(index) + '\t' + rows['省'] + '\t' + rows['市'] + '\t' + rows['区'] + '\t' + rows['地址'] + '\n'
        area_out.writelines(row)
    print('地点数据获取完成')
Ejemplo n.º 22
0
def place(newPlace):
    originalText = [newPlace]
    placeKeyVlue = cpca.transform(originalText, cut=False)
    placeVlue = placeKeyVlue.values
    place = placeVlue[0][0] + placeVlue[0][1] + placeVlue[0][2]
    if place != '':
        province = placeVlue[0][0]
        country = '中国'
    else:
        province = ''
        country = ''
    return province, country