def get_train_station() -> [list, list]: """ 获取中国所有火车站实体及其与各城市间关系 关系类型:位于 :return: entity_info:实体信息列表,entity_rel:实体关系三元组列表 """ page_content = GetHttp().get_page_content(URL, 3) entity_info = [] entity_rel = [] if page_content: soup = BeautifulSoup(page_content, 'html.parser') index = 0 for city_list in soup.find_all(attrs={'class': 'citylist'}): for city_a in city_list.find_all('a'): city = city_a.text + '市' if '黔西南' in city: city = '黔西南布依族苗族自治州' elif '黔东南' in city: city = '黔东南苗族侗族自治州' elif '黔南' in city: city = '黔南布依族苗族自治州' elif '西双版纳' in city: city = '西双版纳傣族自治州' train_list = BeautifulSoup( GetHttp().get_page_content(URL.replace('/station.html', '') + city_a.get('href'), 3), 'html.parser').find(attrs={'class': 'onecity'}) for train_a in train_list.find_all('a'): train_station = train_a.text + '站' entity_info.append( {'type': '火车站', 'property': {'name': train_station, '域': '地理位置域', 'id': 'CNT' + str(index)}}) entity_rel.append([index, {'name': '位于', 'property': {}}, city]) index += 1 return entity_info, entity_rel
def get_village (province_dict: dict) -> None: """ 获取镇、街道下级的村、委员会 :param province_dict: 用来存储包含省的字典 :return: None """ # 镇下级村 print('获取【村/社区】') for province_id in province_dict: for city_id in province_dict[province_id]['city']: for county_id in province_dict[province_id]['city'][city_id]['county']: town_dict = province_dict[province_id]['city'][city_id]['county'][county_id]['town'] for town_id in town_dict: village_dict = {} page_content = GetHttp().get_page_content( URL + town_id[0:2] + '/' + town_id[2:4] + '/' + town_id[4:6] + '/' + town_id + '.html', 3) if not page_content: continue soup = BeautifulSoup(page_content, 'html.parser') for village_item in soup.find_all(attrs={'class': 'villagetr'}): village_id = str(village_item.find_all('td')[0].text) if village_id[0:12] not in village_dict: village_dict[village_id[0:12]] = {'id': village_id[0:12], 'name': str(village_item.find_all('td')[2].text)} town_dict[town_id]['village'] = village_dict
def entity_info_extract (entity_property: dict) -> None: """ 根据所传入的实体属性字典,从百度百科上获取与实体名对应的基本信息和属性 注意:只能处理非多义词实体 :param entity_property: 实体属性字典 :return: None """ page_content = GetHttp().get_page_content(URL + entity_property['name'], 3, charset='utf-8') soup = BeautifulSoup(page_content, 'html.parser') # summary summary = '' summary_tag = soup.find(attrs={'class': 'lemma-summary'}) if summary_tag: for para in summary_tag.find_all(attrs={'class': 'para'}): summary += para.text.replace('\n', '').replace(' ', '').replace('[.*?]', '') entity_property['介绍'] = summary else: print('实体[%s]介绍缺失!' % entity_property['name']) # basic info basic_info_tag = soup.find(attrs={'class': 'basic-info'}) if basic_info_tag: name_list = basic_info_tag.find_all(attrs={'class': 'name'}) value_list = basic_info_tag.find_all(attrs={'class': 'value'}) for i in range(len(name_list)): entity_property[name_list[i].text.replace('\n', '').replace(' ', '')] = \ value_list[i].text.replace('\n', '').replace(' ', '') else: print('实体[%s]基本属性缺失!' % entity_property['name'])
def get_province (province_dict: dict) -> None: """ 获取全国省份和直辖市 :param province_dict: 用来存储包含省的字典 :return: None """ print('获取【省/直辖市/自治区】') page_content = GetHttp().get_page_content(URL, 3) if page_content: soup = BeautifulSoup(page_content, 'html.parser') for i in soup.find_all(attrs={'class': 'provincetr'}): for a in i.find_all('a'): link_id = re.sub("\D", "", a.get('href')) province_dict[link_id] = {'id': link_id, 'name': a.text}
def get_city (province_dict: dict) -> None: """ 获取每所有省的下级市 :param province_dict: 用来存储包含省的字典 :return: None """ print('获取【市/市辖区/自治州/地区/直辖县】') for province_id in province_dict: city_dict = {} page_content = GetHttp().get_page_content(URL + province_id + '.html', 3) if not page_content: continue soup = BeautifulSoup(page_content, 'html.parser') for city_item in soup.find_all(attrs={'class': 'citytr'}): city_id = str(city_item.find_all('td')[0].text) if city_id[0:4] not in city_dict: city_dict[city_id[0:4]] = {'id': city_id[0:4], 'name': str(city_item.find_all('td')[1].text)} province_dict[province_id]['city'] = city_dict
def get_county (province_dict: dict) -> None: """ 获取所有市的下级县区 :param province_dict: 用来存储包含省的字典 :return: None """ print('获取【县/区/自治县/县级市】') for province_id in province_dict: city_dict = province_dict[province_id]['city'] for city_id in city_dict: county_dict = {} page_content = GetHttp().get_page_content(URL + city_id[0:2] + '/' + city_id + '.html', 3) if not page_content: continue soup = BeautifulSoup(page_content, 'html.parser') for county_item in soup.find_all(attrs={'class': 'countytr'}): county_id = str(county_item.find_all('td')[0].text) if county_id[0:6] not in county_dict and str(county_item.find_all('td')[1].text) != '市辖区': county_dict[county_id[0:6]] = {'id': county_id[0:6], 'name': str(county_item.find_all('td')[1].text)} city_dict[city_id]['county'] = county_dict
def get_town (province_dict: dict) -> None: """ 获取县区的下级镇、街道 :param province_dict: 用来存储包含省的字典 :return: None """ print('获取【镇/乡/街道/开发区】') for province_id in province_dict: for city_id in province_dict[province_id]['city']: county_dict = province_dict[province_id]['city'][city_id]['county'] for county_id in county_dict: town_dict = {} page_content = GetHttp().get_page_content( URL + county_id[0:2] + '/' + county_id[2:4] + '/' + county_id + '.html', 3) if not page_content: continue soup = BeautifulSoup(page_content, 'html.parser') for town_item in soup.find_all(attrs={'class': 'towntr'}): town_id = str(town_item.find_all('td')[0].text) if town_id[0:9] not in town_dict: town_dict[town_id[0:9]] = {'id': town_id[0:9], 'name': str(town_item.find_all('td')[1].text)} county_dict[county_id]['town'] = town_dict
def get_airport() -> [list, list]: """ 获取中国所有机场实体及其与各城市间关系 关系类型:位于 :return: entity_info:实体信息列表,entity_rel:实体关系三元组列表 """ page_content = GetHttp().get_page_content(URL, 3, charset='utf-8') entity_info = [] entity_rel = [] if page_content: soup = BeautifulSoup(page_content, 'html.parser') index = 0 for table in soup.find_all(attrs={'class': 'table-view'}): for tr in table.find_all('tr')[1:]: # 解析 tds = tr.find_all('td') airport = re.sub( "[\[0-90-9\]]", "", tds[0].text.replace('\n', '').replace(' ', '').replace(' ', '')) pos = list(tds[1].text.replace('\n', '').replace(' ', '')) i = 0 while i < len(pos) - 1: if pos[i][-1] not in city_type: cur_pos = pos.pop(i) pos[i] = cur_pos + pos[i] elif pos[i + 1][-1] in city_type: cur_pos = pos.pop(i) pos[i] = cur_pos + pos[i] else: i += 1 # 添加 if '营口兰旗机场' in airport: airport.replace('营口兰旗机场', '') entity_info.append({ 'type': '机场', 'property': { 'name': '营口兰旗机场', '域': '地理位置域', 'id': 'CNA' + str(index) } }) entity_rel.append( [index, { 'name': '位于', 'property': {} }, pos.pop()]) pos.pop() index += 1 entity_info.append({ 'type': '机场', 'property': { 'name': airport, '域': '地理位置域', 'id': 'CNA' + str(index) } }) entity_rel.append([ index, { 'name': '位于', 'property': {} }, pos[-1].replace('、', '') ]) index += 1 return entity_info, entity_rel