Python GetHttpの例、Http.GetHttp Pythonの例

コード例 #1

0

ファイルを表示

ファイル: TrainStationCrawler.py プロジェクト: strategist922/EntityExtraction

def get_train_station() -> [list, list]:
    """
    获取中国所有火车站实体及其与各城市间关系
    关系类型：位于
    :return: entity_info：实体信息列表，entity_rel：实体关系三元组列表
    """
    page_content = GetHttp().get_page_content(URL, 3)
    entity_info = []
    entity_rel = []
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        index = 0
        for city_list in soup.find_all(attrs={'class': 'citylist'}):
            for city_a in city_list.find_all('a'):
                city = city_a.text + '市'
                if '黔西南' in city:
                    city = '黔西南布依族苗族自治州'
                elif '黔东南' in city:
                    city = '黔东南苗族侗族自治州'
                elif '黔南' in city:
                    city = '黔南布依族苗族自治州'
                elif '西双版纳' in city:
                    city = '西双版纳傣族自治州'

                train_list = BeautifulSoup(
                    GetHttp().get_page_content(URL.replace('/station.html', '') + city_a.get('href'), 3),
                    'html.parser').find(attrs={'class': 'onecity'})
                for train_a in train_list.find_all('a'):
                    train_station = train_a.text + '站'
                    entity_info.append(
                        {'type': '火车站', 'property': {'name': train_station, '域': '地理位置域', 'id': 'CNT' + str(index)}})
                    entity_rel.append([index, {'name': '位于', 'property': {}}, city])
                    index += 1

    return entity_info, entity_rel

コード例 #2

0

ファイルを表示

ファイル: CityCrawler.py プロジェクト: strategist922/EntityExtraction

def get_village (province_dict: dict) -> None:
	"""
	获取镇、街道下级的村、委员会
	:param province_dict: 用来存储包含省的字典
	:return: None
	"""
	# 镇下级村
	print('获取【村/社区】')
	for province_id in province_dict:
		for city_id in province_dict[province_id]['city']:
			for county_id in province_dict[province_id]['city'][city_id]['county']:
				town_dict = province_dict[province_id]['city'][city_id]['county'][county_id]['town']
				for town_id in town_dict:
					village_dict = {}

					page_content = GetHttp().get_page_content(
						URL + town_id[0:2] + '/' + town_id[2:4] + '/' + town_id[4:6] + '/' + town_id + '.html', 3)
					if not page_content:
						continue
					soup = BeautifulSoup(page_content, 'html.parser')
					for village_item in soup.find_all(attrs={'class': 'villagetr'}):
						village_id = str(village_item.find_all('td')[0].text)
						if village_id[0:12] not in village_dict:
							village_dict[village_id[0:12]] = {'id': village_id[0:12],
							                                  'name': str(village_item.find_all('td')[2].text)}

					town_dict[town_id]['village'] = village_dict

コード例 #3

0

ファイルを表示

ファイル: BaiduEncyclopedia.py プロジェクト: hbwzhsh/EntityExtraction

def entity_info_extract (entity_property: dict) -> None:
	"""
	根据所传入的实体属性字典，从百度百科上获取与实体名对应的基本信息和属性
	注意：只能处理非多义词实体
	:param entity_property: 实体属性字典
	:return: None
	"""
	page_content = GetHttp().get_page_content(URL + entity_property['name'], 3, charset='utf-8')
	soup = BeautifulSoup(page_content, 'html.parser')
	# summary
	summary = ''
	summary_tag = soup.find(attrs={'class': 'lemma-summary'})
	if summary_tag:
		for para in summary_tag.find_all(attrs={'class': 'para'}):
			summary += para.text.replace('\n', '').replace(' ', '').replace('[.*?]', '')
		entity_property['介绍'] = summary
	else:
		print('实体[%s]介绍缺失！' % entity_property['name'])
	# basic info
	basic_info_tag = soup.find(attrs={'class': 'basic-info'})
	if basic_info_tag:
		name_list = basic_info_tag.find_all(attrs={'class': 'name'})
		value_list = basic_info_tag.find_all(attrs={'class': 'value'})
		for i in range(len(name_list)):
			entity_property[name_list[i].text.replace('\n', '').replace(' ', '')] = \
				value_list[i].text.replace('\n', '').replace(' ', '')
	else:
		print('实体[%s]基本属性缺失！' % entity_property['name'])

コード例 #4

0

ファイルを表示

ファイル: CityCrawler.py プロジェクト: strategist922/EntityExtraction

def get_province (province_dict: dict) -> None:
	"""
	获取全国省份和直辖市
	:param province_dict: 用来存储包含省的字典
	:return: None
	"""
	print('获取【省/直辖市/自治区】')
	page_content = GetHttp().get_page_content(URL, 3)
	if page_content:
		soup = BeautifulSoup(page_content, 'html.parser')
		for i in soup.find_all(attrs={'class': 'provincetr'}):
			for a in i.find_all('a'):
				link_id = re.sub("\D", "", a.get('href'))
				province_dict[link_id] = {'id': link_id, 'name': a.text}

コード例 #5

0

ファイルを表示

ファイル: CityCrawler.py プロジェクト: strategist922/EntityExtraction

def get_city (province_dict: dict) -> None:
	"""
	获取每所有省的下级市
	:param province_dict: 用来存储包含省的字典
	:return: None
	"""
	print('获取【市/市辖区/自治州/地区/直辖县】')
	for province_id in province_dict:
		city_dict = {}

		page_content = GetHttp().get_page_content(URL + province_id + '.html', 3)
		if not page_content:
			continue
		soup = BeautifulSoup(page_content, 'html.parser')
		for city_item in soup.find_all(attrs={'class': 'citytr'}):
			city_id = str(city_item.find_all('td')[0].text)
			if city_id[0:4] not in city_dict:
				city_dict[city_id[0:4]] = {'id': city_id[0:4], 'name': str(city_item.find_all('td')[1].text)}

		province_dict[province_id]['city'] = city_dict

コード例 #6

0

ファイルを表示

ファイル: CityCrawler.py プロジェクト: strategist922/EntityExtraction

def get_county (province_dict: dict) -> None:
	"""
	获取所有市的下级县区
	:param province_dict: 用来存储包含省的字典
	:return: None
	"""
	print('获取【县/区/自治县/县级市】')
	for province_id in province_dict:
		city_dict = province_dict[province_id]['city']
		for city_id in city_dict:
			county_dict = {}

			page_content = GetHttp().get_page_content(URL + city_id[0:2] + '/' + city_id + '.html', 3)
			if not page_content:
				continue
			soup = BeautifulSoup(page_content, 'html.parser')
			for county_item in soup.find_all(attrs={'class': 'countytr'}):
				county_id = str(county_item.find_all('td')[0].text)
				if county_id[0:6] not in county_dict and str(county_item.find_all('td')[1].text) != '市辖区':
					county_dict[county_id[0:6]] = {'id': county_id[0:6],
					                               'name': str(county_item.find_all('td')[1].text)}

			city_dict[city_id]['county'] = county_dict

コード例 #7

0

ファイルを表示

ファイル: CityCrawler.py プロジェクト: strategist922/EntityExtraction

def get_town (province_dict: dict) -> None:
	"""
	获取县区的下级镇、街道
	:param province_dict: 用来存储包含省的字典
	:return: None
	"""
	print('获取【镇/乡/街道/开发区】')
	for province_id in province_dict:
		for city_id in province_dict[province_id]['city']:
			county_dict = province_dict[province_id]['city'][city_id]['county']
			for county_id in county_dict:
				town_dict = {}

				page_content = GetHttp().get_page_content(
					URL + county_id[0:2] + '/' + county_id[2:4] + '/' + county_id + '.html', 3)
				if not page_content:
					continue
				soup = BeautifulSoup(page_content, 'html.parser')
				for town_item in soup.find_all(attrs={'class': 'towntr'}):
					town_id = str(town_item.find_all('td')[0].text)
					if town_id[0:9] not in town_dict:
						town_dict[town_id[0:9]] = {'id': town_id[0:9], 'name': str(town_item.find_all('td')[1].text)}

				county_dict[county_id]['town'] = town_dict

コード例 #8

0

ファイルを表示

ファイル: AirportCrawler.py プロジェクト: strategist922/EntityExtraction

def get_airport() -> [list, list]:
    """
    获取中国所有机场实体及其与各城市间关系
    关系类型：位于
    :return: entity_info：实体信息列表，entity_rel：实体关系三元组列表
    """
    page_content = GetHttp().get_page_content(URL, 3, charset='utf-8')
    entity_info = []
    entity_rel = []
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        index = 0
        for table in soup.find_all(attrs={'class': 'table-view'}):
            for tr in table.find_all('tr')[1:]:
                # 解析
                tds = tr.find_all('td')
                airport = re.sub(
                    "[\[0-90-9\]]", "",
                    tds[0].text.replace('\n', '').replace(' ',
                                                          '').replace('　', ''))
                pos = list(tds[1].text.replace('\n', '').replace(' ', ''))
                i = 0
                while i < len(pos) - 1:
                    if pos[i][-1] not in city_type:
                        cur_pos = pos.pop(i)
                        pos[i] = cur_pos + pos[i]
                    elif pos[i + 1][-1] in city_type:
                        cur_pos = pos.pop(i)
                        pos[i] = cur_pos + pos[i]
                    else:
                        i += 1

                # 添加
                if '营口兰旗机场' in airport:
                    airport.replace('营口兰旗机场', '')
                    entity_info.append({
                        'type': '机场',
                        'property': {
                            'name': '营口兰旗机场',
                            '域': '地理位置域',
                            'id': 'CNA' + str(index)
                        }
                    })
                    entity_rel.append(
                        [index, {
                            'name': '位于',
                            'property': {}
                        }, pos.pop()])
                    pos.pop()
                    index += 1
                entity_info.append({
                    'type': '机场',
                    'property': {
                        'name': airport,
                        '域': '地理位置域',
                        'id': 'CNA' + str(index)
                    }
                })
                entity_rel.append([
                    index, {
                        'name': '位于',
                        'property': {}
                    }, pos[-1].replace('、', '')
                ])
                index += 1

    return entity_info, entity_rel