Python get_htmlの例、tools.get_html Pythonの例

コード例 #1

0

ファイルを表示

ファイル: timur.py プロジェクト: rizzleduq/kpfu_parser

def gather_name_link_of_employees_mehmat(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    employees = []

    iframe = soup.find('iframe')
    if iframe:
        outer_src = iframe.get('src')

        html = tools.get_html(outer_src)
        soup = BeautifulSoup(html, 'lxml')

        spans = soup.find_all('span', class_='fio')

        employees = []
        for span in spans:
            a = span.find('a')
            if a:
                employees.append((a.text, a.get('href')))
    else:
        tbody = soup.find_all('tbody')[0]
        trs = tbody.find_all('tr')
        for tr in trs:
            td = tr.find('td')
            p = td.find('p')
            a = p.find_all('a')[0]
            if a:
                employees.append((a.text, a.get('href')))
    return employees

コード例 #2

0

ファイルを表示

ファイル: regina.py プロジェクト: rizzleduq/kpfu_parser

def gather_name_link_of_employees(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    employees = []

    iframe = soup.find('iframe')
    div = soup.find('div', class_='visit_link')
    if iframe:

        outer_src = iframe.get('src')

        html = tools.get_html(outer_src)
        soup = BeautifulSoup(html, 'lxml')

        spans = soup.find_all('span', class_='fio')

        for span in spans:
            a = span.find('a')
            if a:
                employees.append((a.text, a.get('href')))
    elif div:
        ps = div.find_all('p')

        for p in ps:
            a = p.find('a')
            if a:
                if 'КФУ' != a.text and 'Институт' not in a.text:
                    employees.append((a.text, a.get('href')))
    return employees

コード例 #3

0

ファイルを表示

ファイル: timur.py プロジェクト: rizzleduq/kpfu_parser

def gather_name_link_of_employees_imo(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    employees = []

    iframe = soup.find('iframe')
    if iframe:
        outer_src = iframe.get('src')

        html = tools.get_html(outer_src)
        soup = BeautifulSoup(html, 'lxml')

        spans = soup.find_all('span', class_='fio')

        employees = []
        for span in spans:
            a = span.find('a')
            if a:
                employees.append((a.text, a.get('href')))
    else:
        div = soup.find('div', class_='visit_link')
        p = div.find('p')
        for row in p.text.split('\r\n'):
            employees.append((row, None))
    return employees

コード例 #4

0

ファイルを表示

def urls_with_reviews(news_url, channel):
    '''
    获取存在评论的url 过滤url
    :param news_url:
    :param channel:
    :param page: 最小值1
    :return: [num, data_url]
    '''
    path = urlsplit(news_url).path[::-1]
    start_index = path.index(".") + 1
    second = path.index("-")
    id = path[start_index:second][::-1]
    id = id[1:]  # 去掉i 新闻id
    middle = "channel=" + channel + "&newsid=comos-" + id
    comments_url = "http://comment5.news.sina.com.cn/comment/skin/default.html?" + middle + "&group=0"  # 新闻评论网页
    data_url = url_with_reviews_data(channel=channel, id=id)  # 获取新闻评论数据的接口
    data = get_html(data_url)
    data = json.loads(data)
    try:
        num = data["result"]["count"]["show"]  # 获取评论的数量
    except Exception as e:
        raise SinaException(data_url)
    if int(num) > 0:
        return [num, data_url]
    return None

コード例 #5

0

ファイルを表示

def get_data(page_num, url):
    data = []
    for i in range(1, page_num + 1):
        url_ = url.replace("page=1", "page=" + str(i))
        data_ = get_html(url_)  # 获取评论数据
        data.append(data_)
    return {"data": data}

コード例 #6

0

ファイルを表示

def get_phys_teachers_rad_astr(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    teachers_list = soup.select('.visit_link ul li a[href]')
    rad_astr = []
    for i in teachers_list:
        rad_astr.append((i.text, i.get('href')))
    return rad_astr

コード例 #7

0

ファイルを表示

def get_phys_teachers(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    teachers_list = soup.select('p a[href]')
    employees = []
    for i in teachers_list:
        employees.append((i.text, i.get('href')))
    return employees

コード例 #8

0

ファイルを表示

ファイル: ildar.py プロジェクト: thisisregina/kpfu_parser

def gather_name_link_of_employees(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    iframe = soup.find('iframe')
    outer_src = iframe.get('src')

    html = tools.get_html(outer_src)
    soup = BeautifulSoup(html, 'lxml')

    spans = soup.find_all('span', class_='fio')

    employees = []
    for span in spans:
        a = span.find('a')
        if a:
            employees.append((a.text, a.get('href')))

    return employees

コード例 #9

0

ファイルを表示

def get_info_from_html(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')
    p = div.find_all('p')

    result = []
    for current in p:
        result.append(current.text)
    return result

コード例 #10

0

ファイルを表示

ファイル: ildar.py プロジェクト: thisisregina/kpfu_parser

def get_link_from_menu_list_left(link, button_name: str):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    ul = soup.find('ul', class_='menu_list_left')
    lis = ul.find_all('li')

    for li in lis:
        a = li.find('a')
        if a.text == button_name:
            return a.get('href')

コード例 #11

0

ファイルを表示

def gather_name_link_of_cathedras_of_engineer(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='area_width')
    links = div.find_all('a')

    cathedras = []
    for a in links:
        if a.text.startswith('Кафедра'):
            cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #12

0

ファイルを表示

ファイル: timur.py プロジェクト: rizzleduq/kpfu_parser

def gather_name_link_of_cathedras_of_imo(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')

    links = div.find_all('a')

    cathedras = []
    for a in links:
        if 'Кафедра' in a.text:
            cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #13

0

ファイルを表示

ファイル: anton.py プロジェクト: Karantir73/kpfu_parser

def gather_name_link_of_employess_it_licey(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    uls = soup.find_all('ul', class_='menu_list')

    employees = []
    for ul in uls:
        links = ul.find_all('a')
        for link in links:
            if "@" not in link.text:
                employees.append((link.text, link.get('href')))
    return employees

コード例 #14

0

ファイルを表示

ファイル: sergey.py プロジェクト: thisisregina/kpfu_parser

def gather_name_link_of_cathedras_of_psychology(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    ul = soup.find('ul', class_='menu_list_left')

    lis = ul.find_all('li')

    cathedras = []
    for li in lis:
        a = li.find('a')
        cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #15

0

ファイルを表示

def gather_name_link_of_cathedras_of_ipot(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')

    links = div.find_all('a')

    cathedras = []
    for a in links:
        if a.text.startswith('Центр'):
            cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #16

0

ファイルを表示

ファイル: regina.py プロジェクト: rizzleduq/kpfu_parser

def gather_link_of_schools(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    ul = soup.find('ul', class_='menu_list_left')
    lis = ul.find_all('li')

    schools = []
    for li in lis:
        a = li.find('a')
        if a.text.startswith('Высшая школа'):
            schools.append(a.get('href'))
    return schools

コード例 #17

0

ファイルを表示

def get_links_from_menu_list_left(link, button_name: str):
    # get many links, not once
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    ul = soup.find('ul', class_='menu_list_left')
    lis = ul.find_all('li')

    links_res = []
    for li in lis:
        a = li.find('a')
        if a.text.startswith == button_name:
            links_res.append(a.get('href'))
    return links_res

コード例 #18

0

ファイルを表示

ファイル: alsu.py プロジェクト: thisisregina/kpfu_parser

def gather_name_link_of_cathedras_of_chill(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    uls = soup.find_all('ul', class_='menu_list')
    cathedras = []
    for ul in uls:
        lis = ul.find_all('li')
        for li in lis:
            a = li.find('a')
            if a:
                if a.text.startswith('кафедра'):
                    cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #19

0

ファイルを表示

def gather_name_link_of_employees_engineer(link):
    html = tools.get_html(link)
    if html is None:
        return
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('table', class_='cke_show_border')
    links = div.find_all('a')
    employees = []
    for a in links:
        if a:
            employees.append((a.text, a.get('href')))

    return employees

コード例 #20

0

ファイルを表示

ファイル: sergey.py プロジェクト: thisisregina/kpfu_parser

def gather_name_link_of_psychology_employees(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')

    links = div.find_all('a')

    employees = []
    for link in links:
        if link:
            employees.append((link.text, link.get('href')))

    return employees

コード例 #21

0

ファイルを表示

ファイル: timur.py プロジェクト: rizzleduq/kpfu_parser

def gather_name_link_of_cathedras_of_mehmat(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    uls = soup.find_all('ul', class_='menu_list')

    list_links = []
    for ul in uls:
        list_links.append(ul.find_all('a'))

    cathedras = []
    for list in list_links:
        for link in list:
            if link.text.startswith('Кафедpа') or link.text.startswith('Кафедра'):
                cathedras.append((link.text, link.get('href')))
    return cathedras

コード例 #22

0

ファイルを表示

def gather_name_link_of_cathedras_of_phys(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')

    serach_cat = {}

    links = div.find_all('a')

    cathedras = []
    for a in links:
        if a.text.startswith('Кафедра'):
            cathedras.append((a.text, a.get('href')))
            serach_cat[a.text] = 'Сотрудники'
    return cathedras, serach_cat

コード例 #23

0

ファイルを表示

ファイル: ildar.py プロジェクト: thisisregina/kpfu_parser

def gather_name_link_of_cathedras_of_ivmiit(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    div = soup.find('div', class_='visit_link')
    uls = div.find_all('ul')

    lis = []
    for ul in uls:
        lis += ul.find_all('li', class_='li_spec')

    cathedras = []
    for li in lis:
        a = li.find('a')
        if a.text.startswith('Кафедра'):
            cathedras.append((a.text, a.get('href')))
    return cathedras

コード例 #24

0

ファイルを表示

ファイル: main.py プロジェクト: rizzleduq/kpfu_parser

def main():
    html = tools.get_html(constants.initial_url)
    institutes = gather_name_link_of_institutes_and_branches(html)
    print(f'институты: {institutes}')
    print(f'количество институтов: {len(institutes)}')

    parsing_dictionary = {
        'Институт экологии и природопользования': parse_geogr,
        'Институт геологии и нефтегазовых технологий': None,
        'Институт математики и механики им. Н.И. Лобачевского': parse_mehmat,
        'Институт физики': parse_phys,
        'Химический институт им. А.М. Бутлерова': parse_chem,
        'Юридический факультет': parse_law,
        'Институт вычислительной математики и информационных технологий': parse_ivmiit,
        'Институт филологии и межкультурной коммуникации': parse_philology,
        'Институт психологии и образования': parse_psychology,
        'Общеуниверситетская кафедра физического воспитания и спорта': parse_physical,
        'Институт информационных технологий и интеллектуальных систем': None,
        'Институт фундаментальной медицины и биологии': None,
        'Инженерный институт': parse_engineer,
        'Институт международных отношений': parse_imo,
        'Высшая школа бизнеса': parse_higher_school_buisness,
        'Институт социально-философских наук и массовых коммуникаций': None,
        'Институт управления, экономики и финансов': None,
        'Высшая школа государственного и муниципального управления': None,
        'Центр корпоративного обучения': None,
        'IT-лицей-интернат КФУ': parse_IT_licey,
        'Лицей имени Н.И.Лобачевского': parse_lobach_licey,
        'Подготовительный факультет для иностранных учащихся': None,
        'Приволжский центр повышения квалификации и профессиональной переподготовки работников образования': None,
        'Центр непрерывного повышения профессионального мастерства педагогических работников': None,
        'Медико-санитарная часть ФГАОУ ВО КФУ': None,
        'Центр цифровых трансформаций': None,
        'Институт передовых образовательных технологий': parse_ipot,
        'Набережночелнинский институт КФУ': parse_chill,
        'Елабужский институт КФУ': None}

    data = {}
    for name, link in institutes:
        func = parsing_dictionary.get(name)
        if func:
            data[name] = func(link)

    # pprint(data)
    create_visualization(data)

コード例 #25

0

ファイルを表示

def get_news_detail_url(index_url, out_path=None):
    '''
    :param out_home_page:
    :param out_home_page_name:
    :return:
    '''
    channel = get_channel(url=index_url)  # 获取新闻频道
    home_html = get_html(index_url).strip()  # 访问的新闻类型url
    news_data = parse_home_data(home_html)  # 解析新闻数据
    if out_path != None:
        mkdir(out_path)
        save_data_txt(out_path, channel + "_resource.txt", home_html)  # 保存数据
        to_csv(out_path, channel + "_parsed.csv", news_data)
    result = dict()
    for news in news_data:
        news_id = news[0]
        url = news[2]
        # tmp = {"news_id":news_id, "url":url, "channel":channel}
        result[str(news_id)] = (url, channel)

    return result

コード例 #26

0

ファイルを表示

def gather_name_link_of_cathedras_of_law(link):
    html = tools.get_html(link)
    soup = BeautifulSoup(html, 'lxml')

    uls = soup.find_all('ul', class_='menu_list')

    lis = []
    search_cat = {}

    i = 0
    for ul in uls:
        if i < 2:
            lis += ul.find_all('li', class_='li_spec')
        i += 1
    cathedras = []

    for li in lis:
        a = li.find('a')
        if a.text.startswith('Кафедра'):
            cathedras.append((a.text, a.get('href')))
            search_cat[a.text] = 'Сотрудники'
    return cathedras, search_cat

コード例 #27

0

ファイルを表示

ファイル: main.py プロジェクト: enjoqy/PythonProjects

import tools
import nationwide_make_urls
from lxml import etree
'''
爬取招聘网站的主方法
'''

# 获取全国的java岗位链接
nationwide_java_urls = nationwide_make_urls.get_nationwide_urls()

for province_name in nationwide_java_urls:
    # 每个省的url
    province_url = nationwide_java_urls[province_name]

    # 获取每个省共有多少页
    html = tools.get_html(province_url)
    content = etree.ElementTree(etree.HTML(html))
    page_numbers = content.xpath(
        r'//*[@class="p_in"]/span[@class="td"]/text()')
    if not len(page_numbers) == 0:
        # 对获取的页码数字符串进行处理
        page_number = page_numbers[0].split('页')[0][1:]
    else:
        page_number = 1

    # 获取每个省有多少子页链接
    i = 1
    urls = []
    while i <= int(page_number):
        print(page_number)
        print(province_url)

コード例 #28

0

ファイルを表示

ファイル: spider.py プロジェクト: mickelfeng/pspider

    sp = SpecialSpider.SpecialSpider()
    sp.spider(module, gconfig.special, logger, parser)

    logger.info('The [%s] process is completed.' % module)
    exit(0)
### end }

conf = gconfig.settings[module]

try:
    f = open(conf['data_path'], 'w', 0)
except (IOError), e:
    logger.warn('Can NOT open file: %s. [Except]: %s' % (conf['data_path'], e))
    exit(-1)

contents = tools.get_html(module, conf, logger)

for content in contents:
    #{
    #content = contains[i]

    if ('iconv' in conf) and conf['iconv']:
        content = content.decode('gbk', 'ignore').encode('utf-8')
        logger.info('convert code success.')

    # backup
    if 'save' in conf:
        try:
            back_f = open(conf['save'], 'a', 0)
            back_f.write(content)
            logger.info('backup success.')

コード例 #29

0

ファイルを表示

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib
from urllib import request, error
from lxml import etree
import re

from tools import get_html

url = 'http://www.en8848.com.cn/kouyu/basic/yuanlai/218414.html'
html = get_html(url, 'utf-8')
# print(html)
key = html
# 表达式
p = '\$\("\.jp-download"\)\.click\(function\(\){\s*window\.open\(\S.*'
# p = 'html'
# 我们在编译这段正则表达式
pattern = re.compile(p)
# 在源文本中搜索符合正则表达式的部分
matcher1 = re.search(pattern, key)
# 打印出来
print(matcher1)