Python Web_crawlerの例、Web_crawler Pythonの例

コード例 #1

0

ファイルを表示

def get_HSBC_onehtml(url, time, path,
                     pic_id):  ##需要的url，生成的路径，产生的内容文件名字，图片的递增序列
    content = get_html(url, "HSBC")
    soup = BeautifulSoup(content, "html.parser")
    ##
    try:
        main_part = soup.find("div", class_="common")
        title = main_part.find("div", class_="title").get_text()
        ##构建讲座信息
        temp_obj = seminar(title)
        temp_obj.time = time
        temp_obj.url = url
        temp_obj.type = "HSBC"
        #然后是讲座信息等相关
        content = []
        ##不同的讲座内容对应不同的网页信息展示
        main_part = main_part.find("div", class_="content clearfix")
        seminar.mkdirs(path)
        main_list = []
        main_list = main_part.find_all("div")
    except Exception as e:
        My_logger.my_logger.error("{}：不符合爬取预期，跳过,错误{}！".format(url, e))
        ress = seminar("-100")
        ress.right = False
        return ress

    for ele in main_list:
        try:
            jpeg_list = ele.find_all("img")
            if (len(jpeg_list) == 0):
                ##说明是文本文键
                if (util.judge_str(ele.get_text()) == True):
                    content.append(ele.get_text())
            else:
                for singel in jpeg_list:
                    ura = singel.get("src")
                    temp_obj.jpeg.append(ura)
                    out_name = path + "/" + "pic_" + str(pic_id) + ".jpg"
                    if ("img-user" in ura):
                        continue
                    Web_crawler.get_jpeg_out(ura, out_name)
                    temp_obj.jpeg_real_path.append(out_name)
                    #print("已写入：{}张图片".format(pic_id+1))
                    pic_id += 1
        except Exception as e:
            My_logger.my_logger.error("发现问题{}!".format(e))
            My_logger.my_logger.error("爬取汇丰{}讲座时遇到问题，重试！".format(title))
    temp_obj.content = content
    My_logger.my_logger.warning("已完成爬取：{}".format(title))
    return temp_obj

コード例 #2

0

ファイルを表示

def get_HIT_one_html(url, title, real_time, time, path, pic_id):
    header = {
        'User-Agent': util.User_Agent,
        'Referer': util.Referer["HIT"],
        "Host": util.Host["HIT"]
    }
    content = Web_crawler.get_html_withheader(url, header)
    soup = BeautifulSoup(content, "html.parser")
    ##先爬取一个标题
    words_list = []
    org_jpeg_url = "http://www.hitsz.edu.cn"
    ##构建讲座类存储信息
    try:
        temp_obj = seminar(title)
        temp_obj.time = time
        temp_obj.real_time = real_time
        temp_obj.url = url
        temp_obj.type = "HIT"
        seminar.mkdirs(path)
        #然后是讲座信息等相关
        main_part = soup.find("div", class_="detail")
    except Exception as e:
        My_logger.my_logger.error("{}：不符合爬取预期，跳过,错误{}！".format(url, e))
        ress = seminar("-100")
        ress.right = False
        return ress
    for ele in main_part.find_all("p"):
        jpeg_list = ele.find_all("img")
        # print(ele.get_text())
        # print(len(jpeg_list)==0)
        try:
            if (len(jpeg_list) == 0):
                ##说明是文字
                if (util.judge_str(ele.get_text()) == True):
                    words_list.append(ele.get_text())
            else:
                for singel in jpeg_list:
                    ura = org_jpeg_url + singel.get("src")
                    out_name = path + "/" + "pic_" + str(pic_id) + ".jpg"
                    Web_crawler.get_jpeg_out(ura, out_name)
                    temp_obj.jpeg_real_path.append(out_name)
                    #print("已写入：{}张图片".format(pic_id+1))
                    pic_id += 1
        except Exception as e:
            My_logger.my_logger.error("发现问题{}!".format(e))
            My_logger.my_logger.error("爬取哈工大的{}讲座时候遇到问题！".format(title))
    temp_obj.content = words_list
    My_logger.my_logger.info("已完成爬取{}!".format(title))
    return temp_obj

コード例 #3

0

ファイルを表示

def get_utszlecture_one_html(url, title, real_time, path, pic_id):
    header = {
        'User-Agent': util.User_Agent,
        "Host": util.Host["utsz_lecture"],
        'Referer': util.Referer["utsz_lecture"]
    }
    ##h=httplib2.Http(timeout=5)
    real_content = requests.get(url, headers=header, timeout=120).text
    soup = BeautifulSoup(real_content, "html.parser")

    ##建立讲座信息
    temp_obj = seminar(title)
    temp_obj.url = url
    temp_obj.real_time = real_time
    temp_obj.type = "utsz"
    words_list = []
    seminar.mkdirs(path)
    ##开始爬取相关内容
    try:
        main_part = soup.find("div", {
            "class": "conboxcon reset-conboxcon"
        }).find_all("p")
    except Exception as e:
        My_logger.my_logger.error("{}：不符合爬取预期，跳过,错误{}！".format(url, e))
        ress = seminar("-100")
        ress.right = False
        return ress
    for ele in main_part:
        jpeg_list = ele.find_all("img")
        try:
            if (len(jpeg_list) == 0):
                ##说明是文字
                if (util.judge_str(ele.get_text()) == True):
                    words_list.append(ele.get_text())
            else:
                for singel in jpeg_list:
                    ura = singel.get("src")
                    out_name = path + "/" + "pic_" + str(pic_id) + ".jpg"
                    Web_crawler.get_jpeg_out(ura, out_name)
                    temp_obj.jpeg_real_path.append(out_name)
                    #print("已写入：{}张图片".format(pic_id+1))
                    pic_id += 1
        except Exception as e:
            My_logger.my_logger.error("发现问题{}!".format(e))
            My_logger.my_logger.error("爬取大学城的{}讲座时候遇到问题！".format(title))
    temp_obj.content = words_list
    My_logger.my_logger.warning("已完成爬取：{}".format(title))
    return temp_obj

コード例 #4

0

ファイルを表示

def get_html(url, name):
    header = {
        'User-Agent': util.User_Agent,
        'Referer': util.Referer[name],
        "Host": util.Host[name]
    }
    return Web_crawler.get_html_withheader(url, header)

コード例 #5

0

ファイルを表示

def get_utszlecture_one_html_old(url, title, info, path, pic_id):
    httplib2.http.client.HTTPConnection._http_vsn = 10
    httplib2.http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    httplib2.Response.version = 10
    header = {
        'User-Agent': util.User_Agent,
        "Host": util.Host["utsz_lecture"],
        'Referer': util.Referer["utsz_lecture"]
    }
    ##h=httplib2.Http(timeout=5)
    real_content = requests.get(url, headers=header, timeout=120)
    for i in range(10):
        print("第{}次尝试".format(i + 1))
        try:
            real_content = requests.get(url, headers=header, timeout=5)
            break
        except:
            continue
    soup = BeautifulSoup(real_content, "html.parser")
    ##建立讲座信
    temp_obj = seminar(title)
    temp_obj.url = url
    temp_obj.type = "utsz"
    words_list = []
    img_content = soup.find("div", {"class": "edittext"})
    if (img_content is not None):
        for url_small in img_content.find_all("img",
                                              {"src": re.compile("jpg|png")}):
            out_name = path + "/" + title + "pic_" + str(pic_id) + ".jpg"

            Web_crawler.get_jpeg_out(url_small["src"], out_name)
            temp_obj.jpeg_real_path.append(out_name)
            #print("已写入：{}张图片".format(pic_id+1))
            pic_id += 1
            temp_obj.jpeg.append(url_small["src"])

    #然后是讲座内容
    words_content = soup.find("div", {"class": "edittext"})
    if (words_content is not None):
        for item in words_content.find_all("p"):
            if (util.judge_str(item.string) == True):
                words_list.append(item.string)
        temp_obj.content = words_list
    My_logger.my_logger.warning("已完成爬取：{}".format(title))
    return temp_obj

コード例 #6

0

ファイルを表示

def get_TSINGHUA_one_html(url, time, path, pic_id):
    content = get_html(url, "TSINGHUA")
    soup = BeautifulSoup(content, "html.parser")
    ##先爬取一个标题
    try:
        title = soup.find("h1", {"class": "arti_title"})
        temp_obj = seminar(title.string)
        temp_obj.url = url
        temp_obj.type = "TSINGHUA"
        temp_obj.time = time
        content = []  ##讲座的内容
        #然后是讲座信息等相关
        ##预防会产生图片和文字的所有可能
        org_jpeg_url = "https://www.sigs.tsinghua.edu.cn"
        main_part = soup.find("div", class_="wp_articlecontent").find_all("p")
        seminar.mkdirs(path)
    except Exception as e:
        My_logger.my_logger.error("{}：不符合爬取预期，跳过,错误{}！".format(url, e))
        ress = seminar("-100")
        ress.right = False
        return ress
    for ele in main_part:
        jpeg_list = ele.find_all("img")
        try:
            if (len(jpeg_list) == 0):
                ##说明这是文字
                if (util.judge_str(ele.get_text()) == True):
                    content.append(ele.get_text())
            else:
                for singel in jpeg_list:
                    ura = org_jpeg_url + singel.get("src")
                    temp_obj.jpeg.append(org_jpeg_url + ura)
                    out_name = path + "/" + "pic_" + str(pic_id) + ".jpg"
                    Web_crawler.get_jpeg_out(ura, out_name)
                    temp_obj.jpeg_real_path.append(out_name)
                    #print("已写入：{}张图片".format(pic_id+1))
                    pic_id += 1
        except Exception as e:
            My_logger.my_logger.error("发现问题{}!".format(e))
            My_logger.my_logger.error("爬取清华的{}讲座时候遇到问题！".format(title.string))

    temp_obj.content = content
    My_logger.my_logger.warning("已完成爬取：{}".format(title.string))
    return temp_obj

コード例 #7

0

ファイルを表示

def get_STL_one_html(url, info, path, pic_id):
    content = get_html(url, "STL")
    soup = BeautifulSoup(content, "html.parser")

    main_part = soup.find("div", class_="cell large-auto")
    title = main_part.find("h1").get_text()
    temp_obj = seminar(main_part.find("h1").get_text())
    if (len(info) >= 40):
        temp_obj.real_time = info[:39]
    else:
        temp_obj.real_time = info
    temp_obj.url = url
    temp_obj.type = "STL"
    #然后是讲座信息等相关
    content = []
    seminar.mkdirs(path)
    ##首先是文字部分
    for ele in main_part.find_all("p"):
        try:
            if (util.judge_str(ele.get_text()) == True):
                content.append(ele.get_text())
        except Exception as e:
            My_logger.my_logger.error("发现问题{}!".format(e))
            My_logger.my_logger.error("爬取国法讲座：{}中遇到问题，重试！".format(
                main_part.find("h1").get_text()))
    temp_obj.content = content
    ##然后是图片部分
    for ele in main_part.find_all("figure"):
        ura = ele.find("img").get("src")
        temp_obj.jpeg.append(ura)
        out_name = path + "/" + "pic_" + str(pic_id) + ".jpg"
        Web_crawler.get_jpeg_out(ura, out_name)
        temp_obj.jpeg_real_path.append(out_name)
        #print("已写入：{}张图片".format(pic_id+1))
        pic_id += 1
    My_logger.my_logger.warning("已完成爬取：{}".format(title))
    return temp_obj

コード例 #8

0

ファイルを表示

def get_TSINGHUA_SEMINAR(org_url):
    ##content=BeautifulSoup(get_html(org_url,"TSINGHUA"),"html.parser")
    ##这里我们使用selenium来获取相关信息
    header = {
        'User-Agent': util.User_Agent,
        'Referer': util.Referer["TSINGHUA"],
        "Host": util.Host["TSINGHUA"]
    }
    ##初始化一个selenimu的东西
    chrome = Web_crawler.web_selenium("chrome")
    content = BeautifulSoup(chrome.get_source(org_url), "html.parser")
    try:
        yugao = content.find("div", {"class": "contain_news xwlist"})
        #print(yugao)
    except Exception as e:
        My_logger.my_logger.error("{}：不符合爬取预期，跳过,错误{}！".format(org_url, e))
        return [], [], []

    url_list = []
    title_simple_list = []
    time_list = []

    for article in yugao.find_all("div", {"class": "mox_list"}):
        title_simple_list.append(
            article.find("div", {
                "class": "news_title"
            }).get_text())
        time_list.append(
            article.find("div", {
                "class": "news_time"
            }).get_text())
        url_list.append(article.find("a")["href"])

    #print(title_simple_list)
    #print(url_list)
    #print(info_list)
    My_logger.my_logger.info("扫描清华信息完毕，需要爬取{}讲座信息".format(
        len(title_simple_list)))
    return url_list, title_simple_list, time_list