def get_HSBC_onehtml(url, time, path, pic_id): ##需要的url,生成的路径,产生的内容文件名字,图片的递增序列 content = get_html(url, "HSBC") soup = BeautifulSoup(content, "html.parser") ## try: main_part = soup.find("div", class_="common") title = main_part.find("div", class_="title").get_text() ##构建讲座信息 temp_obj = seminar(title) temp_obj.time = time temp_obj.url = url temp_obj.type = "HSBC" #然后是讲座信息等相关 content = [] ##不同的讲座内容对应不同的网页信息展示 main_part = main_part.find("div", class_="content clearfix") seminar.mkdirs(path) main_list = [] main_list = main_part.find_all("div") except Exception as e: My_logger.my_logger.error("{}:不符合爬取预期,跳过,错误{}!".format(url, e)) ress = seminar("-100") ress.right = False return ress for ele in main_list: try: jpeg_list = ele.find_all("img") if (len(jpeg_list) == 0): ##说明是文本文键 if (util.judge_str(ele.get_text()) == True): content.append(ele.get_text()) else: for singel in jpeg_list: ura = singel.get("src") temp_obj.jpeg.append(ura) out_name = path + "/" + "pic_" + str(pic_id) + ".jpg" if ("img-user" in ura): continue Web_crawler.get_jpeg_out(ura, out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 except Exception as e: My_logger.my_logger.error("发现问题{}!".format(e)) My_logger.my_logger.error("爬取汇丰{}讲座时遇到问题,重试!".format(title)) temp_obj.content = content My_logger.my_logger.warning("已完成爬取:{}".format(title)) return temp_obj
def get_HIT_one_html(url, title, real_time, time, path, pic_id): header = { 'User-Agent': util.User_Agent, 'Referer': util.Referer["HIT"], "Host": util.Host["HIT"] } content = Web_crawler.get_html_withheader(url, header) soup = BeautifulSoup(content, "html.parser") ##先爬取一个标题 words_list = [] org_jpeg_url = "http://www.hitsz.edu.cn" ##构建讲座类存储信息 try: temp_obj = seminar(title) temp_obj.time = time temp_obj.real_time = real_time temp_obj.url = url temp_obj.type = "HIT" seminar.mkdirs(path) #然后是讲座信息等相关 main_part = soup.find("div", class_="detail") except Exception as e: My_logger.my_logger.error("{}:不符合爬取预期,跳过,错误{}!".format(url, e)) ress = seminar("-100") ress.right = False return ress for ele in main_part.find_all("p"): jpeg_list = ele.find_all("img") # print(ele.get_text()) # print(len(jpeg_list)==0) try: if (len(jpeg_list) == 0): ##说明是文字 if (util.judge_str(ele.get_text()) == True): words_list.append(ele.get_text()) else: for singel in jpeg_list: ura = org_jpeg_url + singel.get("src") out_name = path + "/" + "pic_" + str(pic_id) + ".jpg" Web_crawler.get_jpeg_out(ura, out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 except Exception as e: My_logger.my_logger.error("发现问题{}!".format(e)) My_logger.my_logger.error("爬取哈工大的{}讲座时候遇到问题!".format(title)) temp_obj.content = words_list My_logger.my_logger.info("已完成爬取{}!".format(title)) return temp_obj
def get_utszlecture_one_html(url, title, real_time, path, pic_id): header = { 'User-Agent': util.User_Agent, "Host": util.Host["utsz_lecture"], 'Referer': util.Referer["utsz_lecture"] } ##h=httplib2.Http(timeout=5) real_content = requests.get(url, headers=header, timeout=120).text soup = BeautifulSoup(real_content, "html.parser") ##建立讲座信息 temp_obj = seminar(title) temp_obj.url = url temp_obj.real_time = real_time temp_obj.type = "utsz" words_list = [] seminar.mkdirs(path) ##开始爬取相关内容 try: main_part = soup.find("div", { "class": "conboxcon reset-conboxcon" }).find_all("p") except Exception as e: My_logger.my_logger.error("{}:不符合爬取预期,跳过,错误{}!".format(url, e)) ress = seminar("-100") ress.right = False return ress for ele in main_part: jpeg_list = ele.find_all("img") try: if (len(jpeg_list) == 0): ##说明是文字 if (util.judge_str(ele.get_text()) == True): words_list.append(ele.get_text()) else: for singel in jpeg_list: ura = singel.get("src") out_name = path + "/" + "pic_" + str(pic_id) + ".jpg" Web_crawler.get_jpeg_out(ura, out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 except Exception as e: My_logger.my_logger.error("发现问题{}!".format(e)) My_logger.my_logger.error("爬取大学城的{}讲座时候遇到问题!".format(title)) temp_obj.content = words_list My_logger.my_logger.warning("已完成爬取:{}".format(title)) return temp_obj
def get_html(url, name): header = { 'User-Agent': util.User_Agent, 'Referer': util.Referer[name], "Host": util.Host[name] } return Web_crawler.get_html_withheader(url, header)
def get_utszlecture_one_html_old(url, title, info, path, pic_id): httplib2.http.client.HTTPConnection._http_vsn = 10 httplib2.http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' httplib2.Response.version = 10 header = { 'User-Agent': util.User_Agent, "Host": util.Host["utsz_lecture"], 'Referer': util.Referer["utsz_lecture"] } ##h=httplib2.Http(timeout=5) real_content = requests.get(url, headers=header, timeout=120) for i in range(10): print("第{}次尝试".format(i + 1)) try: real_content = requests.get(url, headers=header, timeout=5) break except: continue soup = BeautifulSoup(real_content, "html.parser") ##建立讲座信 temp_obj = seminar(title) temp_obj.url = url temp_obj.type = "utsz" words_list = [] img_content = soup.find("div", {"class": "edittext"}) if (img_content is not None): for url_small in img_content.find_all("img", {"src": re.compile("jpg|png")}): out_name = path + "/" + title + "pic_" + str(pic_id) + ".jpg" Web_crawler.get_jpeg_out(url_small["src"], out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 temp_obj.jpeg.append(url_small["src"]) #然后是讲座内容 words_content = soup.find("div", {"class": "edittext"}) if (words_content is not None): for item in words_content.find_all("p"): if (util.judge_str(item.string) == True): words_list.append(item.string) temp_obj.content = words_list My_logger.my_logger.warning("已完成爬取:{}".format(title)) return temp_obj
def get_TSINGHUA_one_html(url, time, path, pic_id): content = get_html(url, "TSINGHUA") soup = BeautifulSoup(content, "html.parser") ##先爬取一个标题 try: title = soup.find("h1", {"class": "arti_title"}) temp_obj = seminar(title.string) temp_obj.url = url temp_obj.type = "TSINGHUA" temp_obj.time = time content = [] ##讲座的内容 #然后是讲座信息等相关 ##预防会产生图片和文字的所有可能 org_jpeg_url = "https://www.sigs.tsinghua.edu.cn" main_part = soup.find("div", class_="wp_articlecontent").find_all("p") seminar.mkdirs(path) except Exception as e: My_logger.my_logger.error("{}:不符合爬取预期,跳过,错误{}!".format(url, e)) ress = seminar("-100") ress.right = False return ress for ele in main_part: jpeg_list = ele.find_all("img") try: if (len(jpeg_list) == 0): ##说明这是文字 if (util.judge_str(ele.get_text()) == True): content.append(ele.get_text()) else: for singel in jpeg_list: ura = org_jpeg_url + singel.get("src") temp_obj.jpeg.append(org_jpeg_url + ura) out_name = path + "/" + "pic_" + str(pic_id) + ".jpg" Web_crawler.get_jpeg_out(ura, out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 except Exception as e: My_logger.my_logger.error("发现问题{}!".format(e)) My_logger.my_logger.error("爬取清华的{}讲座时候遇到问题!".format(title.string)) temp_obj.content = content My_logger.my_logger.warning("已完成爬取:{}".format(title.string)) return temp_obj
def get_STL_one_html(url, info, path, pic_id): content = get_html(url, "STL") soup = BeautifulSoup(content, "html.parser") main_part = soup.find("div", class_="cell large-auto") title = main_part.find("h1").get_text() temp_obj = seminar(main_part.find("h1").get_text()) if (len(info) >= 40): temp_obj.real_time = info[:39] else: temp_obj.real_time = info temp_obj.url = url temp_obj.type = "STL" #然后是讲座信息等相关 content = [] seminar.mkdirs(path) ##首先是文字部分 for ele in main_part.find_all("p"): try: if (util.judge_str(ele.get_text()) == True): content.append(ele.get_text()) except Exception as e: My_logger.my_logger.error("发现问题{}!".format(e)) My_logger.my_logger.error("爬取国法讲座:{}中遇到问题,重试!".format( main_part.find("h1").get_text())) temp_obj.content = content ##然后是图片部分 for ele in main_part.find_all("figure"): ura = ele.find("img").get("src") temp_obj.jpeg.append(ura) out_name = path + "/" + "pic_" + str(pic_id) + ".jpg" Web_crawler.get_jpeg_out(ura, out_name) temp_obj.jpeg_real_path.append(out_name) #print("已写入:{}张图片".format(pic_id+1)) pic_id += 1 My_logger.my_logger.warning("已完成爬取:{}".format(title)) return temp_obj
def get_TSINGHUA_SEMINAR(org_url): ##content=BeautifulSoup(get_html(org_url,"TSINGHUA"),"html.parser") ##这里我们使用selenium来获取相关信息 header = { 'User-Agent': util.User_Agent, 'Referer': util.Referer["TSINGHUA"], "Host": util.Host["TSINGHUA"] } ##初始化一个selenimu的东西 chrome = Web_crawler.web_selenium("chrome") content = BeautifulSoup(chrome.get_source(org_url), "html.parser") try: yugao = content.find("div", {"class": "contain_news xwlist"}) #print(yugao) except Exception as e: My_logger.my_logger.error("{}:不符合爬取预期,跳过,错误{}!".format(org_url, e)) return [], [], [] url_list = [] title_simple_list = [] time_list = [] for article in yugao.find_all("div", {"class": "mox_list"}): title_simple_list.append( article.find("div", { "class": "news_title" }).get_text()) time_list.append( article.find("div", { "class": "news_time" }).get_text()) url_list.append(article.find("a")["href"]) #print(title_simple_list) #print(url_list) #print(info_list) My_logger.my_logger.info("扫描清华信息完毕,需要爬取{}讲座信息".format( len(title_simple_list))) return url_list, title_simple_list, time_list